In [1]:
import numpy as np
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup

DATAFOLDER = '../data/'

In [94]:
#
# Data cleaning for the crawler
#

def clean_breed(breed):
    breed = breed.upper()
    breed = re.sub(' *MIX *', '', breed)
    breed = breed.replace(' Mix ', '')
    breed = breed.replace(' Mix', '')
    breed = breed.replace('Mix ', '')
    breed = breed.replace('Shorthair', '')
    breed = breed.replace(' ', '+')
    return breed.split('/')

breeds = pd.read_csv(DATAFOLDER + 'dog_breeds.csv')['Breeds']
breeds = list(set(b for bb in breeds for b in clean_breed(bb)))

In [103]:
#
# CRAWLER
#
# Objective:
# ----------
# Find the group of each breed that we have in the dataset.
#
# Steps:
# ------
# 1. Simulate a search with a breed name on the FCI website
# 2. Visit all the result pages until:
#     - there is no more pages
#     - the breed is found
#
# Results:
# --------
# 1. dictionary with all the breeds found by the crawler with
#    the corresponding group
# 2. list of the not found breeds that need a manual search on the web
#
# Examples of special breeds:
# ---------------------------
# -> valid answer: ENGLISH POINTER
# -> no answer: POMERANIAN
# -> no valid answer: ENGLISH SHEPHERD
#

search_url = 'http://www.fci.be/en/nomenclature/races.aspx?search='
for breed in breeds[:30]:
    print(breed)
    url = search_url + breed
    r = requests.get(url)
    bs = BeautifulSoup(r.content, 'html.parser')
    search_results = bs.select('ul.listeraces a')
    ok = False
    result_counter = 0
    while not ok and result_counter < len(search_results):
        result_href = search_results[result_counter]
        result_counter += 1
        result_url = 'http://www.fci.be' + result_href.get('href')
        r2 = requests.get(result_url)
        bs2 = BeautifulSoup(r2.content, 'html.parser')
        css_ids = ['#ContentPlaceHolder1_NomEnLabel',
                   '#ContentPlaceHolder1_NomFRLabel',
                   '#ContentPlaceHolder1_NomDELabel',
                   '#ContentPlaceHolder1_NomESLabel']
        possible_names = [bs2.select(css_id)[0].text.replace(' ', '+').upper() 
                              for css_id in css_ids]
        # Check if the result correspond to the breed name
        if any(breed in name or name in breed for name in possible_names):
            print('\t ok')
            group = bs2.select('#ContentPlaceHolder1_GroupeHyperLink')
            group[0].get('href').find('clature/')
            print('\t Group', group[0].get('href')[9+len('clature/')])
            ok = True
        else:
            print('\t not ok')

ENGLISH+POINTER
	 ok
	 Group 7
ENGLISH+SHEPHERD
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
POMERANIAN
TAN+HOUND
	 ok
	 Group 6
POINTER
	 ok
	 Group 7
REDBONE+HOUND
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
DOGUE+DE+BORDEAUX
	 ok
	 Group 2
AUSTRALIAN+CATTLE+DOG
	 ok
	 Group 1
SWISS+HOUND
	 ok
	 Group 6
BELGIAN+SHEEPDOG
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
	 not ok
CANE+CORSO
	 not ok
	 not ok
	 not ok
	 not ok
NORFOLK+TERRIER
	 not ok
	 ok
	 Group 3
TREEING+TENNESSE+BRIN