# Dog breed preprocessing

Here is a notebook to switch from breed to group as defined in the [FCI Nomenclature](http://www.fci.be/en/Nomenclature/).

RQ: If you want to try this notebook, you can modify the DATAFOLDER variable to link to your train.csv file.

In [1]:
import numpy as np
import pandas as pd
import requests
import re

from bs4 import BeautifulSoup

DATAFOLDER = '../data/'

In [None]:
#
# Data cleaning for the crawler
#

def clean_breed(breed):
    breed = breed.upper()
    breed = re.sub(' *MIX *', '', breed)
    breed = breed.replace(' ', '+')
    return breed.split('/')

breeds = pd.read_csv(DATAFOLDER + 'train.csv')
breeds = breeds[breeds['AnimalType'] == 'Dog']['Breed'].value_counts().keys()
breeds = list(set(b for bb in breeds for b in clean_breed(bb)))

In [None]:
#
# CRAWLER
#
# Objective:
# ----------
# Find the group of each breed that we have in the dataset.
#
# Steps:
# ------
# 1. Simulate a search with a breed name on the FCI website
# 2. Visit all the result pages until:
#     - there is no more pages
#     - the breed is found
#
# Results:
# --------
# 1. dictionary with all the breeds found by the crawler with
#    the corresponding group
# 2. list of the not found breeds that need a manual search on the web
#
# Examples of special breeds:
# ---------------------------
# -> valid answer: ENGLISH POINTER
# -> no answer: POMERANIAN
# -> no valid answer: ENGLISH SHEPHERD
#

breed_group = dict()
breed_not_found = []
search_url = 'http://www.fci.be/en/nomenclature/races.aspx?search='
for breed in breeds:
    url = search_url + breed
    r = requests.get(url)
    bs = BeautifulSoup(r.content, 'html.parser')
    search_results = bs.select('ul.listeraces a')
    result_counter = 0
    while result_counter < len(search_results):
        result_href = search_results[result_counter]
        result_counter += 1
        result_url = 'http://www.fci.be' + result_href.get('href')
        r2 = requests.get(result_url)
        bs2 = BeautifulSoup(r2.content, 'html.parser')
        css_ids = ['#ContentPlaceHolder1_NomEnLabel',
                   '#ContentPlaceHolder1_NomFRLabel',
                   '#ContentPlaceHolder1_NomDELabel',
                   '#ContentPlaceHolder1_NomESLabel']
        possible_names = [bs2.select(css_id)[0].text.replace(' ', '+').upper() 
                              for css_id in css_ids]
        # Check if the result correspond to the breed name
        if any(breed in name or name in breed for name in possible_names):
            html_group = bs2.select('#ContentPlaceHolder1_GroupeHyperLink')
            group = html_group[0].get('href')[9+len('clature/')]
            breed_group[breed] = group
            break
        elif result_counter == len(search_results):
            breed_not_found.append(breed)

In [None]:
size = 20
print('#' * size)
print('# Known Groups')
print('#' * size)
print('{:35} {}'.format('BREED', 'GROUP'))
for k, v in breed_group.items():
    print('{:35} {}'.format(k.replace('+', ' '), v))

print('\n' + '#' * size)
print('# Unknown Groups')
print('#' * size)
for b in sorted(breed_not_found):
    print(b.replace('+', ' '))
    
print('\n' + '#' * size)
print('# Number of each category')
print('#' * size)
print('GROUP    ', len(breed_group))
print('W/O GROUP', len(breed_not_found))

Now for all races that were not found, I am going to manually check them. Using both de FCI website and wikipedia.

1. Sheepdogs and Cattledogs (except Swiss Cattledogs)
2. Pinscher and Schnauzer - Molossoid and Swiss Mountain and Cattledogs
3. Terriers
4. Dachshunds
5. Spitz and primitive types
6. Scent hounds and related breeds
7. Pointing Dogs
8. Retrievers - Flushing Dogs - Water Dogs9 Companion and Toy Dogs
10. Sighthounds

In [None]:
breed_group_manual = {
    'ALASKAN+HUSKY'               : '5',
    'AMERICAN+ESKIMO'             : '6',
    'ANATOL+SHEPHERD'             : '1',
    'BELGIAN+MALINOIS'            : '1',
    'BELGIAN+SHEEPDOG'            : '1',
    'BELGIAN+TERVUREN'            : '1',
    'BLACK+MOUTH+CUR'             : '6',
    'BLUE+LACY'                   : '6', # not an official breed
    'BLUETICK+HOUND'              : '6',
    'BRUSS+GRIFFON'               : '9', # brusselian griffon
    'CANE+CORSO'                  : '2',
    'CARDIGAN+WELSH+CORGI'        : '1',
    'CAROLINA+DOG'                : '1', # not an official breed
    'CAVALIER+SPAN'               : '9',
    'CHESA+BAY+RETR'              : '8',
    'CHINESE+SHARPEI'             : '2',
    'DOBERMAN+PINSCH'             : '2',
    'ENGLISH+COONHOUND'           : '6',
    'ENGLISH+SHEPHERD'            : '1',
    'FLAT+COAT+RETRIEVER'         : '8',
    'GERMAN+SHORTHAIR+POINTER'    : '7',
    'GERMAN+WIREHAIRED+POINTER'   : '7',
    'GREAT+PYRENEES'              : '2',
    'GREATER+SWISS+MOUNTAIN+DOG'  : '2',
    'IBIZAN+HOUND'                : '5',
    'IRISH+SETTER'                : '7',
    'MEXICAN+HAIRLESS'            : '5',
    'PEMBROKE+WELSH+CORGI'        : '1',
    'PIT+BULL'                    : '3',
    'PLOTT+HOUND'                 : '6',
    'PODENGO+PEQUENO'             : '5',
    'PORT+WATER+DOG'              : '8', # portuguese water dog
    'PRESA+CANARIO'               : '2',
    'RAT+TERRIER'                 : '3',
    'REDBONE+HOUND'               : '6',
    'RHOD+RIDGEBACK'              : '6', # Rhodesian Ridgeback
    'SMOOTH+FOX+TERRIER'          : '3',
    'TOY+FOX+TERRIER'             : '9',
    'TREEING+CUR'                 : '2', # not an official breed
    'TREEING+WALKER+COONHOUND'    : '6',
    'WIRE+HAIR+FOX+TERRIER'       : '3',
    'WIREHAIRED+POINTING+GRIFFON' : '7'
}

breed_group.update(breed_group_manual)

In [None]:
#
# Dump breed/group correspondance in file
#
with open(DATAFOLDER + 'breed_group.csv', 'w') as f:
    f.write('BREED,GROUP\n')
    for k, v in breed_group.items():
        f.write(k + ',' + v + '\n')