# Model 0.05 - Breeds
 - tbc

In [2]:
import ml_metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

  from numpy.core.umath_tests import inner1d


In [3]:
rnd = 42  # random state for scoring consistency

## Importing the data

In [46]:
df_train = pd.read_csv("../input/train/train.csv", index_col="PetID")
df_test = pd.read_csv("../input/test/test.csv", index_col="PetID")
df_breeds = pd.read_csv("../input/breed_labels.csv", index_col="BreedID")
df_colors = pd.read_csv("../input/color_labels.csv")

In [48]:
colors = df_colors['ColorID']
breeds = df_breeds.index

## Analysing Breeds
 - there is a breed category for mixed breeds
 - test data has 13 unseen breeds
 - 65 breeds are exclusively in training data
 - breeds meta data has 106 breeds unseen by test or training data
 - there are breeds only seen once

### Plan
 - ~~Recreate breeds list, using grouping to group similar breeds (alleviate overfitting)~~ Extract all words from breed(s) name and use each as a feature
 - Only consider breeds present in the test data ✓
 - count of breeds (1 or 2) ✓ _insignificant difference_


In [7]:
# count of distinct training Breeds
print(len(df_train['Breed1'].unique()))
print(len(df_train['Breed2'].unique()))

176
135


In [36]:
# distinct count of all training Breeds
all_train_breeds = df_train['Breed1'].append(df_train['Breed2'])
len(all_train_breeds.unique())

188

In [37]:
# distinct count of all test Breeds
all_test_breeds = df_test['Breed1'].append(df_test['Breed2'])
len(all_test_breeds.unique())

136

In [26]:
# common Breeds between test and training data
len(set(all_train_breeds) & set(all_test_breeds))

123

In [34]:
# breeds only present in training data
train_only_breeds = set(all_train_breeds) - set(all_test_breeds)
len(train_only_breeds)

65

In [32]:
# breeds only present in test data
test_only_breeds = set(all_test_breeds) - set(all_train_breeds)
len(test_only_breeds)

13

In [56]:
# top breeds (training data)
cnt1 = Counter(all_train_breeds.map(df_breeds['BreedName']))
cnt1.most_common(10)

[(nan, 10767),
 ('Mixed Breed', 7654),
 ('Domestic Short Hair', 4233),
 ('Domestic Medium Hair', 1579),
 ('Tabby', 480),
 ('Domestic Long Hair', 421),
 ('Siamese', 369),
 ('Persian', 299),
 ('Labrador Retriever', 291),
 ('Terrier', 253)]

In [59]:
# least common breeds (training data)
cnt1.most_common()[-10:]

[('Dutch Shepherd', 1),
 ('Bluetick Coonhound', 1),
 ('Akbash', 1),
 ('Border Terrier', 1),
 ('Norfolk Terrier', 1),
 ('Afghan Hound', 1),
 ('German Shorthaired Pointer', 1),
 ('Selkirk Rex', 1),
 ('Smooth Fox Terrier', 1),
 ('Harrier', 1)]

In [69]:
# top breeds (test data)
cnt2 = Counter(all_test_breeds.map(df_breeds['BreedName']))
cnt2.most_common(10)

[(nan, 3078),
 ('Mixed Breed', 1604),
 ('Domestic Short Hair', 1236),
 ('Domestic Medium Hair', 428),
 ('Tabby', 189),
 ('Domestic Long Hair', 126),
 ('Persian', 82),
 ('Siamese', 77),
 ('Shih Tzu', 74),
 ('Calico', 57)]

In [60]:
# least common breeds (test data)
cnt2.most_common()[-10:]

[('Chausie', 1),
 ('Devon Rex', 1),
 ('Chinese Foo Dog', 1),
 ('Shepherd', 1),
 ('Great Dane', 1),
 ('Patterdale Terrier (Fell Terrier)', 1),
 ('Eskimo Dog', 1),
 ('French Bulldog', 1),
 ('Turkish Van', 1),
 ('Jack Russell Terrier (Parson Russell Terrier)', 1)]

In [87]:
# Breeds containing "Jack"
df_breeds['BreedName'][df_breeds['BreedName'].str.find("Jack") >= 0]

BreedID
128                             Jack Russell Terrier
129    Jack Russell Terrier (Parson Russell Terrier)
Name: BreedName, dtype: object

In [88]:
# Breeds containing "Terrier"
df_breeds['BreedName'][df_breeds['BreedName'].str.find("Terrier") >= 0]

BreedID
3                                   Airedale Terrier
9                          American Hairless Terrier
10                    American Staffordshire Terrier
17                                Australian Terrier
23                                Bedlington Terrier
33                             Black Russian Terrier
40                                    Border Terrier
42                                    Boston Terrier
49                                      Bull Terrier
51                                     Cairn Terrier
77                             Dandi Dinmont Terrier
98                                       Fox Terrier
108                            Glen of Imaal Terrier
123                                    Irish Terrier
128                             Jack Russell Terrier
129    Jack Russell Terrier (Parson Russell Terrier)
135                               Kerry Blue Terrier
142                                 Lakeland Terrier
148                               Manc

In [89]:
# Breeds containing "Shepherd"
df_breeds['BreedName'][df_breeds['BreedName'].str.find("Shepherd") >= 0]

BreedID
12                Anatolian Shepherd
16               Australian Shepherd
24     Belgian Shepherd Dog Sheepdog
25        Belgian Shepherd Laekenois
26         Belgian Shepherd Malinois
27         Belgian Shepherd Tervuren
81                    Dutch Shepherd
87                  English Shepherd
103              German Shepherd Dog
202                         Shepherd
234            White German Shepherd
Name: BreedName, dtype: object

In [77]:
# test data with no breed at all
len(df_test[df_test['Breed1'] == 0])

0

In [79]:
# training data with no breed at all - Breed in Breed2 column
df_train[df_train['Breed1'] == 0]

Unnamed: 0_level_0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PhotoAmt,AdoptionSpeed
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
375905770,1,Lil Milo,2,0,26,2,2,0,0,2,...,2,1,1,0,41326,1a2113010d6048d5410b265347b35c91,0,Milo went missing after a week with her new ad...,3.0,3
da8d4a273,1,Bella 4 Months Puppy!,4,0,307,2,2,3,0,2,...,2,1,1,100,41326,3673e167fc9932b13149bed1f2a0180a,0,"She's only 4 months old, very friendly and lov...",5.0,4
27e74e45c,2,,3,0,266,3,1,4,7,1,...,2,1,3,0,41401,f7cff59d10c867bdee12c3f35f34d086,0,Mama cat came to house and gave birth to these...,11.0,2
7b5bee232,1,"""Boy Boy""",72,0,307,1,1,2,0,2,...,2,1,1,0,41326,94b991f8dc1e0bb903ca8d4d492c8d43,0,He is a stray dog found wandering around Unive...,5.0,4
0327b8e94,1,Looking Newborn Puppy For Adoption,2,0,205,2,2,5,7,1,...,2,1,1,1,41336,5232bdd1444960a257ccec9a41404320,0,I want to adopt a newborn puppy Prefer small b...,0.0,3


In [85]:
# Breed IDs not included in either test or training data
len(df_breeds) - len((all_test_breeds.append(all_train_breeds)).unique())

106

## Functions

In [9]:
def apply_word_flags(df, words):
    """Creates binary columns for words which appear in the description"""
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                word = word.lower()
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    df = df.drop(columns=['Description'])
    return df

In [10]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [11]:
def apply_color_flags(df, colors):
    """Combines Colors 1,2 & 3 into binary columns for each possible colours"""
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [120]:
def apply_breed_flags(df, breeds):
    """Combines Breeds 1 & 2 into binary columns for each possible breed"""
    for b in breeds:
        df[f'B{b}'] = 0
    for i,pair in df[['Breed1', 'Breed2']].iterrows():
        for b in pair:
            if b in breeds:
                df.at[i,f'B{b}'] = 1
    return df

In [134]:
df_combined = pd.concat([df_test, df_train], sort=False)


Unnamed: 0_level_0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PhotoAmt,AdoptionSpeed,BreedCount
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
378fcc4fc,1,Puppy,2,307,0,1,1,0,0,2,...,1,1,150,41326,4475f31553f0170229455e3c5645644f,0,"Puppy is calm for a young dog, but he becomes ...",3.0,,1
73c10e136,2,London,24,266,0,1,2,7,0,2,...,1,1,0,41326,4475f31553f0170229455e3c5645644f,0,Urgently seeking adoption. Please contact for ...,1.0,,1
72000c4c5,2,Snowball,20,266,0,2,7,0,0,2,...,1,1,150,41326,4475f31553f0170229455e3c5645644f,0,Snowball... doesn't look so good (she is healt...,1.0,,1
e147a4b9f,2,Malibu,5,266,252,2,1,6,7,2,...,1,1,100,41326,4475f31553f0170229455e3c5645644f,0,"Malibu: Female, Local Mix, 4-5 months, vaccina...",1.0,,2
43fbba852,1,Lala Girl,6,307,0,2,1,2,7,2,...,1,1,150,41326,4475f31553f0170229455e3c5645644f,0,LALA! That's my name. I'm a 6 month old girl d...,1.0,,1
77a490ec9,1,Tipo,12,307,75,2,1,7,0,1,...,1,1,0,41326,4475f31553f0170229455e3c5645644f,0,This lovely girl just want to be loved. She's ...,2.0,,2
28c4b1b13,2,Emma,24,266,0,2,6,7,0,2,...,1,1,100,41326,4475f31553f0170229455e3c5645644f,0,"Emma: Female, Local Mix, 2-3 years old, vaccin...",1.0,,1
d1eada628,1,Bryani,12,307,0,1,3,7,0,2,...,1,1,0,41326,4475f31553f0170229455e3c5645644f,0,"I’m a very loyal dog, I love to be with other ...",2.0,,1
d134dec34,2,Shay,12,266,0,2,2,7,0,1,...,1,1,0,41326,4475f31553f0170229455e3c5645644f,0,Urgently seeking adoption. Please contact for ...,1.0,,1
bcd464bb8,1,Rusty,3,218,307,1,2,7,0,3,...,1,1,200,41326,4475f31553f0170229455e3c5645644f,0,Rusty is a Terrier Mix and despite being just ...,2.0,,2


## Preparing training data

In [135]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

# Rescuer
rescue_map = Counter(df_combined['RescuerID'])
rescuer_counts = df_combined['RescuerID'].map(rescue_map)

# Breed
test_breeds = df_test['Breed1'].append(df_test['Breed2']).unique()
breed1_flag = (df_combined['Breed1'] > 0).astype(int)
breed2_flag = (df_combined['Breed2'] > 0).astype(int)
breed_count = breed1_flag + breed2_flag

# Prepare data for modelling 
df_combined['rescuer_counts'] = rescuer_counts
df_combined['BreedCount']  = breed_count
df_combined = apply_word_flags(df_combined, keywords)
df_combined = apply_color_flags(df_combined, colors)
df_combined = apply_breed_flags(df_combined, test_breeds)
df_combined = pd.get_dummies(df_combined, columns=['Gender',
                                                   'Vaccinated', 'Dewormed', 'Sterilized', 
                                                   'State'])
y_train_all = df_combined['AdoptionSpeed'][df_combined['test'] != 1]
X_all       = df_combined.drop(columns=['Name', 'RescuerID', 'AdoptionSpeed'])
X_train_all = X_all[X_all['test'] != 1].drop(columns=['test'])
X_test_all  = X_all[X_all['test'] == 1].drop(columns=['test'])

## Test Random Forest model

In [136]:
scores = []

folds = KFold(10, True, rnd).split(X_train_all)

for train_indx, test_indx in folds:
    
    X_train, X_test = X_train_all.iloc[train_indx], X_train_all.iloc[test_indx]
    y_train, y_test = y_train_all.iloc[train_indx], y_train_all.iloc[test_indx]
    
    rfc = RandomForestClassifier(n_estimators=200, random_state=rnd)
    rfc.fit(X_train, y_train)
    prediction = rfc.predict(X_test)
    scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
    print(scores[-1])

0.40444256200291784
0.4247129156006203
0.374896010738235
0.37968963170365055
0.3753216086157689
0.39675426181083506
0.3792808515784073
0.3969110913470977
0.36032518128392177
0.3710868093027614


In [137]:
# 0.381 
# only considering breeds from test data --> 0.340
# correction to breeds function          --> 0.386 (LB increase)
# added count of breed                   --> 0.386 (insignificant difference)
np.mean(scores)

0.38634209239842154

# Submitting test data

In [16]:
rfc = RandomForestClassifier(n_estimators=200, random_state=rnd)
rfc.fit(X_train_all, y_train_all)
prediction = rfc.predict(X_test_all)

In [17]:
submission = pd.DataFrame({'AdoptionSpeed': prediction.astype(int)}, index=X_test_all.index)
submission.to_csv("submission.csv", index=True, index_label='PetID', header=['AdoptionSpeed'])