# Model 0.05 - Breeds
 - Some breeds not shared between test and train data
 - Breeds could be categorised (e.g. Terrier), so decided to split breed name into keywords
 - Used only test data breeds for flags (1.7 pct point increase in LB)

In [85]:
import re
import ml_metrics
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [86]:
rnd = 42  # random state for scoring consistency

## Importing the data

In [87]:
df_train = pd.read_csv("../input/train/train.csv", index_col="PetID")
df_test = pd.read_csv("../input/test/test.csv", index_col="PetID")
df_breeds = pd.read_csv("../input/breed_labels.csv", index_col="BreedID")
df_colors = pd.read_csv("../input/color_labels.csv")

In [88]:
colors = df_colors['ColorID']
breeds = df_breeds.index

## Analysis of animal types vs breeds types
(1 = Dog, 2 = Cat)

In [106]:
# Pet types inferred from breed (training data)
df_types = df_train.copy()[['Type', 'Breed1', 'Breed2']]
df_types['Breed1'] = df_types['Breed1'].map(df_breeds['Type'])
df_types['Breed2'] = df_types['Breed2'].map(df_breeds['Type'])
df_types.head()

Unnamed: 0_level_0,Type,Breed1,Breed2
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
86e1089a3,2,2.0,
6296e909a,2,2.0,
3422e4906,1,1.0,
5842f1ff5,1,1.0,
850a43f90,1,1.0,


In [107]:
# Check type mismatch between breed 1 and breed 2 (training data)
df_types[(df_types['Breed1'] != df_types['Breed2']) & ~np.isnan(df_types['Breed2']) ]

Unnamed: 0_level_0,Type,Breed1,Breed2
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
375905770,1,,1.0
da8d4a273,1,,1.0
27e74e45c,2,,2.0
7b5bee232,1,,1.0
0327b8e94,1,,1.0


In [108]:
# Check type mismatch between breed 1 and type (training data)
breed1_mismatch = df_types[(df_types['Type'] != df_types['Breed1']) & ~np.isnan(df_types['Breed1'])]
breed1_mismatch

Unnamed: 0_level_0,Type,Breed1,Breed2
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1bc0f89d8,2,1.0,
15a206d0d,2,1.0,
f8654865f,2,1.0,1.0
36b20cfb5,2,1.0,
699a81c51,2,1.0,1.0
85ec1aac0,2,1.0,
6a72cfda7,2,1.0,1.0
6c399cb06,2,1.0,
504134fd6,2,1.0,
44f198a24,2,1.0,


In [109]:
# investigate breed 1 mismatch (training data)
breed1_mismatch_df = df_train.loc[breed1_mismatch.index]
breed1_mismatch_df['Breed1'] = breed1_mismatch_df['Breed1'].map(df_breeds['BreedName'])
breed1_mismatch_df['Breed2'] = breed1_mismatch_df['Breed2'].map(df_breeds['BreedName'])
for i, Data in breed1_mismatch_df[['Type', 'Breed1', 'Breed2', 'Description']].iterrows():
    print(f"{i} \t type:{Data[0]} \t breeds:{Data[1]} & {Data[2]} \n {Data[3]}", end='\n\n')
    

1bc0f89d8 	 type:2 	 breeds:Greyhound & nan 
 My cats name is Kenit, Kenot,Owen, Keyad, Manja, Techit and 4 Kittens, Baby and family. All my cats are pamper.

15a206d0d 	 type:2 	 breeds:Belgian Shepherd Laekenois & nan 
 so cute and kind

f8654865f 	 type:2 	 breeds:Bearded Collie & Bearded Collie 
 New born baby cats 2 weeks time..at my house all five of them are males

36b20cfb5 	 type:2 	 breeds:Belgian Shepherd Laekenois & nan 
 THERE ARE FOUR KITTENS GINGER, BLACKIE, SILVER, TUTU THE MOTHER IS A PART SIAMESE LILAC POINT I LIVE IN A CONDO AND I HAVE TO FIND HOMES FOR THE KITTENS WELL FED, CLEAN AND VERY LIVELY AND LOVABLE I AM MAGGIE, TEL KL AMPANG HILIR,

699a81c51 	 type:2 	 breeds:Terrier & Terrier 
 Mo-joe is adopted and is in good hand with sofia..im proud of him in any where coz even he so cute and adorable, he is the most behave kitten i have..the cutest among all..Mummy wish u all the best in your life baby..of coz u not going to be a lawyer or something.. :) but mummy hop

In [110]:
breed1_mismatch_df

Unnamed: 0_level_0,Type,Name,Age,Breed1,Breed2,Gender,Color1,Color2,Color3,MaturitySize,...,Sterilized,Health,Quantity,Fee,State,RescuerID,VideoAmt,Description,PhotoAmt,AdoptionSpeed
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1bc0f89d8,2,"Kenit, Kenot, Techit, Keyad, Owen",0,Greyhound,,3,3,6,7,2,...,3,1,10,0,41401,97be67995b53f86f64be212c867009fa,0,"My cats name is Kenit, Kenot,Owen, Keyad, Manj...",0.0,4
15a206d0d,2,Shuka,3,Belgian Shepherd Laekenois,,2,6,0,0,1,...,2,1,1,0,41401,4e271c777d05f0d87034c9db0ea6663f,0,so cute and kind,2.0,4
f8654865f,2,Mi Cai 2,1,Bearded Collie,Bearded Collie,1,1,3,6,2,...,2,1,5,0,41401,f3e6eb8b610ab73fb0b2f91cc4195091,0,New born baby cats 2 weeks time..at my house a...,5.0,3
36b20cfb5,2,,3,Belgian Shepherd Laekenois,,3,1,3,6,2,...,2,1,2,0,41401,969f67f283480a4d6c52173f80a10f75,0,"THERE ARE FOUR KITTENS GINGER, BLACKIE, SILVER...",0.0,4
699a81c51,2,Mo-Joe,1,Terrier,Terrier,1,2,7,0,3,...,2,1,1,25,41326,35ca0af7f781e96744e8371c35b07944,0,Mo-joe is adopted and is in good hand with sof...,3.0,0
85ec1aac0,2,Munchi,1,Australian Kelpie,,1,1,7,0,1,...,2,1,1,0,41345,ec0c78ffa25dae7dffdb9669b9ac9e95,0,He's very naughty and cute ..,1.0,4
6a72cfda7,2,Mao Mao,1,Collie,Akita,2,2,0,0,1,...,2,1,1,0,41326,35ca0af7f781e96744e8371c35b07944,0,i rescued mao mao last two weeks or 3 weeks ag...,3.0,0
6c399cb06,2,Bobby The Smiling Shih Tzu,36,Shih Tzu,,1,2,5,7,2,...,2,1,1,0,41336,0e9d19f9046c000d494f2f7496ef6482,0,How he was found: Bobby was abandoned and was ...,2.0,1
504134fd6,2,Kittens Encik Faisal,3,Mixed Breed,,3,2,4,0,2,...,2,1,3,0,41326,73ba1badbbec9b9a37671b0dc84c1d5e,0,Ada 3 ekor kitten d kedai nasi ayam faisal di ...,2.0,4
44f198a24,2,Tabby,3,Mixed Breed,,2,1,3,7,1,...,3,1,1,0,41401,27798b29cd4d37d2d2295ac2056142ca,0,I just rescue this kitty at pm near my apartme...,7.0,4


In [111]:
# Pet types inferred from breed (test data)
df_types = df_test.copy()[['Type', 'Breed1', 'Breed2']]
df_types['Breed1'] = df_types['Breed1'].map(df_breeds['Type'])
df_types['Breed2'] = df_types['Breed2'].map(df_breeds['Type'])
df_types.head()

Unnamed: 0_level_0,Type,Breed1,Breed2
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
378fcc4fc,1,1,
73c10e136,2,2,
72000c4c5,2,2,
e147a4b9f,2,2,2.0
43fbba852,1,1,


In [112]:
# Check type mismatch between breed 1 and breed 2 (training data)
df_types[(df_types['Breed1'] != df_types['Breed2']) & ~np.isnan(df_types['Breed2']) ]

Unnamed: 0_level_0,Type,Breed1,Breed2
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1


In [113]:
# Check type mismatch between breed 1 and type (test data)
breed1_mismatch = df_types[(df_types['Type'] != df_types['Breed1']) & ~np.isnan(df_types['Breed1'])]
breed1_mismatch

Unnamed: 0_level_0,Type,Breed1,Breed2
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
f2847575d,2,1,
3bb3c3f41,2,1,
199ae3a3d,2,1,
c1509a54e,2,1,
66b38c4a7,2,1,


In [114]:
# investigate breed 1 mismatch (test data)
breed1_mismatch_df = df_test.loc[breed1_mismatch.index]
breed1_mismatch_df['Breed1'] = breed1_mismatch_df['Breed1'].map(df_breeds['BreedName'])
breed1_mismatch_df['Breed2'] = breed1_mismatch_df['Breed2'].map(df_breeds['BreedName'])
for i, Data in breed1_mismatch_df[['Type', 'Breed1', 'Breed2', 'Description']].iterrows():
    print(f"{i} \t type:{Data[0]} \t breeds:{Data[1]} & {Data[2]} \n {Data[3]}", end='\n\n')

f2847575d 	 type:2 	 breeds:Mixed Breed & nan 
 This is a very cute male cat , around 2 yrs of age. the owners have abandoned the cat and he is homeless.. he was previously a home cat , litter box trained and only eats good cat food.. currently living on the streets and begging for food. it breaks my heart to see such irresponsible ppl having pets and they dont care about. i hope someone can give toby a home , if they prefer adult cats then kittens , Then Toby is the one for you. pls call me if interested , i will putting up pics of Toby really soon.

3bb3c3f41 	 type:2 	 breeds:Mixed Breed & nan 
 They were born on the 28th Nov . There were 4 kittens , two were adopted. 2 weeks ago. Their mom was a stray which my family rescued from the drain. The kittens are intelligent, lovable and healthy. They are trained to use the cat sand.

199ae3a3d 	 type:2 	 breeds:Belgian Shepherd Laekenois & nan 
 I have too many cats in house and due to that I really need someone who can taking care of my

### Decision
Ultimately the mismatch is on a very small scale and will have minimal impact to the overall model. Further analysis can be performed later on if necessary.

Potential steps are as follows:

#### Steps:
- if type=2 and breed is mixed breed, than change to domestic short, medium or long hair
- count dog, dogs, puppy, puppies and cat, cats, kitten, kittens in description (highest count wins)
- if type incorrect, then change type to correct type
- if breed incorrect, then change breed to mixed for dog or domestic for cat

### Try to predict animal type from description using key words
Although a few look to be incorrect, the majority look correct and the prediction based on keywords is inaccurate. For example, someone might mention how they have a cat to adopt, but it doesn't get on with their dogs. This basic model cannot make accurate predictions. Further analysis might be easier using images.

In [115]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)

In [162]:
# Create new predicted type based on description keywords
df_combined['pred_type'] = 0  # unknown
pets = {'dog': 1, 'dogs': 1, 'puppy': 1, 'puppies': 1, 'cat': 2, 'cats': 2, 'kittens': 2, 'kitten': 2}
for i, desc in df_combined['Description'].items():
    cat_or_dog = []
    try:
        for word in desc.split():
            word = word.lower()
            if word in pets.keys():
                cat_or_dog.append(word)
        cnt = Counter([pets[w] for w in cat_or_dog])
        if len(cnt) == 0 or cnt[1] == cnt[2]: continue
        df_combined.at[i,'pred_type'] = cnt.most_common()[0][0]
    except AttributeError:
        continue

In [163]:
# Review prediction and source description
pd.set_option('display.max_colwidth', -1)
df_combined.loc[:, ['Type', 'pred_type', 'Description']] \
[(df_combined['Type'] != df_combined['pred_type']) & (df_combined['pred_type'] != 0)]

Unnamed: 0_level_0,Type,pred_type,Description
PetID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
64e00ab2c,2,1,"She was found wandering near a children's orphanage. The children love her but there are many stray dogs living in the area.Tikki is very playful and affectionate and would make a lovely addition to a family. She has fluffy fur and beautiful green eyes. She has two orange paws and two black paws! She is also toilet trained. Hoping to find a forever home for her. Future fur-mommy/daddy preferably has own car, has prior experience owning cats, and won't keep her in a cage."
be10ae529,1,2,"These 2 pets were rescued form the bushes nearby Second Chance shelter at Old Klang Rd. They were covered with cat fleas , but after a good Frontline spray by the rescuers, they looked very alert and healthy. Both of the pups were very pretty and though initially they were wary of people but they warmed up to us in a short time. Please give them a lovely home, they deserve more! Pl call Mrs Lai , KIm, mei Leng for more adoption details."
6069f575a,1,2,I ping this handsome small pup outside my apartment. I already have a cat at home. But I do not have the heart to leave out there. I hope someone can take him in. If interested please call. We are from Penang. I can send him to you.
b78ea4bc0,1,2,"This puppy is very smart, he learned to use cat litter tray by himself. A little bit fussy, like to eat when the kibbles mixed with a little bit chicken or canned food. I'm a cat rescuer, there are a lot of cats in my house but the puppy can get along with all my cats very well."
071df417e,2,1,"I've found their mother a month ago at my restaurant. obviously they been dumped by their owner just because they pregnant .. After She delivered 3 kittens, we put them in our care for a month now.. because we afraid the dogs might hurt them. So now I think they ready for adoption as they can eat by their own. . Anyone interested can gve a me sms Or wassup... azlin..tq"
3210b7021,2,1,"A cute, friendly, timid boy. He is very playful,very affectionate and has the cutest tiny meow. He is not a vocal one, only meowing softly when he is hungry. He is adjusted to living with other kitties or dogs in the house. He loves hugs and cuddles. His fur is soft and silky smooth, medium length hair. Has the cutest orange button nose. Found him outside my office meowing and begging at passerby, maybe abandoned. I can't keep him as I have too many animals in the house,. He is toilet trained. Does not use claws when playing. Loves climbing the gate or trees. His favourite toy is anything that roles on the floor"
3b068ba06,2,1,"She's extremely loving and such a well behaved cat. When I first saw her she was roaming in my condo pretty much going to everyone for a lil pat and attention. In couple of days she came to my unit. I fed her and send her back to the management, told them to find out the owner and pass her to them. Clearly they did not. She found my house again. This time I went to management spoke to a few people and try to find out myself. Went from units to units and no one claims her. Her claws been trimmed, very clean and she's so comfortable in my house. Based on how she is, it's very obvious she belongs to someone but no one is coming forth to claim her. I sincerely believe she would make an amazing pet. I have two dogs on my own and she's not getting along in with them or else I would love to keep her. Please call, leave a text or whatsapp (). Thanks heaps!"
1710b366c,1,2,"She is so adorable... Can understand simple instruction such as sit, stay and shake hand... I found her alone in this area( Bandar Baru Ampang ) and sometime there are peoples bullying her and she also exposed by hit and run hazard... I just can feed her and protect her from human harm but i cant take her since i live in apartment plus my 11 cats are anti-dog... Before this she suffer from slight minor injury but already heal... Plz give home for her so she can have a better life... May God bless u..."
617d58794,1,2,"Cutest pet ever,loyal,always smiling to cheer u up no matter how down u are. Take good care of her and she will always be there for you and protect your family from danger. FYI,She got a small scar at the back after a male cat tried to kill her,but i sent it to the vet, furs are growing and will patch the scar up."
2fb04c8d1,2,1,"I found Meow on the side of the street and decided to take her home as she was all alone and so super tiny. It's really a bad time because I'm leaving to the Netherlands on 9th August, but hey, I can't leave her there when there's stray dogs running around. So it's been 3 weeks she's staying at my place and she's learnt how to use the litter box. I'd very much like her to find a loving family to continue to provide her with affection and care. She's very playful and her favourite toy is a wine cork tied to a string. After a long day's playing she'll fall asleep on your lap while you're watching TV or reading emails. Stroking her hair can be very relaxing and therapeutic after a long day. She'll open her eyes occasionally to check if everything's alright and fall asleep again. After Meow is fully recharged with adequate sleep and yummy in her belly, she can run around the house at the speed of light to prove herself worthy of another playing session. Before you know it, you'll be in love with this hairy this creature. She has been dewormed and got her first vaccination. She needs another 2 vaccinations before she can be spayed."


## Functions

In [89]:
def apply_word_flags(df, words):
    """Creates binary columns for words which appear in the description"""
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                word = word.lower()
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    df = df.drop(columns=['Description'])
    return df

In [90]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [91]:
def apply_color_flags(df, colors):
    """Combines Colors 1,2 & 3 into binary columns for each possible colours"""
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [104]:
def create_breed_keywords(df):
    """Creates unique list of keywords from provided breeds dataframe"""
    breed_keywords = []
    for breed in df['BreedName']:
        breed = re.sub(r'[/(/)]', '', breed)  # remove braces
        keywords = breed.split()
        breed_keywords += keywords
    return set(breed_keywords)

In [93]:
def apply_breed_flags(df, keywords, breeds):
    """Creates binary columns for keywords which appear in the breed name"""
    for word in keywords:
        df[word] = 0
        
    for i,pair in df[['Breed1', 'Breed2']].iterrows():
        for indx in pair:
            if indx == 0: continue
            breed = breeds.loc[indx,'BreedName']
            breed = re.sub(r'[/(/)]', '', breed)
            new_keywords = breed.split()
            for word in new_keywords:
                if word in keywords: 
                    df.at[i,word] = 1
                    
    return df

## Preparing training data

In [94]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

# Rescuer
rescue_map = Counter(df_combined['RescuerID'])
rescuer_counts = df_combined['RescuerID'].map(rescue_map)

# Breeds
all_test_breeds = df_test['Breed1'].append(df_test['Breed2'])
df_test_breeds = df_breeds.loc[all_test_breeds[all_test_breeds > 0].unique(), :]
breed_keywords = create_breed_keywords(df_test_breeds)

# Prepare data for modelling 
df_combined['rescuer_counts'] = rescuer_counts
df_combined = apply_word_flags(df_combined, keywords)
df_combined = apply_color_flags(df_combined, colors)
df_combined = apply_breed_flags(df_combined, breed_keywords, df_breeds)
df_combined = pd.get_dummies(df_combined, columns=['Gender',
                                                   'Vaccinated', 'Dewormed', 'Sterilized', 
                                                   'State'])
y_train_all = df_combined['AdoptionSpeed'][df_combined['test'] != 1]
X_all       = df_combined.drop(columns=['Name', 'RescuerID', 'AdoptionSpeed', 'Breed1', 'Breed2'])
X_train_all = X_all[X_all['test'] != 1].drop(columns=['test'])
X_test_all  = X_all[X_all['test'] == 1].drop(columns=['test'])

## Test Random Forest model

In [102]:
scores = []

folds = KFold(10, True, rnd).split(X_train_all)

for train_indx, test_indx in folds:
    
    X_train, X_test = X_train_all.iloc[train_indx], X_train_all.iloc[test_indx]
    y_train, y_test = y_train_all.iloc[train_indx], y_train_all.iloc[test_indx]
    
    rfc = RandomForestClassifier(n_estimators=200, random_state=rnd)
    rfc.fit(X_train, y_train)
    prediction = rfc.predict(X_test)
    scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
    print(scores[-1])

0.39197471630129943
0.43253971271491254
0.40166675216505254
0.41534974116757917
0.4095289569630458
0.39017748800824703
0.40852032410397354
0.3888393367063733
0.3687232047766609
0.39348361520063546


In [103]:
# 0.400
np.mean(scores)

0.4000803848107779

# Submitting test data

In [40]:
rfc = RandomForestClassifier(n_estimators=200, random_state=rnd)
rfc.fit(X_train_all, y_train_all)
prediction = rfc.predict(X_test_all)

In [41]:
submission = pd.DataFrame({'AdoptionSpeed': prediction.astype(int)}, index=X_test_all.index)
submission.to_csv("submission.csv", index=True, index_label='PetID', header=['AdoptionSpeed'])