In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import seaborn as sns
from xgboost import XGBClassifier
from sklearn.metrics import cohen_kappa_score,make_scorer
from sklearn.linear_model import LogisticRegression
import matplotlib.pyplot as plt
import plotly.graph_objs as go
import plotly.offline as py
import nltk
from nltk.corpus import stopwords
from string import punctuation
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder,MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, KFold, cross_validate,RandomizedSearchCV

import warnings
warnings.filterwarnings('ignore')

RANDOM_STATE = 0

# PetFinder.my Adoption Prediction
Millions of stray animals suffer on the streets or are euthanized in shelters every day around the world. If homes can be found for them, many precious lives can be saved — and more happy families created.

PetFinder.my has been Malaysia’s leading animal welfare platform since 2008, with a database of more than 150,000 animals. PetFinder collaborates closely with animal lovers, media, corporations, and global organizations to improve animal welfare.

Animal adoption rates are strongly correlated to the metadata associated with their online profiles, such as descriptive text and photo characteristics. As one example, PetFinder is currently experimenting with a simple AI tool called the Cuteness Meter, which ranks how cute a pet is based on qualities present in their photos.

The goal of this project is to predict the adoptability of pets - specifically, how quickly is a pet adopted? If successful, they will be adapted into AI tools that will guide shelters and rescuers around the world on improving their pet profiles' appeal, reducing animal suffering and euthanization.

## Evaluation
Results are evaluated using the quadratic weighted kappa, which measures the agreement between two ratings. This metric typically varies from 0 (random agreement between raters) to 1 (complete agreement between raters). In the event that there is less agreement between the raters than expected by chance, the metric may go below 0. The quadratic weighted kappa is calculated between the scores which are expected/known and the predicted scores.

In [None]:
def evaluation_metric(model, actual, predicted):
    '''
        Compute quadratic weighted kappa score of predicted values
    '''
    return cohen_kappa_score(actual, predicted, weights = "quadratic")

def get_cross_breed(breed1, breed2) -> str:
    '''
        Compute whether the
    '''
    if (breed2 != 0) & (breed1 != breed2):
        return 'Yes'
    else:
        return 'No'
    
def clean_text(data):
    data['text_char_count'] = data['Description'].apply(len)
    data['text_word_count'] = data['Description'].apply(lambda x: len(x.split()))
    data['text_density'] = data['text_char_count'] / (data['text_word_count']+1)
    data['text_punctuation_count'] = data['Description'].apply(lambda x: len("".join(_ for _ in x if _ in punctuation)))
    data['title_word_count'] = data['Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.istitle()]))
    data['text_upper_case'] = data['Description'].apply(lambda x: len([wrd for wrd in x.split() if wrd.isupper()]))
    data['Description'] = data['Description'].apply(lambda x: x.translate(str.maketrans('', '', punctuation)))
    data['Description'] = data['Description'].str.lower()
    return data

def remove_stopwords(text):
    """
    Removing stopwords belonging to english language
    
    """
    words = [w for w in text if w not in stopwords.words('english')]
    return words

def combine_text(list_of_text):
    '''Takes a list of text and combines them into one large chunk of text.'''
    combined_text = ' '.join(list_of_text)
    return combined_text

def determine_sentiments(score, magnitude):
    '''
        Determine the sentiment - https://cloud.google.com/natural-language/docs/basics
    '''
    if score >=0.1:
        if magnitude < 3:
            return 'Weak Postive'
        elif magnitude < 6 and magnitude>= 3:
            return 'Medium Postive'
        elif magnitude >= 6:
            return 'Clearly Postive'
            
    elif score <=-0.1 :
        if magnitude < 3:
            return 'Weak Negative'
        elif magnitude < 6 and magnitude>= 3:
            return 'Medium Negative'
        elif magnitude >=6:
            return 'Clearly Negative'
            
    else:
        return 'Netural'

mapping_maturity = {1: 'Small', 2:'Medium', 3:'Large', 4: 'Extra Large', 0: 'Not Specified'}
mapping_fur = {1: 'Short', 2:'Medium', 3:'Long', 0:'Not Specified'}
mapping_vet = {1:'Yes', 2:'No', 3:'Not Sure'}
mapping_health = {1:'Healthy', 2:'Minor Injury', 3:'Serious Injury', 0:'Not Specified'}

kappa_scorer = make_scorer(cohen_kappa_score,weights = "quadratic")

# Data Exploration and Evaluation



In [None]:
states = pd.read_csv('/kaggle/input/petfinder-adoption-prediction/state_labels.csv')
colors = pd.read_csv('/kaggle/input/petfinder-adoption-prediction/color_labels.csv')
breeds = pd.read_csv('/kaggle/input/petfinder-adoption-prediction/breed_labels.csv')

train = pd.read_csv('/kaggle/input/petfinder-adoption-prediction/train/train.csv')
test = pd.read_csv('/kaggle/input/petfinder-adoption-prediction/test/test.csv')

palette = {"Dog": "C0", "Cat": "C1"}

## Train Dataset
The train dataset has no duplicate record. The dataset has missing records in feature `Name` and `Description`

In [None]:
train.isnull().values.any()

In [None]:
train.isnull().sum()

In [None]:
train.duplicated().sum()

## Breeds
Breeds dataset holds the breed of the animal in the shelter and has no missing value. 

There are 241 breeds of Dogs and 66 breeds of cat. The breeds information is merged with the train and test dataset

There is no mixed species in the dataset, i.e dog mixed with cats. 

There are 5 animals without main breed details in the trainset and none in test. the 5 observations will be deleted

Dog population in train and test dataset is 54% and 53% respectively

In [None]:
breeds.isnull().values.any()

In [None]:
breeds['Type'].value_counts()

In [None]:
breeds['animal'] = breeds['Type'].apply(lambda x: "Dog" if x == 1 else "Cat")

In [None]:
mapping = dict(breeds[['BreedID', "BreedName"]].values)
train["Main_Breed"]=train.Breed1.map(mapping)
test["Main_Breed"] = test.Breed1.map(mapping)

map_animal = dict(breeds[['BreedID', 'animal']].values)
train['Main_Breed_animal'] = train.Breed1.map(map_animal)
test['Main_Breed_animal'] = test.Breed1.map(map_animal)
train['Sec_Breed_animal'] = train.Breed2.map(map_animal)
test['Sec_Breed_animal'] = test.Breed2.map(map_animal)

In [None]:
train['MaturitySize'] = train['MaturitySize'].map(mapping_maturity)
test['MaturitySize'] = test['MaturitySize'].map(mapping_maturity)

train['FurLength'] = train['FurLength'].map(mapping_fur)
test['FurLength'] = test['FurLength'].map(mapping_fur)

train['Vaccinated'] = train['Vaccinated'].map(mapping_vet)
test['Vaccinated'] = test['Vaccinated'].map(mapping_vet)

train['Dewormed'] = train['Dewormed'].map(mapping_vet)
test['Dewormed'] = test['Dewormed'].map(mapping_vet)

train['Sterilized'] = train['Sterilized'].map(mapping_vet)
test['Sterilized'] = test['Sterilized'].map(mapping_vet)

train['Health'] = train['Health'].map(mapping_health)
test['Health'] = test['Health'].map(mapping_health)

In [None]:
train.drop(train[train['Sec_Breed_animal'].notna() & (train['Main_Breed_animal'] != train['Sec_Breed_animal'])].index, inplace = True)

In [None]:
g = sns.countplot(train['Main_Breed_animal'],palette=palette)
g.set_title("Distribution of Animals in Train set")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train['Main_Breed_animal'])), (x.mean(), y), ha='center', va='bottom');

In [None]:
g = sns.countplot(test['Main_Breed_animal'],palette=palette)
g.set_title("Distribution of Animals in Test set")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(test['Main_Breed_animal'])), (x.mean(), y), ha='center', va='bottom');

cross breed are animals mixed with another breed. ie where Breed2 isnt zero and breed1 != breed2

Dogs are more cross bred than cats. There are more pure breeds in the train dataset

In [None]:
train['Cross_Breed'] = train.apply(lambda x: get_cross_breed(x['Breed1'], x['Breed2']), axis = 1 )
test['Cross_Breed'] = test.apply(lambda x: get_cross_breed(x['Breed1'], x['Breed2']), axis = 1)

In [None]:
train.Cross_Breed.value_counts()

In [None]:
g = sns.countplot(x='Cross_Breed', hue = "Main_Breed_animal", data = train,palette=palette)
g.set_title("Distribution of Cross Breed in the Animals")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');

## Adoption Speed

* 0 - Pet was adopted on the same day as it was listed.
* 1 - Pet was adopted between 1 and 7 days (1st week) after being listed.
* 2 - Pet was adopted between 8 and 30 days (1st month) after being listed.
* 3 - Pet was adopted between 31 and 90 days (2nd & 3rd month) after being listed.
* 4 - No adoption after 100 days of being listed. (There are no pets in this dataset that waited between 90 and 100 days).

2.7% of pets were adopted immediately

Cats are more adopted within 7 days than dogs (i.e adoption speed 0 and 1).
Domestic Hair cats are more likely to be adopted earlier than others

Mixed Breeds are adopted faster than pure breeds, especially cats

In [None]:
train['AdoptionSpeed'].value_counts()

In [None]:
plt.figure(figsize=(15, 8))
g = sns.countplot(train['AdoptionSpeed'])
g.set_title("Adoption Speed of Animals")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train['AdoptionSpeed'])), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days'])

In [None]:
plt.figure(figsize=(15, 8))
g = sns.countplot(x='AdoptionSpeed', hue = "Main_Breed_animal", data = train ,palette=palette)
g.set_title("Adoption Speed per Animal")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days'])

In [None]:
cats = train[train['Main_Breed_animal'] == 'Cat']

In [None]:
pd.crosstab(cats['Main_Breed'], cats['AdoptionSpeed'])

In [None]:
fig, axes = plt.subplots(1, 2, figsize = (20,8))
not_mixed = train[train['Cross_Breed'] == 'No']
pure = train[train['Cross_Breed'] == 'Yes']
plt.figure(figsize=(15, 8));


g = sns.countplot(x='AdoptionSpeed', hue = "Main_Breed_animal", data = not_mixed,palette=palette, ax=axes[0])
g.set_title("Adoption Speed for Non-mixed Breed Animals (Pure Breed)")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(not_mixed)), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days'])

g = sns.countplot(x='AdoptionSpeed', hue = "Main_Breed_animal",palette=palette, data = pure, ax=axes[1])
g.set_title("Adoption Speed per Mixed Breed Animal")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(pure)), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days']);

## Health

Health Condition (1 = Healthy, 2 = Minor Injury, 3 = Serious Injury, 0 = Not Specified). Most of the animals were healthy

In [None]:
train.Health.value_counts()

## Age

Most of the animals are young(new borns to few weeks old). 

Younger animals are speedily adopted than older ones

In [None]:
fig, axes = plt.subplots(1, 3, figsize = (25,8))
axes[0].set_title('Distribution of Animal Ages')
sns.histplot(data=train, x="Age", kde=True, ax = axes[0]);
axes[1].set_title("Scatterplot of Age and AdoptionSpeed")
sns.scatterplot(data=train, x="Age", y="AdoptionSpeed", hue="Main_Breed_animal",palette=palette, ax = axes[1]);
axes[2].set_title('AdoptionSpeed by Animal Specie and age')
sns.violinplot(x="AdoptionSpeed", y="Age", hue="Main_Breed_animal", data=train,palette=palette, ax = axes[2]);


In [None]:
data = []
for a in range(5):
    df = train.loc[train['AdoptionSpeed'] == a]

    data.append(go.Scatter(
        x = df['Age'].value_counts().sort_index().index,
        y = df['Age'].value_counts().sort_index().values,
        name = str(a)
    ))
    
layout = go.Layout(dict(title = "AdoptionSpeed trends by Age",
                  xaxis = dict(title = 'Age (months)'),
                  yaxis = dict(title = 'Counts'),
                  )
                  )
py.iplot(dict(data=data, layout=layout), filename='basic-line')

## PhotoAmt

The maximum number of photos is 30. There is no sufficient evidence that the number of photos influences the Adoption Speed

In [None]:
plt.figure(figsize=(30, 8))
g = sns.countplot(x='PhotoAmt', hue = "AdoptionSpeed", data = train )
g.set_title("Adoption Speed per PhotoAmt")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');


In [None]:
fig, axes = plt.subplots(1, 3, figsize = (25,8))
axes[0].set_title('Distribution of Animal Photos')
sns.histplot(data=train, x="PhotoAmt", kde=True, ax = axes[0]);
axes[1].set_title("Scatterplot of PhotoAmt and AdoptionSpeed")
sns.scatterplot(data=train, x="PhotoAmt", y="AdoptionSpeed", hue="Main_Breed_animal",palette=palette, ax = axes[1]);
axes[2].set_title('AdoptionSpeed by Animal Photos and Type')
sns.violinplot(x="AdoptionSpeed", y="PhotoAmt", hue="Main_Breed_animal", data=train,palette=palette, ax = axes[2]);

## Vetenary Treatment

1. Vaccinated - Pet has been vaccinated (1 = Yes, 2 = No, 3 = Not Sure)
1. Dewormed - Pet has been dewormed (1 = Yes, 2 = No, 3 = Not Sure)
1. Sterilized - Pet has been spayed / neutered (1 = Yes, 2 = No, 3 = Not Sure)

Vaccination is not a factor to early adoption, as people prefer non-vacinated pets
Dewormed pets and healthy pets are favorites for adoption and people generally prefer non-sterilized pets


In [None]:
palette_vet ={'Yes': "#FF0B04", 'No' : "#4374B3", 'Not Sure': "#4d4d4d"}
##  'Vaccinated', 'Dewormed','Sterilized'
fig, axes = plt.subplots(1, 3, figsize = (20,8));
not_mixed = train[train['Cross_Breed'] == 'No']
pure = train[train['Cross_Breed'] == 'Yes']
plt.figure(figsize=(15, 8));


g = sns.countplot(x='AdoptionSpeed', hue = "Vaccinated", palette = palette_vet, data = train,  ax=axes[0])
g.set_title("Adoption Speed for Vaccinated Animals")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days'])

g = sns.countplot(x='AdoptionSpeed', hue = "Dewormed", data = train,  palette = palette_vet, ax=axes[1])
g.set_title("Adoption Speed per Dewormed Animals")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days'])
    
g = sns.countplot(x='AdoptionSpeed', hue = "Sterilized", data = train, palette = palette_vet, ax=axes[2])
g.set_title("Adoption Speed per Sterilized Animals")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');
g=g.set_xticklabels(['Same Day', '1-7days','8-30 days','31-90 days','> 100 days'])

## Description

Null Description were filled with `Unknown_description`

Taking a first look at the descriptions of the animals in wordcloud for early and late adoptions, the most common words are generic.



In [None]:
train['Description'].nunique()

In [None]:
test['Description'] = test['Description'].fillna('')
train['Description'] = train['Description'].fillna('')

In [None]:
train = clean_text(train)
test = clean_text(test)

In [None]:
from wordcloud import WordCloud
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=[26, 8])
wordcloud1 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(train[(train['AdoptionSpeed'] <=1) & (train['Main_Breed_animal'] == 'Dog') ]['Description']))
ax1.imshow(wordcloud1)
ax1.axis('off')
ax1.set_title('Early Adoption for Dogs (0 and 1)',fontsize=15);

wordcloud2 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(train[(train['AdoptionSpeed'] <=1) & (train['Main_Breed_animal'] == 'Cat') ]['Description']))
ax2.imshow(wordcloud2)
ax2.axis('off')
ax2.set_title('Early Adoption for Cats (0 and 1)',fontsize=15);

In [None]:
fig, (ax3, ax4) = plt.subplots(1, 2, figsize=[26, 8])


wordcloud3 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(train[(train['AdoptionSpeed'] >= 3) & (train['Main_Breed_animal'] == 'Dog') ]['Description']))
ax3.imshow(wordcloud3)
ax3.axis('off')
ax3.set_title('Late Adoption for Dogs (3 and 4)',fontsize=15);

wordcloud4 = WordCloud( background_color='white',
                        width=600,
                        height=400).generate(" ".join(train[(train['AdoptionSpeed'] >= 3) & (train['Main_Breed_animal'] == 'Cat') ]['Description']))
ax4.imshow(wordcloud4)
ax4.axis('off')
ax4.set_title('Late Adoption for Cats (3 and 4)',fontsize=15);

## Sentiments

This section is analyzing the sentiments of pet description already run through Google's Natural Language API. PetID with missing sentiment magnitude and score will be filled with 0, translating to Neutral 

Analysing description sentiments shows that:
* Most of the description sentiments for the pet are positive
* Positive sentiments doesnt translate to early adoption


In [None]:
import glob, json
train_sentiment_files = sorted(glob.glob('/kaggle/input/petfinder-adoption-prediction/train_sentiment/*.json'))
data = []
for i in range (len(train_sentiment_files)):
    with open(train_sentiment_files[i]) as jsonFile:
        jsonObject = json.load(jsonFile)
        jsonFile.close()
    score=jsonObject['documentSentiment']['score']
    magnitude=jsonObject['documentSentiment']['magnitude']
    path=train_sentiment_files[i]
    firstpos=path.rfind("/")
    lastpos=path.rfind(".")
    PetID=path[firstpos+1:lastpos]
    data.append([PetID,score,magnitude,score*magnitude])
sentiment_train = pd.DataFrame(data, columns=['PetID','Score', 'Magnitude','Sentiment'])


In [None]:
sentiment_train.fillna(0, inplace = True)

In [None]:
train = pd.merge(train, sentiment_train, how = "left", left_on ="PetID", right_on = 'PetID' )

In [None]:
train['description_sentiment'] = train.apply(lambda x: determine_sentiments(x['Score'], x['Magnitude']), axis = 1)

In [None]:
plt.figure(figsize=(30, 8))
g = sns.countplot(x='description_sentiment', hue = "AdoptionSpeed", data = train )
g.set_title("Adoption Speed per Description Sentiment")
for p in g.patches:
    x = p.get_bbox().get_points()[:,0]
    y = p.get_bbox().get_points()[1,1]
    g.annotate('{:.2g}%'.format(100.*y/len(train)), (x.mean(), y), ha='center', va='bottom');

In [None]:
test_sentiment_files = sorted(glob.glob('/kaggle/input/petfinder-adoption-prediction/test_sentiment/*.json'))
data_test = []
for i in range (len(test_sentiment_files)):
    with open(test_sentiment_files[i]) as jsonFile:
        jsonObject = json.load(jsonFile)
        jsonFile.close()
    score=jsonObject['documentSentiment']['score']
    magnitude=jsonObject['documentSentiment']['magnitude']
    path=train_sentiment_files[i]
    firstpos=path.rfind("/")
    lastpos=path.rfind(".")
    PetID=path[firstpos+1:lastpos]
    data_test.append([PetID,score,magnitude,score*magnitude])
sentiment_test = pd.DataFrame(data_test, columns=['PetID','Score', 'Magnitude','Sentiment'])
sentiment_test.fillna(0, inplace = True)

In [None]:
test = pd.merge(test, sentiment_test, how = "left", left_on="PetID", right_on = 'PetID')

In [None]:
test['description_sentiment'] = test.apply(lambda x: determine_sentiments(x['Score'], x['Magnitude']), axis = 1)

# Feature Engineering

In [None]:
target = train['AdoptionSpeed']
test_petID = test['PetID']
train = train.drop(columns=['AdoptionSpeed', 'Name', 'State','RescuerID','PetID','Breed1', 'Breed2','Sec_Breed_animal', 'Description','text_char_count',
                            'text_word_count', 'text_density', 'text_punctuation_count','title_word_count','text_char_count',  'text_upper_case'])
test = test.drop(columns=['Name', 'State','RescuerID','PetID','Breed1', 'Breed2','Sec_Breed_animal', 'Description','text_char_count', 
                          'text_word_count','text_density', 'text_punctuation_count','title_word_count','text_char_count',  'text_upper_case'])
cat_cols = ['Type','Gender', 'Color1', 'Color2','Color3', 'MaturitySize', 'FurLength', 'Vaccinated', 'Dewormed',
       'Sterilized', 'Health','Main_Breed','Main_Breed_animal', 'Cross_Breed','description_sentiment' ]
float_cols = ['Age','Quantity', 'Fee', 'VideoAmt','PhotoAmt','Score', 'Magnitude','Sentiment']

## One Hot Encoding
Converting categorical values to numeric correspondence

In [None]:
ohe = OneHotEncoder(handle_unknown='ignore', sparse=False)
ohe.fit(train[cat_cols])
ohe_train_x = pd.DataFrame(ohe.transform(train[cat_cols]), columns=ohe.get_feature_names(cat_cols),index=train.index)
ohe_test_x = pd.DataFrame(ohe.transform(test[cat_cols]), columns=ohe.get_feature_names(cat_cols),index=test.index)

In [None]:
test = test.drop(cat_cols, axis=1)
train = train.drop(cat_cols, axis=1)
train = pd.concat([train, ohe_train_x], axis=1)
test = pd.concat([test, ohe_test_x], axis=1)

## Scaling

Scaling the float columns using MinMaxScaler to have consistency of 0-1 values

In [None]:
mm_scalar = MinMaxScaler()
for col in float_cols:
    mm_scalar.fit(train[col].values.reshape(-1, 1))
    test[col] = mm_scalar.transform(test[col].values.reshape(-1,1))
    train[col] = mm_scalar.transform(train[col].values.reshape(-1,1))

In [None]:
train.columns[train.isnull().any()]

In [None]:
test.columns[test.isnull().any()]

In [None]:
train.fillna(0, inplace = True)
test.fillna(0, inplace = True)

### PCA
To reduce the dimension of the data, 95% n_components PCA is used. This resulted in 35 columns

In [None]:
pca = PCA(0.95, random_state =RANDOM_STATE )
pca.fit(train)
train_pca = pca.transform(train)
test_pca = pca.transform(test)

In [None]:
train_pca.shape

In [None]:
test_pca.shape

In [None]:
plt.figure(figsize = (10,7))
plt.plot(train_pca)
plt.xlabel('Observation')
plt.ylabel('Transformed Train Data')
plt.title('Transformed data by 95% PCA');

In [None]:
pca_features = pd.DataFrame(pca.components_,columns=train.columns)

# Modeling

Firstly in the modelinf section, simple models will be used with the train dataset and PCA dataset

In [None]:
models = [
          ('RF', RandomForestClassifier()),
          ('GB', GradientBoostingClassifier()),
          ('DT', DecisionTreeClassifier()),
        ('Xgb', XGBClassifier())
        ]
dfs = []
results = []
names = []
scoring = {"F1": "f1_weighted", "kappa": kappa_scorer}

for name, model in models:
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(model, train, target, cv=kfold, scoring=scoring, return_train_score=True)
    results.append(cv_results)
    names.append(name)
    
    this_df = pd.DataFrame(cv_results)
    this_df['model'] = name
    dfs.append(this_df)
    final = pd.concat(dfs, ignore_index=True)

In [None]:
final.groupby(['model']).agg({'fit_time':'mean', 'score_time':'mean', 'test_F1':'mean',  'train_F1':'mean', 'test_kappa':'mean', 'train_kappa':'mean'}).reset_index().sort_values(by="test_kappa", ascending = False)


From the result table, XGBoost performed better but has a wide difference in test and train kappa result. Next the simple models will be used with PCA

In [None]:
dfs = []
results = []
names = []

for name, model in models:
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(model, train_pca, target, cv=kfold, scoring=scoring, return_train_score=True)
    results.append(cv_results)
    names.append(name)
    
    this_df = pd.DataFrame(cv_results)
    this_df['model'] = name
    dfs.append(this_df)
    final_pca = pd.concat(dfs, ignore_index=True)
    


In [None]:
final_pca.groupby(['model']).agg({'fit_time':'mean', 'score_time':'mean', 'test_F1':'mean',  'train_F1':'mean', 'test_kappa':'mean', 'train_kappa':'mean'}).reset_index().sort_values(by="test_kappa", ascending = False)


The result of simple model showed that PCA train data performed worse than all the features. Gradient Booster performed better with lower train and test kappa difference

## Randomized Search

Randomized search was done to determine the best hyperparameters for the models. 

In [None]:
 models = [
           ('RF', RandomForestClassifier(class_weight='balanced', n_estimators=200,max_features = 0.3,max_depth = 50, criterion ='entropy'  )),
         ('Xgb', XGBClassifier(n_estimators = 200)),
          ('GB', GradientBoostingClassifier(n_estimators = 200, max_depth=25, loss = 'deviance', learning_rate =0.5)),
          ('DT', DecisionTreeClassifier(class_weight="balanced", max_features=0.3, max_depth=5, criterion ='entropy'))
        ]
dfs = []
results = []
names = []
scoring = {"F1": "f1_weighted", "kappa": kappa_scorer}

for name, model in models:
    kfold = KFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
    cv_results = cross_validate(model, train, target, cv=kfold, scoring=scoring, return_train_score=True)
    results.append(cv_results)
    names.append(name)
    
    this_df = pd.DataFrame(cv_results)
    this_df['model'] = name
    dfs.append(this_df)
    final = pd.concat(dfs, ignore_index=True)

In [None]:
final.groupby(['model']).agg({'fit_time':'mean', 'score_time':'mean', 'test_F1':'mean',  'train_F1':'mean', 'test_kappa':'mean', 'train_kappa':'mean'}).reset_index().sort_values(by="test_kappa", ascending = False)


Overfitting was observed in the results of hypertuning. 
The important features of Gradient Booster will be used to run the model to observe the results and reduce overfitting

In [None]:
gb = GradientBoostingClassifier()
gb.fit(train, target)

In [None]:
importance_gb_features = pd.DataFrame(data={
    'Attribute': train.columns,
    'Importance': gb.feature_importances_
})
importance_gb_features = importance_gb_features.sort_values(by='Importance', ascending=False)

In [None]:
importance_gb_features

## Using Important Features of Xgboost

In [None]:
inportant_features = importance_gb_features[importance_gb_features['Importance'] > 0]['Attribute'].values

In [None]:
cv_results = cross_validate(gb, train[inportant_features], target, cv=kfold, scoring=scoring, return_train_score=True)

In [None]:
cv_results['test_kappa'].mean()

In [None]:
cv_results['train_kappa'].mean()

# Conclusion

Gradient Boosters result was more favourable and didnt overfit. Therefore gradientbooster with important features will be used for submission

In [None]:
gb = GradientBoostingClassifier()
gb.fit(train[inportant_features], target)

In [None]:
db_test_pred = gb.predict(test[inportant_features])
output_rf = pd.DataFrame(
    {
        'PetID': test_petID,
        'AdoptionSpeed': db_test_pred
    })
output_rf.to_csv('submission.csv', index=False)