# Model 0.12 - Description
- Removed Tfidf, as results were disappointing
- Added basic description counts
- Insignficant on top of existing model
- When breeds removed, became fairly significant (~1%)
- Running this on LB reduces the score

In [1]:
import re
import ml_metrics
import string
import nltk
import scipy
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import hstack
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

  from numpy.core.umath_tests import inner1d


In [2]:
# hides warnings - think it needs running after modules imported
import warnings
warnings.simplefilter("ignore")

In [3]:
rnd = 42  # random state for scoring consistency

## Importing the data

In [4]:
df_train = pd.read_csv("../input/train/train.csv", index_col="PetID")
df_test = pd.read_csv("../input/test/test.csv", index_col="PetID")
df_breeds = pd.read_csv("../input/breed_labels.csv", index_col="BreedID")
df_colors = pd.read_csv("../input/color_labels.csv")

In [5]:
colors = df_colors['ColorID']
breeds = df_breeds.index

## Functions

In [30]:
def apply_word_flags(df, words, drop=True):
    """Creates binary columns for words which appear in the description"""
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                word = word.lower()
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    return df.drop(columns=['Description'] if drop else [])

In [7]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [8]:
def apply_color_flags(df, colors):
    """Combines Colors 1,2 & 3 into binary columns for each possible colours"""
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [9]:
def create_breed_keywords(df):
    """Creates unique list of keywords from provided breeds dataframe"""
    breed_keywords = []
    for breed in df['BreedName']:
        breed = re.sub(r'[/(/)]', '', breed)  # remove braces
        keywords = breed.split()
        breed_keywords += keywords
    return set(breed_keywords)

In [10]:
def apply_breed_flags(df, keywords, breeds):
    """Creates binary columns for keywords which appear in the breed name"""
    for word in keywords:
        df[word] = 0
        
    for i,pair in df[['Breed1', 'Breed2']].iterrows():
        for indx in pair:
            if indx == 0: continue
            breed = breeds.loc[indx,'BreedName']
            breed = re.sub(r'[/(/)]', '', breed)
            new_keywords = breed.split()
            for word in new_keywords:
                if word in keywords: 
                    df.at[i,word] = 1
                    
    return df

## Preparing training data

In [113]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

# Rescuer
rescue_map = Counter(df_combined['RescuerID'])
rescuer_counts = df_combined['RescuerID'].map(rescue_map)

# Breeds
all_test_breeds = df_test['Breed1'].append(df_test['Breed2'])
df_test_breeds = df_breeds.loc[all_test_breeds[all_test_breeds > 0].unique(), :]
breed_keywords = create_breed_keywords(df_test_breeds)

# Prepare data for modelling 
df_combined['rescuer_counts'] = rescuer_counts
df_combined = apply_word_flags(df_combined, keywords, drop=False)
df_combined = apply_color_flags(df_combined, colors)
df_combined = apply_breed_flags(df_combined, breed_keywords, df_breeds)
df_combined = pd.get_dummies(df_combined, columns=['Gender',
                                                   'Vaccinated', 'Dewormed', 'Sterilized', 
                                                   'State'])
y_train_all = df_combined['AdoptionSpeed'][df_combined['test'] != 1]
X_all       = df_combined.drop(columns=['Name', 'RescuerID', 'AdoptionSpeed', 'Breed1', 'Breed2'])
X_train_all = X_all[X_all['test'] != 1].drop(columns=['test'])
X_test_all  = X_all[X_all['test'] == 1].drop(columns=['test'])

In [114]:
# replace all missing descriptions with a string length 0
X_train_all['Description'] = X_train_all['Description'].fillna('')

### Count of Description characters

In [115]:
# create new column with length of description
X_train_all['desc_length'] = X_train_all['Description'].apply(len)

### Count of Description words

In [116]:
# split and count words in description
X_train_all['desc_word_count'] = X_train_all['Description'].apply(lambda x: len(x.split()))

In [117]:
# remove description feature
X_train_all = X_train_all.drop(columns=['Description'])

### Apply Counts to test data

In [118]:
X_test_all['Description'] = X_test_all['Description'].fillna('')
X_test_all['desc_length'] = X_test_all['Description'].apply(len)
X_test_all['desc_word_count'] = X_test_all['Description'].apply(lambda x: len(x.split()))
X_test_all = X_test_all.drop(columns=['Description'])

## Test XGBoost model

In [83]:
params = {'max_depth': 4, 
          'learning_rate': 0.2, 
          'n_estimators': 200, 
          'silent': True, 
          'objective': 'multi:softprob', 
          'booster': 'gbtree',
          'tree_method': 'hist',
          'n_jobs': 3,
          'gamma': 0, 
          'min_child_weight': 1, 
          'max_delta_step': 0, 
          'subsample': 0.8, 
          'colsample_bytree': 1, 
          'colsample_bylevel': 1, 
          'reg_alpha': 0, 
          'reg_lambda': 1, 
          'scale_pos_weight': 1, 
          'base_score': 0.2, 
          'random_state': rnd, 
          'missing': None,
          'verbose': 0,
          'verbosity': 3}

In [13]:
def cv_testing(X_train_all, params, folds=5, dataframe=True):

    scores = []

    folds = KFold(folds, True, rnd).split(X_train_all)

    for train_indx, test_indx in folds:

        # flag dataframe determines whether to use iloc or "normal" masking for (sparse) arrays
        if not dataframe:
            X_train, X_test = X_train_all[train_indx], X_train_all[test_indx]
            y_train, y_test = y_train_all[train_indx], y_train_all[test_indx]
        else:
            X_train, X_test = X_train_all.iloc[train_indx], X_train_all.iloc[test_indx]
            y_train, y_test = y_train_all.iloc[train_indx], y_train_all.iloc[test_indx]


        clf = xgb.XGBClassifier(**params)

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)

        scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
        print("{:.3f}".format(scores[-1]), end="\t")
    
    print()
    
    return scores

In [109]:
scores = cv_testing(X_train_all=X_train_all, folds=10, params=params, dataframe=True)

[22:05:20] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.396	[22:05:28] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.428	[22:05:37] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.408	[22:05:45] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.392	[22:05:53] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.373	[22:06:02] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.393	[22:06:10] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.390	[22:06:18] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.378	[22:06:27] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.376	[22:06:35] Tree method is selected to be 'hist', which uses a si

In [110]:
# 0.390 --> 0.389 (new initialised score)
# 0.389 --> 0.388 (count of description - insignificant)
# 0.389 --> 0.385 (count of description words - insignificant)
# 0.389 --> 0.388 (count of chars and words - insignificant)
# 0.389 --> 0.339 (remove breeds)
# 0.339 --> 0.351 (add counts - significant increase)
np.mean(scores)

0.38858164531908523

# Submitting test data

In [119]:
clf = xgb.XGBClassifier(**params)
clf.fit(X_train_all, y_train_all)
prediction = clf.predict(X_test_all)

[08:10:52] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.


In [120]:
submission = pd.DataFrame({'AdoptionSpeed': prediction.astype(int)}, index=X_test_all.index)
submission.to_csv("submission.csv", index=True, index_label='PetID', header=['AdoptionSpeed'])