# Model 0.10 - Description
- Changed _tree method_ parametor to _hist_, similar method to LGBM: increased speed and slightly better CV (0.386 --> 0.390)

In [48]:
import re
import ml_metrics
import string
import nltk
import scipy
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

In [2]:
# hides warnings - think it needs running after modules imported
import warnings
warnings.simplefilter("ignore")

In [3]:
rnd = 42  # random state for scoring consistency

## Importing the data

In [4]:
df_train = pd.read_csv("../input/train/train.csv", index_col="PetID")
df_test = pd.read_csv("../input/test/test.csv", index_col="PetID")
df_breeds = pd.read_csv("../input/breed_labels.csv", index_col="BreedID")
df_colors = pd.read_csv("../input/color_labels.csv")

In [5]:
colors = df_colors['ColorID']
breeds = df_breeds.index

##  Tfidf Vectorizer (Better BoW technique)
- Extracts non-stopwords from descriptions
- Applies weighting to text depending on commonalities
- Weight is dependent on how many other words are shared and quantity.

In [6]:
# tokenizer for description, to return list of word tokens
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [7]:
# sample of tokenized text
text = df_test['Description'][3]
tokenize(text)

['malibu',
 ':',
 'femal',
 ',',
 'local',
 'mix',
 ',',
 '4-5',
 'month',
 ',',
 'vaccin',
 'and',
 'spay',
 '.',
 'strike',
 'featur',
 'with',
 'fade',
 'beig',
 'fur',
 'and',
 'jet-yellow',
 'eye',
 '.',
 'natur',
 'curiou',
 'explor',
 ',',
 'immedi',
 'taken',
 'to',
 'human',
 'interact',
 'and',
 'love',
 'to',
 'play',
 'around',
 '.',
 'come',
 'and',
 'meet',
 'our',
 'anim',
 'for',
 'adopt',
 'at',
 'selangor',
 '(',
 'tue',
 '-',
 'sun',
 ',',
 '10am',
 '-',
 '4pm',
 ')',
 'and',
 'fall',
 'in',
 'love',
 '!',
 'www.spca.org.mi']

In [8]:
# remove punctuation from text
remove_punc = str.maketrans({key: None for key in string.punctuation})
text = text.translate(remove_punc).lower()
print(text)

malibu female local mix 45 months vaccinated and spayed striking features with faded beige fur and jetyellow eyes naturally curious explorer immediately taken to human interaction and loves to play around come and meet our animals for adoption at selangor tues  sun 10am  4pm and fall in love wwwspcaorgmy


In [9]:
# Prepare Tfidf
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform([text])

In [10]:
# create fake entry to test
features = tfidf.get_feature_names()
response = tfidf.transform(['Malibu is a wonderful kitty cat who fell and was taken immediately LOVE! \
                             where a human without hair cared for it. Now love it needs a new home and you \
                             might be able to help. Please help. Call 0800-I-CARE now!!!!'])

In [11]:
# output non-zero
for col in response.nonzero()[1]:
    print("{} \t {:.3f}".format(features[col], response[0, col]))

taken 	 0.354
malibu 	 0.354
love 	 0.707
immedi 	 0.354
human 	 0.354


### Implementation of Tfidf

In [12]:
# create dictionary of all test data descriptions
token_dict = {}
for idx, desc in df_test['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [13]:
# Prepare Tfidf again
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [14]:
# Check feature count - 10,506 (too many??)
features = tfidf.get_feature_names()
len(features)

10506

In [15]:
# create dictionary of all training data descriptions
token_dict = {}
for idx, desc in df_train['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [16]:
# Prepare Tfidf for training data
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [17]:
# Check training features count
features_training = tfidf.get_feature_names()
len(features_training)

22103

In [18]:
# intersecting features between test and training
combined = set(features) & set(features_training)
len(combined)

6408

In [19]:
# REPEAT: create dictionary of all test data descriptions
token_dict = {}
for idx, desc in df_test['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [20]:
# Prepare Tfidf for test again
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [21]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

In [22]:
# Check training features count
features = tfidf.get_feature_names()
len(features)

10506

In [23]:
# ValueError where missing description is represented as np.nan (not a number - float object)
try:
    response = tfidf.transform(df_combined['Description'])
except ValueError:
    print("Description is not text/string object")

Description is not text/string object


In [24]:
# Replace np.nan with blank text
df_combined['Description'] = df_combined['Description'].fillna('')

In [25]:
# transform all data to test data tokens
response = tfidf.transform(df_combined['Description'])

In [26]:
# convert response to array (if becomes issue will use sparse array but might need to convert rest of data)
response_arr = response.toarray()

In [44]:
# tested adding this to the df but got Memory error - will need to use sparse
response_arr.shape

(18941, 10506)

## Functions

In [30]:
def apply_word_flags(df, words):
    """Creates binary columns for words which appear in the description"""
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                word = word.lower()
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    return df.drop(columns=['Description'])

In [31]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [32]:
def apply_color_flags(df, colors):
    """Combines Colors 1,2 & 3 into binary columns for each possible colours"""
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [33]:
def create_breed_keywords(df):
    """Creates unique list of keywords from provided breeds dataframe"""
    breed_keywords = []
    for breed in df['BreedName']:
        breed = re.sub(r'[/(/)]', '', breed)  # remove braces
        keywords = breed.split()
        breed_keywords += keywords
    return set(breed_keywords)

In [34]:
def apply_breed_flags(df, keywords, breeds):
    """Creates binary columns for keywords which appear in the breed name"""
    for word in keywords:
        df[word] = 0
        
    for i,pair in df[['Breed1', 'Breed2']].iterrows():
        for indx in pair:
            if indx == 0: continue
            breed = breeds.loc[indx,'BreedName']
            breed = re.sub(r'[/(/)]', '', breed)
            new_keywords = breed.split()
            for word in new_keywords:
                if word in keywords: 
                    df.at[i,word] = 1
                    
    return df

## Preparing training data

In [35]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

# Rescuer
rescue_map = Counter(df_combined['RescuerID'])
rescuer_counts = df_combined['RescuerID'].map(rescue_map)

# Breeds
all_test_breeds = df_test['Breed1'].append(df_test['Breed2'])
df_test_breeds = df_breeds.loc[all_test_breeds[all_test_breeds > 0].unique(), :]
breed_keywords = create_breed_keywords(df_test_breeds)

# Prepare data for modelling 
df_combined['rescuer_counts'] = rescuer_counts
df_combined = apply_word_flags(df_combined, keywords)
df_combined = apply_color_flags(df_combined, colors)
df_combined = apply_breed_flags(df_combined, breed_keywords, df_breeds)
df_combined = pd.get_dummies(df_combined, columns=['Gender',
                                                   'Vaccinated', 'Dewormed', 'Sterilized', 
                                                   'State'])
y_train_all = df_combined['AdoptionSpeed'][df_combined['test'] != 1]
X_all       = df_combined.drop(columns=['Name', 'RescuerID', 'AdoptionSpeed', 'Breed1', 'Breed2'])
X_train_all = X_all[X_all['test'] != 1].drop(columns=['test'])
X_test_all  = X_all[X_all['test'] == 1].drop(columns=['test'])

### Experimenting with sparse data arrays

In [74]:
# convert all data to numpy and convert to integers (scipy doesn't like mixed data types)
X_all_values = np.array(X_all.values.astype(int))
X_all_values

array([[ 1,  2,  2, ...,  0,  0,  0],
       [ 2, 24,  2, ...,  0,  0,  0],
       [ 2, 20,  2, ...,  0,  0,  0],
       ...,
       [ 2,  2,  3, ...,  0,  0,  0],
       [ 2,  9,  1, ...,  0,  0,  0],
       [ 1,  1,  2, ...,  0,  0,  0]])

In [71]:
# convert all data to sparse data array
X_all_sparse = scipy.sparse.csr_matrix(np.array(X_all.values.astype(int)))

In [80]:
# shape of sparse array, compared to pandas dataframe
print(f"Pandas DataFrame:{X_all.shape}")
print(f"Sparse Array: \t {X_all_sparse.shape}")

Pandas DataFrame:(18941, 215)
Sparse Array: 	 (18941, 215)


In [82]:
# get index of test column
list(X_all.columns).index('test')

9

In [100]:
# split all data into test and training
mask = X_all_sparse[:,9]
X_all_sparse[mask]

IndexError: Indexing with sparse matrices is not supported except boolean indexing where matrix and index are equal shapes.

## Test XGBoost model

In [101]:
params = {'max_depth': 4, 
          'learning_rate': 0.2, 
          'n_estimators': 200, 
          'silent': True, 
          'objective': 'multi:softprob', 
          'booster': 'gbtree',
          'tree_method': 'hist',
          'n_jobs': 3,
          'gamma': 0, 
          'min_child_weight': 1, 
          'max_delta_step': 0, 
          'subsample': 0.8, 
          'colsample_bytree': 1, 
          'colsample_bylevel': 1, 
          'reg_alpha': 0, 
          'reg_lambda': 1, 
          'scale_pos_weight': 1, 
          'base_score': 0.2, 
          'random_state': rnd, 
          'missing': None,
          'verbose': 0,}

In [37]:
def cv_testing(X_train_all, params, folds=5):

    scores = []

    folds = KFold(folds, True, rnd).split(X_train_all)

    for train_indx, test_indx in folds:

        X_train, X_test = X_train_all.iloc[train_indx], X_train_all.iloc[test_indx]
        y_train, y_test = y_train_all.iloc[train_indx], y_train_all.iloc[test_indx]


        clf = xgb.XGBClassifier(**params)

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)

        scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
        print("{:.3f}".format(scores[-1]), end="\t")
    
    print()
    
    return scores

In [102]:
scores = cv_testing(X_train_all=X_train_all, folds=10, params=params)

[21:34:08] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.394	[21:34:15] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.436	[21:34:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.


KeyboardInterrupt: 

In [70]:
# 0.386 --> 0.390 (changed tree method to 'hist')
np.mean(scores)

0.3909419241621025

# Submitting test data

In [40]:
clf = xgb.XGBClassifier(**params)
clf.fit(X_train_all, y_train_all)
prediction = clf.predict(X_test_all)

In [41]:
submission = pd.DataFrame({'AdoptionSpeed': prediction.astype(int)}, index=X_test_all.index)
submission.to_csv("submission.csv", index=True, index_label='PetID', header=['AdoptionSpeed'])