# Model 0.11 - Test Tfidf
- Cleanup Tfidf, after previous experitmentation and play about with tfidf and sparse arrays
- Reduced features for testing and using dense, instead of sparse
- "Trained" Tfidf on all data, not just test or training corpus
- Tried top 100, 10 and 5 features: no improvement/worse

In [1]:
import re
import ml_metrics
import string
import nltk
import scipy
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import hstack
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

  from numpy.core.umath_tests import inner1d


In [2]:
# hides warnings - think it needs running after modules imported
import warnings
warnings.simplefilter("ignore")

In [3]:
rnd = 42  # random state for scoring consistency

## Importing the data

In [4]:
df_train = pd.read_csv("../input/train/train.csv", index_col="PetID")
df_test = pd.read_csv("../input/test/test.csv", index_col="PetID")
df_breeds = pd.read_csv("../input/breed_labels.csv", index_col="BreedID")
df_colors = pd.read_csv("../input/color_labels.csv")

In [5]:
colors = df_colors['ColorID']
breeds = df_breeds.index

## Functions

In [6]:
def apply_word_flags(df, words):
    """Creates binary columns for words which appear in the description"""
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                word = word.lower()
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    return df.drop(columns=['Description'])

In [7]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [8]:
def apply_color_flags(df, colors):
    """Combines Colors 1,2 & 3 into binary columns for each possible colours"""
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [9]:
def create_breed_keywords(df):
    """Creates unique list of keywords from provided breeds dataframe"""
    breed_keywords = []
    for breed in df['BreedName']:
        breed = re.sub(r'[/(/)]', '', breed)  # remove braces
        keywords = breed.split()
        breed_keywords += keywords
    return set(breed_keywords)

In [10]:
def apply_breed_flags(df, keywords, breeds):
    """Creates binary columns for keywords which appear in the breed name"""
    for word in keywords:
        df[word] = 0
        
    for i,pair in df[['Breed1', 'Breed2']].iterrows():
        for indx in pair:
            if indx == 0: continue
            breed = breeds.loc[indx,'BreedName']
            breed = re.sub(r'[/(/)]', '', breed)
            new_keywords = breed.split()
            for word in new_keywords:
                if word in keywords: 
                    df.at[i,word] = 1
                    
    return df

In [11]:
# tokenizer for description, to return list of word tokens
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

## Preparing training data

In [12]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

# Rescuer
rescue_map = Counter(df_combined['RescuerID'])
rescuer_counts = df_combined['RescuerID'].map(rescue_map)

# Breeds
all_test_breeds = df_test['Breed1'].append(df_test['Breed2'])
df_test_breeds = df_breeds.loc[all_test_breeds[all_test_breeds > 0].unique(), :]
breed_keywords = create_breed_keywords(df_test_breeds)

# Prepare data for modelling 
df_combined['rescuer_counts'] = rescuer_counts
# df_combined = apply_word_flags(df_combined, keywords)
df_combined = apply_color_flags(df_combined, colors)
df_combined = apply_breed_flags(df_combined, breed_keywords, df_breeds)
df_combined = pd.get_dummies(df_combined, columns=['Gender',
                                                   'Vaccinated', 'Dewormed', 'Sterilized', 
                                                   'State'])
y_train_all = df_combined['AdoptionSpeed'][df_combined['test'] != 1]
X_all       = df_combined.drop(columns=['Name', 'RescuerID', 'AdoptionSpeed', 'Breed1', 'Breed2'])
X_train_all = X_all[X_all['test'] != 1].drop(columns=['test'])
X_test_all  = X_all[X_all['test'] == 1].drop(columns=['test'])

### Tfidf

In [48]:
# remove punctuation from text
remove_punc = str.maketrans({key: None for key in string.punctuation})

In [49]:
# create dictionary of all test data descriptions
token_dict = {}
for idx, desc in df_combined['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [66]:
# Prepare Tfidf
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=5)
tfs = tfidf.fit_transform(token_dict.values())

In [67]:
# Check training features count
features = tfidf.get_feature_names()

In [68]:
# Replace np.nan with blank text
df_combined['Description'] = df_combined['Description'].fillna('')

In [69]:
# transform all data to test data tokens
response = tfidf.transform(df_combined['Description'])

In [70]:
# convert response to array (if becomes issue will use sparse array but might need to convert rest of data)
response_arr = response.toarray()

In [71]:
# transform all data to test data tokens
response = tfidf.transform(X_all['Description'].fillna(''))

In [72]:
# get features for checking
features = tfidf.get_feature_names()

In [73]:
# check sample rows
print(X_all.iloc[4000]['Description'], end='\n\n')
values = []
for row, col in zip(response.nonzero()[0], response.nonzero()[1]):
    if row == 4000: 
        values.append(response[0, col])
        print("{} \t {:.3f}".format(features[col], response[row, col]))


Bouncer is a happy pup that needs a happy loving environment to grow up in and of course till its last day in this world. If you can do that for him please call or whatsapp Jocelyn at. Adoption requirement is commit to neutering. I collect back vaccination fee. Please Don't sms or email.

love 	 0.761
adopt 	 0.649


In [74]:
# Combine features
tfidf_features = features.copy()
features = list(X_all.drop(columns=['test','Description']).columns)
features.extend(tfidf_features)

In [75]:
# create dataframe for tfidf and combine with original data
df_response = pd.DataFrame(columns = tfidf_features,
                           data = response.todense(),
                           index = X_all.index)
result = pd.concat([df_response, X_all], axis=1 )

In [95]:
# check sample rows
print(df_combined.iloc[2]['Description'])
result.iloc[2][:20]

Snowball... doesn't look so good (she is healthy)... a bit cranky but still... she is for adoption... She is actually a very manja and gentle cat. May be she doesn't like her photo to be taken...


adopt                                                      0.466757
cat                                                        0.683185
home                                                              0
love                                                              0
veri                                                       0.561601
Type                                                              2
Age                                                              20
MaturitySize                                                      2
FurLength                                                         1
Health                                                            1
Quantity                                                          1
Fee                                                             150
VideoAmt                                                          0
Description       Snowball... doesn't look so good (she is healt...
PhotoAmt                                        

In [77]:
# split training and test data ready for testing model
X_train_df = result[result['test'] != 1].drop(columns=['test', 'Description'])
X_test_df = result[result['test'] == 1].drop(columns=['test', 'Description'])

## Test XGBoost model

In [78]:
params = {'max_depth': 4, 
          'learning_rate': 0.2, 
          'n_estimators': 200, 
          'silent': True, 
          'objective': 'multi:softprob', 
          'booster': 'gbtree',
          'tree_method': 'hist',
          'n_jobs': 3,
          'gamma': 0, 
          'min_child_weight': 1, 
          'max_delta_step': 0, 
          'subsample': 0.8, 
          'colsample_bytree': 1, 
          'colsample_bylevel': 1, 
          'reg_alpha': 0, 
          'reg_lambda': 1, 
          'scale_pos_weight': 1, 
          'base_score': 0.2, 
          'random_state': rnd, 
          'missing': None,
          'verbose': 0,
          'verbosity': 3}

In [79]:
def cv_testing(X_train_all, params, folds=5, dataframe=True):

    scores = []

    folds = KFold(folds, True, rnd).split(X_train_all)

    for train_indx, test_indx in folds:

        # flag dataframe determines whether to use iloc or "normal" masking for (sparse) arrays
        if not dataframe:
            X_train, X_test = X_train_all[train_indx], X_train_all[test_indx]
            y_train, y_test = y_train_all[train_indx], y_train_all[test_indx]
        else:
            X_train, X_test = X_train_all.iloc[train_indx], X_train_all.iloc[test_indx]
            y_train, y_test = y_train_all.iloc[train_indx], y_train_all.iloc[test_indx]


        clf = xgb.XGBClassifier(**params)

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)

        scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
        print("{:.3f}".format(scores[-1]), end="\t")
    
    print()
    
    return scores

In [80]:
scores = cv_testing(X_train_all=X_train_df, folds=10, params=params, dataframe=True)

[09:15:06] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.391	[09:15:14] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.418	[09:15:22] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.391	[09:15:31] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.377	[09:15:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.366	[09:15:47] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.409	[09:15:55] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.373	[09:16:03] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.390	[09:16:11] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.357	[09:16:19] Tree method is selected to be 'hist', which uses a si

In [81]:
# 0.390 --> 0.382 (dense dataframe of tfidf (all data) - 10 features)
# 0.390 --> 0.371 (dense dataframe of tfidf (all data) - 100 features)
# 0.390 --> 0.383 (dense dataframe of tfidf (all data) - 5 features)
np.mean(scores)

0.383182483471643

# Submitting test data

In [None]:
# clf = xgb.XGBClassifier(**params)
# clf.fit(X_train_all, y_train_all)
# prediction = clf.predict(X_test_all)

In [None]:
# submission = pd.DataFrame({'AdoptionSpeed': prediction.astype(int)}, index=X_test_all.index)
# submission.to_csv("submission.csv", index=True, index_label='PetID', header=['AdoptionSpeed'])