# Model 0.10 - Description
- Changed _tree method_ parametor to _hist_, similar method to LGBM: increased speed and slightly better CV (0.386 --> 0.390)

In [144]:
import re
import ml_metrics
import string
import nltk
import scipy
import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from scipy.sparse import hstack
from sklearn import preprocessing
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem.porter import PorterStemmer

In [2]:
# hides warnings - think it needs running after modules imported
import warnings
warnings.simplefilter("ignore")

In [3]:
rnd = 42  # random state for scoring consistency

## Importing the data

In [4]:
df_train = pd.read_csv("../input/train/train.csv", index_col="PetID")
df_test = pd.read_csv("../input/test/test.csv", index_col="PetID")
df_breeds = pd.read_csv("../input/breed_labels.csv", index_col="BreedID")
df_colors = pd.read_csv("../input/color_labels.csv")

In [5]:
colors = df_colors['ColorID']
breeds = df_breeds.index

##  Tfidf Vectorizer (Better BoW technique)
- Extracts non-stopwords from descriptions
- Applies weighting to text depending on commonalities
- Weight is dependent on how many other words are shared and quantity.

In [6]:
# tokenizer for description, to return list of word tokens
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = []
    for item in tokens:
        stems.append(PorterStemmer().stem(item))
    return stems

In [7]:
# sample of tokenized text
text = df_test['Description'][3]
tokenize(text)

['malibu',
 ':',
 'femal',
 ',',
 'local',
 'mix',
 ',',
 '4-5',
 'month',
 ',',
 'vaccin',
 'and',
 'spay',
 '.',
 'strike',
 'featur',
 'with',
 'fade',
 'beig',
 'fur',
 'and',
 'jet-yellow',
 'eye',
 '.',
 'natur',
 'curiou',
 'explor',
 ',',
 'immedi',
 'taken',
 'to',
 'human',
 'interact',
 'and',
 'love',
 'to',
 'play',
 'around',
 '.',
 'come',
 'and',
 'meet',
 'our',
 'anim',
 'for',
 'adopt',
 'at',
 'selangor',
 '(',
 'tue',
 '-',
 'sun',
 ',',
 '10am',
 '-',
 '4pm',
 ')',
 'and',
 'fall',
 'in',
 'love',
 '!',
 'www.spca.org.mi']

In [8]:
# remove punctuation from text
remove_punc = str.maketrans({key: None for key in string.punctuation})
text = text.translate(remove_punc).lower()
print(text)

malibu female local mix 45 months vaccinated and spayed striking features with faded beige fur and jetyellow eyes naturally curious explorer immediately taken to human interaction and loves to play around come and meet our animals for adoption at selangor tues  sun 10am  4pm and fall in love wwwspcaorgmy


In [9]:
# Prepare Tfidf
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform([text])

In [10]:
# create fake entry to test
features = tfidf.get_feature_names()
response = tfidf.transform(['Malibu is a wonderful kitty cat who fell and was taken immediately LOVE! \
                             where a human without hair cared for it. Now love it needs a new home and you \
                             might be able to help. Please help. Call 0800-I-CARE now!!!!'])

In [11]:
# output non-zero
for col in response.nonzero()[1]:
    print("{} \t {:.3f}".format(features[col], response[0, col]))

taken 	 0.354
malibu 	 0.354
love 	 0.707
immedi 	 0.354
human 	 0.354


### Implementation of Tfidf

In [12]:
# create dictionary of all test data descriptions
token_dict = {}
for idx, desc in df_test['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [13]:
# Prepare Tfidf again
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [14]:
# Check feature count - 10,506 (too many??)
features = tfidf.get_feature_names()
len(features)

10506

In [15]:
# create dictionary of all training data descriptions
token_dict = {}
for idx, desc in df_train['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [16]:
# Prepare Tfidf for training data
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [17]:
# Check training features count
features_training = tfidf.get_feature_names()
len(features_training)

22103

In [18]:
# intersecting features between test and training
combined = set(features) & set(features_training)
len(combined)

6408

In [19]:
# REPEAT: create dictionary of all test data descriptions
token_dict = {}
for idx, desc in df_test['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [20]:
# Prepare Tfidf for test again
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english')
tfs = tfidf.fit_transform(token_dict.values())

In [21]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

In [22]:
# Check training features count
features = tfidf.get_feature_names()
len(features)

10506

In [23]:
# ValueError where missing description is represented as np.nan (not a number - float object)
try:
    response = tfidf.transform(df_combined['Description'])
except ValueError:
    print("Description is not text/string object")

Description is not text/string object


In [24]:
# Replace np.nan with blank text
df_combined['Description'] = df_combined['Description'].fillna('')

In [25]:
# transform all data to test data tokens
response = tfidf.transform(df_combined['Description'])

In [26]:
# convert response to array (if becomes issue will use sparse array but might need to convert rest of data)
response_arr = response.toarray()

In [44]:
# tested adding this to the df but got Memory error - will need to use sparse
response_arr.shape

(18941, 10506)

## Functions

In [30]:
def apply_word_flags(df, words):
    """Creates binary columns for words which appear in the description"""
    for word in words:
        df[word] = 0
    for i, desc in df['Description'].items():
        try:
            for word in desc.split():
                word = word.lower()
                if word in words:
                    df.at[i,word] = 1
        except AttributeError:
            continue
    return df.drop(columns=['Description'])

In [31]:
keywords = ['home', 'good' , 'adopt', 'loving', 'give', 'looking', 'playful', 'rescued', 'cat', 'contact']

In [32]:
def apply_color_flags(df, colors):
    """Combines Colors 1,2 & 3 into binary columns for each possible colours"""
    for c in colors:
        df[f'C{c}'] = 0
    for i,colors in df[['Color1', 'Color2', 'Color3']].iterrows():
        for c in colors:
            if c != 0:
                df.at[i,f'C{c}'] = 1
    df = df.drop(columns=['Color1', 'Color2', 'Color3'])
    return df

In [33]:
def create_breed_keywords(df):
    """Creates unique list of keywords from provided breeds dataframe"""
    breed_keywords = []
    for breed in df['BreedName']:
        breed = re.sub(r'[/(/)]', '', breed)  # remove braces
        keywords = breed.split()
        breed_keywords += keywords
    return set(breed_keywords)

In [34]:
def apply_breed_flags(df, keywords, breeds):
    """Creates binary columns for keywords which appear in the breed name"""
    for word in keywords:
        df[word] = 0
        
    for i,pair in df[['Breed1', 'Breed2']].iterrows():
        for indx in pair:
            if indx == 0: continue
            breed = breeds.loc[indx,'BreedName']
            breed = re.sub(r'[/(/)]', '', breed)
            new_keywords = breed.split()
            for word in new_keywords:
                if word in keywords: 
                    df.at[i,word] = 1
                    
    return df

## Preparing training data

In [251]:
# Combine test and training data
df_combined = pd.concat([df_test, df_train], sort=False)
df_combined['test'] = df_combined['AdoptionSpeed'].isna()

# Rescuer
rescue_map = Counter(df_combined['RescuerID'])
rescuer_counts = df_combined['RescuerID'].map(rescue_map)

# Breeds
all_test_breeds = df_test['Breed1'].append(df_test['Breed2'])
df_test_breeds = df_breeds.loc[all_test_breeds[all_test_breeds > 0].unique(), :]
breed_keywords = create_breed_keywords(df_test_breeds)

# Prepare data for modelling 
df_combined['rescuer_counts'] = rescuer_counts
# df_combined = apply_word_flags(df_combined, keywords)
df_combined = apply_color_flags(df_combined, colors)
df_combined = apply_breed_flags(df_combined, breed_keywords, df_breeds)
df_combined = pd.get_dummies(df_combined, columns=['Gender',
                                                   'Vaccinated', 'Dewormed', 'Sterilized', 
                                                   'State'])
y_train_all = df_combined['AdoptionSpeed'][df_combined['test'] != 1]
X_all       = df_combined.drop(columns=['Name', 'RescuerID', 'AdoptionSpeed', 'Breed1', 'Breed2'])
X_train_all = X_all[X_all['test'] != 1].drop(columns=['test'])
X_test_all  = X_all[X_all['test'] == 1].drop(columns=['test'])

### Experimenting with sparse data arrays

In [74]:
# convert all data to numpy and convert to integers (scipy doesn't like mixed data types)
X_all_values = np.array(X_all.values.astype(int))
X_all_values

array([[ 1,  2,  2, ...,  0,  0,  0],
       [ 2, 24,  2, ...,  0,  0,  0],
       [ 2, 20,  2, ...,  0,  0,  0],
       ...,
       [ 2,  2,  3, ...,  0,  0,  0],
       [ 2,  9,  1, ...,  0,  0,  0],
       [ 1,  1,  2, ...,  0,  0,  0]])

In [148]:
# convert all data to sparse data array and remove test column
X_all_sparse = scipy.sparse.csr_matrix(np.array(X_all.drop(columns=['test']).values.astype(int)))

In [149]:
# shape of sparse array, compared to pandas dataframe
print(f"Pandas DataFrame:{X_all.shape}")
print(f"Sparse Array: \t {X_all_sparse.shape}")

Pandas DataFrame:(18941, 215)
Sparse Array: 	 (18941, 214)


In [150]:
# get index of test column
list(X_all.columns).index('test')

9

In [151]:
# split all data into test and training
mask_test = np.where(X_all['test'] == 1)[0]
mask_train = np.where(X_all['test'] == 0)[0]
X_train_sparse = X_all_sparse[mask_train]
X_test_sparse  = X_all_sparse[mask_test]

In [227]:
# same as 'cv_testing' from below, but iloc removed and important features returned
def cv_testing_sparse(X_train_all, params, folds=5):

    scores = []

    folds = KFold(folds, True, rnd).split(X_train_all)

    for train_indx, test_indx in folds:

        X_train, X_test = X_train_all[train_indx], X_train_all[test_indx]
        y_train, y_test = y_train_all[train_indx], y_train_all[test_indx]


        clf = xgb.XGBClassifier(**params)

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)

        scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
        print("{:.3f}".format(scores[-1]), end="\t")
    
    print()
    
    return scores, clf.feature_importances_ # latest fold features

In [154]:
# scores are very much the same as using dataframe
scores, _ = cv_testing_sparse(X_train_all=X_train_sparse, folds=10, params=params)

[21:11:20] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.384	[21:11:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.422	[21:11:26] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.416	[21:11:30] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.387	[21:11:33] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.367	[21:11:36] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.416	[21:11:39] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.384	[21:11:42] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.368	[21:11:46] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.383	[21:11:49] Tree method is selected to be 'hist', which uses a si

In [155]:
# 0.390 (unchanged scores as expected)
np.mean(scores)

0.39048733629151666

### Combine all sparse data with description features

In [167]:
# check response 
print(response.shape)
response

(18941, 10506)


<18941x10506 sparse matrix of type '<class 'numpy.float64'>'
	with 499323 stored elements in Compressed Sparse Row format>

In [261]:
# check sparse data
X_all_sparse.shape

(18941, 214)

In [171]:
# scipy hstack
X_sparse_inc_desc = hstack([X_all_sparse, response]).tocsr()

In [172]:
# check new sparse dataset
print(X_sparse_inc_desc.shape)
X_sparse_inc_desc

(18941, 10720)


<18941x10720 sparse matrix of type '<class 'numpy.float64'>'
	with 863884 stored elements in Compressed Sparse Row format>

In [188]:
# split all data into test and training
mask_test = np.where(X_all['test'] == 1)[0]
mask_train = np.where(X_all['test'] == 0)[0]
X_train_sparse = X_sparse_inc_desc[mask_train]
X_test_sparse  = X_sparse_inc_desc[mask_test]

In [215]:
# scores are very much the same as using dataframe
scores, feature_importances = cv_testing_sparse(X_train_all=X_train_sparse, folds=10, params=params)

[21:40:25] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.413	[21:41:59] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.407	[21:43:31] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.367	[21:45:10] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.347	[21:46:49] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.361	[21:48:22] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.403	[21:49:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.362	[21:51:23] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.366	[21:52:55] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.375	[21:54:22] Tree method is selected to be 'hist', which uses a si

### Determine best features

In [216]:
# Adjusted model to get feature importances, however needs labelling
feature_importances

array([0.00257175, 0.02880362, 0.00524637, ..., 0.        , 0.        ,
       0.        ], dtype=float32)

In [217]:
features

['\x01iq',
 '\x01ǩbty9s̷\x02˂0dgmoj\x17qxbqu\x05gd',
 '\x03about',
 '\x04\x13ժ\x7fdgjoqf\x076',
 '\x06b8tw\x1bq眂̽ԣ\x15nwg\x7fnq\x06iqgkz\x0fl\x15h\x16gsŗm\x05\x04\x0ecm\x15f\x1a\x03esm\x05y\x05\x1bs\x05qttjywwh\x16k\x15ٯ\x04',
 '\x0fإw6k\x17pvcxqxnt',
 '\x10c\x0fusjn9tlodnb\x17w7ֹg\x16\x17g\x7fr\x15dl',
 '\x13',
 '\x14\x17',
 '\x14mh\x10',
 '\x175yz',
 '\x18t5dl',
 '\x1alys\x12k',
 '\x1avvh這mniysd\x17ilu\x18b\x06\x7f\x01\u07b9ab\x12mvit9xgovē\x11ẑlwet8r\x1b\x07zr',
 '0',
 '000',
 '02',
 '03',
 '06',
 '08',
 '09',
 '0for',
 '1',
 '10',
 '108',
 '109',
 '10am',
 '10in1',
 '10kg',
 '10lx3di',
 '10pm',
 '10second',
 '10th',
 '11',
 '1111',
 '113',
 '116',
 '11am',
 '11juli',
 '11month',
 '11pm',
 '11th',
 '12',
 '121',
 '1213',
 '122',
 '123',
 '12310',
 '128kg',
 '12916',
 '12hr',
 '12midnit',
 '12month',
 '12pm',
 '12pm2pm',
 '12th',
 '12yr',
 '13',
 '13615',
 '13814',
 '13th',
 '13yearsnow',
 '14',
 '142',
 '144',
 '145',
 '149',
 '14dogsreal',
 '14juli',
 '15',
 '156',
 '15715',
 '15ju

In [218]:
# Combine labels from original features and tfidf features
labels = list(X_all.drop(columns=['test']).columns) + features
len(labels)

10720

In [219]:
# Create dataframe combining labels with importance and ordering in ascending order
importance_df = pd.DataFrame(feature_importances,
                             index = labels,
                             columns=['importance']).sort_values('importance', ascending=False)

In [220]:
importance_df

Unnamed: 0,importance
rescuer_counts,0.031684
Age,0.028804
PhotoAmt,0.023763
adopt,0.020574
veri,0.016048
love,0.014299
home,0.013887
dog,0.010493
pleas,0.010390
wa,0.008950


In [223]:
# only positive importance makes up 1279, which will significantly improve speed at least
nonzero_importance = importance_df[importance_df['importance'] > 0]
nonzero_importance

Unnamed: 0,importance
rescuer_counts,0.031684
Age,0.028804
PhotoAmt,0.023763
adopt,0.020574
veri,0.016048
love,0.014299
home,0.013887
dog,0.010493
pleas,0.010390
wa,0.008950


### Rerun classifier for ALL training data and extract feature importances from run on test data

In [229]:
# Create classifier to train all data, to pick out important features
clf = xgb.XGBClassifier(**params)
clf.fit(X_train_sparse, y_train_all)

[17:56:33] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.


XGBClassifier(base_score=0.2, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.2, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=200,
       n_jobs=3, nthread=None, objective='multi:softprob', random_state=42,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.8, tree_method='hist', verbose=0)

In [230]:
# extract important features
importance_df = pd.DataFrame(clf.feature_importances_,
                             index = labels,
                             columns=['importance']).sort_values('importance', ascending=False)

In [232]:
importance_df.head()

Unnamed: 0,importance
Age,0.027684
rescuer_counts,0.027479
PhotoAmt,0.024813
adopt,0.019994
home,0.016098


In [234]:
# only positive importance makes up 1305
nonzero_importance = importance_df[importance_df['importance'] > 0]
nonzero_importance

Unnamed: 0,importance
Age,0.027684
rescuer_counts,0.027479
PhotoAmt,0.024813
adopt,0.019994
home,0.016098
love,0.013534
veri,0.012714
pleas,0.011996
dog,0.010971
play,0.009125


### Rebuild Tfidf using only top features
I was going to remove the above features from the tfidf vector, but for now will vary the amount arbitarily, based on count. This is perhaps something I can implement later on, when and if I have time to do so.

In [328]:
# Create dictionary of all test data descriptions
token_dict = {}
for idx, desc in df_train['Description'].items():
    try:
        token_dict[idx] = desc.translate(remove_punc).lower()
    except AttributeError:
        continue

In [329]:
# Prepare Tfidf for test again, but varying max_features parameter 
tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words='english', max_features=10)
tfs = tfidf.fit_transform(token_dict.values())

In [330]:
# Check training features count
features = tfidf.get_feature_names()
len(features)

10

In [331]:
# transform all data to test data tokens
response = tfidf.transform(df_combined['Description'].fillna(''))

In [332]:
# shape of response
response.shape

(18941, 10)

In [344]:
# convert all data to sparse data array and remove test column
X_all_sparse = scipy.sparse.csr_matrix(np.array(X_all.drop(columns=['test', 'Description']).values.astype(int)))

In [334]:
# scipy hstack
X_sparse_inc_desc = hstack([X_all_sparse, response]).tocsr()

In [335]:
# check new sparse dataset
print(X_sparse_inc_desc.shape)
X_sparse_inc_desc

(18941, 214)


<18941x214 sparse matrix of type '<class 'numpy.float64'>'
	with 391127 stored elements in Compressed Sparse Row format>

In [345]:
# split all data into test and training
mask_test = np.where(X_all['test'] == 1)[0]
mask_train = np.where(X_all['test'] == 0)[0]
X_train_sparse = X_all_sparse[mask_train]
X_test_sparse  = X_all_sparse[mask_test]

## Test XGBoost model

In [337]:
params = {'max_depth': 4, 
          'learning_rate': 0.2, 
          'n_estimators': 200, 
          'silent': True, 
          'objective': 'multi:softprob', 
          'booster': 'gbtree',
          'tree_method': 'hist',
          'n_jobs': 3,
          'gamma': 0, 
          'min_child_weight': 1, 
          'max_delta_step': 0, 
          'subsample': 0.8, 
          'colsample_bytree': 1, 
          'colsample_bylevel': 1, 
          'reg_alpha': 0, 
          'reg_lambda': 1, 
          'scale_pos_weight': 1, 
          'base_score': 0.2, 
          'random_state': rnd, 
          'missing': None,
          'verbose': 0,
          'verbosity': 3}

In [338]:
def cv_testing(X_train_all, params, folds=5, dataframe=True):

    scores = []

    folds = KFold(folds, True, rnd).split(X_train_all)

    for train_indx, test_indx in folds:

        # flag dataframe determines whether to use iloc or "normal" masking for (sparse) arrays
        if not dataframe:
            X_train, X_test = X_train_all[train_indx], X_train_all[test_indx]
            y_train, y_test = y_train_all[train_indx], y_train_all[test_indx]
        else:
            X_train, X_test = X_train_all.iloc[train_indx], X_train_all.iloc[test_indx]
            y_train, y_test = y_train_all.iloc[train_indx], y_train_all.iloc[test_indx]


        clf = xgb.XGBClassifier(**params)

        clf.fit(X_train, y_train)

        prediction = clf.predict(X_test)

        scores.append(ml_metrics.quadratic_weighted_kappa(rater_a=y_test, rater_b=prediction))
        print("{:.3f}".format(scores[-1]), end="\t")
    
    print()
    
    return scores

In [346]:
scores = cv_testing(X_train_all=X_train_sparse, folds=10, params=params, dataframe=False)

[16:58:48] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.389	[16:58:50] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.421	[16:58:53] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.413	[16:58:56] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.384	[16:58:58] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.367	[16:59:01] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.407	[16:59:04] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.368	[16:59:06] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.385	[16:59:09] Tree method is selected to be 'hist', which uses a single updater grow_fast_histmaker.
0.362	[16:59:12] Tree method is selected to be 'hist', which uses a si

In [349]:
# 0.386 --> 0.390 (changed tree method to 'hist')
# 0.390 --> 0.374 (tfidf features 10)
# 0.390 --> 0.366 (tfidf features 100)
# 0.390 --> 0.370 (tfidf features 100, using training data corpus)
# 0.390 --> 0.374 (tfidf features 10, using training data corpus)
# 0.390 --> 0.386 (no tfidf?! note: also no keyword flags)
np.mean(scores)

0.38597978973077507

Results of adding tfidf features appears to be backwards. This could be a result of multiple factors:
 - Made a mistake somewhere
 - Tfidf is not suitable
 - Model is overfitting during training

Will clean up and reassess.

# Submitting test data

In [40]:
clf = xgb.XGBClassifier(**params)
clf.fit(X_train_all, y_train_all)
prediction = clf.predict(X_test_all)

In [41]:
submission = pd.DataFrame({'AdoptionSpeed': prediction.astype(int)}, index=X_test_all.index)
submission.to_csv("submission.csv", index=True, index_label='PetID', header=['AdoptionSpeed'])