In [1]:
import os
import re
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import seaborn as sns
import datetime
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
df_train = pd.read_csv('data/Hackerearth/train.csv')
df_test = pd.read_csv('data/Hackerearth/test.csv')
print(df_train.shape)
print(df_test.shape)

(18834, 11)
(8072, 9)


In [3]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18834 entries, 0 to 18833
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   pet_id          18834 non-null  object 
 1   issue_date      18834 non-null  object 
 2   listing_date    18834 non-null  object 
 3   condition       17357 non-null  float64
 4   color_type      18834 non-null  object 
 5   length(m)       18834 non-null  float64
 6   height(cm)      18834 non-null  float64
 7   X1              18834 non-null  int64  
 8   X2              18834 non-null  int64  
 9   breed_category  18834 non-null  float64
 10  pet_category    18834 non-null  int64  
dtypes: float64(4), int64(3), object(4)
memory usage: 1.6+ MB


In [4]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8072 entries, 0 to 8071
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   pet_id        8072 non-null   object 
 1   issue_date    8072 non-null   object 
 2   listing_date  8072 non-null   object 
 3   condition     7453 non-null   float64
 4   color_type    8072 non-null   object 
 5   length(m)     8072 non-null   float64
 6   height(cm)    8072 non-null   float64
 7   X1            8072 non-null   int64  
 8   X2            8072 non-null   int64  
dtypes: float64(3), int64(2), object(4)
memory usage: 567.7+ KB


In [5]:
df_train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,0.8,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,0.72,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,0.15,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,0.62,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,0.5,11.06,18,4,0.0,1


In [6]:
df_train['length(m)'] = df_train['length(m)'].apply(lambda each: each * 100)
df_test['length(m)'] = df_test['length(m)'].apply(lambda each: each * 100)

In [7]:
df_train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,Brown Tabby,80.0,7.78,13,9,0.0,1
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,White,72.0,14.19,13,9,0.0,2
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,,Brown,15.0,40.9,15,4,2.0,4
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,White,62.0,17.82,0,1,0.0,2
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,Black,50.0,11.06,18,4,0.0,1


In [8]:
df_train.isna().sum()

pet_id               0
issue_date           0
listing_date         0
condition         1477
color_type           0
length(m)            0
height(cm)           0
X1                   0
X2                   0
breed_category       0
pet_category         0
dtype: int64

In [9]:
df_train.condition.value_counts()

1.0    6819
0.0    6281
2.0    4257
Name: condition, dtype: int64

In [10]:
df_train[df_train['length(m)'] == 0]

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category
83,ANSL_73664,2017-10-04 00:00:00,2018-12-02 08:09:00,1.0,Black,0.0,32.98,7,1,0.0,2
91,ANSL_72100,2017-06-09 00:00:00,2018-07-03 14:53:00,0.0,Tan,0.0,22.20,0,7,1.0,2
174,ANSL_55528,2016-07-07 00:00:00,2017-09-28 14:19:00,0.0,Black,0.0,17.10,0,1,1.0,2
220,ANSL_59822,2018-05-11 00:00:00,2018-09-22 17:22:00,0.0,Brown,0.0,20.55,0,1,1.0,2
404,ANSL_58956,2018-07-14 00:00:00,2018-10-11 17:12:00,1.0,Lynx Point,0.0,39.49,7,1,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...
18043,ANSL_53008,2017-11-30 00:00:00,2018-03-02 13:26:00,1.0,Black,0.0,40.46,0,1,0.0,1
18078,ANSL_53838,2013-04-25 00:00:00,2017-09-04 16:11:00,0.0,Tricolor,0.0,27.51,0,7,1.0,2
18174,ANSL_69101,2018-05-31 00:00:00,2018-06-24 15:11:00,,Orange Tabby,0.0,35.10,18,4,2.0,1
18368,ANSL_64466,2017-09-26 00:00:00,2017-12-27 12:13:00,2.0,Black,0.0,15.55,13,9,0.0,1


In [11]:
df_train['length(m)'] = df_train['length(m)'].replace(0,df_train['length(m)'].mean())
df_test['length(m)'] = df_test['length(m)'].replace(0,df_test['length(m)'].mean())

In [12]:
df_train.condition.fillna(3.0, inplace = True)
df_test.condition.fillna(3.0, inplace = True)

In [13]:
df_test.isna().sum()

pet_id          0
issue_date      0
listing_date    0
condition       0
color_type      0
length(m)       0
height(cm)      0
X1              0
X2              0
dtype: int64

In [16]:
df_train.color_type.value_counts().sort_values(ascending = False)

Black                4620
White                2453
Brown                1791
Brown Tabby          1687
Tan                  1349
Blue                  852
Orange Tabby          791
Red                   526
Brown Brindle         496
Tricolor              469
Blue Tabby            386
Tortie                366
Calico                343
Gray                  307
Chocolate             259
Torbie                242
Cream Tabby           191
Sable                 167
Cream                 162
Fawn                  159
Yellow                143
Buff                  125
Lynx Point            117
Blue Merle            104
Seal Point             78
Black Brindle          66
Gray Tabby             65
Black Tabby            55
Flame Point            52
Brown Merle            39
Orange                 39
Black Smoke            32
Gold                   31
Tortie Point           26
Silver                 24
Red Tick               23
Blue Tick              21
Blue Point             20
Lilac Point 

In [19]:
df_train[df_train['breed_category'] == 0.0].pet_category.value_counts()

2    5692
1    3195
4      83
0      30
Name: pet_category, dtype: int64

In [20]:
df_train[df_train['breed_category'] == 1.0].pet_category.value_counts()

2    4869
1    3406
4      75
0       7
Name: pet_category, dtype: int64

In [21]:
df_train[df_train['breed_category'] == 2.0].pet_category.value_counts()

4    783
1    583
2     60
0     51
Name: pet_category, dtype: int64

In [32]:
print(df_train[df_train['condition'] == 0.0].breed_category.value_counts())
print(df_train[df_train['condition'] == 1.0].breed_category.value_counts())
print(df_train[df_train['condition'] == 2.0].breed_category.value_counts())
print(df_train[df_train['condition'] == 3.0].breed_category.value_counts())

1.0    6281
Name: breed_category, dtype: int64
0.0    6819
Name: breed_category, dtype: int64
0.0    2181
1.0    2076
Name: breed_category, dtype: int64
2.0    1477
Name: breed_category, dtype: int64


In [53]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df_train['color_type'] = le.fit_transform(df_train['color_type'])
df_test['color_type'] = le.transform(df_test['color_type'])

In [54]:
def diffdates(rows):
    temp1 = rows['issue_date']
    temp2 = rows['listing_date']
    search = r'\d{4}-\d{2}-\d{2}'
    temp1match = re.search(search, temp1)
    temp2match = re.search(search, temp2)
    return (datetime.datetime.strptime(temp2match.group(), '%Y-%m-%d').date() - datetime.datetime.strptime(temp1match.group(), '%Y-%m-%d').date()).days

In [56]:
df_test['duration'] = df_test.apply(diffdates, axis = 'columns')
df_test.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,duration
0,ANSL_75005,2005-08-17 00:00:00,2017-09-07 15:35:00,0.0,2,87.0,42.73,0,7,4404
1,ANSL_76663,2018-11-15 00:00:00,2019-05-08 17:24:00,1.0,38,6.0,6.71,0,1,174
2,ANSL_58259,2012-10-11 00:00:00,2018-04-02 16:51:00,1.0,2,24.0,41.21,0,7,1999
3,ANSL_67171,2015-02-13 00:00:00,2018-04-06 07:25:00,1.0,2,29.0,8.46,7,1,1148
4,ANSL_72871,2017-01-18 00:00:00,2018-04-26 13:42:00,1.0,15,71.0,30.92,0,7,463


In [55]:
df_train['duration'] = df_train.apply(diffdates, axis = 'columns')
df_train.head()

Unnamed: 0,pet_id,issue_date,listing_date,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,duration
0,ANSL_69903,2016-07-10 00:00:00,2016-09-21 16:25:00,2.0,18,80.0,7.78,13,9,0.0,1,73
1,ANSL_66892,2013-11-21 00:00:00,2018-12-27 17:47:00,1.0,53,72.0,14.19,13,9,0.0,2,1862
2,ANSL_69750,2014-09-28 00:00:00,2016-10-19 08:24:00,3.0,15,15.0,40.9,15,4,2.0,4,752
3,ANSL_71623,2016-12-31 00:00:00,2019-01-25 18:30:00,1.0,53,62.0,17.82,0,1,0.0,2,755
4,ANSL_57969,2017-09-28 00:00:00,2017-11-19 09:38:00,2.0,2,50.0,11.06,18,4,0.0,1,52


In [76]:
dfd = df_train.copy()
dfd['diff'] = abs(dfd['X1'] - dfd['X2'])
dfd.corr()[dfd.corr() > 0.2]

Unnamed: 0,condition,color_type,length(m),height(cm),X1,X2,breed_category,pet_category,duration,diff
condition,1.0,,,,0.457061,0.359141,,,,0.367725
color_type,,1.0,,,,,,,,
length(m),,,1.0,,,,,,,
height(cm),,,,1.0,,,,,,
X1,0.457061,,,,1.0,0.584396,0.240729,,,0.504951
X2,0.359141,,,,0.584396,1.0,,,,0.44859
breed_category,,,,,0.240729,,1.0,0.20923,,
pet_category,,,,,,,0.20923,1.0,,0.201639
duration,,,,,,,,,1.0,
diff,0.367725,,,,0.504951,0.44859,,0.201639,,1.0


In [77]:
df = df_test.copy()
df_train.drop(['pet_id', 'issue_date', 'listing_date'], axis = 1, inplace = True)
df_test.drop(['pet_id', 'issue_date', 'listing_date'], axis = 1, inplace = True)

In [78]:
x_train = df_train.copy()
x_train = x_train.drop(['breed_category','pet_category'], axis = 1)
x_test = df_test.copy()
y_breed = df_train['breed_category']
y_pet = df_train['pet_category']

In [79]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.fit_transform(x_test)

In [80]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(tree_method = "exact", predictor = "cpu_predictor",
                            objective = "multi:softmax")

# Create parameter grid
parameters = {"learning_rate": [0.1, 0.01, 0.001],
               "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
               "max_depth": [2, 4, 7, 10],
               "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
               "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
               "reg_alpha": [0, 0.5, 1],
               "reg_lambda": [1, 1.5, 2, 3, 4.5],
               "min_child_weight": [1, 3, 5, 7],
               "n_estimators": [100, 250, 500, 1000]}

from sklearn.model_selection import RandomizedSearchCV
xgb_rscv = RandomizedSearchCV(xgb_clf, param_distributions = parameters, scoring = "f1_micro",
                             cv = 10, verbose = 3, random_state = 40 )
model_xgboost = xgb_rscv.fit(x_train, y_pet)

print("Learning Rate: ", model_xgboost.best_estimator_.get_params()["learning_rate"])
print("Gamma: ", model_xgboost.best_estimator_.get_params()["gamma"])
print("Max Depth: ", model_xgboost.best_estimator_.get_params()["max_depth"])
print("Subsample: ", model_xgboost.best_estimator_.get_params()["subsample"])
print("Max Features at Split: ", model_xgboost.best_estimator_.get_params()["colsample_bytree"])
print("Alpha: ", model_xgboost.best_estimator_.get_params()["reg_alpha"])
print("Lamda: ", model_xgboost.best_estimator_.get_params()["reg_lambda"])
print("Minimum Sum of the Instance Weight Hessian to Make a Child: ",
      model_xgboost.best_estimator_.get_params()["min_child_weight"])
print("Number of Trees: ", model_xgboost.best_estimator_.get_params()["n_estimators"])

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.878, total=   1.8s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.7s remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.869, total=   1.8s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    3.5s remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.876, total=   1.8s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 
[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.889, total=   1.8s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 
[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.885, total=   1.9s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 
[CV]  subsample=

[CV]  subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0, score=0.773, total=   4.1s
[CV] subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0, score=0.768, total=   3.9s
[CV] subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0, score=0.764, total=   4.1s
[CV] subsample=0.7, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=7, max_depth=4, learning_rate=0.1, gamma=0.1, colsampl

[CV]  subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6, score=0.887, total=   3.2s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6 
[CV]  subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6, score=0.903, total=   3.2s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6 
[CV]  subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6, score=0.900, total=   4.0s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytr

[CV]  subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6, score=0.900, total=  32.3s
[CV] subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6 
[CV]  subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6, score=0.904, total=  34.6s
[CV] subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6 
[CV]  subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6, score=0.901, total=  30.7s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=0, n_estimators=250, min_child_weight=7, max_depth=10, learning_rate=0.01, gamma=0.3, colsample_bytree

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 16.6min finished


Learning Rate:  0.01
Gamma:  0.3
Max Depth:  7
Subsample:  0.4
Max Features at Split:  0.6
Alpha:  0
Lamda:  1.5
Minimum Sum of the Instance Weight Hessian to Make a Child:  1
Number of Trees:  1000


In [83]:
import xgboost as xgb
xgb_clf = xgb.XGBClassifier(tree_method = "exact", predictor = "cpu_predictor",
                            objective = "multi:softmax")

# Create parameter grid
parameters = {"learning_rate": [0.1, 0.01, 0.001],
               "gamma" : [0.01, 0.1, 0.3, 0.5, 1, 1.5, 2],
               "max_depth": [2, 4, 7, 10],
               "colsample_bytree": [0.3, 0.6, 0.8, 1.0],
               "subsample": [0.2, 0.4, 0.5, 0.6, 0.7],
               "reg_alpha": [0, 0.5, 1],
               "reg_lambda": [1, 1.5, 2, 3, 4.5],
               "min_child_weight": [1, 3, 5, 7],
               "n_estimators": [100, 250, 500, 1000]}

from sklearn.model_selection import RandomizedSearchCV
xgb_rscv = RandomizedSearchCV(xgb_clf, param_distributions = parameters, scoring = "f1_micro",
                             cv = 10, verbose = 3, random_state = 40 )
model_xgboost = xgb_rscv.fit(x_train, y_breed)

print("Learning Rate: ", model_xgboost.best_estimator_.get_params()["learning_rate"])
print("Gamma: ", model_xgboost.best_estimator_.get_params()["gamma"])
print("Max Depth: ", model_xgboost.best_estimator_.get_params()["max_depth"])
print("Subsample: ", model_xgboost.best_estimator_.get_params()["subsample"])
print("Max Features at Split: ", model_xgboost.best_estimator_.get_params()["colsample_bytree"])
print("Alpha: ", model_xgboost.best_estimator_.get_params()["reg_alpha"])
print("Lamda: ", model_xgboost.best_estimator_.get_params()["reg_lambda"])
print("Minimum Sum of the Instance Weight Hessian to Make a Child: ",
      model_xgboost.best_estimator_.get_params()["min_child_weight"])
print("Number of Trees: ", model_xgboost.best_estimator_.get_params()["n_estimators"])

Fitting 10 folds for each of 10 candidates, totalling 100 fits
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.904, total=   1.3s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.2s remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.893, total=   1.3s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.4s remaining:    0.0s


[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.902, total=   1.3s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 
[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.908, total=   1.3s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 
[CV]  subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0, score=0.919, total=   1.2s
[CV] subsample=0.6, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=5, max_depth=2, learning_rate=0.1, gamma=1.5, colsample_bytree=1.0 
[CV]  subsample=

[CV]  subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0, score=0.898, total=   2.8s
[CV] subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0, score=0.892, total=   2.8s
[CV] subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0 
[CV]  subsample=0.2, reg_lambda=4.5, reg_alpha=0.5, n_estimators=250, min_child_weight=1, max_depth=2, learning_rate=0.001, gamma=0.5, colsample_bytree=1.0, score=0.874, total=   2.9s
[CV] subsample=0.7, reg_lambda=1, reg_alpha=0, n_estimators=100, min_child_weight=7, max_depth=4, learning_rate=0.1, gamma=0.1, colsampl

[CV]  subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6, score=0.898, total=   2.5s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6 
[CV]  subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6, score=0.907, total=   2.6s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6 
[CV]  subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytree=0.6, score=0.908, total=   2.6s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=1, n_estimators=100, min_child_weight=3, max_depth=10, learning_rate=0.001, gamma=0.01, colsample_bytr

[CV]  subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6, score=0.898, total=  18.5s
[CV] subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6 
[CV]  subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6, score=0.896, total=  21.0s
[CV] subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6 
[CV]  subsample=0.4, reg_lambda=1.5, reg_alpha=0, n_estimators=1000, min_child_weight=1, max_depth=7, learning_rate=0.01, gamma=0.3, colsample_bytree=0.6, score=0.892, total=  22.8s
[CV] subsample=0.7, reg_lambda=3, reg_alpha=0, n_estimators=250, min_child_weight=7, max_depth=10, learning_rate=0.01, gamma=0.3, colsample_bytree

[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed: 10.0min finished


Learning Rate:  0.1
Gamma:  1.5
Max Depth:  2
Subsample:  0.6
Max Features at Split:  1.0
Alpha:  0
Lamda:  1
Minimum Sum of the Instance Weight Hessian to Make a Child:  5
Number of Trees:  100


In [91]:
XGBClassifier(learning_rate=0.01, n_estimators=10,colsample_bytree = 1.0,
                      subsample = 0.4,reg_alpha = 0.5, reg_lambda=1.5,
                      max_depth=6, 
                      gamma=0.5, min_child_weight=1)

# Learning Rate:  0.01
# Gamma:  0.3
# Max Depth:  7
# Subsample:  0.4
# Max Features at Split:  0.6
# Alpha:  0
# Lamda:  1.5
# Minimum Sum of the Instance Weight Hessian to Make a Child:  1
# Number of Trees:  1000

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1.0, gamma=0.5,
              learning_rate=0.01, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=None, n_estimators=10, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0.5, reg_lambda=1.5, scale_pos_weight=1, seed=None,
              silent=None, subsample=0.4, verbosity=1)

In [108]:
pet = XGBClassifier(learning_rate=0.1, n_estimators=600,colsample_bytree = 1.0,
                      subsample = 0.4,reg_alpha = 0.2, reg_lambda=1.5,
                      max_depth=6, 
                      gamma=0.5, min_child_weight=1)
pet.fit(x_train, y_pet)
y_petPred = pet.predict(x_test)
breed = XGBClassifier(learning_rate=0.1, n_estimators=500,colsample_bytree = 0.6,
                      subsample = 0.5,reg_alpha = 0.2, reg_lambda=1.5,
                      max_depth=7, 
                      gamma=0.3, min_child_weight=1)
breed.fit(x_train, y_breed)
y_breedPred = breed.predict(x_test)
from sklearn.metrics import accuracy_score

print(accuracy_score(y_breed, breed.predict(x_train)))
print(accuracy_score(y_pet, pet.predict(x_train)))

0.9961771264733992
0.9725496442603802


In [109]:
submission = pd.DataFrame({
    'pet_id':df['pet_id'],
    'breed_category':y_breedPred,
    'pet_category':y_petPred
})
submission.to_csv("submission10.csv", index = False)