In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import string

from scipy import stats

#from google.colab import files
from pandas.api.types import CategoricalDtype
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

In [2]:
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

In [3]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

import gc

In [4]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name'] == name, 'Entropy'] = round(stats.entropy(df[name].value_counts(normalize=True), base=2),2) 

    return summary

## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [5]:
#import io
#uploaded = files.upload()
#df_train = pd.read_csv(io.BytesIO(uploaded['train.csv']))

df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')
submission = pd.read_csv('sample_submission.csv')

import io
uploaded2 = files.upload()
df_test = pd.read_csv(io.BytesIO(uploaded2['test.csv']))

df

In [6]:
df_train

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,...,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,...,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,...,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,...,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,...,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0
5,5,0,1,1,T,N,Blue,Polygon,Lion,Costa Rica,...,51e27c16d,1,Novice,Freezing,j,E,PZ,2,2,0
6,6,0,1,1,T,N,Green,Trapezoid,Cat,China,...,7e3d79a0d,2,Grandmaster,Lava Hot,g,P,wy,5,4,0
7,7,1,0,1,T,Y,Red,Triangle,Dog,Russia,...,feb72ecc2,1,Novice,Lava Hot,j,K,Ed,4,2,0
8,8,1,0,1,T,Y,Blue,Square,Hamster,Canada,...,34a7273bf,2,Novice,Boiling Hot,e,V,qo,3,4,0
9,9,0,0,0,F,Y,Red,Trapezoid,Lion,China,...,0ece7a511,1,Expert,Freezing,h,Q,CZ,3,2,0


In [7]:
df_train['target'].describe()

count    300000.00000
mean          0.30588
std           0.46078
min           0.00000
25%           0.00000
50%           0.00000
75%           1.00000
max           1.00000
Name: target, dtype: float64

sns.distplot(df_train['target'])

corrmat = df_train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True);

total = df_test.isnull().sum().sort_values(ascending=False)
percent = (df_test.isnull().sum()/df_test.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_data.head(20)

In [8]:
y_train = df_train["target"]

In [9]:
train_ID = df_train['id']
test_ID = df_test['id']

df_train.drop("id", axis = 1, inplace = True)
df_test.drop("id", axis = 1, inplace = True)

In [10]:
ntrain = df_train.shape[0]
ntest = df_test.shape[0]
all_data = pd.concat((df_train, df_test)).reset_index(drop=True)
all_data.drop(['target'], axis=1, inplace=True)
all_data.shape[1]

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


23

In [11]:
all_data

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,day,month,nom_0,nom_1,nom_2,...,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
0,0,0,0,T,Y,2,2,Green,Triangle,Snake,...,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr
1,0,1,0,T,Y,7,8,Green,Trapezoid,Hamster,...,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF
2,0,0,0,F,Y,7,2,Blue,Trapezoid,Lion,...,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc
3,0,1,0,F,Y,2,1,Red,Trapezoid,Snake,...,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW
4,0,0,0,F,N,7,8,Red,Trapezoid,Lion,...,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP
5,0,1,1,T,N,2,2,Blue,Polygon,Lion,...,29a854620,ff5b35098,b7e6f8e6f,51e27c16d,1,Novice,Freezing,j,E,PZ
6,0,1,1,T,N,5,4,Green,Trapezoid,Cat,...,3393a0f78,c6587685d,06f5ae149,7e3d79a0d,2,Grandmaster,Lava Hot,g,P,wy
7,1,0,1,T,Y,4,2,Red,Triangle,Dog,...,55eed5058,2dd9daf45,98addc2c9,feb72ecc2,1,Novice,Lava Hot,j,K,Ed
8,1,0,1,T,Y,3,4,Blue,Square,Hamster,...,3e44d44eb,3f0057c9b,a2d110837,34a7273bf,2,Novice,Boiling Hot,e,V,qo
9,0,0,0,F,Y,3,2,Red,Trapezoid,Lion,...,8ed6221ae,4fbfe4a84,2c15d0173,0ece7a511,1,Expert,Freezing,h,Q,CZ


In [12]:
""" 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'
all_data = all_data.drop([], axis=1)"""

" 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'\nall_data = all_data.drop([], axis=1)"

In [13]:
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}
all_data['bin_3'] = all_data['bin_3'].map(bin_dict)
all_data['bin_4'] = all_data['bin_4'].map(bin_dict)

In [14]:
ord_1 = CategoricalDtype(categories=['Novice', 'Contributor','Expert', 
                                     'Master', 'Grandmaster'], ordered=True)
ord_2 = CategoricalDtype(categories=['Freezing', 'Cold', 'Warm', 'Hot',
                                     'Boiling Hot', 'Lava Hot'], ordered=True)
ord_3 = CategoricalDtype(categories=['a', 'b', 'c', 'd', 'e', 'f', 'g',
                                     'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'], ordered=True)
ord_4 = CategoricalDtype(categories=['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I',
                                     'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R',
                                     'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'], ordered=True)

In [15]:
all_data.ord_1 = all_data.ord_1.astype(ord_1)
all_data.ord_2 = all_data.ord_2.astype(ord_2)
all_data.ord_3 = all_data.ord_3.astype(ord_3)
all_data.ord_4 = all_data.ord_4.astype(ord_4)

all_data.ord_1 = all_data.ord_1.cat.codes
all_data.ord_2 = all_data.ord_2.cat.codes
all_data.ord_3 = all_data.ord_3.cat.codes
all_data.ord_4 = all_data.ord_4.cat.codes

In [16]:
all_data = pd.get_dummies(all_data, columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'],\
                          prefix=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], drop_first=True)

In [17]:
all_data['day_sin'] = np.sin(2 * np.pi * all_data['day']/6.0)
all_data['day_cos'] = np.cos(2 * np.pi * all_data['day']/6.0)

all_data['month_sin'] = np.sin(2 * np.pi * all_data['month']/11.0)
all_data['month_cos'] = np.cos(2 * np.pi * all_data['month']/11.0)

In [18]:
all_data['ord_5_oe_add'] = all_data['ord_5'].apply(lambda x:sum([(string.ascii_letters.find(letter)+1) for letter in x]))
all_data['ord_5_oe_join'] = all_data['ord_5'].apply(lambda x:float(''.join(str(string.ascii_letters.find(letter)+1) for letter in x)))

all_data['ord_5_oe1'] = all_data['ord_5'].apply(lambda x:(string.ascii_letters.find(x[0])+1))
all_data['ord_5_oe2'] = all_data['ord_5'].apply(lambda x:(string.ascii_letters.find(x[1])+1))

for col in ['ord_5_oe1', 'ord_5_oe2', 'ord_5_oe_add', 'ord_5_oe_join']:
    all_data[col]= all_data[col].astype('float64')

In [19]:
all_data

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,day,month,nom_5,nom_6,nom_7,...,nom_4_Piano,nom_4_Theremin,day_sin,day_cos,month_sin,month_cos,ord_5_oe_add,ord_5_oe_join,ord_5_oe1,ord_5_oe2
0,0,0,0,1,1,2,2,50f116bcf,3ac1b8814,68f6ad3e9,...,0,0,8.660254e-01,-0.5,9.096320e-01,0.415415,29.0,1118.0,11.0,18.0
1,0,1,0,1,1,7,8,b3b4d25d0,fbcb50fc1,3b6dd5612,...,1,0,8.660254e-01,0.5,-9.898214e-01,-0.142315,34.0,232.0,2.0,32.0
2,0,0,0,0,1,7,2,3263bdce5,0922e3cb8,a6a36f527,...,0,1,8.660254e-01,0.5,9.096320e-01,0.415415,39.0,363.0,36.0,3.0
3,0,1,0,0,1,2,1,f12246592,50d7ad46a,ec69236eb,...,0,0,8.660254e-01,-0.5,5.406408e-01,0.841254,60.0,1149.0,11.0,49.0
4,0,0,0,0,0,7,8,5b0f5acd5,1fe17a1fd,04ddac2be,...,0,0,8.660254e-01,0.5,-9.898214e-01,-0.142315,59.0,1742.0,17.0,42.0
5,0,1,1,1,0,2,2,46cab09da,29a854620,ff5b35098,...,0,0,8.660254e-01,-0.5,9.096320e-01,0.415415,94.0,4252.0,42.0,52.0
6,0,1,1,1,0,5,4,be5592604,3393a0f78,c6587685d,...,1,0,-8.660254e-01,0.5,7.557496e-01,-0.654861,48.0,2325.0,23.0,25.0
7,1,0,1,1,1,4,2,72f8028dc,55eed5058,2dd9daf45,...,0,0,-8.660254e-01,-0.5,9.096320e-01,0.415415,35.0,314.0,31.0,4.0
8,1,0,1,1,1,3,4,4604905e7,3e44d44eb,3f0057c9b,...,0,0,1.224647e-16,-1.0,7.557496e-01,-0.654861,32.0,1715.0,17.0,15.0
9,0,0,0,0,1,3,2,ad95dc0ee,8ed6221ae,4fbfe4a84,...,1,0,1.224647e-16,-1.0,9.096320e-01,0.415415,81.0,2952.0,29.0,52.0


In [20]:
all_data = all_data.drop(['day', 'month'], axis=1)

In [21]:
high_card_feats = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']

In [22]:
for col in high_card_feats:
    all_data[f'hash_{col}'] = all_data[col].apply( lambda x: hash(str(x)) % 5000 )

In [23]:
for col in high_card_feats:
    enc_nom_1 = (all_data.groupby(col).size()) / len(all_data)
    all_data[f'freq_{col}'] = all_data[col].apply(lambda x : enc_nom_1[x])

In [24]:
for f in ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']:
    if all_data[f].dtype=='object' : 
        lbl = LabelEncoder()
        lbl.fit(list(all_data[f].values))
        all_data[f'le_{f}'] = lbl.transform(list(all_data[f].values))

In [25]:
new_feat = ['hash_nom_5', 'hash_nom_6', 'hash_nom_7', 'hash_nom_8',
            'hash_nom_9',  'freq_nom_5', 'freq_nom_6', 'freq_nom_7', 
            'freq_nom_8', 'freq_nom_9', 'le_nom_5', 'le_nom_6',
            'le_nom_7', 'le_nom_8', 'le_nom_9']

resumetable(all_data[high_card_feats + new_feat])

Dataset Shape: (500000, 20)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,nom_5,object,0,222,50f116bcf,b3b4d25d0,3263bdce5,7.49
1,nom_6,object,0,522,3ac1b8814,fbcb50fc1,0922e3cb8,8.74
2,nom_7,object,0,1220,68f6ad3e9,3b6dd5612,a6a36f527,9.97
3,nom_8,object,0,2219,c389000ab,4cd920251,de9c9f684,10.84
4,nom_9,object,0,12068,2f4cb3d51,f83c56c21,ae6800dd0,13.28
5,hash_nom_5,int64,0,218,2742,2084,1207,7.46
6,hash_nom_6,int64,0,489,3863,1257,2457,8.61
7,hash_nom_7,int64,0,1088,1441,722,2305,9.79
8,hash_nom_8,int64,0,1788,1159,3472,4157,10.48
9,hash_nom_9,int64,0,4556,4773,3467,587,11.86


In [26]:
all_data.drop([ 'ord_5', 'hash_nom_5', 
                'hash_nom_6', 'hash_nom_7', 'hash_nom_8', 'hash_nom_9',
               #'le_nom_5', 'le_nom_6', 'le_nom_7', 'le_nom_8', 'le_nom_9',
               #'freq_nom_5','freq_nom_6', 'freq_nom_7', 'freq_nom_8', 'freq_nom_9',
                'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'
              ], axis=1, inplace=True)

#'feq_nom_5', 'feq_nom_6', 'feq_nom_7', 'feq_nom_8', 'feq_nom_9', 

In [27]:
all_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500000 entries, 0 to 499999
Data columns (total 48 columns):
bin_0               500000 non-null int64
bin_1               500000 non-null int64
bin_2               500000 non-null int64
bin_3               500000 non-null int64
bin_4               500000 non-null int64
ord_0               500000 non-null int64
ord_1               500000 non-null int8
ord_2               500000 non-null int8
ord_3               500000 non-null int8
ord_4               500000 non-null int8
nom_0_Green         500000 non-null uint8
nom_0_Red           500000 non-null uint8
nom_1_Polygon       500000 non-null uint8
nom_1_Square        500000 non-null uint8
nom_1_Star          500000 non-null uint8
nom_1_Trapezoid     500000 non-null uint8
nom_1_Triangle      500000 non-null uint8
nom_2_Cat           500000 non-null uint8
nom_2_Dog           500000 non-null uint8
nom_2_Hamster       500000 non-null uint8
nom_2_Lion          500000 non-null uint8
nom_2_Snake

all_data['nom_5'] = all_data['nom_5'].apply(lambda x: int(x, 16))
all_data['nom_6'] = all_data['nom_6'].apply(lambda x: int(x, 16))
all_data['nom_7'] = all_data['nom_7'].apply(lambda x: int(x, 16))
all_data['nom_8'] = all_data['nom_8'].apply(lambda x: int(x, 16))
all_data['nom_9'] = all_data['nom_9'].apply(lambda x: int(x, 16))

In [28]:
for col in all_data.columns:
    print(col)
    print(all_data[col].unique())

bin_0
[0 1]
bin_1
[0 1]
bin_2
[0 1]
bin_3
[1 0]
bin_4
[1 0]
ord_0
[2 1 3]
ord_1
[4 2 0 1 3]
ord_2
[1 3 5 4 0 2]
ord_3
[ 7  0  8  9  6  4  3  1 10  5 11 13 14  2 12]
ord_4
[ 3  0 17  4 15 10 21 16 25 11  5 19 20 18 24  1  7  9 13  6 22  8 14  2
 23 12]
nom_0_Green
[1 0]
nom_0_Red
[0 1]
nom_1_Polygon
[0 1]
nom_1_Square
[0 1]
nom_1_Star
[0 1]
nom_1_Trapezoid
[0 1]
nom_1_Triangle
[1 0]
nom_2_Cat
[0 1]
nom_2_Dog
[0 1]
nom_2_Hamster
[0 1]
nom_2_Lion
[0 1]
nom_2_Snake
[1 0]
nom_3_China
[0 1]
nom_3_Costa Rica
[0 1]
nom_3_Finland
[1 0]
nom_3_India
[0 1]
nom_3_Russia
[0 1]
nom_4_Oboe
[0 1]
nom_4_Piano
[0 1]
nom_4_Theremin
[0 1]
day_sin
[ 8.66025404e-01  8.66025404e-01 -8.66025404e-01 -8.66025404e-01
  1.22464680e-16  8.66025404e-01 -2.44929360e-16]
day_cos
[-0.5  0.5  0.5 -0.5 -1.   1. ]
month_sin
[ 9.09631995e-01 -9.89821442e-01  5.40640817e-01  7.55749574e-01
 -5.40640817e-01  9.89821442e-01 -7.55749574e-01 -9.09631995e-01
  5.40640817e-01 -1.13310778e-15  2.81732557e-01 -2.81732557e-01]
month

 1.340e-04 1.200e-04 3.800e-05 2.000e-05]
freq_nom_7
[8.200e-04 9.240e-04 1.600e-03 1.500e-03 1.520e-03 1.306e-03 1.104e-03
 1.072e-03 1.566e-03 6.320e-04 1.340e-03 1.324e-03 1.100e-03 1.176e-03
 1.050e-03 1.540e-03 8.240e-04 3.200e-04 1.044e-03 1.456e-03 1.350e-03
 8.280e-04 8.500e-04 1.218e-03 1.204e-03 9.820e-04 1.628e-03 1.468e-03
 1.004e-03 1.336e-03 5.120e-04 1.084e-03 1.410e-03 4.160e-04 5.740e-04
 1.394e-03 1.052e-03 1.264e-03 1.486e-03 1.490e-03 1.134e-03 1.630e-03
 1.330e-03 4.760e-04 1.318e-03 1.594e-03 1.102e-03 1.150e-03 1.120e-03
 1.374e-03 8.620e-04 6.240e-04 6.040e-04 8.600e-04 1.696e-03 1.650e-03
 1.606e-03 7.740e-04 1.208e-03 1.638e-03 1.422e-03 1.608e-03 1.444e-03
 1.484e-03 1.316e-03 1.206e-03 1.280e-03 1.224e-03 1.190e-03 1.128e-03
 1.590e-03 1.276e-03 1.158e-03 1.114e-03 1.138e-03 3.040e-04 9.320e-04
 6.360e-04 4.660e-04 1.438e-03 1.386e-03 1.286e-03 7.240e-04 1.130e-03
 7.820e-04 1.458e-03 7.420e-04 1.164e-03 1.216e-03 1.226e-03 7.840e-04
 1.296e-03 9.940e-04 8.0

 5.40e-05 7.00e-05 1.40e-05 4.00e-06]
freq_nom_9
[6.40e-05 4.60e-05 1.00e-04 1.46e-04 1.26e-04 1.34e-04 3.60e-05 1.64e-04
 1.40e-04 1.38e-04 8.80e-05 9.20e-05 5.00e-05 5.20e-05 1.36e-04 1.76e-04
 1.60e-04 1.74e-04 4.40e-05 1.54e-04 7.40e-05 1.20e-04 1.50e-04 1.68e-04
 1.62e-04 9.40e-05 8.00e-05 7.20e-05 1.90e-04 1.48e-04 1.42e-04 6.00e-05
 1.08e-04 1.24e-04 1.44e-04 9.80e-05 1.02e-04 1.32e-04 1.14e-04 1.70e-04
 7.60e-05 1.10e-04 5.80e-05 1.66e-04 1.58e-04 5.60e-05 1.06e-04 6.20e-05
 3.80e-05 1.30e-04 1.12e-04 9.60e-05 1.18e-04 4.80e-05 2.40e-05 7.80e-05
 7.00e-05 1.16e-04 1.52e-04 1.80e-04 3.20e-05 1.28e-04 8.40e-05 1.22e-04
 6.60e-05 9.00e-05 3.00e-05 8.20e-05 2.80e-05 1.82e-04 1.56e-04 1.84e-04
 1.72e-04 1.04e-04 1.80e-05 1.60e-05 6.80e-05 5.40e-05 4.00e-05 2.20e-05
 1.92e-04 3.40e-05 4.20e-05 1.88e-04 1.78e-04 8.60e-05 2.60e-05 2.00e-05
 2.02e-04 1.96e-04 1.40e-05 1.20e-05 1.86e-04 1.94e-04 6.00e-06 8.00e-06
 2.06e-04 2.04e-04 2.14e-04 4.00e-06 2.00e-04 2.24e-04 1.00e-05 2.00e-06
 1

all_data = pd.get_dummies(all_data)

In [29]:
all_data

Unnamed: 0,bin_0,bin_1,bin_2,bin_3,bin_4,ord_0,ord_1,ord_2,ord_3,ord_4,...,freq_nom_5,freq_nom_6,freq_nom_7,freq_nom_8,freq_nom_9,le_nom_5,le_nom_6,le_nom_7,le_nom_8,le_nom_9
0,0,0,0,1,1,2,4,1,7,3,...,0.008624,0.003752,0.000820,0.000978,0.000064,78,120,491,1689,2192
1,0,1,0,1,1,1,4,3,0,0,...,0.002608,0.002736,0.000924,0.000358,0.000046,159,510,260,652,11719
2,0,0,0,0,1,1,2,5,7,17,...,0.008518,0.003868,0.001600,0.000878,0.000100,44,14,766,1935,8140
3,0,1,0,0,1,1,4,4,8,3,...,0.003256,0.003874,0.001500,0.000714,0.000146,209,165,1121,631,6098
4,0,0,0,0,0,1,4,0,0,17,...,0.006602,0.003858,0.001520,0.000690,0.000126,90,61,34,1763,8293
5,0,1,1,1,0,1,0,0,9,4,...,0.008752,0.003334,0.001306,0.000686,0.000134,65,86,1213,1577,3859
6,0,1,1,1,0,2,4,5,6,15,...,0.005032,0.000524,0.001104,0.000574,0.000100,169,110,928,65,5900
7,1,0,1,1,1,1,0,5,9,10,...,0.008876,0.002160,0.001072,0.000060,0.000036,106,177,188,1320,12009
8,1,0,1,1,1,2,0,4,4,21,...,0.006288,0.003588,0.001566,0.000324,0.000164,63,131,272,1418,2462
9,0,0,0,0,1,1,2,0,7,16,...,0.000726,0.001922,0.000632,0.000894,0.000140,154,282,353,387,697


In [30]:
train = all_data[:ntrain]
test =  all_data[ntrain:]

In [31]:
train.shape

(300000, 48)

rf = RandomForestClassifier(n_estimators=500, random_state=0)
rf.fit(train, y_train)

train_acc = accuracy_score(y_train, rf.predict(train))
print(train_acc)

svclassifier = SVC(kernel='poly', degree=5)
svclassifier.fit(train, y_train)

train_acc = accuracy_score(y_train, svclassifier.predict(train)) 
print(train_acc)

submission = pd.DataFrame({'id' : test_ID, 'target' : svclassifier.predict(test)})
submission.to_csv('submission4.csv', index=False)

In [32]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import accuracy_score

#Models
import warnings
warnings.filterwarnings("ignore")

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifier, SGDClassifier, LogisticRegression
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, ExtraTreesClassifier, BaggingClassifier, VotingClassifier, RandomTreesEmbedding

"""
clfs = []
seed = 42

clfs.append(("LogReg", 
             Pipeline([("Scaler", StandardScaler()),
                       ("LogReg", LogisticRegression())])))

clfs.append(("XGBClassifier", XGBClassifier()))

# clfs.append(("KNN", 
#              Pipeline([("Scaler", StandardScaler()),
#                        ("KNN", KNeighborsClassifier(n_neighbors=5))])))

clfs.append(("DecisionTreeClassifier", DecisionTreeClassifier()))

clfs.append(("RandomForestClassifier", RandomForestClassifier(n_estimators=100)))

clfs.append(("GradientBoostingClassifier", GradientBoostingClassifier(n_estimators=100)))

clfs.append(("RidgeClassifier", 
             Pipeline([("Scaler", StandardScaler()),
                       ("RidgeClassifier", RidgeClassifier())])))

clfs.append(("BaggingClassifier",
             Pipeline([("Scaler", StandardScaler()),
                       ("BaggingClassifier", BaggingClassifier())])))

clfs.append(("ExtraTreesClassifier",ExtraTreesClassifier()))

#'neg_mean_absolute_error', 'neg_mean_squared_error','r2'
scoring = 'roc_auc'
n_folds = 7

results, names  = [], [] 

for name, model  in clfs:
    kfold = KFold(n_splits=n_folds, shuffle=False, random_state=seed)
    
    cv_results = cross_val_score(model, 
                                 train.values, y_train, 
                                 cv= kfold, scoring=scoring,
                                 n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
    msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  
                               cv_results.std())
    print(msg)
    
# boxplot algorithm comparison
fig = plt.figure(figsize=(15,6))
fig.suptitle('Classifier Algorithm Comparison', fontsize=22)
ax = fig.add_subplot(111)
sns.boxplot(x=names, y=results)
ax.set_xticklabels(names)
ax.set_xlabel("Algorithmn", fontsize=20) 
ax.set_ylabel("Accuracy of Models", fontsize=18)
ax.set_xticklabels(ax.get_xticklabels(),rotation=45)

plt.show()
"""

In [33]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from xgboost import plot_importance
from sklearn.metrics import make_scorer

import time
def objective(params):
    time1 = time.time()
    params = {
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'subsample': "{:.2f}".format(params['subsample']),
        'reg_alpha': "{:.3f}".format(params['reg_alpha']),
        'reg_lambda': "{:.3f}".format(params['reg_lambda']),
        'learning_rate': "{:.3f}".format(params['learning_rate']),
        'num_leaves': '{:.3f}'.format(params['num_leaves']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'min_child_samples': '{:.3f}'.format(params['min_child_samples']),
        'feature_fraction': '{:.3f}'.format(params['feature_fraction']),
        'bagging_fraction': '{:.3f}'.format(params['bagging_fraction'])
    }

    print("\n############## New Run ################")
    print(f"params = {params}")
    FOLDS = 12
    count=1
    kf = KFold(n_splits=FOLDS, shuffle=False, random_state=42)

    # tss = TimeSeriesSplit(n_splits=FOLDS)
    y_preds = np.zeros(submission.shape[0])
    # y_oof = np.zeros(X_train.shape[0])
    score_mean = 0
    for tr_idx, val_idx in kf.split(train, y_train):
        clf = xgb.XGBClassifier(
            n_estimators=500, random_state=4, 
            verbose=True, 
            tree_method='gpu_hist', 
            **params
        )

        X_tr, X_vl = train.iloc[tr_idx, :], train.iloc[val_idx, :]
        y_tr, y_vl = y_train.iloc[tr_idx], y_train.iloc[val_idx]
        
        clf.fit(X_tr, y_tr)
        #y_pred_train = clf.predict_proba(X_vl)[:,1]
        #print(y_pred_train)
        score = make_scorer(roc_auc_score, needs_proba=True)(clf, X_vl, y_vl)
        # plt.show()
        score_mean += score
        print(f'{count} CV - score: {round(score, 4)}')
        count += 1
    time2 = time.time() - time1
    print(f"Total Time Run: {round(time2 / 60,2)}")
    gc.collect()
    print(f'Mean ROC_AUC: {score_mean / FOLDS}')
    del X_tr, X_vl, y_tr, y_vl, clf, score
    
    return -(score_mean / FOLDS)

space = {
    # The maximum depth of a tree, same as GBM.
    # Used to control over-fitting as higher depth will allow model 
    # to learn relations very specific to a particular sample.
    # Should be tuned using CV.
    # Typical values: 3-10
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    
    # reg_alpha: L1 regularization term. L1 regularization encourages sparsity 
    # (meaning pulling weights to 0). It can be more useful when the objective
    # is logistic regression since you might need help with feature selection.
    'reg_alpha':  hp.uniform('reg_alpha', 0.01, 0.4),
    
    # reg_lambda: L2 regularization term. L2 encourages smaller weights, this
    # approach can be more useful in tree-models where zeroing 
    # features might not make much sense.
    'reg_lambda': hp.uniform('reg_lambda', 0.01, .4),
    
    # eta: Analogous to learning rate in GBM
    # Makes the model more robust by shrinking the weights on each step
    # Typical final values to be used: 0.01-0.2
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.15),
    
    # colsample_bytree: Similar to max_features in GBM. Denotes the 
    # fraction of columns to be randomly samples for each tree.
    # Typical values: 0.5-1
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1),
    
    # A node is split only when the resulting split gives a positive
    # reduction in the loss function. Gamma specifies the 
    # minimum loss reduction required to make a split.
    # Makes the algorithm conservative. The values can vary depending on the loss function and should be tuned.
    'gamma': hp.uniform('gamma', 0.01, .7),
    
    # more increases accuracy, but may lead to overfitting.
    # num_leaves: the number of leaf nodes to use. Having a large number 
    # of leaves will improve accuracy, but will also lead to overfitting.
    'num_leaves': hp.choice('num_leaves', list(range(20, 200, 5))),
    
    # specifies the minimum samples per leaf node.
    # the minimum number of samples (data) to group into a leaf. 
    # The parameter can greatly assist with overfitting: larger sample
    # sizes per leaf will reduce overfitting (but may lead to under-fitting).
    'min_child_samples': hp.choice('min_child_samples', list(range(100, 250, 10))),
    
    # subsample: represents a fraction of the rows (observations) to be 
    # considered when building each subtree. Tianqi Chen and Carlos Guestrin
    # in their paper A Scalable Tree Boosting System recommend 
    'subsample': hp.choice('subsample', [.5, 0.6, 0.7, .8]),
    
    # randomly select a fraction of the features.
    # feature_fraction: controls the subsampling of features used
    # for training (as opposed to subsampling the actual training data in 
    # the case of bagging). Smaller fractions reduce overfitting.
    'feature_fraction': hp.uniform('feature_fraction', 0.4, .8),
    
    # randomly bag or subsample training data.
    'bagging_fraction': hp.uniform('bagging_fraction', 0.4, .9)
    
    # bagging_fraction and bagging_freq: enables bagging (subsampling) 
    # of the training data. Both values need to be set for bagging to be used.
    # The frequency controls how often (iteration) bagging is used. Smaller
    # fractions and frequencies reduce overfitting.
}

In [34]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=30, 
            # trials=trials
           )

                                                                                                                       
############## New Run ################
params = {'max_depth': 2, 'gamma': '0.646', 'subsample': '0.60', 'reg_alpha': '0.286', 'reg_lambda': '0.378', 'learning_rate': '0.149', 'num_leaves': '25.000', 'colsample_bytree': '0.748', 'min_child_samples': '180.000', 'feature_fraction': '0.656', 'bagging_fraction': '0.712'}
1 CV - score: 0.765                                                                                                    
2 CV - score: 0.7731                                                                                                   
3 CV - score: 0.7734                                                                                                   
4 CV - score: 0.7713                                                                                                   
5 CV - score: 0.7714                                                             

12 CV - score: 0.7679                                                                                                  
Total Time Run: 27.81                                                                                                  
Mean ROC_AUC: 0.771846780567298                                                                                        
                                                                                                                       
############## New Run ################
params = {'max_depth': 3, 'gamma': '0.632', 'subsample': '0.80', 'reg_alpha': '0.300', 'reg_lambda': '0.318', 'learning_rate': '0.017', 'num_leaves': '130.000', 'colsample_bytree': '0.550', 'min_child_samples': '220.000', 'feature_fraction': '0.519', 'bagging_fraction': '0.703'}
1 CV - score: 0.7472                                                                                                   
2 CV - score: 0.7542                                                            

9 CV - score: 0.7734                                                                                                   
10 CV - score: 0.7779                                                                                                  
11 CV - score: 0.7751                                                                                                  
12 CV - score: 0.7709                                                                                                  
Total Time Run: 12.49                                                                                                  
Mean ROC_AUC: 0.7758321591460732                                                                                       
                                                                                                                       
############## New Run ################
params = {'max_depth': 4, 'gamma': '0.226', 'subsample': '0.70', 'reg_alpha': '0.399', 'reg_lambda': '0.323', 'learning_

6 CV - score: 0.77                                                                                                     
7 CV - score: 0.7677                                                                                                   
8 CV - score: 0.7673                                                                                                   
9 CV - score: 0.761                                                                                                    
10 CV - score: 0.766                                                                                                   
11 CV - score: 0.7617                                                                                                  
12 CV - score: 0.7578                                                                                                  
Total Time Run: 10.8                                                                                                   
Mean ROC_AUC: 0.763515980271097         

3 CV - score: 0.7791                                                                                                   
4 CV - score: 0.7755                                                                                                   
5 CV - score: 0.7754                                                                                                   
6 CV - score: 0.7826                                                                                                   
7 CV - score: 0.782                                                                                                    
8 CV - score: 0.7794                                                                                                   
9 CV - score: 0.7736                                                                                                   
10 CV - score: 0.7799                                                                                                  
11 CV - score: 0.7759                   

params = {'max_depth': 3, 'gamma': '0.089', 'subsample': '0.60', 'reg_alpha': '0.088', 'reg_lambda': '0.091', 'learning_rate': '0.149', 'num_leaves': '165.000', 'colsample_bytree': '0.375', 'min_child_samples': '110.000', 'feature_fraction': '0.626', 'bagging_fraction': '0.501'}
1 CV - score: 0.7704                                                                                                   
2 CV - score: 0.7777                                                                                                   
3 CV - score: 0.7772                                                                                                   
4 CV - score: 0.7749                                                                                                   
5 CV - score: 0.776                                                                                                    
6 CV - score: 0.7834                                                                                                   


Total Time Run: 15.99                                                                                                  
Mean ROC_AUC: 0.778940021369483                                                                                        
                                                                                                                       
############## New Run ################
params = {'max_depth': 4, 'gamma': '0.595', 'subsample': '0.70', 'reg_alpha': '0.332', 'reg_lambda': '0.343', 'learning_rate': '0.137', 'num_leaves': '55.000', 'colsample_bytree': '0.825', 'min_child_samples': '230.000', 'feature_fraction': '0.405', 'bagging_fraction': '0.628'}
1 CV - score: 0.7715                                                                                                   
2 CV - score: 0.7777                                                                                                   
3 CV - score: 0.7812                                                             

10 CV - score: 0.7808                                                                                                  
11 CV - score: 0.7761                                                                                                  
12 CV - score: 0.7724                                                                                                  
Total Time Run: 14.19                                                                                                  
Mean ROC_AUC: 0.7773601338578985                                                                                       
                                                                                                                       
############## New Run ################
params = {'max_depth': 6, 'gamma': '0.695', 'subsample': '0.80', 'reg_alpha': '0.067', 'reg_lambda': '0.029', 'learning_rate': '0.120', 'num_leaves': '105.000', 'colsample_bytree': '0.559', 'min_child_samples': '130.000', 'feature_fraction'

In [35]:
best_params = space_eval(space, best)
best_params['max_depth'] = int(best_params['max_depth'])
best_params

{'bagging_fraction': 0.5711628873176735,
 'colsample_bytree': 0.9654510436650519,
 'feature_fraction': 0.6589149027018624,
 'gamma': 0.5455961535680991,
 'learning_rate': 0.1426493516414294,
 'max_depth': 5,
 'min_child_samples': 130,
 'num_leaves': 120,
 'reg_alpha': 0.21001898581814585,
 'reg_lambda': 0.37184999251023865,
 'subsample': 0.8}

In [37]:
clf = xgb.XGBClassifier(
    n_estimators=500,
    **best_params,
    tree_method='gpu_hist'
)

clf.fit(train, y_train)

y_preds = clf.predict_proba(test)[:,1] 

In [38]:
feature_important = clf.get_booster().get_score(importance_type="weight")
keys = list(feature_important.keys())
values = list(feature_important.values())

data = pd.DataFrame(data=values, index=keys, columns=["score"]).sort_values(by = "score", ascending=False)

# Top 10 features
data.head(20)

Unnamed: 0,score
freq_nom_5,1102
le_nom_6,1081
freq_nom_6,1081
le_nom_5,1013
le_nom_7,954
freq_nom_7,953
le_nom_8,921
freq_nom_8,888
le_nom_9,747
freq_nom_9,683


In [48]:
submission['target'] = y_preds
submission.to_csv('XGB_hypopt_model2.csv', index=False, header =1)

In [46]:
submission.head()

Unnamed: 0,id,target
0,300000,0.35578
1,300001,0.677692
2,300002,0.180116
3,300003,0.327095
4,300004,0.831369
