# 1.Feature Engineering

In [1]:
# Importing Libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
 
from sklearn.metrics import roc_auc_score

from category_encoders import TargetEncoder

import string
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Read the data
X = pd.read_csv('./train.csv', index_col='id')
X_test = pd.read_csv('./test.csv', index_col='id')

In [3]:
# Checking if there are any columns with missing values
X_cols_with_missing = [col for col in X.columns if X[col].isnull().any()]
X_test_cols_with_missing = [col for col in X_test.columns if X_test[col].isnull().any()]

print("X has ",len(X_cols_with_missing), "empty columns")
print("X_test has ",len(X_test_cols_with_missing), "empty columns")

X has  0 empty columns
X_test has  0 empty columns


In [4]:
# Checking features categories coherence between X and X_test
for var in X_test.columns:
    if len(set(X_test[var].values) - set(X[var].values)) > 0:
        print(var, len(set(X_test[var].values) - set(X[var].values)))

nom_8 4
nom_9 87


**=>** The features **nom_8** and **nom_9** has values in **X_test** that are not in **X**!

In [5]:
for var in X_test.columns:
    if len(set(X[var].values) - set(X_test[var].values)) > 0:
        print(var, len(set(X[var].values) - set(X_test[var].values)))

nom_7 1
nom_8 5
nom_9 229


**=>** The features **nom_7**, **nom_8** and **nom_9** has values in **X** that are not in **X_test**!

In [6]:
# Assign the variable count to y
y = X['target']
# Remove the variables we do not want to be in X
X.drop(['target'], axis=1, inplace=True)

##  OneHotEncode ALL

In [9]:
columns_to_onehot_encode = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4',
                            'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9', 
                            'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 
                            'day', 'month']


X_OneHot_Encoded = pd.get_dummies(X,
                                  columns=columns_to_onehot_encode,
                                  prefix=columns_to_onehot_encode, 
                                  drop_first=True)

X_test_OneHot_Encoded = pd.get_dummies(X_test,
                                       columns=columns_to_onehot_encode,
                                       prefix=columns_to_onehot_encode, 
                                       drop_first=True)


In [11]:
print(X_OneHot_Encoded.shape)
print(X_test_OneHot_Encoded.shape)

(300000, 16423)
(200000, 16279)


In [17]:
common_columns = set(X_OneHot_Encoded.columns).intersection(set(X_test_OneHot_Encoded.columns))

len(common_columns)

16188

In [18]:
X_OneHot_Encoded = X_OneHot_Encoded[common_columns]
X_test_OneHot_Encoded = X_test_OneHot_Encoded[common_columns]

In [19]:
print(X_OneHot_Encoded.shape)
print(X_test_OneHot_Encoded.shape)

(300000, 16188)
(200000, 16188)


## 1.1 Binary features

In [58]:
# Binary features
bin_features = pd.Series(X.columns).str.startswith('bin')

# First five observations of the binary features
X[X.columns[bin_features]].head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,T,Y
1,0,1,0,T,Y
2,0,0,0,F,Y
3,0,1,0,F,Y
4,0,0,0,F,N


As we can see here, two features (bin_3 and bin_4) should be mapped to 1 and 0.

In [59]:
# dictionary to map the feature
bin_dict = {'T':1, 'F':0, 'Y':1, 'N':0}

# Maping the category values in our dict
X['bin_3'] = X['bin_3'].map(bin_dict)
X['bin_4'] = X['bin_4'].map(bin_dict)
X_test['bin_3'] = X_test['bin_3'].map(bin_dict)
X_test['bin_4'] = X_test['bin_4'].map(bin_dict)

In [60]:
# First five observations of the binary features
X[X.columns[bin_features]].head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,0,0,1,1
1,0,1,0,1,1
2,0,0,0,0,1
3,0,1,0,0,1
4,0,0,0,0,0


## 1.2 Nominal features

In [61]:
# Nominal features
nom_features = pd.Series(X.columns).str.startswith('nom')

# First five observations of the nominal features
X[X.columns[nom_features]].head()

Unnamed: 0_level_0,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51
1,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21
2,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0
3,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71
4,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7


In [62]:
X[X.columns[nom_features]].nunique()

nom_0        3
nom_1        6
nom_2        6
nom_3        6
nom_4        4
nom_5      222
nom_6      522
nom_7     1220
nom_8     2215
nom_9    11981
dtype: int64

We can split these features into 2 groups:

* nom_0..nom_4: Low cardinality features.

* nom_5..nom_9: High cardinality features. 

### 1.2.1 Low cardinality nominal features

In [63]:
# Add a column containing the number of sides in each shape of the values in nom_01
X.nom_1.unique()

# dictionary to map the feature ord_1
nom_1_dict = {'Circle':1, 'Triangle':3, 'Square':4, 'Trapezoid':4, 'Polygon':5, 'Star':10}

# Maping the category values in our dict
X['nom_1_sides'] = X['nom_1'].map(nom_1_dict)
X_test['nom_1_sides'] = X_test['nom_1'].map(nom_1_dict)

In [64]:
# One Hot Encoding low cardinality features
X = pd.get_dummies(X,
                   columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'],
                   prefix=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], 
                   drop_first=True)

X_test = pd.get_dummies(X_test,
                        columns=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'],
                        prefix=['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4'], 
                        drop_first=True)

### 1.2.2 High cardinality nominal features

target encoding

In [65]:
ce_target_encoder = TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])

ce_target_encoder.fit(X, y)

ce_target_encoder.transform(X).head()

Unnamed: 0_level_0,bin_0,bin_1,bin_2,bin_3,bin_4,nom_5,nom_6,nom_7,nom_8,nom_9,...,nom_2_Lion,nom_2_Snake,nom_3_China,nom_3_Costa Rica,nom_3_Finland,nom_3_India,nom_3_Russia,nom_4_Oboe,nom_4_Piano,nom_4_Theremin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0,0,0,1,1,0.358134,0.312718,0.232365,0.372694,0.368421,...,0,1,0,0,1,0,0,0,0,0
1,0,1,0,1,1,0.388889,0.263658,0.310105,0.189189,0.076924,...,0,0,0,0,0,0,1,0,1,0
2,0,0,0,0,1,0.274564,0.296835,0.206316,0.223022,0.172414,...,1,0,0,0,0,0,1,0,0,1
3,0,1,0,0,1,0.234872,0.364255,0.350679,0.325123,0.227273,...,0,1,0,0,0,0,0,1,0,0
4,0,0,0,0,0,0.312438,0.305964,0.272936,0.376812,0.2,...,1,0,0,0,0,0,0,1,0,0


In [66]:
# Nominal features
nom_features = pd.Series(X.columns).str.startswith('nom')

# First five observations of the nominal features
X[X.columns[nom_features]].head()

Unnamed: 0_level_0,nom_5,nom_6,nom_7,nom_8,nom_9,nom_1_sides,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,...,nom_2_Lion,nom_2_Snake,nom_3_China,nom_3_Costa Rica,nom_3_Finland,nom_3_India,nom_3_Russia,nom_4_Oboe,nom_4_Piano,nom_4_Theremin
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,3,1,0,0,0,...,0,1,0,0,1,0,0,0,0,0
1,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,4,1,0,0,0,...,0,0,0,0,0,0,1,0,1,0
2,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,4,0,0,0,0,...,1,0,0,0,0,0,1,0,0,1
3,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,4,0,1,0,0,...,0,1,0,0,0,0,0,1,0,0
4,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,4,0,1,0,0,...,1,0,0,0,0,0,0,1,0,0


## 1.3 Ordinal features

In [67]:
# Ordinal features
ord_features = pd.Series(X.columns).str.startswith('ord')

# First five observations of the ordinal features
X[X.columns[ord_features]].head()

Unnamed: 0_level_0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2,Grandmaster,Cold,h,D,kr
1,1,Grandmaster,Hot,a,A,bF
2,1,Expert,Lava Hot,h,R,Jc
3,1,Grandmaster,Boiling Hot,i,D,kW
4,1,Grandmaster,Freezing,a,R,qP


In [68]:
# dictionary to map the feature ord_1
ord_1_dict = {'Novice':0, 'Contributor':1, 'Expert':2, 'Master':3, 'Grandmaster':4}

# Maping the category values in our dict
X['ord_1'] = X['ord_1'].map(ord_1_dict)
X_test['ord_1'] = X_test['ord_1'].map(ord_1_dict)

In [69]:
# dictionary to map the feature ord_2
ord_2_dict = {'Freezing':0, 'Cold':1, 'Warm':2, 'Hot':3, 'Boiling Hot':4, 'Lava Hot':5}

# Maping the category values in our dict
X['ord_2'] = X['ord_2'].map(ord_2_dict)
X_test['ord_2'] = X_test['ord_2'].map(ord_2_dict)

In [70]:
# Converting ord_3 and ord_4 values to their code ascii
X['ord_3'] = X['ord_3'].apply(lambda x : string.ascii_letters.find(x)+1)
X['ord_4'] = X['ord_4'].apply(lambda x : string.ascii_letters.find(x)+1)

X_test['ord_3'] = X_test['ord_3'].apply(lambda x : string.ascii_letters.find(x)+1)
X_test['ord_4'] = X_test['ord_4'].apply(lambda x : string.ascii_letters.find(x)+1)

In [71]:
# Adding the sum of the code ascii of the two letters making up ord_5
X['ord_5_ascii'] = X['ord_5'].apply(lambda x : sum([(string.ascii_letters.find(letter)+1) for letter in x]))
X_test['ord_5_ascii'] = X_test['ord_5'].apply(lambda x : sum([(string.ascii_letters.find(letter)+1) for letter in x]))

# Splitting ord_5 into the alphabet rank of each of the letters making up ord_5
X['ord_5_0_ascii'] = X['ord_5'].apply(lambda x : string.ascii_letters.find(x[0])+1)
X['ord_5_1_ascii'] = X['ord_5'].apply(lambda x : string.ascii_letters.find(x[1])+1)

X_test['ord_5_0_ascii'] = X_test['ord_5'].apply(lambda x : string.ascii_letters.find(x[0])+1)
X_test['ord_5_1_ascii'] = X_test['ord_5'].apply(lambda x : string.ascii_letters.find(x[1])+1)

# Adding alphabetical order of ord_5
ord_5 = sorted(list(set(X['ord_5'].values)))
ord_5 = dict(zip(ord_5, range(len(ord_5))))
X.loc[:, 'ord_5'] = X['ord_5'].apply(lambda x: ord_5[x]).astype(int)
X_test.loc[:, 'ord_5'] = X_test['ord_5'].apply(lambda x: ord_5[x]).astype(int)

In [72]:
# Ordinal features
ord_features = pd.Series(X.columns).str.startswith('ord')

# First five observations of the ordinal features
X[X.columns[ord_features]].head()

Unnamed: 0_level_0,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,ord_5_ascii,ord_5_0_ascii,ord_5_1_ascii
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
0,2,4,1,8,30,136,29,11,18
1,1,4,3,1,27,93,34,2,32
2,1,2,5,8,44,31,39,36,3
3,1,4,4,9,30,134,60,11,49
4,1,4,0,1,44,158,59,17,42


## 1.4 Cyclical features


In [73]:
# Cyclical features
cyc_features = ['day', 'month']

# First five observations of the cyclical features
X[cyc_features].head()

Unnamed: 0_level_0,day,month
id,Unnamed: 1_level_1,Unnamed: 2_level_1
0,2,2
1,7,8
2,7,2
3,2,1
4,7,8


In [74]:
# Adding cos and sin coordinates of the day and month values
X['month_sin'] = np.sin((2*np.pi*X['month'])/12)
X['month_cos'] = np.cos((2*np.pi*X['month'])/12)
X['day_sin'] = np.sin((2*np.pi*X['month'])/7)
X['day_cos'] = np.cos((2*np.pi*X['month'])/7)
#X.drop(cyc_features, axis=1, inplace=True)                    
                       
X_test['month_sin'] = np.sin((2*np.pi*X_test['month'])/12)
X_test['month_cos'] = np.cos((2*np.pi*X_test['month'])/12)
X_test['day_sin'] = np.sin((2*np.pi*X_test['month'])/7)
X_test['day_cos'] = np.cos((2*np.pi*X_test['month'])/7)
#X_test.drop(cyc_features, axis=1, inplace=True) 

In [75]:
# Cyclical features
cyc_features = ['day', 'day_sin', 'day_cos', 'month', 'month_sin', 'month_cos']

# First five observations of the cyclical features
X[cyc_features].head()

Unnamed: 0_level_0,day,day_sin,day_cos,month,month_sin,month_cos
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2,0.974928,-0.222521,2,0.866025,0.5
1,7,0.781831,0.62349,8,-0.866025,-0.5
2,7,0.974928,-0.222521,2,0.866025,0.5
3,2,0.781831,0.62349,1,0.5,0.866025
4,7,0.781831,0.62349,8,-0.866025,-0.5


# 2.Modeling

In [76]:
print("X has",X.shape[1], "columns.")
print("X_test has",X_test.shape[1], "columns.")

X has 46 columns.
X_test has 46 columns.


In [77]:
# Splitting the training data into training and validation data 
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=0)

## 2.1.Logistic Regression

In [78]:
# # Choose the type of classifier. 
# lr_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
#                      ("Scaler", StandardScaler()),
#                      ("LogReg", LogisticRegression(solver="liblinear"))])

# # Choose some parameter combinations to try
# lr_parameters = {'LogReg__C': [0.1, 0.25, 0.5, 1.0, 1.5],
#                  'LogReg__max_iter':[50, 100, 200, 500, 1000],
#                  'LogReg__tol':[0.00001, 0.0001, 0.0002, 0.0005, 0.001]}

# # Run the grid search
# lr_grid_obj = GridSearchCV(lr_model, lr_parameters, cv=5, scoring='roc_auc', n_jobs=3)
# lr_grid_obj = lr_grid_obj.fit(X_train, y_train)

# # Set the rf_model to the best combination of parameters
# lr_model = lr_grid_obj.best_estimator_

# # Show parameters
# lr_model

In [79]:
# Choose the type of classifier. 
lr_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
                     ("Scaler", StandardScaler()),
                     ("LogReg", LogisticRegression(C=1.5, max_iter=50, solver="liblinear", tol=0.001))])

# 5-Fold Cross Validation
lr_cv_results = cross_val_score(lr_model,
                                X_train, y_train,
                                cv=KFold(n_splits=5, shuffle=False, random_state=42), 
                                scoring='roc_auc',
                                n_jobs=3)

In [80]:
# CV results
print("LR CV AUC mean: ", lr_cv_results.mean())
print("LR CV AUC std:  ", lr_cv_results.std())

LR CV AUC mean:  0.7767579103315474
LR CV AUC std:   0.0016620086520674274


In [27]:
#0.7767579103315474

In [81]:
# Validation
lr_model.fit(X_train, y_train)
lr_predictions = lr_model.predict_proba(X_valid)[:,1]
print("LR Validation AUC: ", roc_auc_score(y_valid, lr_predictions))

LR Validation AUC:  0.7825783392156936


## 2.2.Random Forest 

In [82]:
# Choose the type of classifier. 
rf_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
                     ("rf", RandomForestClassifier(n_estimators=100))])



# 5-Fold Cross Validation
rf_cv_results = cross_val_score(rf_model,
                                X_train, y_train,
                                cv=KFold(n_splits=5, shuffle=False, random_state=42), 
                                scoring='roc_auc',
                                n_jobs=3)

In [84]:
# CV results
print("RF CV AUC mean: ", rf_cv_results.mean())
print("RF CV AUC std:  ", rf_cv_results.std())

RF CV AUC mean:  0.7442606744361219
RF CV AUC std:   0.0012855158011247792


In [31]:
#0.7442606744361219

In [85]:
# Validation
rf_model.fit(X_train, y_train)
rf_predictions = rf_model.predict_proba(X_valid)[:,1]
print("RF Validation AUC: ", roc_auc_score(y_valid, rf_predictions))

RF Validation AUC:  0.7498459438790318


## 2.3.Gradient Boosting

In [86]:
# Choose the type of classifier.
gb_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
                     ("gb", GradientBoostingClassifier(n_estimators=100))])

# 5-Fold Cross Validation
gb_cv_results = cross_val_score(gb_model,
                                X_train, y_train,
                                cv=KFold(n_splits=5, shuffle=False, random_state=42), 
                                scoring='roc_auc',
                                n_jobs=3)

In [87]:
# CV results
print("GB CV AUC mean: ", gb_cv_results.mean())
print("GB CV AUC std:  ", gb_cv_results.std())

GB CV AUC mean:  0.750820545030749
GB CV AUC std:   0.002015165108214244


In [35]:
#0.750820545030749

In [88]:
# Validation
gb_model.fit(X_train, y_train)
gb_predictions = gb_model.predict_proba(X_valid)[:,1]
print("GB Validation AUC: ", roc_auc_score(y_valid, gb_predictions))

GB Validation AUC:  0.7598435039881575


## 2.4.Combining models

In [89]:
mean_predictions = np.mean(np.array([lr_predictions, rf_predictions, gb_predictions]), axis=0)
print("Stacked Validation AUC: ", roc_auc_score(y_valid, mean_predictions))                                    

Stacked Validation AUC:  0.7728354658627712


In [38]:
#0.7728354658627712

In [39]:
X.columns

Index(['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'ord_0', 'ord_1', 'ord_2',
       'ord_3', 'ord_4', 'day', 'month', 'nom_1_sides', 'nom_0_Green',
       'nom_0_Red', 'nom_1_Polygon', 'nom_1_Square', 'nom_1_Star',
       'nom_1_Trapezoid', 'nom_1_Triangle', 'nom_2_Cat', 'nom_2_Dog',
       'nom_2_Hamster', 'nom_2_Lion', 'nom_2_Snake', 'nom_3_China',
       'nom_3_Costa Rica', 'nom_3_Finland', 'nom_3_India', 'nom_3_Russia',
       'nom_4_Oboe', 'nom_4_Piano', 'nom_4_Theremin', 'ord_5_ascii',
       'ord_5_0_ascii', 'ord_5_1_ascii', 'month_sin', 'month_cos', 'day_sin',
       'day_cos'],
      dtype='object')

# 3.Submission

In [90]:
lr_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
                     ("Scaler", StandardScaler()),
                     ("LogReg", LogisticRegression(C=1.5, max_iter=50, solver="liblinear", tol=0.001))])
lr_model.fit(X, y)
lr_predictions = lr_model.predict_proba(X_test)[:,1]

# rf_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
#                      ("rf", RandomForestClassifier(n_estimators=100))])
# rf_model.fit(X, y)
# rf_predictions = rf_model.predict_proba(X_test)[:,1]

# gb_model = Pipeline([("Target_Encoder", TargetEncoder(cols = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9'])),
#                      ("gb", GradientBoostingClassifier(n_estimators=100))])
# gb_model.fit(X, y)
# gb_predictions = gb_model.predict_proba(X_test)[:,1]

# mean_predictions = np.mean(np.array([lr_predictions, rf_predictions, gb_predictions]), axis=0)

In [91]:
output = pd.DataFrame({'id': X_test.index,
                       'target': lr_predictions})

output.to_csv('submission.csv', index=False)