In [1]:
import pandas as pd
data = pd.read_csv("newTrainingDataset.csv")

In [2]:
# Training set 

y = data.click
X = data.iloc[:, 1:49]

In [3]:
# Validation set 

vali = pd.read_csv("newValidationDataset.csv")
X_test = vali.iloc[:, 1:49]
y_true = vali.click

In [None]:
# Optimize decision tree 

from sklearn.linear_model import LogisticRegression 
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
inner_cv = KFold(n_splits=5, shuffle=True)
outer_cv = KFold(n_splits=5, shuffle=True)

gs = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                 param_grid=[{'max_depth': [4, 6, 8, 10, None], 'criterion':['gini','entropy'], 
                              'min_samples_leaf':[1,2,3,4,5],
                              'min_samples_split':[2,3,4,5]}],
                  scoring='neg_log_loss',
                  cv=inner_cv)
gs = gs.fit(X,y)
print("Optimal Parameter: ", gs.best_params_)
print("Optimal Estimator: ", gs.best_estimator_) 

In [23]:
# Performance of DT 

import sklearn 
from sklearn.metrics import log_loss
dt = DecisionTreeClassifier(max_depth=8, criterion='gini', min_samples_leaf = 5, random_state=0)

y_pred = dt.fit(X, y).predict_proba(X_test)
sklearn.metrics.log_loss(y_true, y_pred, normalize=True, sample_weight=None, labels=None)

In [4]:
# Performance of Naive Baynes 

from sklearn.naive_bayes import GaussianNB
import sklearn 
from sklearn.metrics import log_loss

gnb = GaussianNB()
y_pred2 = gnb.fit(X, y).predict_proba(X_test)

sklearn.metrics.log_loss(y_true, y_pred2, normalize=True, sample_weight=None, labels=None)

2.195075728470081

In [5]:
# A small sample of training and validation sets to run code faster 

train_X = X.sample(n=2000000)
train_y = y.sample(n=2000000)
vali_sample = vali.sample(n=1000000)
X_sample = X_test.sample(n=1000000)
y_sample = y_true.sample(n=1000000)

In [None]:
# Optimize SVM

from sklearn.model_selection import GridSearchCV
from sklearn import svm
from sklearn.svm import SVC 

op_sup = [{'kernel': ['rbf','linear','poly','sigmoid'], 'gamma': ['scale', 'auto'],
                     'C': [1, 10, 100, 1000, 10000]}]

sup = GridSearchCV(SVC(random_state=0,probability=True),
                 op_sup,
                  scoring='neg_log_loss')
sup = sup.fit(train_X,train_y)
print("Optimal Parameter: ", sup.best_params_)
print("Optimal Estimator: ", sup.best_estimator_) 

In [49]:
# Performance of SVM

from sklearn import svm
import sklearn 
from sklearn.metrics import log_loss

svc = svm.SVC(probability=True, C=100, gamma='scale', kernel='rbf', random_state=0)
y_pred3 = svc.fit(train_X, train_y).predict_proba(X_sample)

sklearn.metrics.log_loss(y_sample, y_pred3, normalize=True, sample_weight=None, labels=None)

0.45752086747900683

In [50]:
# Fiting DT with the small sample  

from sklearn.tree import DecisionTreeClassifier
gs_new = GridSearchCV(estimator=DecisionTreeClassifier(random_state=0),
                 param_grid=[{'max_depth': [6, 8, 10, None], 'criterion':['gini','entropy'], 
                              'min_samples_leaf':[2,3,4],
                              'min_samples_split':[2,3,4]}],
                  scoring='neg_log_loss')
gs_new = gs_new.fit(train_X,train_y)
print("Optimal Parameter: ", gs_new.best_params_)
print("Optimal Estimator: ", gs_new.best_estimator_) 

Optimal Parameter:  {'criterion': 'entropy', 'max_depth': 6, 'min_samples_leaf': 3, 'min_samples_split': 2}
Optimal Estimator:  DecisionTreeClassifier(criterion='entropy', max_depth=6, min_samples_leaf=3,
                       random_state=0)


In [51]:
# Performance of DT with the samll sample 

dt = DecisionTreeClassifier(max_depth=6, criterion='entropy', min_samples_leaf = 4, random_state=0)
y_pred5 = dt.fit(train_X, train_y).predict_proba(X_sample)

sklearn.metrics.log_loss(y_sample, y_pred5, normalize=True, sample_weight=None, labels=None)

0.6011251125519442

In [None]:
# Optimize Neural Nets 

import numpy as np 
from sklearn.neural_network import MLPClassifier
import sklearn 
from sklearn.metrics import log_loss
from sklearn.model_selection import GridSearchCV

parameters = {'solver': ['lbfgs','sgd', 'adam'], 'max_iter': [500,1000,1500], 'alpha': 10.0 ** -np.arange(1, 7), 
              'hidden_layer_sizes':np.arange(5, 12), 'activation': ['identity', 'logistic', 'tanh', 'relu']}

net = GridSearchCV(MLPClassifier(random_state=1), parameters, n_jobs=-1,
                  scoring='neg_log_loss')

gs_net = net.fit(train_X, train_y)

print("Optimal Estimator: ", gs_net.best_estimator_) 

In [54]:
# Performance of Neural Nets 

net_1 = MLPClassifier(alpha=0.1, hidden_layer_sizes=5, max_iter=500, random_state=1,solver='lbfgs')

y_pred6 = net_1.fit(train_X, train_y).predict_proba(X_sample)

sklearn.metrics.log_loss(y_sample, y_pred6, normalize=True, sample_weight=None, labels=None)

0.45602347354500145