In [1]:
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import SGDClassifier
import xgboost as xgb

In [2]:
df = pd.read_csv("preprocessed2.csv")
df.shape

(404287, 810)

In [3]:
y = df['is_duplicate'].values

df = df.drop(['id','is_duplicate'], axis=1)

x = df.values

In [4]:
print(x.shape)

print(y.shape)

(404287, 808)
(404287,)


## Train test split

In [5]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size = 0.2, random_state = 0)

xtrain, xcv, ytrain, ycv = train_test_split(xtrain, ytrain, test_size = 0.2, random_state = 0)

In [6]:
print("shape of training features:", xtrain.shape)
print("shape of training labels:", ytrain.shape)
print("shape of cross-validation features:", xcv.shape)
print("shape of cross-validation labels:", ycv.shape)
print("shape of test features:", xtest.shape)
print("shape of test labels:", ytest.shape)

shape of training features: (258743, 808)
shape of training labels: (258743,)
shape of cross-validation features: (64686, 808)
shape of cross-validation labels: (64686,)
shape of test features: (80858, 808)
shape of test labels: (80858,)


## random model

In [7]:
y_pred = np.zeros([80858,2])
for i in range(80858):
    x = np.random.rand(1,2)
    x = x/x.sum()
    y_pred[i,:] = x

In [8]:
print("log loss of test data is:",log_loss(ytest,y_pred))

log loss of test data is: 0.8906748995311722


In [9]:
y_pred_cv = np.zeros([64686,2])
for i in range(64686):
    x = np.random.rand(1,2)
    x = x/x.sum()
    y_pred_cv[i,:] = x

In [10]:
print("log loss of cross_validation data is:",log_loss(ycv,y_pred_cv))

log loss of cross_validation data is: 0.8829634963221697


## Logistic Regression

In [11]:
alpha = [10 ** x for x in range(-5, 5)]

errors = []

for i in alpha:
    lr = SGDClassifier(alpha=i, penalty='l2', loss='log', random_state=0)
    lr.fit(xtrain, ytrain)
    
    cc = CalibratedClassifierCV(lr, method="sigmoid")
    cc.fit(xtrain, ytrain)
    
    y_pred_cv = cc.predict_proba(xcv)
    
    errors.append(log_loss(ycv, y_pred_cv, labels=lr.classes_, eps=1e-15))
    
    print('For values of alpha = ', i, "The log loss is:",log_loss(ycv, y_pred_cv, labels=lr.classes_, eps=1e-15))

For values of alpha =  1e-05 The log loss is: 0.6577730802676728
For values of alpha =  0.0001 The log loss is: 0.6577730802676728
For values of alpha =  0.001 The log loss is: 0.6577730802676728
For values of alpha =  0.01 The log loss is: 0.6577730802676728
For values of alpha =  0.1 The log loss is: 0.656342052086407
For values of alpha =  1 The log loss is: 0.6537536594247011
For values of alpha =  10 The log loss is: 0.6536525827033991
For values of alpha =  100 The log loss is: 0.6494018573169754
For values of alpha =  1000 The log loss is: 0.648741290983299
For values of alpha =  10000 The log loss is: 0.6496127181385953


In [12]:
lr = SGDClassifier(alpha=1000, penalty='l2', loss='log', random_state=0)
lr.fit(xtrain, ytrain)

cc = CalibratedClassifierCV(lr, method="sigmoid")
cc.fit(xtrain, ytrain)

y_pred_cv = cc.predict_proba(xcv)

y_pred = cc.predict_proba(xtest)

print("log loss on cross validation data:",log_loss(ycv, y_pred_cv, labels=lr.classes_, eps=1e-15))

print("log loss on test data:",log_loss(ytest, y_pred, labels=lr.classes_, eps=1e-15))

log loss on cross validation data: 0.648741290983299
log loss on test data: 0.6485601494195274


## Linear SVM

In [13]:
alpha = [10 ** x for x in range(-5, 5)]

errors = []

for i in alpha:
    lrsvm = SGDClassifier(alpha=i, penalty='l1', loss='hinge', random_state=42)
    lrsvm.fit(xtrain, ytrain)
    
    cc = CalibratedClassifierCV(lrsvm, method="sigmoid")
    cc.fit(xtrain, ytrain)
    
    y_pred_cv = cc.predict_proba(xcv)
    
    errors.append(log_loss(ycv, y_pred_cv, labels = lrsvm.classes_, eps=1e-15))
    
    print('For values of alpha = ', i, "The log loss is:",log_loss(ycv, y_pred_cv, labels = lrsvm.classes_, eps=1e-15))

For values of alpha =  1e-05 The log loss is: 0.6577730802676728
For values of alpha =  0.0001 The log loss is: 0.6577730802676728
For values of alpha =  0.001 The log loss is: 0.6577730802676728
For values of alpha =  0.01 The log loss is: 0.5211240195353466
For values of alpha =  0.1 The log loss is: 0.5381584493759366
For values of alpha =  1 The log loss is: 0.5852859436496413
For values of alpha =  10 The log loss is: 0.6542062255030273
For values of alpha =  100 The log loss is: 0.6577730604935831
For values of alpha =  1000 The log loss is: 0.65442382242152
For values of alpha =  10000 The log loss is: 0.6544238224231584


In [14]:
lrsvm = SGDClassifier(alpha=0.01, penalty='l1', loss='hinge', random_state=42)
lrsvm.fit(xtrain, ytrain)

cc = CalibratedClassifierCV(lrsvm, method="sigmoid")
cc.fit(xtrain, ytrain)

y_pred_cv = cc.predict_proba(xcv)

y_pred = cc.predict_proba(xtest)

print("log loss on cross validation data:",log_loss(ycv, y_pred_cv, labels = lrsvm.classes_, eps=1e-15))

print("log loss on test data:",log_loss(ytest, y_pred, labels = lrsvm.classes_, eps=1e-15))

log loss on cross validation data: 0.5211240195353466
log loss on test data: 0.5176347880715397


## XG Boost

In [15]:
params = {}
params['objective'] = 'binary:logistic'
params['eval_metric'] = 'logloss'
params['eta'] = 0.02
params['max_depth'] = 4

d_train = xgb.DMatrix(xtrain, label = ytrain)
d_cv = xgb.DMatrix(xcv, label = ycv)
d_test = xgb.DMatrix(xtest, label = ytest)

watchlist = [(d_train, 'train'), (d_cv, 'valid'), (d_test, 'test')]

bst = xgb.train(params, d_train, 400, watchlist, early_stopping_rounds=20, verbose_eval=10)

xgdmat = xgb.DMatrix(xtrain, ytrain)

y_pred_cv = bst.predict(d_cv)

y_pred = bst.predict(d_test)

print("The cross validation log loss is:",log_loss(ycv, y_pred_cv, labels=lr.classes_, eps=1e-15))
print("The test log loss is:",log_loss(ytest, y_pred, labels=lr.classes_, eps=1e-15))

[15:59:33] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[0]	train-logloss:0.685336	valid-logloss:0.6853	test-logloss:0.685335
Multiple eval metrics have been passed: 'test-logloss' will be used for early stopping.

Will train until test-logloss hasn't improved in 20 rounds.
[15:59:43] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[15:59:52] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:00:02] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:00:12] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:00:22] d:\build\xgboost\xgboost-0.81.git\src\tr

[16:09:04] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:09:14] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:09:24] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:09:34] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:09:44] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:09:55] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:10:05] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 prun

[16:18:44] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:18:56] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:19:08] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:19:20] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[110]	train-logloss:0.424998	valid-logloss:0.424766	test-logloss:0.424367
[16:19:31] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:19:41] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:19:52] d:\build\xgboost\xgboost-0.81.git\src\t

[160]	train-logloss:0.405961	valid-logloss:0.40606	test-logloss:0.405314
[16:29:42] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:29:50] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:29:57] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:30:05] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:30:13] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:30:20] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:30:28] d:\build\xgboost\xgboost-0.81.git\src\tr

[16:38:21] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:38:29] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:38:38] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:38:46] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:38:54] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:39:03] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[220]	train-logloss:0.395068	valid-logloss:0.395807	test-logloss:0.394557
[16:39:11] d:\build\xgboost\xgboost-0.81.git\src\t

[16:46:06] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=4
[16:46:16] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[270]	train-logloss:0.389753	valid-logloss:0.390866	test-logloss:0.389476
[16:46:24] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:46:35] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:46:43] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:46:52] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:47:01] d:\build\xgboost\xgboost-0.81.git\src\t

[16:53:59] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:54:06] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:54:13] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=4
[16:54:20] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=4
[16:54:27] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[16:54:34] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 pruned nodes, max_depth=4
[16:54:42] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 28 extra nodes, 0 prun

[17:01:00] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[17:01:07] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[17:01:15] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[17:01:23] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[380]	train-logloss:0.381876	valid-logloss:0.384075	test-logloss:0.382391
[17:01:30] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[17:01:37] d:\build\xgboost\xgboost-0.81.git\src\tree\updater_prune.cc:74: tree pruning end, 1 roots, 30 extra nodes, 0 pruned nodes, max_depth=4
[17:01:45] d:\build\xgboost\xgboost-0.81.git\src\t