## Loading Data

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = np.genfromtxt('X_train.txt', delimiter=None)
Y = np.genfromtxt('Y_train.txt', delimiter=None)
Xte = np.genfromtxt('X_test.txt', delimiter=None)

Xtr, Xva, Ytr, Yva = train_test_split(X, Y, test_size = 0.1, random_state=0)
XtrP = StandardScaler().fit_transform(Xtr)
XvaP = StandardScaler().fit_transform(Xva)

print(Xtr.shape, Xva.shape, Ytr.shape, Yva.shape)

(180000, 14) (20000, 14) (180000,) (20000,)


## KNN

In [2]:
from sklearn.neighbors import KNeighborsClassifier

In [3]:
correlations = dict([])
for i in range(14):
    for j in range(i+1, 14):
        correlations[np.corrcoef(XtrP[:,i], XtrP[:,j])[0][1]] = str(i) + ',' + str(j)
print('min correlation of', min(correlations), 'at', correlations[min(correlations)])
print('max correlation of', max(correlations), 'at', correlations[max(correlations)])

min correlation of -0.9692805658978799 at 3,13
max correlation of 0.9456425468737583 at 0,3


In [4]:
XtrKNN = XtrP[:, 0]
XvaKNN = XvaP[:, 0]
for i in range(1, 14):
    if i not in [3, 13]:
        XtrKNN = np.column_stack((XtrKNN, XtrP[:,i]))
        XvaKNN = np.column_stack((XvaKNN, XvaP[:,i]))
XtrKNN = np.column_stack((XtrKNN, np.multiply(-XtrP[:,3], XtrP[:,13])))
XvaKNN = np.column_stack((XvaKNN, np.multiply(-XvaP[:,3], XvaP[:,13])))

In [5]:
# for n_neighbors in [7,8,9,10,11,12,13,14,15]:
#     neigh = KNeighborsClassifier(n_neighbors)
#     neigh.fit(XtrKNN, Ytr)
#     print(n_neighbors, metrics.roc_auc_score(Yva,neigh.predict_proba(XvaKNN)[:,1]))

In [6]:
neigh = KNeighborsClassifier(10)
neigh.fit(XtrKNN, Ytr)
print('Training AUC', metrics.roc_auc_score(Ytr,neigh.predict_proba(XtrKNN)[:,1]))
print('Validation AUC', metrics.roc_auc_score(Yva,neigh.predict_proba(XvaKNN)[:,1]))

Training AUC 0.8269270001007643
Validation AUC 0.731799663583667


## Linear Model

In [7]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import PolynomialFeatures

In [8]:
XtrPoly = PolynomialFeatures(degree=2).fit_transform(XtrP)
XvaPoly = PolynomialFeatures(degree=2).fit_transform(XvaP)

print(XtrPoly.shape, XvaP.shape)

(180000, 120) (20000, 14)


In [11]:
lr = LogisticRegression(tol=1e-4, max_iter=5000)
lr.fit(XtrPoly, Ytr)

LogisticRegression(max_iter=5000)

In [12]:
print("Training AUC:", metrics.roc_auc_score(Ytr, lr.predict_proba(XtrPoly)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, lr.predict_proba(XvaPoly)[:,1]))

Training AUC: 0.6811936251633213
Validation AUC: 0.6818634463949991


## Random Forests

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
rf = RandomForestClassifier(n_estimators=500, max_depth=35, min_samples_leaf=4, min_samples_split = 4, max_features=4)
rf.fit(XtrP, Ytr)

RandomForestClassifier(max_depth=35, max_features=4, min_samples_leaf=4,
                       min_samples_split=4, n_estimators=500)

In [8]:
print("Training AUC:", metrics.roc_auc_score(Ytr, rf.predict_proba(XtrP)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, rf.predict_proba(XvaP)[:,1]))

Training AUC: 0.952404273127158
Validation AUC: 0.7871700963858093


    100 estimators, max_depth=30: 0.7556847954460596
    100 estimators, max_depth=30, min_samples_split=4: 0.7676214147735156
    100 estimators, max_depth=30, min_samples_leaf=4: 0.7796876858950488
    100 estimators, min_samples_leaf=3, min_samples_split = 3, max_features=2,0.7796945078531264
    100 estimators, min_samples_leaf=3, min_samples_split = 3, max_features=3,0.7837683270635156
    100 estimators, min_samples_leaf=4, min_samples_split = 4, max_features=4,0.7850483251095345
    100 estimators, min_samples_leaf=5, min_samples_split = 5, max_features=4,0.7843246789470857
    100 estimators, min_samples_leaf=5, min_samples_split = 5, max_features=5,0.7836942830214536


## Boosted Learners

In [17]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from lightgbm import LGBMClassifier

In [21]:
# num_leaves = [3300, 3650, 4000, 4300, 4650]
# min_child_samples = [1, 2, 4]
# for n in num_leaves:
#     for m in min_child_samples:
#         lgbm = LGBMClassifier(num_leaves=n, learning_rate=0.01, min_child_samples=m, n_estimators=200)
#         lgbm.fit(XtrP, Ytr)
#         print(n,m,"Validation AUC:", metrics.roc_auc_score(Yva, lgbm.predict_proba(XvaP)[:,1]))

# max_depths = [50, 75, 100, 125]
# min_child_samples = [1, 2, 4]
# for n in num_leaves:
#     for d in max_depths:
#         for m in min_child_samples:
#             lgbm = LGBMClassifier(num_leaves=n, max_depth=d, learning_rate=0.01, min_child_samples=m, n_estimators=200)
#             lgbm.fit(XtrP, Ytr)
#             print(n,d,m,"Validation AUC:", metrics.roc_auc_score(Yva, lgbm.predict_proba(XvaP)[:,1]))

4000 50 1 Validation AUC: 0.7846730722253169
4000 50 2 Validation AUC: 0.784184030568597
4000 50 4 Validation AUC: 0.7833061076928735
4000 75 1 Validation AUC: 0.7846730722253169
4000 75 2 Validation AUC: 0.7841835780281874
4000 75 4 Validation AUC: 0.783529839256322
4000 100 1 Validation AUC: 0.7846730722253169
4000 100 2 Validation AUC: 0.7841835780281874
4000 100 4 Validation AUC: 0.783529839256322
4000 125 1 Validation AUC: 0.7846730722253169
4000 125 2 Validation AUC: 0.7841835780281874
4000 125 4 Validation AUC: 0.783529839256322
4300 50 1 Validation AUC: 0.7840812983768457
4300 50 2 Validation AUC: 0.7839019212923176
4300 50 4 Validation AUC: 0.7832117971677681
4300 75 1 Validation AUC: 0.7840812983768457
4300 75 2 Validation AUC: 0.7839004753705214
4300 75 4 Validation AUC: 0.78312128908586
4300 100 1 Validation AUC: 0.7840812983768457
4300 100 2 Validation AUC: 0.7839004753705214
4300 100 4 Validation AUC: 0.78312128908586
4300 125 1 Validation AUC: 0.7840812983768457
4300 125

In [18]:
lgbm = LGBMClassifier(num_leaves=1000, learning_rate=0.01, min_child_samples=1, n_estimators=2000)
lgbm.fit(XtrP, Ytr)
print("Training AUC:", metrics.roc_auc_score(Ytr, lgbm.predict_proba(XtrP)[:,1]))
print("Validation AUC:", metrics.roc_auc_score(Yva, lgbm.predict_proba(XvaP)[:,1]))

Training AUC: 0.9771825304995541
Validation AUC: 0.7982248307730657


    num_leaves=4000, min_child_samples=1: 0.9822267654079306, 0.7880581959019618


## Neural Networks

In [22]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense

n_features = XtrP.shape[1]
# define model
nn = Sequential()
nn.add(Dense(500, activation='relu', kernel_initializer='he_normal', input_shape=(n_features,)))
nn.add(Dense(100, activation='relu', kernel_initializer='he_normal'))
nn.add(Dense(10, activation='relu', kernel_initializer='he_normal'))
nn.add(Dense(1, activation='sigmoid'))
# compile the model
nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# fit the model
nn.fit(XtrP, Ytr, epochs=200, batch_size=128, verbose=2)

Epoch 1/200
1407/1407 - 2s - loss: 0.5997 - accuracy: 0.6909
Epoch 2/200
1407/1407 - 2s - loss: 0.5898 - accuracy: 0.6976
Epoch 3/200
1407/1407 - 2s - loss: 0.5843 - accuracy: 0.7017
Epoch 4/200
1407/1407 - 2s - loss: 0.5807 - accuracy: 0.7038
Epoch 5/200
1407/1407 - 2s - loss: 0.5778 - accuracy: 0.7065
Epoch 6/200
1407/1407 - 2s - loss: 0.5748 - accuracy: 0.7076
Epoch 7/200
1407/1407 - 2s - loss: 0.5723 - accuracy: 0.7093
Epoch 8/200
1407/1407 - 2s - loss: 0.5700 - accuracy: 0.7109
Epoch 9/200
1407/1407 - 2s - loss: 0.5675 - accuracy: 0.7125
Epoch 10/200
1407/1407 - 2s - loss: 0.5654 - accuracy: 0.7139
Epoch 11/200
1407/1407 - 2s - loss: 0.5629 - accuracy: 0.7160
Epoch 12/200
1407/1407 - 2s - loss: 0.5609 - accuracy: 0.7168
Epoch 13/200
1407/1407 - 2s - loss: 0.5593 - accuracy: 0.7175
Epoch 14/200
1407/1407 - 2s - loss: 0.5572 - accuracy: 0.7192
Epoch 15/200
1407/1407 - 2s - loss: 0.5556 - accuracy: 0.7201
Epoch 16/200
1407/1407 - 2s - loss: 0.5538 - accuracy: 0.7216
Epoch 17/200
1407

Epoch 133/200
1407/1407 - 2s - loss: 0.4733 - accuracy: 0.7674
Epoch 134/200
1407/1407 - 2s - loss: 0.4728 - accuracy: 0.7679
Epoch 135/200
1407/1407 - 2s - loss: 0.4723 - accuracy: 0.7681
Epoch 136/200
1407/1407 - 2s - loss: 0.4720 - accuracy: 0.7689
Epoch 137/200
1407/1407 - 2s - loss: 0.4715 - accuracy: 0.7682
Epoch 138/200
1407/1407 - 2s - loss: 0.4718 - accuracy: 0.7678
Epoch 139/200
1407/1407 - 2s - loss: 0.4709 - accuracy: 0.7686
Epoch 140/200
1407/1407 - 2s - loss: 0.4711 - accuracy: 0.7681
Epoch 141/200
1407/1407 - 2s - loss: 0.4698 - accuracy: 0.7694
Epoch 142/200
1407/1407 - 2s - loss: 0.4698 - accuracy: 0.7688
Epoch 143/200
1407/1407 - 2s - loss: 0.4692 - accuracy: 0.7689
Epoch 144/200
1407/1407 - 2s - loss: 0.4698 - accuracy: 0.7692
Epoch 145/200
1407/1407 - 2s - loss: 0.4685 - accuracy: 0.7687
Epoch 146/200
1407/1407 - 2s - loss: 0.4688 - accuracy: 0.7695
Epoch 147/200
1407/1407 - 2s - loss: 0.4682 - accuracy: 0.7695
Epoch 148/200
1407/1407 - 2s - loss: 0.4681 - accuracy:

<tensorflow.python.keras.callbacks.History at 0x23a0ec71988>

In [23]:
print("Training AUC:", metrics.roc_auc_score(Ytr, nn.predict_proba(XtrP)))
print("Validation AUC:", metrics.roc_auc_score(Yva, nn.predict_proba(XvaP)))



Training AUC: 0.8480251247055066




Validation AUC: 0.7535464322575043


## Ensemble

In [40]:
abc_pred = abc.predict_proba(XvaP)[:,1]
rf_pred = rf.predict_proba(XvaP)[:,1]
neigh_pred = neigh.predict_proba(XvaKNN)[:,1]
nn_pred = nn.predict_proba(XvaP)

XvaStack = np.column_stack((abc_pred, rf_pred, neigh_pred, nn_pred))



In [35]:
print("Validation AUC lgbm:", metrics.roc_auc_score(Yva, lgbm_pred))
print("Validation AUC rf:", metrics.roc_auc_score(Yva, rf_pred))
print("Validation AUC neigh:", metrics.roc_auc_score(Yva, neigh_pred))
print("Validation AUC nn:", metrics.roc_auc_score(Yva, nn_pred))
print("Validation AUC:", metrics.roc_auc_score(Yva, stacked.predict_proba(XvaStack)[:,1]))

Validation AUC lgbm: 0.7982248307730657
Validation AUC rf: 0.7871700963858093
Validation AUC neigh: 0.7353135074128989
Validation AUC nn: 0.7535464322575043
Validation AUC: 0.7999517194570875


In [29]:
lgbm_pred = lgbm.predict_proba(XvaP)[:,1]
rf_pred = rf.predict_proba(XvaP)[:,1]
neigh_pred = neigh.predict_proba(XvaKNN)[:,1]
nn_pred = nn.predict_proba(XvaP)



In [33]:
XvaStack = np.column_stack((lgbm_pred, rf_pred, neigh_pred, nn_pred))
# for c in [.1, 1, 10, 100, 1000, 10000, 1000000]:
#     stacked = LogisticRegression(max_iter=1000, C=c)
#     stacked.fit(XvaStack, Yva)
#     print(c, "Validation AUC:", metrics.roc_auc_score(Yva, stacked.predict_proba(XvaStack)[:,1]))

0.1 Validation AUC: 0.7994593444539362
1 Validation AUC: 0.7998947435157693
10 Validation AUC: 0.7999470616021406
100 Validation AUC: 0.7999513221045328
1000 Validation AUC: 0.7999516642692327
10000 Validation AUC: 0.7999517194570874
1000000 Validation AUC: 0.7999517194570875


In [41]:
stacked = LogisticRegression(max_iter=1000, C=100000)
stacked.fit(XvaStack, Yva)

LogisticRegression(C=100000, max_iter=1000)

## Submission

In [42]:
Xte = np.genfromtxt('X_test.txt', delimiter=None)
XteP, params = ml.rescale(Xte)
XteKNN = XteP[:, 0]
for i in range(1, 14):
    if i not in [3, 13]:
        XteKNN = np.column_stack((XteKNN, XteP[:,i]))
XteKNN = np.column_stack((XteKNN, np.multiply(-XteP[:,3], XteP[:,13])))

In [43]:
lgbm_pred = lgbm.predict_proba(XteP)[:,1]
rf_pred = rf.predict_proba(XteP)[:,1]
neigh_pred = neigh.predict_proba(XteKNN)[:,1]
nn_pred = nn.predict_proba(XteP)

XteStack = np.column_stack((lgbm_pred, rf_pred, neigh_pred, nn_pred))



In [44]:
Yte = stacked.predict_proba(XteStack)[:,1]
Y_sub = np.vstack([np.arange(Xte.shape[0]), Yte]).T
np.savetxt('Y_submit.txt',Y_sub,'%d,%.10f',header='ID,Prob1',comments='',delimiter=',')

In [52]:
XtrP.shape

AttributeError: 'StandardScaler' object has no attribute 'shape'

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0d2657fc-2bca-4e81-a161-275a5e7f5cf6' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>