In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
import catboost as cb
from catboost import CatBoostClassifier, Pool
import random 
from tqdm.notebook import tqdm
import sklearn
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD

import warnings
warnings.filterwarnings("ignore")

print("pandas:", pd.__version__)
print("numpy:", np.__version__)
print("sklearn:", sklearn.__version__)
print("lightgbm:", lgb.__version__)
print("catboost:", cb.__version__)

pandas: 0.24.2
numpy: 1.18.1
sklearn: 0.19.1
lightgbm: 2.1.1
catboost: 0.21


  from numpy.core.umath_tests import inner1d


In [2]:
train = pd.read_csv('data/TRAIN_PREPARED.csv')
train, val = train.iloc[:150000], train.iloc[150000:]
test  = pd.read_csv('data/TEST_PREPARED.csv')

In [3]:
y = train.label.values
y_val = val.label.values

train = train.drop(['email', 'label'], axis=1)
val  = val.drop(['email', 'label'], axis=1)
test  = test.drop('email', axis=1)

In [4]:
def convert_datetime(data):
    data.first_date = (pd.to_datetime(data.first_date).astype(int) / 10**9).astype(int)
    data.cut_date = (pd.to_datetime(data.cut_date).astype(int) / 10**9).astype(int)
    data.last_date = (pd.to_datetime(data.last_date).astype(int) / 10**9).astype(int)
    
    return data

In [5]:
train = convert_datetime(train)
val = convert_datetime(val)
test = convert_datetime(test)

In [6]:
train = train.fillna(-1)
val = val.fillna(-1)
test = test.fillna(-1)

for f in tqdm(train.columns):
    if train[f].dtype=='object':
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(train[f].values) + 
                list(val[f].values) +
                list(test[f].values))
        
        train[f] = lbl.transform(list(train[f].values))
        val[f] = lbl.transform(list(val[f].values))
        test[f] = lbl.transform(list(test[f].values))

HBox(children=(FloatProgress(value=0.0, max=144.0), HTML(value='')))




In [7]:
cols = list(train.columns)
len(cols)

144

In [8]:
scaler = StandardScaler()
train[cols] = scaler.fit_transform(train[cols])
val[cols] = scaler.transform(val[cols])
test[cols] = scaler.transform(test[cols])

In [9]:
N = 32

svd = TruncatedSVD(n_components=N, random_state=18)
X = svd.fit_transform(train[cols], y)
X_val = svd.transform(val[cols])
X_test = svd.transform(test[cols])

In [10]:
%%time
arch = "random_forest"
train[arch] = 0

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):

    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]
    
    clf = RandomForestClassifier(n_estimators=150,
                                 n_jobs=-1).fit(X_train, y_train) 
    
    y_pred = clf.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 6))

test[arch] = clf.predict_proba(X_test)[:,1]
val[arch]  = clf.predict_proba(X_val)[:, 1]

print("\n FIN ROC AUC:", round(roc_auc_score(y, train[arch]), 6))

0 ROC AUC: 0.927572
1 ROC AUC: 0.929709
2 ROC AUC: 0.921921
4 ROC AUC: 0.929309

 FIN ROC AUC: 0.927503
CPU times: user 43min 53s, sys: 19.7 s, total: 44min 13s
Wall time: 37.3 s


In [11]:
%%time
arch = "lgb"
train[arch] = 0
rounds = 5000
early_stop_rounds = 100

params = {'objective': 'binary',
          'boosting_type': 'gbrt',
          'metric': 'auc',
          'seed': 18,
          'max_depth': 7,
          'leraning_rate': 0.05,
          'bagging_freq': 3,
          'bagging_fraction': 0.7,
          'bagging_seed': 18,
          'verbose': -1,
          'n_jobs': -1}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]

    d_train = lgb.Dataset(X_train, y_train)
    d_valid = lgb.Dataset(X_valid, y_valid)    

    model = lgb.train(params,
                      d_train,
                      num_boost_round=rounds,
                      valid_sets=[d_train, d_valid],
                      valid_names=['train','valid'],
                      early_stopping_rounds=early_stop_rounds,
                      verbose_eval=200) 

    y_pred = model.predict(X_valid)
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 6))

test[arch] = model.predict(X_test)
val[arch]  = model.predict(X_val)

print("\n FIN ROC AUC:", round(roc_auc_score(y, train[arch]), 6))

Training until validation scores don't improve for 100 rounds.
[200]	train's auc: 0.81687	valid's auc: 0.745551
[400]	train's auc: 0.881366	valid's auc: 0.777594
[600]	train's auc: 0.921666	valid's auc: 0.798839
[800]	train's auc: 0.948915	valid's auc: 0.816327
[1000]	train's auc: 0.966221	valid's auc: 0.83031
[1200]	train's auc: 0.977716	valid's auc: 0.840318
[1400]	train's auc: 0.985327	valid's auc: 0.848883
[1600]	train's auc: 0.990555	valid's auc: 0.856857
[1800]	train's auc: 0.993961	valid's auc: 0.863083
[2000]	train's auc: 0.996067	valid's auc: 0.867708
[2200]	train's auc: 0.997479	valid's auc: 0.872019
[2400]	train's auc: 0.998459	valid's auc: 0.876237
[2600]	train's auc: 0.99907	valid's auc: 0.880007
[2800]	train's auc: 0.999443	valid's auc: 0.883449
[3000]	train's auc: 0.999673	valid's auc: 0.88611
[3200]	train's auc: 0.999808	valid's auc: 0.888681
[3400]	train's auc: 0.99989	valid's auc: 0.890813
[3600]	train's auc: 0.999937	valid's auc: 0.893125
[3800]	train's auc: 0.999963

In [12]:
%%time
arch = "cat"
train[arch] = 0
iters = 5000
early_stop_rounds = 100

params = {'iterations': iters,
          'loss_function': 'Logloss',
          'eval_metric':'AUC',
          'random_seed': 18,
          'learning_rate': 0.05,
          'depth': 8}

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = X[train_index]
    X_valid = X[valid_index]

    y_train = y[train_index]
    y_valid = y[valid_index]
    
    trn_data = Pool(X_train, y_train)
    val_data = Pool(X_valid, y_valid)
    
    clf = CatBoostClassifier(**params)
    clf.fit(trn_data,
            eval_set=val_data,
            use_best_model=True,
            early_stopping_rounds=early_stop_rounds,
            verbose=0)
    
    y_pred = clf.predict_proba(X_valid)[:, 1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 6))

test[arch] = clf.predict_proba(X_test)[:,1]
val[arch]  = clf.predict_proba(X_val)[:, 1]

print("\n FIN ROC AUC:", round(roc_auc_score(y, train[arch]), 6))


0 ROC AUC: 0.90691
1 ROC AUC: 0.908611
2 ROC AUC: 0.90018
3 ROC AUC: 0.907115
4 ROC AUC: 0.906808

 FIN ROC AUC: 0.905937
CPU times: user 6d 14h 22min 46s, sys: 28min 49s, total: 6d 14h 51min 35s
Wall time: 1h 57min 47s


In [13]:
models = ["cat", "random_forest", "lgb"]

train[models].corr()

Unnamed: 0,cat,random_forest,lgb
cat,1.0,0.893267,0.928884
random_forest,0.893267,1.0,0.889688
lgb,0.928884,0.889688,1.0


In [14]:
%%time
arch = "stack"
train[arch] = 0

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=18)
for i, (train_index, valid_index) in enumerate(skf.split(X, y)):
    
    X_train = train.loc[train_index, models]
    X_valid = train.loc[valid_index, models]

    y_train = y[train_index]
    y_valid = y[valid_index]
    
    reg = LogisticRegression(C=1,
                             solver="newton-cg", 
                             penalty="l2", 
                             n_jobs=-1, 
                             max_iter=100).fit(X_train, y_train) 
    
    y_pred = reg.predict_proba(X_valid)[:,1]
    train.loc[valid_index, arch] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 5))
    
test[arch] = reg.predict_proba(test[models])[:, 1]
val[arch]  = reg.predict_proba(val[models])[:, 1]

print("\n FIN ROC AUC:", round(roc_auc_score(y, train[arch]), 5))

0 ROC AUC: 0.9322
1 ROC AUC: 0.93371
2 ROC AUC: 0.92592
3 ROC AUC: 0.93314
4 ROC AUC: 0.93312

 FIN ROC AUC: 0.9316
CPU times: user 8.11 s, sys: 1min 1s, total: 1min 9s
Wall time: 38.3 s


In [15]:
print('model coefs:', reg.coef_[0] / np.sum(reg.coef_[0]) * 100)

model coefs: [11.40791107 78.05467384 10.53741509]


In [16]:
print('val score:', round(roc_auc_score(y_val, val['stack']), 6))

val score: 0.931511


In [17]:
submit = pd.read_csv('data/test_submit_example.csv')
submit['prediction'] = test['stack']
submit.head()

Unnamed: 0,email,prediction
0,e98a93bfc7,0.121071
1,d813ccb3f5,0.653664
2,3d07a90a91,0.352418
3,5791e966d8,0.192841
4,d982336d49,0.185631


In [18]:
submit.to_csv('data/submission.csv')