In [1]:
# Import libraries and set desired options
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from scipy import sparse, stats
from scipy.linalg import svd
import umap
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     cross_validate, train_test_split)
from tqdm import tqdm
from lightgbm import LGBMClassifier
from code.cross_validation import *
from code.read_data import *
from code.feature_engineering import *

In [2]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)
sns.set()
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

# Read data sets

In [3]:
X1, X2, X3, Y, X1_test, X2_test, X3_test = read_data()
Y = Y.rename(columns={f'{i}': f'target_{i}' for i in range(1, 6)})

In [4]:
%%time
X_train = agg_and_merge(X1, X2, X3)
X_test = agg_and_merge(X1_test, X2_test, X3_test)
X_train, X_test = eng(X_train, X_test)
X_train, X_test = add_emedding_features(X_train, X_test, random_state=42)
X_train_norm, X_test_norm = normalize(X_train, X_test)

umap...
tsne...
Wall time: 5min 58s


# Simple models

In [5]:
merged = pd.merge(X_train, Y, on='id')
logreg_merged = pd.merge(X_train_norm, Y, on='id')
targets = ['target_1', 'target_2', 'target_3', 'target_4', 'target_5']
assert len(merged) == len(logreg_merged)
final_models = {}
final_models_roc_auc = {}

## logregs

In [6]:
C_grids = {
    'target_1': np.linspace(0.01, 0.1, 10),
    'target_2': np.linspace(0.01, 0.1, 10),
    'target_3': np.linspace(0.001, 0.01, 10),
    'target_4': np.linspace(0.005, 0.1, 10),
    'target_5': np.linspace(0.001, 0.01, 10),
}

for target in targets:
    print(target)
    roc_auc_means = []
    for C in C_grids[target]:
        logreg = LogisticRegression(C=C, class_weight='balanced', random_state=42, n_jobs=4)
        roc_auc_means.append(CV_metrics(logreg, X_train_norm, Y[target].values))
    print(roc_auc_means)
    index = roc_auc_means.index(max(roc_auc_means))
    print(f'index: {index}')
    C = C_grids[target][index]
    final_models[f'{target}_logreg'] = LogisticRegression(C=C, class_weight='balanced', random_state=42, n_jobs=-1)
    final_models_roc_auc[f'{target}_logreg'] = max(roc_auc_means)

target_1
roc_auc  avg:  0.536   ['0.510', '0.532', '0.566']
roc_auc  avg:  0.539   ['0.512', '0.534', '0.571']
roc_auc  avg:  0.540   ['0.512', '0.534', '0.572']
roc_auc  avg:  0.540   ['0.513', '0.534', '0.573']
roc_auc  avg:  0.540   ['0.514', '0.534', '0.573']
roc_auc  avg:  0.540   ['0.514', '0.534', '0.572']
roc_auc  avg:  0.540   ['0.514', '0.534', '0.572']
roc_auc  avg:  0.540   ['0.514', '0.534', '0.572']
roc_auc  avg:  0.540   ['0.514', '0.533', '0.573']
roc_auc  avg:  0.540   ['0.514', '0.533', '0.572']
[0.5360431976947645, 0.5391695738729793, 0.5395739764670948, 0.5400198267040781, 0.5404247228702622, 0.5401109275061202, 0.5400452623726011, 0.5400460709906711, 0.5399626792688123, 0.5397728917388904]
index: 4
target_2
roc_auc  avg:  0.567   ['0.567', '0.575', '0.559']
roc_auc  avg:  0.569   ['0.569', '0.577', '0.563']
roc_auc  avg:  0.570   ['0.568', '0.577', '0.564']
roc_auc  avg:  0.570   ['0.568', '0.578', '0.565']
roc_auc  avg:  0.571   ['0.568', '0.578', '0.566']
roc_auc

## RF

In [7]:
C_grids = {
    'target_1': [{'n_estimators':500, 'max_depth':7}, {'n_estimators':500, 'max_depth':9}, {'n_estimators':500, 'max_depth':12}],
    'target_2': [{'n_estimators':700, 'max_depth':7}, {'n_estimators':700, 'max_depth':9}, {'n_estimators':700, 'max_depth':12}],
    'target_3': [{'n_estimators':600, 'max_depth':7}, {'n_estimators':600, 'max_depth':9}, {'n_estimators':600, 'max_depth':12}],
    'target_4': [{'n_estimators':500, 'max_depth':3}, {'n_estimators':600, 'max_depth':3}, {'n_estimators':700, 'max_depth':3}],
    'target_5': [{'n_estimators':400, 'max_depth':3}, {'n_estimators':500, 'max_depth':3}, {'n_estimators':600, 'max_depth':3}],
}

for target in targets:
    print(target)
    roc_auc_means = []
    for options in C_grids[target]:
        rf = RandomForestClassifier(**options, class_weight='balanced', random_state=42, n_jobs=4)
        roc_auc_means.append(CV_metrics(rf, X_train, Y[target].values))
    print(roc_auc_means)
    index = roc_auc_means.index(max(roc_auc_means))
    print(f'index: {index}')
    options = C_grids[target][index]
    final_models[f'{target}_rf'] = RandomForestClassifier(**options,
                                                          class_weight='balanced',
                                                          random_state=42,
                                                          n_jobs=-1)
    final_models_roc_auc[f'{target}_rf'] = max(roc_auc_means)

target_1
roc_auc  avg:  0.569   ['0.542', '0.575', '0.590']
roc_auc  avg:  0.566   ['0.536', '0.573', '0.590']
roc_auc  avg:  0.563   ['0.534', '0.566', '0.588']
[0.5692151616021781, 0.5662488192457745, 0.5625744071827128]
index: 0
target_2
roc_auc  avg:  0.586   ['0.599', '0.589', '0.569']
roc_auc  avg:  0.587   ['0.598', '0.591', '0.573']
roc_auc  avg:  0.585   ['0.592', '0.588', '0.576']
[0.5855792979050577, 0.5872319700268412, 0.585243850291849]
index: 1
target_3
roc_auc  avg:  0.618   ['0.607', '0.618', '0.628']
roc_auc  avg:  0.618   ['0.610', '0.617', '0.626']
roc_auc  avg:  0.618   ['0.609', '0.619', '0.625']
[0.6177202201077125, 0.61793286129407, 0.6175481154978009]
index: 1
target_4
roc_auc  avg:  0.586   ['0.597', '0.595', '0.566']
roc_auc  avg:  0.588   ['0.599', '0.598', '0.566']
roc_auc  avg:  0.587   ['0.598', '0.597', '0.565']
[0.5862315689156831, 0.5877944732194157, 0.5869352055896664]
index: 1
target_5
roc_auc  avg:  0.540   ['0.534', '0.531', '0.554']
roc_auc  avg:  

## GBM

In [8]:
C_grids = {
    'target_1': [{'n_estimators':100, 'num_leaves':3, 'reg_lambda':10.0}, {'n_estimators':100, 'num_leaves':5, 'reg_lambda':10.0}, {'n_estimators':100, 'num_leaves':7, 'reg_lambda':10.0}],
    'target_2': [{'n_estimators':400, 'num_leaves':7, 'reg_lambda':10.0}, {'n_estimators':500, 'num_leaves':7, 'reg_lambda':10.0}, {'n_estimators':600, 'num_leaves':7, 'reg_lambda':10.0}],
    'target_3': [{'n_estimators':150, 'num_leaves':7, 'reg_lambda':15.0}, {'n_estimators':150, 'num_leaves':9, 'reg_lambda':15.0}, {'n_estimators':150, 'num_leaves':12, 'reg_lambda':15.0}],
    'target_4': [{'n_estimators':100, 'num_leaves':3, 'reg_lambda':10.0}, {'n_estimators':150, 'num_leaves':3, 'reg_lambda':10.0}, {'n_estimators':200, 'num_leaves':3, 'reg_lambda':10.0}],
    'target_5': [{'n_estimators':100, 'num_leaves':3, 'reg_lambda':75.0}, {'n_estimators':100, 'num_leaves':3, 'reg_lambda':100.0}, {'n_estimators':100, 'num_leaves':3, 'reg_lambda':125.0}],
}

for target in targets:
    print(target)
    roc_auc_means = []
    for options in C_grids[target]:
        gbm = LGBMClassifier(**options, learning_rate=0.01, random_state=42, class_weight='balanced')
        roc_auc_means.append(CV_metrics(gbm, X_train, Y[target].values))
    print(roc_auc_means)
    index = roc_auc_means.index(max(roc_auc_means))
    print(f'index: {index}')
    options = C_grids[target][index]
    final_models[f'{target}_gbm'] = LGBMClassifier(**options, random_state=42, class_weight='balanced')
    final_models_roc_auc[f'{target}_gbm'] = max(roc_auc_means)

target_1
roc_auc  avg:  0.589   ['0.584', '0.590', '0.592']
roc_auc  avg:  0.593   ['0.581', '0.593', '0.605']
roc_auc  avg:  0.591   ['0.579', '0.592', '0.604']
[0.5886782114223689, 0.593348734977652, 0.5914525838392782]
index: 1
target_2
roc_auc  avg:  0.590   ['0.600', '0.592', '0.577']
roc_auc  avg:  0.590   ['0.600', '0.591', '0.580']
roc_auc  avg:  0.590   ['0.599', '0.590', '0.580']
[0.5895035848440997, 0.5900164210451609, 0.5898491632418796]
index: 1
target_3
roc_auc  avg:  0.618   ['0.608', '0.622', '0.624']
roc_auc  avg:  0.618   ['0.608', '0.622', '0.624']
roc_auc  avg:  0.618   ['0.608', '0.621', '0.625']
[0.6178405803031389, 0.6179397683626111, 0.6175471775745276]
index: 1
target_4
roc_auc  avg:  0.600   ['0.612', '0.610', '0.579']
roc_auc  avg:  0.601   ['0.614', '0.608', '0.582']
roc_auc  avg:  0.600   ['0.615', '0.608', '0.575']
[0.600153745926977, 0.6010678895906593, 0.599614121595269]
index: 1
target_5
roc_auc  avg:  0.542   ['0.546', '0.527', '0.553']
roc_auc  avg:  

In [9]:
for target in targets:
    print(final_models_roc_auc[f'{target}_logreg'], final_models_roc_auc[f'{target}_rf'], final_models_roc_auc[f'{target}_gbm'])
np.mean([v for _, v in final_models_roc_auc.items()])

0.5404247228702622 0.5692151616021781 0.593348734977652
0.5705769474244217 0.5872319700268412 0.5900164210451609
0.6087714934443638 0.61793286129407 0.6179397683626111
0.5985255218024504 0.5877944732194157 0.6010678895906593
0.5417110908229578 0.5396961452270851 0.5417901799569075


0.5804028921111357

## Model

In [10]:
probas = []
X_train = merged.drop(columns=targets)
X_train_logreg = logreg_merged.drop(columns=targets)
for target in targets:
    y_train = merged[target].values
    # models
    # lightgbm
    y_proba = final_models[f'{target}_gbm'].fit(X_train, y_train).predict_proba(X_test)[:, 1] * 0.2
    # random forest
    y_proba += final_models[f'{target}_rf'].fit(X_train, y_train).predict_proba(X_test)[:, 1] * 0.3
    # logreg
    y_proba += final_models[f'{target}_logreg'].fit(X_train_norm, y_train).predict_proba(X_test_norm)[:, 1] * 0.5
    probas.append(y_proba)

In [11]:
tmp = pd.DataFrame(probas).T
baseline = pd.DataFrame(tmp.values, columns=['1', '2', '3', '4', '5'])
baseline['id'] = X_test['id']
baseline[['id', '1', '2', '3', '4', '5']].to_csv('baseline.csv', index=False)