In [1]:
# Import libraries and set desired options
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from scipy import sparse, stats
from scipy.linalg import svd
import umap
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     cross_validate, train_test_split)
from tqdm import tqdm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from code.cross_validation import *
from code.read_data import *
from code.feature_engineering import *
from code.autoencoder import *

Using TensorFlow backend.


In [2]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)
sns.set()
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

# Read data sets

In [3]:
%%time
X1, X2, X3, Y, X1_test, X2_test, X3_test = read_data()
Y = Y.rename(columns={f'{i}': f'target_{i}' for i in range(1, 6)})
targets = ['target_1', 'target_2', 'target_3', 'target_4', 'target_5']
train_len = len(X1)
X2_all = pd.concat([X2, X2_test], ignore_index=True)
good_A_labels = set(X2['A'].values) & set(X2_test['A'].values)
only_train_A_labels = set(X2['A'].values) - set(X2_test['A'].values)
only_test_A_labels = set(X2_test['A'].values) - set(X2['A'].values)

Wall time: 1.42 s


In [4]:
%%time
def aggregate_X2(X2, X2_test):
    X2_agg = pd.concat([X2, X2_test], ignore_index=True)
    X2_agg['A'] = X2_agg['A'].apply(lambda x: [x])
    X2_agg = X2_agg.groupby('id').agg(sum).reset_index()
    X2_agg['A'] = X2_agg['A'].apply(lambda x: set(x))
    return X2_agg

    
def get_X2_features(most_frequent_A):
    X2_all_cp = aggregate_X2(X2, X2_test)
    print(len(most_frequent_A['A'].values))
    for item in tqdm(most_frequent_A['A'].values):
        X2_all_cp[f'A_feature_{item}'] = X2_all_cp['A'].apply(lambda x: item in x)
    X2_all_cp['has_only_train_A_labels'] = X2_all_cp['A'].apply(lambda x: x & only_train_A_labels != set())
    X2_all_cp['only_test_A_labels'] = X2_all_cp['A'].apply(lambda x: x & only_test_A_labels != set())
    X2_all_cp.drop(columns=['A'], inplace=True)
    return X2_all_cp


occurences = X2_all.groupby('A')['id'].nunique().reset_index()
occurences['good'] = occurences['A'].apply(lambda x: x in good_A_labels)
occurences = occurences[occurences['good']]

Wall time: 313 ms


In [5]:
%%time
# autoencoder
X2_all_large = get_X2_features(occurences[occurences['id'] >= 4])
ids = X2_all_large['id']
X2_all_encoded = encode(X2_all_large.drop(columns=['id']), encoding_dim=32)
X2_all_encoded['id'] = ids

31863


100%|██████████| 31863/31863 [03:24<00:00, 156.10it/s]


Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Train on 6043 samples, validate on 2015 samples
Epoch 1/50
 - 9s - loss: 0.5613 - val_loss: 0.2105
Epoch 2/50
 - 6s - loss: 0.0928 - val_loss: 0.0516
Epoch 3/50
 - 6s - loss: 0.0454 - val_loss: 0.0436
Epoch 4/50
 - 6s - loss: 0.0423 - val_loss: 0.0423
Epoch 5/50
 - 6s - loss: 0.0415 - val_loss: 0.0417
Epoch 6/50
 - 6s - loss: 0.0411 - val_loss: 0.0414
Epoch 7/50
 - 6s - loss: 0.0407 - val_loss: 0.0410
Epoch 8/50
 - 6s - loss: 0.0404 - val_loss: 0.0406
Epoch 9/50
 - 6s - loss: 0.0398 - val_loss: 0.0399
Epoch 10/50
 - 6s - loss: 0.0390 - val_loss: 0.0389
Epoch 11/50
 - 6s - loss: 0.0374 - val_loss: 0.0372
Epoch 12/50
 - 6s - loss: 0.0357 - val_loss: 0.0359
Epoch 13/50
 - 6s - loss: 0.0345 - val_loss: 0.0349
Epoch 14/50
 - 7s - loss: 0.0336 - val_loss: 0.0342
Epoch 15/50
 - 6s - loss: 0.0328 - val_loss: 0.0335
Epoch 16/50
 - 7s - loss: 0.0321 - val_loss: 0.0329
Epoch 17/

In [6]:
%%time
# X2 lots of features
X2_agg = aggregate_X2(X2, X2_test)
X2_agg['good_features_count'] = X2_agg['A'].apply(lambda x: len(x & good_A_labels))
X2_agg['only_train_A_labels_count'] = X2_agg['A'].apply(lambda x: len(x & only_train_A_labels))
X2_agg['only_test_A_labels_count'] = X2_agg['A'].apply(lambda x: len(x & only_test_A_labels))
X2_agg['A_label_sum_for_ratio'] = X2_agg['good_features_count'] + X2_agg['only_train_A_labels_count'] + X2_agg['only_test_A_labels_count']
X2_agg['good_features_count_ratio'] = X2_agg['good_features_count'] / X2_agg['A_label_sum_for_ratio']
X2_agg['only_train_A_labels_count'] = X2_agg['only_train_A_labels_count'] / X2_agg['A_label_sum_for_ratio']
X2_agg['only_test_A_labels_count'] = X2_agg['only_test_A_labels_count'] / X2_agg['A_label_sum_for_ratio']
X2_agg.drop(columns=['A'], inplace=True)
X2_agg.fillna(0, inplace=True)
X2_features = pd.merge(X2_all_encoded, X2_agg, on='id')

Wall time: 3.94 s


In [7]:
%%time
# X1
X1_all = pd.concat([X1, X1_test], ignore_index=True)
bool_columns = [col for col in X1_all.columns if X1_all[col].nunique() == 2]
not_bool_columns = list(set(X1_all.columns) - set(bool_columns) - set(['id']))
# X1 categorical
X1_categorical_embedding = umap.UMAP(metric='dice').fit_transform(X1_all[bool_columns])
X1_categorical_embedding_df = pd.DataFrame(X1_categorical_embedding, columns=['X1_all_embedding_1', 'X1_all_embedding_2'])
X1_categorical_embedding_df['id'] = X1_all['id']
# X1 other
X1_other_embedding = umap.UMAP().fit_transform(X1_all[not_bool_columns])
X1_other_embedding_df = pd.DataFrame(X1_other_embedding, columns=['X1_all_embedding_3', 'X1_all_embedding_4'])
X1_other_embedding_df['id'] = X1_all['id']
# X2
X2_all_cp = get_X2_features(occurences[occurences['id'] >= 4])
X2_all_embedding = umap.UMAP(metric='dice').fit_transform(X2_all_cp.drop(columns=['id']))
X2_all_embedding_df = pd.DataFrame(X2_all_embedding, columns=['X2_all_embedding_1', 'X2_all_embedding_2'])
X2_all_embedding_df['id'] = X2_all_cp['id']
# unite
embeddings = [X1_categorical_embedding_df, X1_other_embedding_df, X2_all_embedding_df]

31863


100%|██████████| 31863/31863 [03:36<00:00, 147.29it/s]


Wall time: 1h 52min 4s


In [8]:
import pickle
with open('data/embeddings_4.pickle','wb') as f:
    pickle.dump(embeddings, f)

In [9]:
%%time
X_train = agg_and_merge(X1, X2, X3, embeddings)
X_test = agg_and_merge(X1_test, X2_test, X3_test, embeddings)
X_train, X_test = eng(X_train, X_test)
X_train, X_test = add_emedding_features(X_train, X_test, random_state=42)
X_train_norm, X_test_norm = normalize(X_train, X_test)

umap...
Wall time: 56.4 s


In [10]:
%%time
X_train_norm = pd.merge(X_train_norm, X2_features, on='id')
X_test_norm = pd.merge(X_test_norm, X2_features, on='id')
X_train = pd.merge(X_train, X2_features, on='id')
X_test = pd.merge(X_test, X2_features, on='id')

Wall time: 365 ms


In [11]:
%%time
X_train_norm_ext = pd.merge(X_train_norm, X2_all_large, on='id')
X_test_norm_ext = pd.merge(X_test_norm, X2_all_large, on='id')
X_train_ext = pd.merge(X_train, X2_all_large, on='id')
X_test_ext = pd.merge(X_test, X2_all_large, on='id')

Wall time: 2.11 s


In [12]:
%%time
et_multi = ExtraTreesClassifier(n_estimators=1000,
                                max_depth=7,
                                class_weight='balanced',
                                random_state=42,
                                n_jobs=4)
#CV_multilabel(et_multi, X_train_ext, Y.drop(columns=['id']))

Wall time: 0 ns


In [13]:
%%time
rf_multi = RandomForestClassifier(n_estimators=1000,
                                  max_depth=10,
                                  class_weight='balanced',
                                  random_state=42,
                                  n_jobs=4)
#CV_multilabel(rf_multi, X_train_ext, Y.drop(columns=['id']))

Wall time: 0 ns


In [14]:
%%time
step = 500
intervals = [(i, i + step) for i in range(0, 4000, step)]
splits = [[item for j, item in enumerate(intervals) if i != j] for i in range(len(intervals))]
out_dims = [[item for j, item in enumerate(intervals) if i == j] for i in range(len(intervals))]
X_train_splits = [pd.concat([X_train_ext[lb:ub] for (lb, ub) in split]) for split in splits]
y_train_splits = [pd.concat([Y[lb:ub] for (lb, ub) in split]) for split in splits]

Wall time: 3.17 s


In [25]:
%%time
metafeatures = pd.concat([X_train[['id']], X_test[['id']]], ignore_index=True)
for model_name, model in tqdm(zip(['rf_multi', 'et_multi'], [rf_multi, et_multi])):
    X_train_predicts = []
    X_test_predicts = []
    for X_train_split, y_train_split, out_dim in zip(X_train_splits, y_train_splits, out_dims):
        model.fit(X_train_split, y_train_split.drop(columns=['id']))
        X_train_predicts.append(model.predict_proba(X_train_ext[out_dim[0][0]:out_dim[0][1]]))
        X_test_predicts.append(model.predict_proba(X_test_ext))
    
    # X train features
    X_train_features = {target: [] for target in targets}
    for chunk in X_train_predicts:
        for target, probas in zip(targets, chunk):
            X_train_features[target].append(probas[:, 1])
    for target in targets:
        X_train_features[target] = np.concatenate(X_train_features[target])
    # X test features
    X_test_features = {target: [] for target in targets}
    for chunk in X_test_predicts:
        for target, probas in zip(targets, chunk):
            X_test_features[target].append(probas[:, 1])
    for target in targets:
        X_test_features[target] = sum(X_test_features[target]) / len(X_test_features[target])
    # metafeatures
    for target in targets:
        metafeatures[f'meta_{model_name}_{target}'] = np.concatenate((X_train_features[target], X_test_features[target]))


0it [00:00, ?it/s]
1it [03:10, 190.44s/it]
2it [05:55, 182.95s/it]

Wall time: 5min 55s


In [26]:
%%time
X_train_norm = pd.merge(X_train_norm, metafeatures, on='id')
X_test_norm = pd.merge(X_test_norm, metafeatures, on='id')
X_train = pd.merge(X_train, metafeatures, on='id')
X_test = pd.merge(X_test, metafeatures, on='id')

Wall time: 365 ms


# Simple models

In [27]:
merged = pd.merge(X_train, Y, on='id')
logreg_merged = pd.merge(X_train_norm, Y, on='id')
assert len(merged) == len(logreg_merged)
final_models = {}
final_models_roc_auc = {}

## logregs

In [28]:
%%time
logreg = LogisticRegression(C=0.05, class_weight='balanced', random_state=42, n_jobs=4)
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(logreg, X_train_norm, Y[target].values))
print(np.mean(roc_aucs))

roc_auc  avg:  0.588   ['0.576', '0.568', '0.589', '0.585', '0.624']
roc_auc  avg:  0.618   ['0.620', '0.613', '0.639', '0.597', '0.619']
roc_auc  avg:  0.615   ['0.585', '0.611', '0.633', '0.658', '0.588']
roc_auc  avg:  0.603   ['0.630', '0.623', '0.609', '0.574', '0.581']
roc_auc  avg:  0.610   ['0.625', '0.624', '0.610', '0.632', '0.559']
0.6069017423916294
Wall time: 13.6 s


## RF

In [29]:
%%time
rf = RandomForestClassifier(n_estimators=1000,
                            max_depth=6,
                            class_weight='balanced',
                            random_state=42,
                            n_jobs=4)
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(rf, X_train, Y[target].values))
print(np.mean(roc_aucs))

roc_auc  avg:  0.592   ['0.552', '0.579', '0.592', '0.593', '0.645']
roc_auc  avg:  0.619   ['0.623', '0.636', '0.631', '0.597', '0.610']
roc_auc  avg:  0.615   ['0.603', '0.609', '0.624', '0.629', '0.610']
roc_auc  avg:  0.606   ['0.626', '0.617', '0.613', '0.581', '0.595']
roc_auc  avg:  0.602   ['0.595', '0.625', '0.608', '0.621', '0.558']
0.6068022973124194
Wall time: 1min 12s


## ET

In [34]:
%%time
et = ExtraTreesClassifier(n_estimators=1000,
                          max_depth=9,
                          class_weight='balanced',
                          random_state=42,
                          n_jobs=4)
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(et, X_train, Y[target].values))
print(np.mean(roc_aucs))

roc_auc  avg:  0.565   ['0.512', '0.548', '0.564', '0.585', '0.620']
roc_auc  avg:  0.618   ['0.627', '0.633', '0.630', '0.602', '0.600']
roc_auc  avg:  0.618   ['0.599', '0.611', '0.626', '0.642', '0.612']
roc_auc  avg:  0.609   ['0.636', '0.620', '0.614', '0.589', '0.589']
roc_auc  avg:  0.596   ['0.592', '0.625', '0.589', '0.616', '0.560']
0.6014856557402766
Wall time: 40.4 s


## GBM

In [31]:
%%time
gbm = LGBMClassifier(num_leaves=3,
                     learning_rate=0.05,
                     reg_lambda=75.0,
                     random_state=42,
                     class_weight='balanced')
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(gbm, X_train, Y[target].values))
print(np.mean(roc_aucs))

roc_auc  avg:  0.611   ['0.591', '0.599', '0.612', '0.607', '0.648']
roc_auc  avg:  0.620   ['0.623', '0.636', '0.632', '0.598', '0.610']
roc_auc  avg:  0.624   ['0.605', '0.613', '0.649', '0.632', '0.620']
roc_auc  avg:  0.601   ['0.621', '0.609', '0.623', '0.573', '0.581']
roc_auc  avg:  0.603   ['0.603', '0.625', '0.599', '0.622', '0.563']
0.6117977038250448
Wall time: 13.9 s


## Model

In [32]:
%%time
probas = []
X_train = merged.drop(columns=targets)
X_train_logreg = logreg_merged.drop(columns=targets)
for target in targets:
    print(target)
    y_train = merged[target].values
    # models
    # lightgbm
    y_proba = gbm.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # random forest
    y_proba += rf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # extra tree
    y_proba += et.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # logreg
    y_proba += logreg.fit(X_train_norm, y_train).predict_proba(X_test_norm)[:, 1]
    y_proba /= 4.0
    final_proba = y_proba
    probas.append(final_proba)

target_1
target_2
target_3
target_4
target_5
Wall time: 37.2 s


In [33]:
tmp = pd.DataFrame(probas).T
baseline = pd.DataFrame(tmp.values, columns=['1', '2', '3', '4', '5'])
baseline['id'] = X_test['id']
baseline[['id', '1', '2', '3', '4', '5']].to_csv('baseline.csv', index=False)