In [1]:
# Import libraries and set desired options
from itertools import permutations
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from IPython.display import display
from scipy import sparse, stats
from scipy.linalg import svd
import umap
from sklearn import preprocessing
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn import svm
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import (KFold, StratifiedKFold, cross_val_score,
                                     cross_validate, train_test_split)
from tqdm import tqdm
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from code.cross_validation import *
from code.read_data import *
from code.feature_engineering import *
from code.autoencoder import *

Using TensorFlow backend.


In [2]:
pd.set_option('display.max_columns', None) 
pd.set_option('display.max_rows', None)
sns.set()
import warnings
warnings.filterwarnings('ignore')
%load_ext autoreload
%autoreload 2

# Read data sets

In [74]:
%%time
X1, X2, X3, Y, X1_test, X2_test, X3_test = read_data()
targets = [col for col in Y.columns if col != 'id']
train_len = len(X1)

Wall time: 919 ms


## Deal with X2 features

In [75]:
%%time
def aggregate_X2(X2, X2_test):
    X2_agg = pd.concat([X2, X2_test], ignore_index=True)
    X2_agg['A'] = X2_agg['A'].apply(lambda x: [x])
    X2_agg = X2_agg.groupby('id').agg(sum).reset_index()
    X2_agg['A'] = X2_agg['A'].apply(lambda x: set(x))
    return X2_agg


def X2_freq_features(X2, X2_test):
    X2_all = pd.concat([X2, X2_test], ignore_index=True)
    A_mapping_df = X2_all.groupby('A')['id'].nunique().reset_index().rename(columns={'id': 'freq'})
    A_mapping_df['freq'] = A_mapping_df['freq'] / X2_all['id'].nunique()
    A_mapping = {k: v for k, v in zip(A_mapping_df['A'].values, A_mapping_df['freq'].values)}
    X2_agg = aggregate_X2(X2, X2_test)
    X2_agg['A'] = X2_agg['A'].apply(lambda x: np.array([A_mapping[item] for item in x]))
    X2_agg['min_freq_A'] = X2_agg['A'].apply(lambda x: x.min())
    X2_agg['max_freq_A'] = X2_agg['A'].apply(lambda x: x.max())
    X2_agg['mean_freq_A'] = X2_agg['A'].apply(lambda x: x.mean())
    X2_agg['median_freq_A'] = X2_agg['A'].apply(lambda x: np.median(x))
    X2_agg['var_freq_A'] = X2_agg['A'].apply(lambda x: x.var())
    X2_agg['min_max_freq_ratio_A'] = X2_agg['min_freq_A'] / X2_agg['max_freq_A']
    X2_agg['mean_median_freq_delta_A'] = np.abs((X2_agg['mean_freq_A'] - X2_agg['median_freq_A']) / X2_agg['mean_freq_A'])
    return X2_agg.drop(columns=['A'])


X2_freq = X2_freq_features(X2, X2_test)

Wall time: 4.7 s


In [76]:
%%time
# keep categories existing in both train and test
good_A_labels = set(X2['A'].values) & set(X2_test['A'].values)
X2['A'] = X2['A'].apply(lambda x: x if x in good_A_labels else -1)
X2_test['A'] = X2_test['A'].apply(lambda x: x if x in good_A_labels else -1)

Wall time: 821 ms


In [77]:
X2_all = pd.concat([X2, X2_test], ignore_index=True)
len(set(X2_all['A'].values))

53116

In [60]:
%%time
def get_X2_features(most_frequent_A):
    X2_all_cp = aggregate_X2(X2, X2_test)
    print(len(most_frequent_A['A'].values))
    for item in tqdm(most_frequent_A['A'].values):
        X2_all_cp[f'A_feature_{item}'] = X2_all_cp['A'].apply(lambda x: item in x)
    X2_all_cp.drop(columns=['A'], inplace=True)
    return X2_all_cp


occurences = X2_all.groupby('A')['id'].nunique().reset_index()
occurences['good'] = occurences['A'].apply(lambda x: x in good_A_labels)
occurences = occurences[occurences['good']]

Wall time: 190 ms


In [61]:
%%time
# autoencoder
X2_all_large = get_X2_features(occurences[occurences['id'] >= 1])


#ids = X2_all_large['id']
#X2_all_encoded = encode(X2_all_large.drop(columns=['id']), encoding_dim=32)
#X2_all_encoded['id'] = ids

53115


100%|██████████| 53115/53115 [08:27<00:00, 104.63it/s]


Wall time: 8min 31s


In [13]:
%%time
# X1
X1_all = pd.concat([X1, X1_test], ignore_index=True)
bool_columns = [col for col in X1_all.columns if X1_all[col].nunique() == 2]
not_bool_columns = list(set(X1_all.columns) - set(bool_columns) - set(['id']))
# X1 categorical
X1_categorical_embedding = umap.UMAP(n_components=3, metric='dice').fit_transform(X1_all[bool_columns])
X1_categorical_embedding_df = pd.DataFrame(X1_categorical_embedding, columns=[f'X1_all_embedding_{i+1}' for i in range(3)])
X1_categorical_embedding_df['id'] = X1_all['id']
# X1 other
X1_other_embedding = umap.UMAP().fit_transform(X1_all[not_bool_columns])
X1_other_embedding_df = pd.DataFrame(X1_other_embedding, columns=['X1_all_embedding_5', 'X1_all_embedding_6'])
X1_other_embedding_df['id'] = X1_all['id']
# X2
X2_all_cp = get_X2_features(occurences[occurences['id'] >= 5])
X2_all_embedding = umap.UMAP(n_components=32, metric='dice').fit_transform(X2_all_cp.drop(columns=['id']))
X2_all_embedding_df = pd.DataFrame(X2_all_embedding, columns=[f'X2_all_embedding_{i+1}' for i in range(32)])
X2_all_embedding_df['id'] = X2_all_cp['id']
# unite
embeddings = [X1_categorical_embedding_df, X1_other_embedding_df, X2_all_embedding_df]

26508


100%|██████████| 26508/26508 [02:32<00:00, 174.19it/s]


Wall time: 1h 42min 39s


In [14]:
import pickle
with open('data/embeddings_5_n.pickle','wb') as f:
    pickle.dump(embeddings, f)

In [79]:
embeddings.append(X2_freq)

In [86]:
%%time
X_train = agg_and_merge(X1, X2, X3, embeddings)
X_test = agg_and_merge(X1_test, X2_test, X3_test, embeddings)
X_train, X_test = eng(X_train, X_test)
X_train, X_test = add_emedding_features(X_train, X_test, random_state=42)
X_train_norm, X_test_norm = normalize(X_train, X_test)

umap...
Wall time: 1min 5s


In [87]:
%%time
X_train_norm = pd.merge(X_train_norm, X2_all_encoded, on='id')
X_test_norm = pd.merge(X_test_norm, X2_all_encoded, on='id')
X_train = pd.merge(X_train, X2_all_encoded, on='id')
X_test = pd.merge(X_test, X2_all_encoded, on='id')
assert len(X_train) == 4000

Wall time: 309 ms


In [82]:
%%time
X_train_norm_ext = pd.merge(X_train_norm, X2_all_large, on='id')
X_test_norm_ext = pd.merge(X_test_norm, X2_all_large, on='id')
X_train_ext = pd.merge(X_train, X2_all_large, on='id')
X_test_ext = pd.merge(X_test, X2_all_large, on='id')
assert len(X_train_norm_ext) == 4000

Wall time: 4.01 s


In [85]:
%%time
et_multi = ExtraTreesClassifier(n_estimators=100,
                                max_depth=7,
                                class_weight='balanced',
                                random_state=42,
                                n_jobs=4)
CV_multilabel(et_multi, X_train_ext, Y.drop(columns=['id']))

target_1: 0.547, [0.5517395499304748, 0.5299615338382018, 0.5181210936342902, 0.5636546229886608, 0.5731555967582763]
target_2: 0.609, [0.5989356387555161, 0.5905393778064585, 0.6118233766233766, 0.6113504464285714, 0.6327650948273517]
target_3: 0.604, [0.5855873358517719, 0.6078719194099659, 0.6085276884191176, 0.6153430687487763, 0.6045316808363971]
target_4: 0.598, [0.6006241565452092, 0.5902395915161037, 0.6368010160763714, 0.5708417374086634, 0.5894440630452816]
target_5: 0.567, [0.5439161966156325, 0.5513727517618568, 0.59996, 0.539924248824308, 0.5976504040706375]
0.5849872876288508
Wall time: 56.2 s


In [84]:
%%time
rf_multi = RandomForestClassifier(n_estimators=1000,
                                  max_depth=10,
                                  class_weight='balanced',
                                  random_state=42,
                                  n_jobs=4)
CV_multilabel(rf_multi, X_train_ext, Y.drop(columns=['id']))

target_1: 0.546, [0.5533570759669684, 0.5515010818608006, 0.500118486922006, 0.5593752576237906, 0.5644980412093663]
target_2: 0.622, [0.6125766700896451, 0.6128834933182759, 0.6112, 0.6344940476190476, 0.6400710184756877]
target_3: 0.601, [0.5730742939377587, 0.6283432271991365, 0.593240176930147, 0.5808437810139144, 0.6287482766544118]
target_4: 0.602, [0.6005704821494295, 0.6001500953877231, 0.6244797095346895, 0.5957547413761258, 0.5886610028890986]
target_5: 0.593, [0.5818159657094102, 0.5870860656852874, 0.6055000000000001, 0.5872968939201937, 0.6014666267584556]
0.5926842604892549
Wall time: 1min 47s


In [20]:
%%time
step = 500
intervals = [(i, i + step) for i in range(0, 4000, step)]
splits = [[item for j, item in enumerate(intervals) if i != j] for i in range(len(intervals))]
out_dims = [[item for j, item in enumerate(intervals) if i == j] for i in range(len(intervals))]
X_train_splits = [pd.concat([X_train_ext[lb:ub] for (lb, ub) in split]) for split in splits]
y_train_splits = [pd.concat([Y[lb:ub] for (lb, ub) in split]) for split in splits]

Wall time: 1.48 s


In [21]:
%%time
metafeatures = pd.concat([X_train[['id']], X_test[['id']]], ignore_index=True)
for model_name, model in tqdm(zip(['rf_multi', 'et_multi'], [rf_multi, et_multi])):
    X_train_predicts = []
    X_test_predicts = []
    for X_train_split, y_train_split, out_dim in zip(X_train_splits, y_train_splits, out_dims):
        model.fit(X_train_split, y_train_split.drop(columns=['id']))
        X_train_predicts.append(model.predict_proba(X_train_ext[out_dim[0][0]:out_dim[0][1]]))
        X_test_predicts.append(model.predict_proba(X_test_ext))
    
    # X train features
    X_train_features = {target: [] for target in targets}
    for chunk in X_train_predicts:
        for target, probas in zip(targets, chunk):
            X_train_features[target].append(probas[:, 1])
    for target in targets:
        X_train_features[target] = np.concatenate(X_train_features[target])
    # X test features
    X_test_features = {target: [] for target in targets}
    for chunk in X_test_predicts:
        for target, probas in zip(targets, chunk):
            X_test_features[target].append(probas[:, 1])
    for target in targets:
        X_test_features[target] = sum(X_test_features[target]) / len(X_test_features[target])
    # metafeatures
    for target in targets:
        metafeatures[f'meta_{model_name}_{target}'] = np.concatenate((X_train_features[target], X_test_features[target]))

2it [04:53, 152.44s/it]


Wall time: 4min 53s


In [22]:
%%time
X_train_norm = pd.merge(X_train_norm, metafeatures, on='id')
X_test_norm = pd.merge(X_test_norm, metafeatures, on='id')
X_train = pd.merge(X_train, metafeatures, on='id')
X_test = pd.merge(X_test, metafeatures, on='id')

Wall time: 169 ms


# Simple models

In [88]:
merged = pd.merge(X_train, Y, on='id')
logreg_merged = pd.merge(X_train_norm, Y, on='id')
assert len(merged) == len(logreg_merged)
final_models = {}
final_models_roc_auc = {}

## logregs

In [101]:
%%time
logreg = LogisticRegression(C=0.04, class_weight='balanced', random_state=42, n_jobs=4)
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(logreg, X_train_norm, Y[target].values))
print(np.mean(roc_aucs))

avg: 0.597, delta: 0.068, std: 0.023
avg: 0.625, delta: 0.036, std: 0.014
avg: 0.619, delta: 0.054, std: 0.022
avg: 0.613, delta: 0.057, std: 0.021
avg: 0.619, delta: 0.052, std: 0.019
0.6144235644664617
Wall time: 9.14 s


## RF

In [120]:
%%time
rf = RandomForestClassifier(n_estimators=1000,
                            max_depth=4,
                            min_samples_leaf=50,
                            max_features=0.2,
                            oob_score=True,
                            class_weight='balanced',
                            random_state=42,
                            n_jobs=4)
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(rf, X_train, Y[target].values))
print(np.mean(roc_aucs))

avg: 0.615, delta: 0.064, std: 0.022
avg: 0.623, delta: 0.032, std: 0.014
avg: 0.621, delta: 0.032, std: 0.011
avg: 0.618, delta: 0.041, std: 0.017
avg: 0.609, delta: 0.061, std: 0.021
0.6170389950011488
Wall time: 4min 31s


## ET

In [135]:
%%time
et = ExtraTreesClassifier(n_estimators=100,
                          max_depth=8,
                          min_samples_leaf=50,
                          max_features=0.25,
                          class_weight='balanced',
                          random_state=42,
                          n_jobs=4)
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(et, X_train, Y[target].values))
print(np.mean(roc_aucs))

avg: 0.600, delta: 0.069, std: 0.025
avg: 0.625, delta: 0.036, std: 0.015
avg: 0.621, delta: 0.045, std: 0.018
avg: 0.622, delta: 0.052, std: 0.019
avg: 0.602, delta: 0.048, std: 0.018
0.614140259114123
Wall time: 22.1 s


## GBM

In [142]:
%%time
gbm = LGBMClassifier(num_leaves=3,
                     learning_rate=0.055,
                     reg_lambda=75.0,
                     random_state=42,
                     class_weight='balanced')
roc_aucs = []
for target in targets:
    roc_aucs.append(CV_metrics(gbm, X_train, Y[target].values))
print(np.mean(roc_aucs))

avg: 0.611, delta: 0.053, std: 0.019
avg: 0.621, delta: 0.036, std: 0.014
avg: 0.633, delta: 0.054, std: 0.019
avg: 0.617, delta: 0.043, std: 0.018
avg: 0.608, delta: 0.064, std: 0.022
0.6179262525091284
Wall time: 14.6 s


## Model

In [None]:
%%time
probas = []
X_train = merged.drop(columns=targets)
X_train_logreg = logreg_merged.drop(columns=targets)
for target in targets:
    print(target)
    y_train = merged[target].values
    # models
    # lightgbm
    y_proba = gbm.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # random forest
    y_proba += rf.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # extra tree
    y_proba += et.fit(X_train, y_train).predict_proba(X_test)[:, 1]
    # logreg
    y_proba += logreg.fit(X_train_norm, y_train).predict_proba(X_test_norm)[:, 1]
    y_proba /= 4.0
    final_proba = y_proba
    probas.append(final_proba)

target_1
target_2
target_3


In [None]:
tmp = pd.DataFrame(probas).T
baseline = pd.DataFrame(tmp.values, columns=['1', '2', '3', '4', '5'])
baseline['id'] = X_test['id']
baseline[['id', '1', '2', '3', '4', '5']].to_csv('baseline.csv', index=False)