A minimalistic **EDA**  

In [None]:
from contextlib import contextmanager
from IPython.display import display
import matplotlib.pyplot as plt
import category_encoders as ce
from pathlib import Path
from scipy import stats
import pandas_profiling 
import seaborn as sns
import pandas as pd
import numpy as np
import scipy as sp
import os, gc, re
import warnings
import logging
import random
import time
import cuml
import sys



seeds = [2021, 42]
warnings.simplefilter('ignore')
sns.set()
plt.style.use('seaborn-whitegrid')
%matplotlib inline 

In [None]:
def apply_pallete(colors):
    customPalette = sns.set_palette(sns.color_palette(colors))
    sns.palplot(sns.color_palette(colors),size=0.5)
    plt.tick_params(axis='both', labelsize=0, length=0)
pallete = ["#6930c3","#ff7f51","#aa4465","#ffa3a5", "#5e60ce","#ff9b54","#dd2d4a","#f49cbb", "#0096c7","#ffbf69","#f26a8d","#ffcbf2","#48cae4","#e9b827","#f49cbb","#e2afff","#ade8f4","#f9e576","#ff86c8"]
apply_pallete(pallete)

In [None]:
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')
print(f'train: {train.shape}, test: {test.shape}')

In [None]:
profile = train.profile_report(progress_bar=False,
                               missing_diagrams=None,
                               correlations={
                                   'pearson': {'calculate': True},
                                   'spearman': {'calculate': False},
                                   'kendall': {'calculate': False},
                                   'phi_k': {'calculate': True},
                                   'cramers': {'calculate': True},
                               }
                              )
profile.set_variable('html.style',
    {
      'primary_color': '#5e60ce',
      'full_width': True,
    }
)

In [None]:
profile

In [None]:
plt.figure(figsize=(5, 7.5))
train.target.value_counts().add_prefix('target_').plot(kind='bar')
plt.title('Target Count')
plt.show()

In [None]:
plt.figure(figsize=(5, 7.5))
train.drop(['id', 'target'], axis=1).dtypes.value_counts().add_suffix('_dtype').plot(kind='bar')
plt.title('DataType Count')
plt.show()

In [None]:
dense_features = [f for f in train.columns if 'cont' in f]
low_cardinality_sparse_features = [f for f in train.columns if 'cat' in f and train[f].nunique()<17]
high_cardinality_sparse_features = [f for f in train.columns if 'cat' in f and train[f].nunique()>=17]
sparse_features = [f for f in train.columns if 'cat' in f]
target = ['target']


In [None]:
def autolabel(rects):
    for rect in rects:
        height = rect.get_height()
        ax.text(rect.get_x() + rect.get_width()/2.,
                1.05*height,
                '%d' % int(height),
                ha='center',
                va='bottom',
                size=14)


In [None]:
fig, ax = plt.subplots(figsize=(12,6))
barp = ax.bar(sparse_features, train[sparse_features].nunique())
ax.set_ylabel('Number of unique categories')
ax.set_xlabel('Variables')
ax.set_title('Train Cardinality')
ax.set_ylim([0,350])
ax.tick_params(axis='x', rotation=45)
autolabel(barp)


In [None]:
fig, ax = plt.subplots(figsize=(12,6))
barp = ax.bar(sparse_features, test[sparse_features].nunique())
ax.set_ylabel('Number of unique categories')
ax.set_xlabel('Variables')
ax.set_title('Test Cardinality')
ax.set_ylim([0,350])
ax.tick_params(axis='x', rotation=45)
autolabel(barp)

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x="variable", y="value", data=pd.melt(train[dense_features]), palette=pallete)
plt.title(f'Train Continous')
plt.show()

In [None]:
plt.figure(figsize=(20,8))
sns.boxplot(x="variable", y="value", data=pd.melt(test[dense_features]), palette=pallete)
plt.title(f'Test Continous')
plt.show()

In [None]:
vertical_offset_median = 0.025
vertical_offset_max = 0.0125 
vertical_offset_min = -0.025 
plt.figure()
fig, ax = plt.subplots(3, 4,figsize=(16, 30))
for i, c in enumerate(dense_features, start=1):
    plt.subplot(3, 4, i)
    data = pd.melt(pd.concat([train[[c]].rename(columns={c:f'{c}_tr'}),test[[c]].rename(columns={c:f'{c}_te'})]))
    medians = data.groupby(['variable'])['value'].median().round(4)
    mins = data.groupby(['variable'])['value'].min().round(4)
    maxs = data.groupby(['variable'])['value'].max().round(4)
    box_plot = sns.boxplot(x="variable", y="value", data=data, palette=pallete)
    for xtick in box_plot.get_xticks():
        box_plot.text(xtick, medians.max() + vertical_offset_median, medians[xtick], 
                horizontalalignment='center', size='large', color='black', weight='semibold')
        box_plot.text(xtick, maxs.max() + vertical_offset_max, maxs[xtick], 
                horizontalalignment='center', size='large', color='black', weight='semibold')
        box_plot.text(xtick, mins.min() + vertical_offset_min, mins[xtick], 
                horizontalalignment='center', size='large', color='black', weight='semibold')
    plt.xlabel(c, fontsize=15)
    plt.ylabel(None)
plt.show()

In [None]:
for c in low_cardinality_sparse_features:
    melt_df = train[dense_features+[c]].melt(id_vars=c, value_name='Value', var_name='Numeric Feaures')
    fig, ax = plt.subplots(figsize=(20,8))
    sns.boxplot(data=melt_df, x='Numeric Feaures',  y='Value', hue=c, ax=ax, palette=pallete)
    ax.legend(loc="lower left", ncol = len(ax.lines))
    ax.set_ylabel(None)
    ax.set_title(f'{c}')
    frame = plt.gca()
    frame.axes.get_xaxis().set_label_text(None)


In [None]:
for cat in low_cardinality_sparse_features:
    df = train.groupby(cat).agg(
        freq = ('target',lambda x: x.count()/len(train)),
        mean_target = ('target',lambda x:x.mean())
            ).plot(kind='bar', figsize=(5+train[cat].nunique(),7.5))
    plt.title(f'{cat}')
    plt.ylabel('Frequency & Target Mean')



In [None]:
for c in dense_features:
    train[f'q_{c}'], bins_ = pd.qcut(train[c], 10, retbins=True, labels=[i for i in range(10)])
    melt_df = train[target+[f'q_{c}']].melt(id_vars=f'q_{c}', value_name='Target', var_name='Value')
    fig, ax = plt.subplots(figsize=(20,8))
    sns.barplot(data=melt_df, x='Value',  y='Target', hue=f'q_{c}', ax=ax, palette=pallete)
    ax.legend(loc="lower left", ncol = len(ax.lines))
    ax.set_ylabel('Mean Target', fontsize=15)
    ax.set_xlabel(None)
    ax.set_title(f'Quantized {c}')
    frame = plt.gca()
    frame.axes.get_xaxis().set_visible(False)


In [None]:
train_ = train.sample(frac=0.05, random_state=seeds[0])
plt.figure()
fig, ax = plt.subplots(4, 3,figsize=(12, 16))
plt.suptitle('Continous Train Features', fontsize=18)
for i, col in enumerate(dense_features, start=1):
    plt.subplot(4, 3, i)
    sns.distplot(train_[col],color="darkblue", kde=True, bins=60, label=col)
    plt.xlabel(col, fontsize=9)
plt.show();

In [None]:
test_ = test.sample(frac=0.05, random_state=seeds[0])
plt.figure()
fig, ax = plt.subplots(4, 3,figsize=(12, 16))
plt.suptitle('Continous Test Features', fontsize=18)
for i, col in enumerate(dense_features, start=1):
    plt.subplot(4, 3, i)
    sns.distplot(test_[col],color="darkcyan", kde=True, bins=60, label=col)
    plt.xlabel(col, fontsize=9)
plt.show();

In [None]:
benc_features = []
for col in sparse_features:
    benc = ce.BinaryEncoder(cols=col, drop_invariant=True)
    benc.fit(train[col])
    train_be = benc.transform(train[col])
    be_features = train_be.columns.tolist()
    train[be_features] = train_be.values
    benc_features.extend(be_features)


In [None]:
features = benc_features + dense_features
train_ = train[features + target].sample(frac=0.05, random_state=seeds[0])


In [None]:
model = cuml.cluster.KMeans(init="k-means||",
                            n_clusters=4,
                            oversampling_factor=40,
                            random_state=seeds[0])
y_pred = model.fit_predict(train_[features].astype(np.float32))


In [None]:
pca = cuml.decomposition.PCA(random_state=seeds[0], n_components=2, whiten=False)
pca_tr = pca.fit_transform(train_[features].astype(np.float32))
train_['pca_1'] = pca_tr[:,0]
train_['pca_2'] = pca_tr[:,1]

In [None]:

cmap = plt.cm.get_cmap("Accent")
plt.figure(figsize=(15,15))
plt.scatter(train_.pca_1, train_.pca_2, c=y_pred, s=10, cmap=cmap)
plt.legend()
plt.title('PCA-KMEANS')
plt.show()

In [None]:
cmap = plt.cm.get_cmap("flag")
plt.figure(figsize=(15,15))
plt.scatter(train_.pca_1, train_.pca_2, c=train_.target, s=10, cmap=cmap)
plt.legend()
plt.title('PCA-TARGET')
plt.show()

In [None]:
model = cuml.TSNE(n_components = 2, perplexity=20.0, random_state=seeds[0])
tsne_embed  = model.fit_transform(train_[features].astype(np.float32))
train_['tsne_1'] = tsne_embed[:,0]
train_['tsne_2'] = tsne_embed[:,1]

In [None]:
plt.figure(figsize=(16,16))
df1 = train_.loc[train_.target==1]
plt.scatter(df1.tsne_1, df1.tsne_2, color='blue', s=10, label='Positive Target')
df2 = train_.loc[train_.target==0]
plt.scatter(df2.tsne_1, df2.tsne_2, color='orange', s=10, label='Negative Target')
plt.legend()
plt.title('TSNE-TARGET')
plt.show()

In [None]:
mapper = cuml.manifold.UMAP(n_neighbors=20, n_components=2, n_epochs=500, learning_rate=0.5, random_state=seeds[0]).fit(train_[features])
umap_embed = mapper.transform(train_[features])
train_['umap_1'] = umap_embed[:,0]
train_['umap_2'] = umap_embed[:,1]

In [None]:
cmap = plt.cm.get_cmap("Accent")
plt.figure(figsize=(15,15))
plt.scatter(train_.umap_1, train_.umap_2, c=y_pred, s=10, cmap=cmap)
plt.legend()
plt.title('UMAP-KMEANS')
plt.show()

In [None]:
cmap = plt.cm.get_cmap("flag")
plt.figure(figsize=(15,15))
plt.scatter(train_.umap_1, train_.umap_2, c=train_.target, s=10, cmap=cmap)
plt.legend()
plt.title('UMAP-TARGET')
plt.show()

In [None]:
!pip install --no-warn-conflicts -q --upgrade xgboost

A Simple Blend of **XGB-CB-LGBM**

In [None]:
from sklearn.model_selection import StratifiedKFold
from category_encoders import LeaveOneOutEncoder
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier, Pool
from sklearn.metrics import roc_auc_score
from collections import defaultdict
from lightgbm import LGBMClassifier
from IPython.display import display
from scipy.stats import rankdata
from xgboost import DMatrix
import lightgbm as lgbm
import xgboost as xgb 
import pandas as pd
import numpy as np

seeds = [2021, 0, 42]
n_splits = 10
shuffle=True
iterations = 50000
early_stopping_rounds = 400
verbose_eval = 0
baseline_rounds = 1
cb_learning_rate = 0.006
xgb_learning_rate = 0.01
n_bags = 1
n_models = 3

train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv')
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv')

cat_cols = [feature for feature in train.columns if 'cat' in feature]
cont_cols = [feature for feature in train.columns if 'con' in feature]

for col in cat_cols:
    train_only = list(set(train[col].unique()) - set(test[col].unique()))
    test_only = list(set(test[col].unique()) - set(train[col].unique()))
    both = list(set(test[col].unique()).union(set(train[col].unique())))
    train.loc[train[col].isin(train_only), col] = np.nan
    test.loc[test[col].isin(test_only), col] = np.nan
    mode = train[col].mode().values[0]
    train[col] = train[col].fillna(mode)
    test[col] = test[col].fillna(mode)
    
X_train = train.drop(['id', 'target'], axis=1)
X_test_ = test.drop(['id'], axis=1)
y_train = train.target
test_id = test.id
ltrn = len(train)
lte = len(test)

for col in cat_cols:
    label_encoder = LabelEncoder().fit(X_train[col])
    X_train[col] = label_encoder.transform(X_train[col])
    X_test_[col] = label_encoder.transform(X_test_[col])
    X_train[col] = X_train[col].astype('category')
    X_test_[col] = X_test_[col].astype('category')
    
leave_one_out_cols = []
for col in cat_cols:
    leave_one_out_encoder = LeaveOneOutEncoder().fit(X_train[col].astype('str'), y_train)
    new_col = f'{col}_leave_one_out'
    X_train[new_col] = leave_one_out_encoder.transform(X_train[col].astype('str'))
    X_test_[new_col] = leave_one_out_encoder.transform(X_test_[col].astype('str'))
    leave_one_out_cols.append(new_col)
    
default_cols = cat_cols + cont_cols
replaced_cat_cols_with_loo = leave_one_out_cols + cont_cols


X_test = X_test_[default_cols]
X_test_loo = X_test_[replaced_cat_cols_with_loo]
bags_oof = np.zeros((ltrn, n_bags*n_models))
bags_preds = np.zeros((lte, n_bags*n_models))

xgb_params= defaultdict(seed=seeds[0], objective= 'binary:logistic',max_depth= 14,eta= xgb_learning_rate,colsample_bytree= 0.4,subsample= 0.8, alpha=7.5,gamma=0.75 ,min_child_weight= 1.5, max_bin=333,n_jobs= 2,eval_metric='logloss',tree_method= 'gpu_hist',gpu_id= 0, predictor= 'gpu_predictor',)
cb_params = defaultdict(random_seed=seeds[0], iterations=iterations, learning_rate=cb_learning_rate, depth=7, bootstrap_type='Bernoulli',random_strength=1, min_data_in_leaf=2, l2_leaf_reg=9, loss_function='Logloss', eval_metric='AUC', grow_policy='Depthwise',max_bin=1024, model_size_reg= 0,task_type= 'GPU',od_type='IncToDec',od_wait=100,verbose=verbose_eval,subsample=0.8,od_pval=1e-10,max_ctr_complexity= 8,has_time= False,simple_ctr = 'FeatureFreq',combinations_ctr= 'FeatureFreq')
lgbm_params = defaultdict(seed=seeds[0], max_depth= 75, subsample= 0.85, colsample_bytree= 0.25, learning_rate= 0.01,reg_lambda= 7.5,reg_alpha= 0.75,min_child_samples= 200,num_leaves= 200, max_bin= 777,cat_smooth= 80,cat_l2= 5,metric= 'auc',n_jobs= -1,verbose=-1)
   
for bag in range(n_bags):
    oof_cb = np.zeros((ltrn))
    oof_xgb = np.zeros((ltrn))
    oof_cbx = np.zeros((ltrn))
    oof_xgbx = np.zeros((ltrn))
    oof_lgb = np.zeros((ltrn))
    for fold, (train_idx, val_idx) in enumerate(StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=seeds[bag]).split(X_train, y_train)):
        print(f'Bag {bag+1} Fold {fold+1}')
        X_tr = X_train[default_cols].iloc[train_idx]
        X_val = X_train[default_cols].iloc[val_idx]
        X_tr_loo = X_train[replaced_cat_cols_with_loo].iloc[train_idx]
        X_val_loo = X_train[replaced_cat_cols_with_loo].iloc[val_idx]
        y_tr = y_train.iloc[train_idx]
        y_val = y_train.iloc[val_idx]
        ltr = len(X_tr)
        lv = len(X_val)
        ptrain = Pool(data=X_tr, label=y_tr, cat_features=[x for x in range(len(cat_cols))])
        pvalid = Pool(data=X_val, label=y_val, cat_features=[x for x in range(len(cat_cols))])
        ptest = Pool(data=X_test, cat_features=[x for x in range(len(cat_cols))])
        CModel = CatBoostClassifier(**cb_params)
        CModel.fit(ptrain,
                   eval_set=pvalid,
                   use_best_model=True,
                   early_stopping_rounds=early_stopping_rounds)
        temp_fold_preds = rankdata(CModel.predict_proba(pvalid)[:,1])/lv
        oof_cb[val_idx] = temp_fold_preds
        first_cb_auc = roc_auc_score(y_val, temp_fold_preds)
        print(f'AUC of CB model is {first_cb_auc}')
        baseline_preds_tr_cb = rankdata(CModel.predict_proba(ptrain)[:,1])/ltr
        baseline_preds_vl_cb = temp_fold_preds
        test_preds_cb = rankdata(CModel.predict_proba(ptest)[:,1])/lte  
        
        xtrain = DMatrix(data=X_tr, label=y_tr, nthread=2, enable_categorical=True)
        xvalid = DMatrix(data=X_val, label=y_val, nthread=2, enable_categorical=True)
        xtest = DMatrix(data=X_test, nthread=2, enable_categorical=True)
        XModel = xgb.train(xgb_params, xtrain,
                           evals=[(xvalid,'validation')],
                           verbose_eval=verbose_eval,
                           early_stopping_rounds=early_stopping_rounds,
                           xgb_model=None,
                           num_boost_round=iterations)
        temp_fold_preds = rankdata(XModel.predict(xvalid))/lv
        oof_xgb[val_idx] = temp_fold_preds
        first_xgb_auc = roc_auc_score(y_val, temp_fold_preds)
        print(f'AUC of XGB model with categorical data is {first_xgb_auc}')
        baseline_preds_tr_xgb = rankdata(XModel.predict(xtrain))/ltr
        baseline_preds_vl_xgb = temp_fold_preds
        test_preds_xgb = rankdata(XModel.predict(xtest))/lte
        
        ltrain = lgbm.Dataset(X_tr, label=y_tr, init_score=None, categorical_feature=cat_cols)
        lvalid = lgbm.Dataset(X_val, label=y_val, init_score=None, categorical_feature=cat_cols)
        ltest =  lgbm.Dataset(X_test, label=y_val, init_score=None, categorical_feature=cat_cols)
        LModel = lgbm.train(lgbm_params,
                        train_set=ltrain,
                        num_boost_round=iterations,
                        valid_sets=lvalid, 
                        init_model=None,
                        early_stopping_rounds=early_stopping_rounds,
                        categorical_feature=cat_cols,

                        verbose_eval=False)           
        temp_fold_preds = rankdata(LModel.predict(X_val))/lv
        oof_lgb[val_idx] = temp_fold_preds
        test_preds_lgb = rankdata(LModel.predict(X_test))/lte
        bags_oof[val_idx,bag*n_models] = temp_fold_preds
        bags_preds[:,bag*n_models] = test_preds_lgb
        first_lgb_auc = roc_auc_score(y_val, temp_fold_preds)
        print(f'AUC of LGBM model is {first_lgb_auc}')    
        baseline_preds_tr_lgb = rankdata(LModel.predict(X_tr))/ltr
        baseline_preds_vl_lgb = temp_fold_preds
        test_preds_lgb = test_preds_lgb
    
        baseline_train = (baseline_preds_tr_xgb+baseline_preds_tr_lgb+baseline_preds_tr_cb)/3
        baseline_valid = (baseline_preds_vl_xgb+baseline_preds_vl_lgb+baseline_preds_vl_cb)/3
        baseline_test = (test_preds_xgb+test_preds_lgb+test_preds_cb)/3
    
        for baseline in range(baseline_rounds):
            ptrain = Pool(data=X_tr, label=y_tr, cat_features=[x for x in range(len(cat_cols))], baseline=baseline_train)
            pvalid = Pool(data=X_val, label=y_val, cat_features=[x for x in range(len(cat_cols))], baseline=baseline_valid)
            ptest = Pool(data=X_test, cat_features=[x for x in range(len(cat_cols))], baseline=baseline_test)
            cb_params_ = cb_params.copy()
            cb_params_.update({'learning_rate': cb_learning_rate*(1/(baseline+1))})
            CModel = CatBoostClassifier(**cb_params_)
            CModel.fit(ptrain, 
                       eval_set=pvalid,
                       use_best_model=True,
                       early_stopping_rounds=early_stopping_rounds)
            temp_fold_preds = rankdata(CModel.predict_proba(pvalid)[:,1])/lv
            oof_cbx[val_idx] = temp_fold_preds
            second_cb_auc = roc_auc_score(y_val, temp_fold_preds)
            print(f'AUC of CB model with baseline round is {baseline+1} {second_cb_auc}')   
            baseline_train = rankdata(CModel.predict_proba(ptrain)[:,1])/ltr
            baseline_valid = temp_fold_preds
            baseline_test = rankdata(CModel.predict_proba(ptest)[:,1])/lte
            if baseline == baseline_rounds - 1:
                bags_oof[val_idx,bag*n_models+1] = temp_fold_preds
                bags_preds[:,bag*n_models+1] = baseline_test
        
            xtrain = DMatrix(data=X_tr_loo, label=y_tr, enable_categorical=False, base_margin=baseline_train)
            xvalid = DMatrix(data=X_val_loo, label=y_val, enable_categorical=False, base_margin=baseline_valid)
            xtest =  DMatrix(data=X_test_loo, enable_categorical=False, base_margin=baseline_test)
            xgb_params_ = xgb_params.copy()
            xgb_params_.update({'learning_rate': xgb_learning_rate*(1/(baseline+1))})
            XModel = xgb.train(xgb_params_, xtrain,
                               evals=[(xvalid,'validation')],
                               verbose_eval=verbose_eval,
                               early_stopping_rounds=early_stopping_rounds,
                               xgb_model=None,
                               num_boost_round=iterations)
            temp_fold_preds = rankdata(XModel.predict(xvalid))/lv
            oof_xgbx[val_idx] = temp_fold_preds
            baseline_train = rankdata(XModel.predict(xtrain))/ltr
            baseline_valid = temp_fold_preds
            baseline_test = rankdata(XModel.predict(xtest))/lte
            if baseline == baseline_rounds - 1:
                bags_oof[val_idx,bag*n_models+2] = temp_fold_preds
                bags_preds[:,bag*n_models+2] = baseline_test
            second_xgb_auc = roc_auc_score(y_val, temp_fold_preds)
            print(f'AUC of XGB model with baseline round is {baseline+1} {second_xgb_auc}')
            print('*' * 75)
    bag_auc = roc_auc_score(y_train, np.mean(bags_oof[:,bag*n_models:(bag+1)*n_models], axis=1))
    print(f'AUC of average bag {bag+1} models is {bag_auc}')
    print('#' * 80)


In [None]:
final_auc = roc_auc_score(y_train, np.mean(bags_oof, axis=1))
print(f'AUC of average of bags is {final_auc}')
submission = pd.DataFrame({'id':test_id,'target':np.mean(bags_preds, axis=1)})
submission.to_csv('submission.csv', index=False)
display(submission.head(3))
np.save('bags_oof', bags_oof)
np.save('bags_preds', bags_preds)