# Overview

The purpose of this kernel is to take a look at the data, come up with some insights, and attempt to create a predictive model or two. This notebook is still very raw. I will work on it as my very limited time permits, and hope to expend it in the upcoming days and weeks.

# Packages

First, let's load a few useful Python packages. This section will keep growing in subsequent versions of this EDA.

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gc
import json
import math


import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm
from sklearn.decomposition import PCA
from pandas_profiling import ProfileReport
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from lightgbm import LGBMClassifier

import os

%matplotlib inline

Let's see what files we have in the input directory:

In [None]:
import os
print(os.listdir("../input/tabular-playground-series-mar-2021/"))

In [None]:
#Loading Train and Test Data
train = pd.read_csv("../input/tabular-playground-series-mar-2021/train.csv")
test = pd.read_csv("../input/tabular-playground-series-mar-2021/test.csv")
sample_submission = pd.read_csv("../input/tabular-playground-series-mar-2021/sample_submission.csv") 

In [None]:
train.head()

In [None]:
test.head()

In [None]:
train.info()

In [None]:
test.info()

In [None]:
train.describe()

In [None]:
test.describe()

Let's look at the distribution of the target:



In [None]:
train['target'].value_counts()

In [None]:
np.mean(train.target)

So this is a binary classification problem with imbalanced data.

In [None]:
train['cat0'].value_counts()

In [None]:
train['cat1'].value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train['cat1'].values, bins=200)
plt.title('Histogram cat1 counts in train')
plt.xlabel('Value')
plt.ylabel('Count')
plt.show()

In [None]:
train['cat2'].value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train['cat2'].values, bins=200)
plt.title('Histogram cat2 counts in train')
plt.xlabel('Value')
plt.ylabel('Count')
plt.show()

In [None]:
train['cat3'].value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train['cat3'].values, bins=200)
plt.title('Histogram cat3 counts in train')
plt.xlabel('Value')
plt.ylabel('Count')
plt.show()

In [None]:
train['cat4'].value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train['cat4'].values, bins=200)
plt.title('Histogram cat4 counts in train')
plt.xlabel('Value')
plt.ylabel('Count')
plt.show()

In [None]:
train['cat5'].value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train['cat5'].values, bins=200)
plt.title('Histogram cat5 counts in train')
plt.xlabel('Value')
plt.ylabel('Count')
plt.show()

In [None]:
train['cat6'].value_counts()

In [None]:
plt.figure(figsize=(12, 5))
plt.hist(train['cat6'].values, bins=200)
plt.title('Histogram cat5 counts in train')
plt.xlabel('Value')
plt.ylabel('Count')
plt.show()

In [None]:
profile_train = ProfileReport(train, title='Pandas Train Profiling Report', html={'style':{'full_width':True}})

In [None]:
profile_train

In [None]:
profile_test = ProfileReport(test, title='Pandas Train Profiling Report', html={'style':{'full_width':True}})

In [None]:
profile_test

Now let's do some simple modeling. First, we'll have to encode all teh categorical variales so that we can use them with numerical algorithms. 

In [None]:
y = train.target
X = train.drop(["id", "target"], axis=1)

X_test = test.drop(["id"], axis=1)

In [None]:

#List of categorical col
list_cat = [col for col in X.columns if col.startswith("cat")]


X_all = pd.concat([X, X_test], axis=0)

le = LabelEncoder()

for col in list_cat:
    X_all[col] = le.fit_transform(X_all[col])
    
X_all.head()

In [None]:
X = X_all.iloc[:len(train), :]
X_test = X_all.iloc[len(train):, :]


In [None]:
train_oof = np.zeros((X.shape[0],))
test_preds = 0
train_oof.shape

In [None]:
%%time
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)
max_iter = 350

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y))):
        #print(f'Fold {f}')
        train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        model = HistGradientBoostingClassifier(max_iter=max_iter, validation_fraction=None, learning_rate=0.05, 
                                               max_depth=9, min_samples_leaf=23, max_leaf_nodes=100)
        

        model =  model.fit(train_df, train_target)
        temp_oof = model.predict_proba(val_df)[:,1]
        temp_test = model.predict_proba(X_test)[:,1]

        train_oof[val_ind] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))

In [None]:
roc_auc_score(y, train_oof)


In [None]:
0.8912443537006325

In [None]:
np.save('train_oof_hgb_0', train_oof)
np.save('test_preds_hgb_0', test_preds)

Now let's take a look at LightGBM instead.

In [None]:
%%time

train_oof_lgbm_0 = np.zeros((X.shape[0],))
test_preds_lgbm_0 = 0
train_oof_lgbm_0.shape

NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)


for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(X, y))):
        #print(f'Fold {f}')
        train_df, val_df = X.iloc[train_ind], X.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        model = LGBMClassifier(
                    cat_feature=[x for x in range(19)],
                    random_state=777,
                    cat_l2=25.999876242730252,
                    cat_smooth=89.2699690675538,
                    colsample_bytree=0.2557260109926193,
                    learning_rate=0.004,
                    max_bin=788,
                    max_depth=81,
                    metric="auc",
                    min_child_samples=292,
                    min_data_per_group=177,
                    n_estimators=4000000,
                    n_jobs=-1,
                    num_leaves=171,
                    reg_alpha=0.7115353581785044,
                    reg_lambda=5.658115293998945,
                    subsample=0.9262904583735796,
                    subsample_freq=1,
                    verbose=-1,
                )
        

        model =  model.fit(train_df, train_target, eval_set=[(val_df,val_target)],early_stopping_rounds=450,verbose=False)
        temp_oof = model.predict_proba(val_df)[:,1]
        temp_test = model.predict_proba(X_test)[:,1]

        train_oof_lgbm_0[val_ind] = temp_oof
        test_preds_lgbm_0 += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))
        
print(roc_auc_score(y, train_oof_lgbm_0))
np.save('train_oof_lgbm_0', train_oof_lgbm_0)
np.save('test_preds_lgbm_0', test_preds_lgbm_0)


Instead of label encoding, we could also do one hot encoding

In [None]:
y = train.target
X = train.drop(["id", "target"], axis=1)

X_test = test.drop(["id"], axis=1)

In [None]:
X.head()

In [None]:
categorical_cols = ['cat'+str(i) for i in range(19)]
continous_cols = ['cont'+str(i) for i in range(11)]

In [None]:
cols=categorical_cols+continous_cols
train_objs_num = len(train)
dataset = pd.concat(objs=[X[cols], X_test[cols]], axis=0)
dataset_preprocessed = pd.get_dummies(dataset,columns=categorical_cols)
train_preprocessed = dataset_preprocessed[:train_objs_num]
test_preprocessed = dataset_preprocessed[train_objs_num:]

In [None]:
train_preprocessed.head()

In [None]:
train_oof_lr_0 = np.zeros((X.shape[0],))
test_preds_lr_0 = 0
train_oof_lr_0.shape

In [None]:
%%time
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)


for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train_preprocessed, y))):
        #print(f'Fold {f}')
        train_df, val_df = train_preprocessed.iloc[train_ind], train_preprocessed.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        model = LogisticRegression(max_iter=200)
        

        model =  model.fit(train_df, train_target)
        temp_oof = model.predict_proba(val_df)[:,1]
        temp_test = model.predict_proba(test_preprocessed)[:,1]

        train_oof_lr_0[val_ind] = temp_oof
        test_preds_lr_0 += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))

In [None]:
np.save('train_oof_lr_0', train_oof_lr_0)
np.save('test_preds_lr_0', test_preds_lr_0)

In [None]:
print(roc_auc_score(y, train_oof_lr_0))

In [None]:
print(roc_auc_score(y, 0.85*train_oof+0.15*train_oof_lr_0))

In [None]:
0.8925740557816217

In [None]:
train_oof_lgbm_1 = np.zeros((X.shape[0],))
test_preds_lgbm_1 = 0
train_oof_lgbm_1.shape

In [None]:
lgbm_params={'metric': 'auc', 
             'reg_alpha': 6.010538011450937, 
             'reg_lambda': 0.031702113663443346, 
             'colsample_bytree': 0.27,
             'subsample': 0.6, 
             'learning_rate': 0.005, 
             'max_depth': 100, 
             'num_leaves': 100, 
             'min_child_samples': 216,
             'cat_smooth': 87, 
             'random_state': 77,
             'n_estimators': 200000}

In [None]:
%%time
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)


for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train_preprocessed, y))):
        #print(f'Fold {f}')
        train_df, val_df = train_preprocessed.iloc[train_ind], train_preprocessed.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        model = LGBMClassifier(**lgbm_params) 
        

        model =  model.fit(train_df, train_target, eval_set=[(val_df,val_target)],early_stopping_rounds=1100,verbose=False)
        temp_oof = model.predict_proba(val_df)[:,1]
        temp_test = model.predict_proba(test_preprocessed)[:,1]

        train_oof_lgbm_1[val_ind] = temp_oof
        test_preds_lgbm_1 += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))
        
print(roc_auc_score(y, train_oof_lgbm_1))


In [None]:
%%time
train_oof_lgbm_2 = np.zeros((X.shape[0],))
test_preds_lgbm_2 = 0
train_oof_lgbm_2.shape

lgbm_parameters = {
    'cat_feature': categorical_cols,
    'metric': 'auc', 
    'n_estimators': 20000,
    'reg_alpha': 0.000721024661208569,
    'reg_lambda': 47.79748127808107,
    'colsample_bytree': 0.24493010466517195,
    'subsample': 0.12246675404710294,
    'learning_rate': 0.013933182980403087,
    'max_depth': 21,
    'num_leaves': 90,
    'min_child_samples': 144,
    'cat_smooth': 63
}

NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)


for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train_preprocessed, y))):
        #print(f'Fold {f}')
        train_df, val_df = train_preprocessed.iloc[train_ind], train_preprocessed.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        model = LGBMClassifier(**lgbm_params) 
        

        model =  model.fit(train_df, train_target, eval_set=[(val_df,val_target)],early_stopping_rounds=1100,verbose=False)
        temp_oof = model.predict_proba(val_df)[:,1]
        temp_test = model.predict_proba(test_preprocessed)[:,1]

        train_oof_lgbm_2[val_ind] = temp_oof
        test_preds_lgbm_2 += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))
        
print(roc_auc_score(y, train_oof_lgbm_2))

In [None]:
print(roc_auc_score(y, 0.95*train_oof_lgbm_1+0.05*train_oof))

In [None]:
print(roc_auc_score(y, 0.5*train_oof_lgbm_0+0.5*train_oof_lgbm_1))

In [None]:
print(roc_auc_score(y, 0.25*train_oof_lgbm_0+0.25*train_oof_lgbm_1+0.5*train_oof_lgbm_2))

In [None]:
np.save('train_oof_lgbm_1', train_oof_lgbm_1)
np.save('test_preds_lgbm_1', test_preds_lgbm_1)

In [None]:
np.save('train_oof_lgbm_2', train_oof_lgbm_2)
np.save('test_preds_lgbm_2', test_preds_lgbm_2)

In [None]:
train_oof_hgb_1 = np.zeros((X.shape[0],))
test_preds_hgb_1 = 0
train_oof_hgb_1.shape

In [None]:
%%time
NUM_FOLDS = 10
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=137)
max_iter = 350

for f, (train_ind, val_ind) in tqdm(enumerate(kf.split(train_preprocessed, y))):
        #print(f'Fold {f}')
        train_df, val_df = train_preprocessed.iloc[train_ind], train_preprocessed.iloc[val_ind]
        train_target, val_target = y.iloc[train_ind], y.iloc[val_ind]
        
        
        model = HistGradientBoostingClassifier(max_iter=max_iter, validation_fraction=None, learning_rate=0.05, 
                                               max_depth=9, min_samples_leaf=23, max_leaf_nodes=100)
        

        model =  model.fit(train_df, train_target)
        temp_oof = model.predict_proba(val_df)[:,1]
        temp_test = model.predict_proba(test_preprocessed)[:,1]

        train_oof[val_ind] = temp_oof
        test_preds += temp_test/NUM_FOLDS
        
        print(roc_auc_score(val_target, temp_oof))


In [None]:
np.save('train_oof_hgb_1', train_oof_hgb_1)
np.save('test_preds_hgb_1', test_preds_hgb_1)

In [None]:
'''sample_submission['target'] = 0.85*test_preds+0.15*test_preds_lr_0
sample_submission.to_csv('submission.csv', index=False)'''

In [None]:
sample_submission['target'] = test_preds
sample_submission.to_csv('submission_hgb_0.csv', index=False)

In [None]:
sample_submission['target'] = test_preds_hgb_1
sample_submission.to_csv('submission_hgb_1.csv', index=False)

In [None]:
sample_submission['target'] = test_preds_lr_0
sample_submission.to_csv('submission_lr_0.csv', index=False)

In [None]:
sample_submission['target'] = test_preds_lgbm_0
sample_submission.to_csv('submission_lgbm_0.csv', index=False)

In [None]:
sample_submission['target'] = test_preds_lgbm_1
sample_submission.to_csv('submission_lgbm_1.csv', index=False)

In [None]:
sample_submission['target'] = 1.1*test_preds_lgbm_1-0.1*test_preds
sample_submission.to_csv('submission_blend_0.csv', index=False)

In [None]:
sample_submission['target'] = 0.5*test_preds_lgbm_0+0.5*test_preds_lgbm_1
sample_submission.to_csv('submission_blend_0.csv', index=False)

In [None]:
sample_submission['target'] = 0.55*test_preds_lgbm_0+0.45*test_preds_lgbm_1
sample_submission.to_csv('submission_blend_1.csv', index=False)

In [None]:
sample_submission['target'] = 1.05*(0.55*test_preds_lgbm_0+0.45*test_preds_lgbm_1)-0.05*test_preds
sample_submission.to_csv('submission_blend_2.csv', index=False)

In [None]:
sample_submission['target'] = 1.05*(0.5*test_preds_lgbm_0+0.5*test_preds_lgbm_1)-0.05*test_preds
sample_submission.to_csv('submission_blend_3.csv', index=False)

In [None]:
sample_submission['target'] = 0.25*test_preds_lgbm_0+0.25*test_preds_lgbm_1+0.5*test_preds_lgbm_2
sample_submission.to_csv('submission_blend_4.csv', index=False)

In [None]:
sample_submission['target'] = 1.05*(0.25*test_preds_lgbm_0+0.25*test_preds_lgbm_1+0.5*test_preds_lgbm_2)-0.05*test_preds
sample_submission.to_csv('submission_blend_5.csv', index=False)