<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:270%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Tabular Playground Series - Nov 2021
</div>

<a><img src="https://i.ibb.co/PWvpT9F/header.png" alt="header" border="0" width=800 height=400></a>

In [None]:
import pandas as pd, numpy as np, os
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns',None)

import random
import math
from scipy import stats
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing, model_selection, metrics

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Preprocessing
</div>

___

In [None]:
%%time

dir = '../input/tabular-playground-series-nov-2021/'
z = '.csv'

train = pd.read_csv('../input/november21/train.csv')
test = pd.read_csv(dir+'test'+z)

sample_submission = pd.read_csv(dir+'sample_submission'+z)

train_indx = train['id']
test_indx = test['id']

y = train['target']

train.drop(['id','target'],axis=1,inplace=True)
test.drop('id',axis=1,inplace=True)


In [None]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

    
TARGET = 'target'
FOLD = 5
SEED = 42
N_ESTIMATORS=15000
DEVICE = 'GPU'

LOSS = 'CrossEntropy'
EVAL_METRIC = "AUC"

STUDY_TIME = 60*60*8
seed_everything(SEED)

In [None]:
#reduce memory
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [None]:
#reduce memory by changing its datatype datatype
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
# helper functions
def get_auc(y_true, y_hat):
    fpr, tpr, _ = roc_curve(y_true, y_hat)
    score = auc(fpr, tpr)
    return score

In [None]:
X = train
X_test = test

del train, test
gc.collect()

In [None]:
skew_cols = []
cols = []

for col in X.columns:
    if abs(X[col].skew(axis=0)) > 1:
        skew_cols.append(col)
    else:
        cols.append(col)
        
main_cols = skew_cols + cols

In [None]:
#adding new columns and checking the score

#mean median and std on skew_cols and cols columns (train set)
X['skew_mean'] = X[skew_cols].mean(axis=1)
X['skew_median'] = X[skew_cols].median(axis=1)
X['skew_std'] = X[skew_cols].std(axis=1)
X['skew_skew'] = X[skew_cols].skew(axis=1)

X['col_mean'] = X[cols].mean(axis=1)
X['col_median'] = X[cols].median(axis=1)
X['col_std'] = X[cols].std(axis=1)
X['col_skew'] = X[cols].skew(axis=1)

#mean median and std on skew_cols and cols columns (test set)
X_test['skew_mean'] = X_test[skew_cols].mean(axis=1)
X_test['skew_median'] = X_test[skew_cols].median(axis=1)
X_test['skew_std'] = X_test[skew_cols].std(axis=1)
X_test['skew_skew'] = X_test[skew_cols].skew(axis=1)

X_test['col_mean'] = X_test[cols].mean(axis=1)
X_test['col_median'] = X_test[cols].median(axis=1)
X_test['col_std'] = X_test[cols].std(axis=1)
X_test['col_skew'] = X_test[cols].skew(axis=1)

#mean median and std on all columns
X['mean'] = X.mean(axis=1)
X['median'] = X.median(axis=1)
X['std'] = X.std(axis=1)
X['skew'] = X.skew(axis=1)

X_test['mean'] = X_test.mean(axis=1)
X_test['median'] = X_test.median(axis=1)
X_test['std'] = X_test.std(axis=1)
X_test['skew'] = X_test.skew(axis=1)

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Autoviz
</div>

___

In [None]:
!pip install autoviz

In [None]:
from autoviz.AutoViz_Class import AutoViz_Class

AV = AutoViz_Class()

train = X.join(y)

filename = ""
sep = ","
dft = AV.AutoViz(
    filename,
    sep=",",
    depVar="target",
    dfte=train,
    header=0,
    verbose=0,
    lowess=False,
    chart_format="svg",
    max_rows_analyzed=150000,
    max_cols_analyzed=30,
)

### Idea From DLastStark

Below we can split the distribution into 2 bins where the 1st bin has all negative values and the other bin with all positive values after applying cube root. as we can see from the below distribution that they can be easily split by centre 0

In [None]:
plt.rcParams['figure.dpi'] = 600

fig = plt.figure(figsize=(4,2), facecolor='#f6f5f5')
gs = fig.add_gridspec(2,2)
gs.update(wspace=0.1, hspace=0.5)

background_color = "#f6f5f5"
sns.set_palette(['#ffd514','#ff355d'])

ax0 = plt.subplot(gs[0,0])
ax1 = plt.subplot(gs[0,1])
ax2 = plt.subplot(gs[1,0])
ax3 = plt.subplot(gs[1,1])

for count in range(0,4):
    for s in ["right", "top","left"]:
        locals()['ax' + str(count)].spines[s].set_visible(False)
        locals()['ax' + str(count)].set_facecolor(background_color)

    locals()['ax' + str(count)].set_facecolor(background_color)
    locals()['ax' + str(count)].set_xlabel('',fontsize=4, weight='bold',)
    locals()['ax' + str(count)].set_ylabel('',fontsize=4, weight='bold')

    locals()['ax' + str(count)].tick_params(labelsize=3, width=0.5, length=1.5)
    locals()['ax' + str(count)].grid(which='major', axis='x', zorder=0, color='#EEEEEE', linewidth=0.4)
    locals()['ax' + str(count)].grid(which='major', axis='y', zorder=0, color='#EEEEEE', linewidth=0.4)
    
    locals()['ax' + str(count)].axes.yaxis.set_visible(False)

sns.kdeplot(X['f1'],ax=ax0,fill=True,ec='black',alpha=1,zorder=5,linewidth=0.4)
sns.kdeplot(np.cbrt(X['f1']),ax=ax1,fill=True,ec='black',alpha=1,zorder=5,linewidth=0.4,color='#ff355d')
sns.kdeplot(X['f2'],ax=ax2,fill=True,ec='black',alpha=1,zorder=5,linewidth=0.4)
sns.kdeplot(np.cbrt(X['f2']),ax=ax3,fill=True,ec='black',alpha=1,zorder=5,linewidth=0.4,color='#ff355d')


plt.show()


In [None]:
#thanks to DLastStark (copied from)
from tqdm import tqdm

for col in tqdm(main_cols):
    X[col+'_bin'] = X[col].apply(lambda x: 1 if np.cbrt(x)>0 else 0)
    X_test[col+'_bin'] = X_test[col].apply(lambda x: 1 if np.cbrt(x)>0 else 0)

In [None]:
std = preprocessing.MinMaxScaler(feature_range=(0,1))

X[main_cols] = pd.DataFrame(std.fit_transform(X[main_cols]),columns = X[main_cols].columns)
X_test[main_cols] = pd.DataFrame(std.transform(X_test[main_cols]),columns = X_test[main_cols].columns)

In [None]:
cols_bin = [col for col in X.columns.to_list() if col not in main_cols]

X['bin_count'] = X[cols_bin].sum(axis=1)
X_test['bin_count'] = X_test[cols_bin].sum(axis=1)

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Mutual info regression
</div>

___

In [None]:
%%time
from sklearn.feature_selection import mutual_info_regression
_x = X.iloc[:5000,:].copy()
_y = y.iloc[:5000].copy()
mi_scores = mutual_info_regression(_x, _y)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=_x.columns)
mi_scores = mi_scores.sort_values(ascending=False)

In [None]:
import plotly.figure_factory as ff
import plotly.express as px
top = 100
fig = px.bar(mi_scores, x=mi_scores.values[:top], y=mi_scores.index[:top])
fig.update_layout(
    title=f"Top {top} Strong Relationships Between Feature Columns and Target Column",
    xaxis_title="Relationship with Target",
    yaxis_title="Feature Columns",
    yaxis={'categoryorder':'total ascending'},
    colorway=["blue"]
)
fig.show()

del _x, _y
gc.collect()

<div style="color:White; display:fill; border-radius:5px;background-color:#336b87;font-size:170%;font-family:sans-serif;letter-spacing:0.5px;text-align: center">
Logistic Regression
</div>

___

In [None]:
assert X.columns.to_list() == X_test.columns.to_list() and X.shape[1] == X_test.shape[1]

In [None]:
%%time

#X, y, X_test
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, auc
import lightgbm as lgb
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier

# create list[tuples] of base_models
models = [
    ('lr',LogisticRegression(solver='liblinear')),
]

# create dictionaries to store predictions
oof_pred_tmp = dict()
test_pred_tmp = dict()
scores_tmp = dict()

# create cv
kf = StratifiedKFold(n_splits=20, shuffle=True, random_state=1)

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    # create train, validation sets
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    # fit & predict all models on the same fold
    for name, model in models:
        if name not in scores_tmp:
            oof_pred_tmp[name] = list()
            oof_pred_tmp['y_valid'] = list()
            test_pred_tmp[name] = list()
            scores_tmp[name] = list()
        if name != 'lr':
            model.fit(
                X_train, y_train,
                eval_set=[(X_valid,y_valid)],
                early_stopping_rounds=200,
                verbose=0
            )
        else:
            model.fit(
                X_train, y_train,

            )

        
        # validation prediction
        pred_valid = model.predict_proba(X_valid)[:,1]
        score = get_auc(y_valid, pred_valid)
        
        scores_tmp[name].append(score)
        oof_pred_tmp[name].extend(pred_valid)
        
        print(f"Fold: {fold + 1} Model: {name} Score: {score}")
        print('--'*20)
        
        # test prediction
        y_hat = model.predict_proba(X_test)[:,1]
        test_pred_tmp[name].append(y_hat)
    
    # store y_validation for later use
    oof_pred_tmp['y_valid'].extend(y_valid)
        
# print overall validation scores
for name, model in models:
    print(f"Overall Validation Score | {name}: {np.mean(scores_tmp[name])}")
    print('::'*20)

In [None]:
# create df with base predictions on test_data
base_test_predictions = pd.DataFrame(
    {name: np.mean(np.column_stack(test_pred_tmp[name]), axis=1) 
    for name in test_pred_tmp.keys()}
)

# save csv checkpoint
base_test_predictions.to_csv('./base_test_predictions.csv', index=False)

# create simple average blend 
base_test_predictions['simple_avg'] = base_test_predictions.mean(axis=1)

# create submission file with simple blend average
simple_blend_submission = sample_submission.copy()
simple_blend_submission['target'] = base_test_predictions['simple_avg']
simple_blend_submission.to_csv('./simple_blend_submission.csv', index=False)

In [None]:
# create training set for meta learner based on the oof_predictions of the base models
oof_predictions = pd.DataFrame(
    {name:oof_pred_tmp[name] for name in oof_pred_tmp.keys()}
)

# save csv checkpoint
oof_predictions.to_csv('./oof_predictions.csv', index=False)

# get simple blend validation score
y_valid = oof_predictions['y_valid'].copy()
y_hat_blend = oof_predictions.drop(columns=['y_valid']).mean(axis=1)
score = get_auc(y_valid, y_hat_blend)

print(f"Overall Validation Score | Simple Blend: {score}")
print('::'*20)