In [None]:
# credit to this kernel: https://www.kaggle.com/vishwas21/tps-oct-21-eda-modeling/notebook
# I modified from there.

## Imports

In [None]:
import os
import gc
gc.enable()
import time
import random
import warnings
import pdb
from datetime import datetime
import pickle

import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn import svm
from sklearn import tree
from sklearn import impute
from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model
from sklearn import decomposition
from sklearn import preprocessing
from sklearn import model_selection



In [None]:
today = int(str(datetime.now().year) + 
 str(datetime.now().month).rjust(2, '0') + 
 str(datetime.now().day).rjust(2, '0'))
today

In [None]:
nrows = None
trees = 10000

# nrows = 1000
# trees = 10
verbose=True

In [None]:
warnings.filterwarnings('ignore')

SEED = today
np.random.seed(SEED)

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('float_format', '{:f}'.format)

sns.set_style("darkgrid")
mpl.rcParams['figure.dpi'] = 600
%matplotlib inline

## Data

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-oct-2021/train.csv', 
                       nrows = nrows)
test_df = pd.read_csv('../input/tabular-playground-series-oct-2021/test.csv', nrows=nrows)

print('Quick view of training data: ')
train_df.head()

In [None]:
TARGET = 'target'
FEATURES = [col for col in train_df.columns if col not in ['id', TARGET]]
# # print(f'Training data:\n\t Number of rows: {train_df.shape[0]}, Number of columns: {train_df.shape[1]}')
# # print(f'Testing data:\n\t Number of rows: {test_df.shape[0]}, Number of columns: {test_df.shape[1]}')


In [None]:
# print('Basic statistics of training data:')
# train_df[FEATURES+[TARGET]].describe()

In [None]:
# print('Basic statistics of testing data:')
# test_df[FEATURES].describe()

In [None]:
# print(f'Number of missing values in training data: {train_df.isna().sum().sum()}')
# print(f'Number of missing values in testing data: {test_df.isna().sum().sum()}')

## EDA

In [None]:
# this is interesting.  It searches for categorical features by looking at the nunique values and calling
# it categorical (which would include boolean) if its less than 25.  

# df = pd.concat([train_df[FEATURES], test_df[FEATURES]], axis=0)

# cat_features = [col for col in FEATURES if df[col].nunique() < 25]
# cont_features = [col for col in FEATURES if df[col].nunique() >= 25]

# del df
# print(f'Total number of features: {len(FEATURES)}')
# print(f'Number of categorical features: {len(cat_features)}')
# print(f'Number of continuos features: {len(cont_features)}')

# plt.pie([len(cat_features), len(cont_features)], 
#         labels=['Categorical', 'Continuos'],
#         colors=['#76D7C4', '#F5B7B1'],
#         textprops={'fontsize': 13},
#         autopct='%1.1f%%')
# plt.show()

In [None]:
# this takes a long time.  How does this help me?

# print("Feature distribution of continous features: ")
# ncols = 5
# nrows = int(len(cont_features) / ncols + (len(FEATURES) % ncols > 0))

# fig, axes = plt.subplots(nrows, ncols, figsize=(18, 150), facecolor='#EAEAF2')

# for r in range(nrows):
#     for c in range(ncols):
#         col = cont_features[r*ncols+c]
#         sns.kdeplot(x=train_df[col], ax=axes[r, c], color='#58D68D', label='Train data')
#         sns.kdeplot(x=test_df[col], ax=axes[r, c], color='#DE3163', label='Test data')
#         axes[r, c].set_ylabel('')
#         axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
#         axes[r, c].tick_params(labelsize=5, width=0.5)
#         axes[r, c].xaxis.offsetText.set_fontsize(4)
#         axes[r, c].yaxis.offsetText.set_fontsize(4)
# plt.show()

In [None]:
# looks like all of these are actually binary.

# print("Feature distribution of categorical features: ")
# ncols = 5
# nrows = int(len(cat_features) / ncols + (len(FEATURES) % ncols > 0))

# fig, axes = plt.subplots(nrows, ncols, figsize=(18, 45), facecolor='#EAEAF2')

# for r in range(nrows):
#     for c in range(ncols):
#         col = cat_features[r*ncols+c]
#         sns.histplot(x=train_df[col], ax=axes[r, c], color='#58D68D', label='Train data')
#         sns.histplot(x=test_df[col], ax=axes[r, c], color='#DE3163', label='Test data')
#         axes[r, c].set_ylabel('')
#         axes[r, c].set_xlabel(col, fontsize=8, fontweight='bold')
#         axes[r, c].tick_params(labelsize=5, width=0.5)
#         axes[r, c].xaxis.offsetText.set_fontsize(4)
#         axes[r, c].yaxis.offsetText.set_fontsize(4)
# plt.show()

In [None]:
# Looks like all the categorical features are binary.

In [None]:
# print("Target Distribution: ")

# target_df = pd.DataFrame(train_df[TARGET].value_counts()).reset_index()
# target_df.columns = [TARGET, 'count']

# fig, ax = plt.subplots(1, 1, figsize=(25, 8), facecolor='#EAEAF2')
# sns.barplot(y=TARGET, x='count', data=target_df, palette=['#58D68D', '#DE3163'], ax=ax, orient='h')
# ax.set_xlabel('Count', fontsize=16)
# ax.set_ylabel('Target', fontsize=16)
# plt.show()

## Feature Engineering

In [None]:
# each row gets a mean, std, min, and a max.  Nice.
# easy and interesting

train_df["mean"] = train_df[FEATURES].mean(axis=1)
train_df["std"] = train_df[FEATURES].std(axis=1)
train_df["min"] = train_df[FEATURES].min(axis=1)
train_df["max"] = train_df[FEATURES].max(axis=1)

test_df["mean"] = test_df[FEATURES].mean(axis=1)
test_df["std"] = test_df[FEATURES].std(axis=1)
test_df["min"] = test_df[FEATURES].min(axis=1)
test_df["max"] = test_df[FEATURES].max(axis=1)

FEATURES.extend(['mean', 'max', 'min', 'max'])

## Utils

In [None]:
def format_time(seconds):
    """
    Formates time in human readable form

    Args:
        seconds: seconds passed in a process
    Return:
        formatted string in form of MM:SS or HH:MM:SS
    """
    h = int(seconds // 3600)
    m = int((seconds % 3600) // 60)
    s = int(seconds % 60)

    result = ''

    if h > 0:
        if h < 10:
            h = '0' + str(h)
        else:
            h = str(h)
        h += ' Hr'
        result += h
        result += ' '
    
    if m > 0:
        if m < 10:
            m = '0' + str(m)
        else:
            m = str(m)
        m += ' min'
        result += m
        result += ' '

    if s < 10:
        s = '0' + str(s)
    else:
        s = str(s)
    s += ' sec'
    result += s
    
    return result

INT8_MIN    = np.iinfo(np.int8).min
INT8_MAX    = np.iinfo(np.int8).max
INT16_MIN   = np.iinfo(np.int16).min
INT16_MAX   = np.iinfo(np.int16).max
INT32_MIN   = np.iinfo(np.int32).min
INT32_MAX   = np.iinfo(np.int32).max

FLOAT16_MIN = np.finfo(np.float16).min
FLOAT16_MAX = np.finfo(np.float16).max
FLOAT32_MIN = np.finfo(np.float32).min
FLOAT32_MAX = np.finfo(np.float32).max


def memory_usage(data, detail=1):
    if detail:
        display(data.memory_usage())
    memory = data.memory_usage().sum() / (1024*1024)
    print("Memory usage : {0:.2f}MB".format(memory))
    return memory


def compress_dataset(data):
    memory_before_compress = memory_usage(data, 0)
    print()
    length_interval      = 50
    length_float_decimal = 4

    print('='*length_interval)
    for col in data.columns:
        col_dtype = data[col][:100].dtype

        if col_dtype != 'object':
            print("Name: {0:24s} Type: {1}".format(col, col_dtype))
            col_series = data[col]
            col_min = col_series.min()
            col_max = col_series.max()

            if col_dtype == 'float64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(np.round(col_min, length_float_decimal)), str(np.round(col_max, length_float_decimal))))
                if (col_min > FLOAT16_MIN) and (col_max < FLOAT16_MAX):
                    data[col] = data[col].astype(np.float16)
                    print("  float16 min: {0:15s} max: {1:15s}".format(str(FLOAT16_MIN), str(FLOAT16_MAX)))
                    print("compress float64 --> float16")
                elif (col_min > FLOAT32_MIN) and (col_max < FLOAT32_MAX):
                    data[col] = data[col].astype(np.float32)
                    print("  float32 min: {0:15s} max: {1:15s}".format(str(FLOAT32_MIN), str(FLOAT32_MAX)))
                    print("compress float64 --> float32")
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                print('='*length_interval)

            if col_dtype == 'int64':
                print(" variable min: {0:15s} max: {1:15s}".format(str(col_min), str(col_max)))
                type_flag = 64
                if (col_min > INT8_MIN/2) and (col_max < INT8_MAX/2):
                    type_flag = 8
                    data[col] = data[col].astype(np.int8)
                    print("     int8 min: {0:15s} max: {1:15s}".format(str(INT8_MIN), str(INT8_MAX)))
                elif (col_min > INT16_MIN) and (col_max < INT16_MAX):
                    type_flag = 16
                    data[col] = data[col].astype(np.int16)
                    print("    int16 min: {0:15s} max: {1:15s}".format(str(INT16_MIN), str(INT16_MAX)))
                elif (col_min > INT32_MIN) and (col_max < INT32_MAX):
                    type_flag = 32
                    data[col] = data[col].astype(np.int32)
                    print("    int32 min: {0:15s} max: {1:15s}".format(str(INT32_MIN), str(INT32_MAX)))
                    type_flag = 1
                else:
                    pass
                memory_after_compress = memory_usage(data, 0)
                print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
                if type_flag == 32:
                    print("compress (int64) ==> (int32)")
                elif type_flag == 16:
                    print("compress (int64) ==> (int16)")
                else:
                    print("compress (int64) ==> (int8)")
                print('='*length_interval)

    print()
    memory_after_compress = memory_usage(data, 0)
    print("Compress Rate: [{0:.2%}]".format((memory_before_compress-memory_after_compress) / memory_before_compress))
    
    return data


## Data preprocessing

In [None]:
%%time
train_df = compress_dataset(train_df)
test_df = compress_dataset(test_df)

In [None]:
# every feature is scaled - even the categoricals.

# every though everything is 0 to 1 (in this dataset at least), it's scaled anyways to surround 0.

scaler = preprocessing.StandardScaler()
for col in FEATURES:
    train_df[col] = scaler.fit_transform(train_df[col].to_numpy().reshape(-1,1))
    test_df[col] = scaler.transform(test_df[col].to_numpy().reshape(-1,1))
    
X = train_df[FEATURES].to_numpy().astype(np.float32)
Y = train_df[TARGET].to_numpy().astype(np.float32)
X_test = test_df[FEATURES].to_numpy().astype(np.float32)

del train_df, test_df
gc.collect()
np.save("X_11Oct2021", X, fix_imports=False)
np.save("Y_11Oct2021", Y, fix_imports=False)
np.save("X_test_11Oct2021", X_test, fix_imports=False)

In [None]:
X = np.load("X_11Oct2021.npy")
Y = np.load("Y_11Oct2021.npy")
X_test = np.load("X_test_11Oct2021.npy")


## Modeling

In [None]:
# no early stopping.  Must have determined before that 9500 trees (for example in xgboost's case) is the 
# right amount.

# in spite of no early stopping, an eval set is pulled out so that out of fold predictions are generated
# for ensemblings presumably.  

xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'max_depth': 6,
    'n_estimators': trees, #9500,
    'learning_rate': 0.007279718158350149,
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'tree_method': 'gpu_hist',
    'gpu_id': 0,
    'predictor': 'gpu_predictor',
#     'verbosity': 2,
}

lgb_params = {
    'objective' : 'binary',
    'metric' : 'auc',
    'num_leaves' : 7,
    'learning_rate' : 0.08,
    'device' : 'gpu',
    'feature_pre_filter': False, 
    'reg_alpha': 9.314037635261775, 
    'reg_lambda': 0.10613573572440353,
    'num_leaves': 7,
    'colsample_bytree': 0.4, 
    'subsample': 0.8391963650875751, 
    'subsample_freq': 5, 
    'min_child_samples': 100,
    'num_iterations': trees, #10000,
    'n_estimators': 20000
}

catb_params = {    
    "objective": "CrossEntropy",
    "eval_metric" : "AUC",
    "task_type": "GPU",
    "grow_policy": "SymmetricTree",
    "learning_rate": 0.08,
    "n_estimators":  trees, #10_000,
    "random_strength" : 1.0,
    "max_bin": 128,
    "l2_leaf_reg": 0.002550319996478972,
    "max_depth": 4,
    "min_data_in_leaf": 193,
    'verbose': 0
}

### Level - 1

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

models = [

    [CatBoostClassifier, catb_params, 'catb'],
    [LGBMClassifier, lgb_params, 'lgbm'],
    [XGBClassifier, xgb_params, 'xgb'],

]

In [None]:
# stratified kfolds versus regular one.  Makes sense.

# I think shrinking the size of the data makes the training go faster. At least it appears that this xgb is moving
# through the trees faster than mine.  I should test if it's the compression that does this or perhaps one of the
# parameters

# This runs each model twice with 2 different seeds interestingly.  

# instead of training on the whole dataset, it's doing a prediction on the test 
# data using each of the folds training model * 1/5.  Hmm.  

from collections import defaultdict

oof_df = defaultdict(lambda : [])
num_boost_dict = defaultdict(lambda : [])
test_df = defaultdict(lambda : np.zeros((X_test.shape[0])))

SEEDS = [today % 10000, today]
N_FOLDS = 5
start = time.time()

skfolds = model_selection.StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)


for fold, (t, v) in enumerate(skfolds.split(X, Y)):
    x_train, x_val = X[t], X[v]
    y_train, y_val = Y[t], Y[v]
    
    oof_df[TARGET].extend(y_val)
    print('-'*38, f"\n{'-'*15} FOLD-{fold} {'-'*15}")
    print('-'*38)

    for i in range(len(SEEDS)):
        for class_name, class_params, name in models:
            tic = time.time()
            if name in ['qda', 'gnb']:
                if i > 0:
                    continue
            else:
                class_params['random_state'] = SEEDS[i]

            clf = class_name(**class_params)
            if name in ['xgb', 'lgbm']:
                clf = clf.fit(x_train, y_train, eval_set=[(x_val, y_val)], 
                              eval_metric='auc', verbose=verbose, early_stopping_rounds=trees)
            elif name in ['catb']:
                clf = clf.fit(x_train, y_train, eval_set=[(x_val, y_val)], 
                              verbose_eval=verbose) # early_stopping_rounds=10000)
            else:
                assert False

            if name == 'xgb':
                num_boost_rounds = (clf.best_iteration if clf.best_iteration != 0 else 
                                    class_params['n_estimators']-1)
            if name == 'lgbm':
                num_boost_rounds = (clf.best_iteration_ if clf.best_iteration_ is not None else 
                                    class_params['num_iterations'])                
            elif name in ['catb']:
                num_boost_rounds = clf.best_iteration_      
            num_boost_dict[f'{name}_{SEEDS[i]}'].append(num_boost_rounds)  
            if name == 'catb':
                preds = clf.predict_proba(x_val, ntree_end=num_boost_rounds+1)[:, 1].tolist()
                test_df[f'{name}_{SEEDS[i]}'] += (clf.predict_proba(
                    X_test, ntree_end=num_boost_rounds+1)[:, 1] / N_FOLDS)
            elif name == 'xgb':
                preds = clf.predict_proba(x_val, ntree_limit=num_boost_rounds+1)[:, 1].tolist()
                test_df[f'{name}_{SEEDS[i]}'] += (clf.predict_proba(
                    X_test, ntree_limit=num_boost_rounds+1)[:, 1] / N_FOLDS)                
            else:
                preds = clf.predict_proba(x_val)[:, 1].tolist() # lgbm uses best iteration automatically.
                test_df[f'{name}_{SEEDS[i]}'] += (clf.predict_proba(X_test)[:, 1] / N_FOLDS)
            oof_df[f'{name}_{SEEDS[i]}'].extend(preds)
            # 

            score = metrics.roc_auc_score(y_val, preds)
            print(f"MODEL: {name}\tSEED: {SEEDS[i]}\tSCORE: {score}\tTIME: {format_time(time.time()-tic)}")
        
            del clf
            gc.collect()
    del x_train, x_val, y_train, y_val
    gc.collect()

print('='*38)
for k, v in oof_df.items():
    if k != TARGET:
        score = metrics.roc_auc_score(oof_df[TARGET], v)
        print(f'Overall ROC AUC of {k}: {score}')
        
oof_df = pd.DataFrame(oof_df)
test_df = pd.DataFrame(test_df)

print()
print(f'TOTAL TIME: {format_time(time.time() - start)}')

In [None]:
oof_df.head()

In [None]:
oof_df.to_csv('oof_df.csv', index=False)
pickle.dump(dict(num_boost_dict), open("num_boost_dict_10Oct2021.pkl", "wb"))
test_df.to_csv('test_df.csv', index=False)

In [None]:
oof_df = pd.read_csv('oof_df.csv')
test_df = pd.read_csv('test_df.csv')

### Level - 2

In [None]:
# looks like it's building lots of ensemblers.  Hmm.

FEATURES = [col for col in oof_df.columns if col not in [TARGET]]
X = oof_df[FEATURES].to_numpy()
Y = oof_df[TARGET].to_numpy()
X_test = test_df[FEATURES].to_numpy()

In [None]:
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression,LinearRegression
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

models = [
    [CatBoostClassifier, catb_params, 'catb'],
    [QuadraticDiscriminantAnalysis, {}, 'qda'],
    [GaussianNB, {}, 'gnb'],
    [LogisticRegression, {}, 'log'],
    [LinearRegression, {}, 'reg']
]

In [None]:
from collections import defaultdict

oof_df = defaultdict(lambda : [])
test_df = defaultdict(lambda : np.zeros((X_test.shape[0])))

N_FOLDS = 5
start = time.time()

skfolds = model_selection.StratifiedKFold(n_splits=N_FOLDS, shuffle=False) #, random_state=SEED)

for fold, (t, v) in enumerate(skfolds.split(X, Y)):
    x_train, x_val = X[t], X[v]
    y_train, y_val = Y[t], Y[v]
    
    oof_df[TARGET].extend(y_val)
    print('-'*38, f"\n{'-'*15} FOLD-{fold} {'-'*15}")
    print('-'*38)

    for class_name, class_params, name in models:
        tic = time.time()

        clf = class_name(**class_params)
        clf = clf.fit(x_train, y_train)
        if name != 'reg':
            preds = clf.predict_proba(x_val)[:, 1].tolist()
            test_df[f'{name}'] += (clf.predict_proba(X_test)[:, 1] / N_FOLDS)
        else:
            preds = clf.predict(x_val).tolist()
            test_df[f'{name}'] += (clf.predict(X_test) / N_FOLDS)   
        
        oof_df[f'{name}'].extend(preds)
        score = metrics.roc_auc_score(y_val, preds)
        print(f"MODEL: {name}\tSCORE: {score}\tTIME: {format_time(time.time()-tic)}")

        del clf
        gc.collect()
        
    del x_train, x_val, y_train, y_val
    gc.collect()

print('='*38)
for k, v in oof_df.items():
    if k != TARGET:
        score = metrics.roc_auc_score(oof_df[TARGET], v)
        print(f'Overall ROC AUC of {k}: {score}')
        
oof_df = pd.DataFrame(oof_df)
test_df = pd.DataFrame(test_df)

print()
print(f'TOTAL TIME: {format_time(time.time() - start)}')

In [None]:
oof_df.to_csv('oof_df_l2.csv', index=False)
test_df.to_csv('test_df_l2.csv', index=False)

### Level - 3

In [None]:
oof_l1 = pd.read_csv('oof_df.csv')
test_l1 = pd.read_csv('test_df.csv')

for col in oof_l1:
    if col != TARGET:
        oof_df[col] = oof_l1[col]
        test_df[col] = test_l1[col]
        
oof_df.head()

In [None]:
FEATURES = [col for col in oof_df.columns if col not in [TARGET]]
X = oof_df[FEATURES].to_numpy()
Y = oof_df[TARGET].to_numpy()
X_test = test_df[FEATURES].to_numpy()

In [None]:
from collections import defaultdict

test_preds = np.zeros((test_df.shape[0]))
oof_y = []
oof_preds = []

N_FOLDS = 5
start = time.time()

skfolds = model_selection.StratifiedKFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

for fold, (t, v) in enumerate(skfolds.split(X, Y)):
    x_train, x_val = X[t], X[v]
    y_train, y_val = Y[t], Y[v]
    
    print(f"\n{'-'*15} FOLD-{fold} {'-'*15}")
    tic = time.time()

    clf = LinearRegression()
    clf = clf.fit(x_train, y_train)
    preds = clf.predict(x_val).tolist()
    test_preds += (clf.predict(X_test) / N_FOLDS)
    oof_y.extend(y_val.tolist())
    oof_preds.extend(preds)    

    score = metrics.roc_auc_score(y_val, preds)
    
    print(f"MODEL: log\tSCORE: {score}\tTIME: {format_time(time.time()-tic)}")
    
    del x_train, x_val, y_train, y_val, clf
    gc.collect()

score = metrics.roc_auc_score(oof_y, oof_preds)
print(f'Overall ROC AUC of reg: {score}')

print()
print(f'TOTAL TIME: {format_time(time.time() - start)}')

## Submission

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-oct-2021/sample_submission.csv', nrows=nrows)
submission[TARGET] = test_preds

submission.head()

In [None]:
submission.to_csv('submission.csv', index=False)