In [None]:
%%time

import os, psutil
import gc

import numpy as np 
import pandas as pd 

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import cross_validate,cross_val_score,train_test_split, KFold, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score,make_scorer, precision_score, recall_score,f1_score, roc_curve,auc
from sklearn import metrics
from sklearn import ensemble,metrics,model_selection,neighbors,preprocessing, svm, tree
from sklearn.preprocessing import MinMaxScaler
import lightgbm as lgb
from lightgbm import LGBMClassifier
from sklearn.ensemble import GradientBoostingClassifier
import optuna
from optuna.integration import LightGBMPruningCallback
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

import scikitplot.metrics as skplot
import datatable as dt

# machine learning tools
import h2o
from h2o.estimators import H2OGeneralizedLinearEstimator, H2ORandomForestEstimator, H2OGradientBoostingEstimator
from h2o.automl import H2OAutoML

pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
# from IPython.core.interactiveshell import InteractiveShell
# InteractiveShell.ast_node_interactivity = 'all'

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Helper Functions

In [None]:
def cpu_usage():
    pid = os.getpid()
    py = psutil.Process(pid)
    return f'Memory Usage : {round(py.memory_info()[0]/2**30,2)}'

In [None]:
# function to reduce data memory size
def reduce_memory_usage(df):
    start_mem = df.memory_usage().sum()/1024**2
    numerics = ['int8', 'int16', 'int32','int64', 'float16','float32','float64']
    for col in df.columns:
        col_type = df[col].dtype
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(df[col].dtype)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum()/1024**2
    print(f'Memory reduced from {round(start_mem,2)} -> {round(end_mem,2)}.\nReduction in memory size by {round(((start_mem - end_mem)/start_mem)*100,2)}%')
    cpu_usage()

In [None]:
def dtype_graph(df):
    cat_cols = df.select_dtypes(include = bool).columns
    cont_cols = df.select_dtypes(include = 'float16').columns
    print(f'Categorical columns: {len(cat_cols)} | {round(len(cat_cols)/(len(cat_cols) + len(cont_cols))*100,0)}%')
    print(f'Numerical columns: {len(cont_cols)} | {round(len(cont_cols)/(len(cat_cols) + len(cont_cols))*100,0)}%')
    plt.bar([1,2],[len(cat_cols), len(cont_cols)])
    plt.xticks([1,2],('Categorical', 'Continuos'));
    plt.show()

## Get data

In [None]:
%%time
train = dt.fread('../input/tabular-playground-series-oct-2021/train.csv').to_pandas()
test = dt.fread('../input/tabular-playground-series-oct-2021/test.csv').to_pandas()

In [None]:
%%time
reduce_memory_usage(train)
reduce_memory_usage(test)

In [None]:
cat_cols = test.select_dtypes(include = bool).columns
cont_cols = test.select_dtypes(include = 'float16').columns

In [None]:
features = train.columns.tolist()
features.remove('id')
features.remove('target')
train[train.select_dtypes(include = bool).columns] = train[train.select_dtypes(include = bool).columns].astype(int)

In [None]:
train['target'] = train['target'].astype(int)

In [None]:
# train_sub = train.sample(frac = .20, random_state = 42) # getting a sample of training data to run the models faster
# X_train, X_test, y_train, y_test = train_test_split(train_sub[features],train_sub['target'],
#                                                     train_size = 0.8, test_size = 0.2, 
#                                                     random_state = 42,stratify = train_sub['target'])

In [None]:
# # logreg = LogisticRegression()
# rf = RandomForestClassifier()
# xgb = XGBClassifier(enable_categorical = True)
# lgbm = LGBMClassifier()
# cb = CatBoostClassifier(allow_writing_files = False, logging_level = 'Silent')

# # model_list = [rf, xgb, lgbm, cb]
# model_list = [lgbm,xgb]

# scoring = {'auc_score' : make_scorer(roc_auc_score),
#            'accuracy' : make_scorer(accuracy_score), 
#            'precision' : make_scorer(precision_score),
#            'recall' : make_scorer(recall_score), 
#            'f1_score' : make_scorer(f1_score)}

In [None]:
# def model_fit(model_list, cv = 3):
#     results = pd.DataFrame(index = ['auc_score_train','auc_score_test','fit_time','precision', 'recall', 'f1_score'])
#     for model in model_list:
#         cv_score_list = []
#         model.fit(X_train,y_train)
#         y_preds = model.predict(X_test)
#         roc_auc_score_test = roc_auc_score(y_test,y_preds)
        
#         cv_score =  cross_validate(model,X_train, y_train, 
#                          cv = StratifiedKFold(n_splits = 3, random_state = 42),
#                          scoring = scoring, verbose = 2)
        
#         cv_score_list.append(cv_score['test_auc_score'].mean())
#         cv_score_list.append(roc_auc_score_test)
#         cv_score_list.append(cv_score['fit_time'].mean())
#         cv_score_list.append(cv_score['test_precision'].mean())
#         cv_score_list.append(cv_score['test_recall'].mean())
#         cv_score_list.append(cv_score['test_f1_score'].mean())
        
#         results[model.__class__.__name__] = cv_score_list
#         print(f'-----------------{model.__class__.__name__} Fitted -----------------')
#     return results

In [None]:
# all_models_results = model_fit(model_list, cv = 3)

In [None]:
# all_models_results

In [None]:

### LGBM parameter tuning using Optuna

In [None]:
# train_sub = train_sub.reset_index(drop = True)

In [None]:
# def objective(trial, X = train_sub[features], y = train_sub['target']):
    
#     param_grid = {
# #     "device_type": trial.suggest_categorical("device_type", ['gpu']),
#     'n_estimators' : trial.suggest_categorical('n_estimators' ,[10000]),
#     'learning_rate' : trial.suggest_float('learning_rate', .01,.3),
#     'num_leaves' : trial.suggest_int('num_leaves', 20, 3000, step = 200),
#     'max_depth' : trial.suggest_int('max_depth', 3, 12),
#     'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 200, 10000, step = 10),
#     'reg_alpha' : trial.suggest_int('reg_alpha', 0, 100, step = 5),
#     'reg_lambda' : trial.suggest_int('reg_lambda', 0, 100, step = 5),
#     'min_gain_to_split' : trial.suggest_float('min_gain_to_split', 0, 15),
#     'bagging_fraction': trial.suggest_float('bagging_fraction', 0.2, 0.95, step=0.1),
#     'bagging_freq': trial.suggest_categorical('bagging_freq', [1]),
#     'feature_fraction': trial.suggest_float('feature_fraction', 0.2, 0.95, step=0.1)
#     }
    
#     cv = StratifiedKFold(shuffle= True, random_state= 42)
#     cv_scores = np.empty(5)
    
#     for idx, (train_idx, test_idx) in enumerate(cv.split(X,y)):
#         X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_test = y[train_idx], y[test_idx]
    
    
#         model = LGBMClassifier(objective = 'binary', **param_grid, random_state= 42,
#                                device = 'gpu', gpu_platform_id = 0, gpu_device_id = 0, silent = True)

#         model.fit(X_train, y_train, 
#                   eval_set = [(X_test, y_test), (X_train, y_train)], 
#                   eval_metric= 'auc', 
#                   early_stopping_rounds= 100,
#                   callbacks=[LightGBMPruningCallback(trial, 'auc')],verbose = False)

#         y_preds = model.predict(X_test)
#         auc_score = roc_auc_score(y_test, y_preds)
#         cv_scores[idx] = auc_score
    
#     return np.mean(cv_scores)


In [None]:
# study = optuna.create_study(direction = 'maximize', study_name = 'LGBM Classifier')
# func = lambda trial : objective(trial)
# study.optimize(func, n_trials = 10, timeout= 3600*2)

In [None]:
# for param, value in study.best_params.items():
#     print(f'{param} : {value}')

In [None]:
# tuned_lgb_params = {'n_estimators' : 10000,
# 'learning_rate' : 0.15635058762282747,
# 'num_leaves' : 1620,
# 'max_depth' : 6,
# 'min_data_in_leaf' : 3430,
# 'reg_alpha' : 100,
# 'reg_lambda' : 40,
# 'min_gain_to_split' : 2.20631337496675,
# 'bagging_fraction' : 0.8,
# 'bagging_freq' : 1,
# 'feature_fraction' : 0.5}

In [None]:
# features = train.columns.tolist()
# features.remove('id')
# features.remove('target')
# # train[train.select_dtypes(include = bool).columns] = train[train.select_dtypes(include = bool).columns].astype('int16').astype('object')
# X_train, X_test, y_train, y_test = train_test_split(train[features],train['target'],
#                                                     train_size = 0.8, test_size = 0.2, 
#                                                     random_state = 42)


In [None]:
# lgbm_tuned = LGBMClassifier(**tuned_lgb_params,device = 'gpu', gpu_platform_id = 0, gpu_device_id = 0)
# lgbm_tuned.fit(X_train, y_train, 
#                   eval_set = [(X_test, y_test), (X_train, y_train)], 
#                   eval_metric= 'auc', 
#                   early_stopping_rounds= 100,verbose = False)

In [None]:
# y_preds  = lgbm_tuned.predict(X_test)
# y_probas = lgbm_tuned.predict_proba(X_test)

In [None]:
# print(classification_report(y_test,y_preds))
# fig, axes = plt.subplots(2,2,figsize = (20,10));
# skplot.plot_confusion_matrix(y_test,y_preds, ax = axes[0,0], normalize = True);
# skplot.plot_lift_curve(y_test,y_probas, ax = axes[0,1]);
# skplot.plot_precision_recall_curve(y_test,y_probas, ax = axes[1,0]);
# skplot.plot_roc_curve(y_test,y_probas, ax = axes[1,1]);
# fig.tight_layout()


## Trying xgb

In [None]:
from category_encoders import TargetEncoder

In [None]:
train_sub = train.sample(frac = .20, random_state = 42) # getting a sample of training data to run the models faster
X = train_sub[features]
y = train_sub['target']

In [None]:
enc = TargetEncoder(cols = cat_cols)

In [None]:
X = enc.fit_transform(X,y)
X_test = enc.transform(test[features])

In [None]:
params = {
    'max_depth': 6,
    'n_estimators': 9500,
    'subsample': 0.7,
    'colsample_bytree': 0.2,
    'colsample_bylevel': 0.6000000000000001,
    'min_child_weight': 56.41980735551558,
    'reg_lambda': 75.56651890088857,
    'reg_alpha': 0.11766857055687065,
    'gamma': 0.6407823221122686,
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'tree_method': 'gpu_hist',
    'predictor': 'gpu_predictor',
    'use_label_encoder': False
    }

In [None]:
%%time
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=786)

preds = []
scores = []

for fold, (idx_train, idx_valid) in enumerate(kf.split(X, y)):
    X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
    X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
    
    params['learning_rate']=0.007
    model1 = XGBClassifier(**params)
    
    model1.fit(X_train,y_train,
              eval_set=[(X_train, y_train),(X_valid,y_valid)],
              early_stopping_rounds=200,
              verbose=False)
    
    params['learning_rate']=0.01
    model2 = XGBClassifier(**params)
    
    model2.fit(X_train,y_train,
              eval_set=[(X_train, y_train),(X_valid,y_valid)],
              early_stopping_rounds=200,
              verbose=False,
              xgb_model=model1)
    
    params['learning_rate']=0.05
    model3 = XGBClassifier(**params)
    
    model3.fit(X_train,y_train,
              eval_set=[(X_train, y_train),(X_valid,y_valid)],
              early_stopping_rounds=200,
              verbose=False,
              xgb_model=model2)
    
    pred_valid = model3.predict_proba(X_valid)[:,1]
    fpr, tpr, _ = roc_curve(y_valid, pred_valid)
    score = auc(fpr, tpr)
    scores.append(score)
    
    print(f"Fold: {fold + 1} Score: {score}")
    print('||'*40)
    
    test_preds = model3.predict_proba(X_test)[:,1]
    preds.append(test_preds)
    
print(f"Overall Validation Score: {np.mean(scores)}")

## Submission file

In [None]:
# lgbm_preds = lgbm_tuned.predict(test[features])
# lgbm_submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
# lgbm_submission.target = lgbm_preds
# lgbm_submission.to_csv("lgbm_pbaseline_submission.csv",index=False)
# print('lgbm submission complete')   

In [None]:
predictions = np.mean(np.column_stack(preds),axis=1)
xgb_submission = pd.read_csv("../input/tabular-playground-series-oct-2021/sample_submission.csv")
xgb_submission['target'] = predictions
xgb_submission.to_csv('./xgb_submission.csv', index=False)