In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:

        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, roc_auc_score, roc_curve
import matplotlib.pyplot as plt
import seaborn as sns
import random
import os
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, pyll
import time
import colorama
import pickle
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold
from catboost import CatBoostClassifier, Pool
%matplotlib inline

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
SEED = 2021
random.seed(SEED)
np.random.seed(SEED)
os.environ['PYTHONHASHSEED']=str(SEED)

In [None]:
train_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/train.csv")
test_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/test.csv")
sub_df = pd.read_csv("/kaggle/input/tabular-playground-series-mar-2021/sample_submission.csv")

In [None]:
train_df.head()

In [None]:
test_df.head()

In [None]:
train_df.shape, test_df.shape

In [None]:
train_df.info()

In [None]:
train_df.target.value_counts() / len(train_df)

In [None]:
test_df.info()

In [None]:
for col in train_df.select_dtypes('object').columns:
    print(col, train_df[col].nunique())

In [None]:
train_df.isnull().sum() / len(train_df)

In [None]:
cols_to_remove = ['id']
target = 'target'

In [None]:
_X = train_df.drop(cols_to_remove + [target], axis=1)
y = train_df[[target]]
_XTEST = test_df.drop(cols_to_remove , axis=1)

In [None]:
_X.shape, _XTEST.shape

In [None]:
X_all = pd.concat([_X, _XTEST]).reset_index(drop=True)
X_all.shape

In [None]:
cat_columns = []
for col in X_all.select_dtypes('object').columns:
    print(col)
    cat_columns.append(col)
    le = LabelEncoder()
    X_all[col] = le.fit_transform(X_all[col])

In [None]:
X_all.shape

In [None]:
X = X_all[:len(y)]
XTEST = X_all[len(y):]
X.shape, XTEST.shape

In [None]:
X.columns

In [None]:
NUM_OF_BOOST_ROUND = 1000
EARLY_STOPPING = 300

In [None]:
cat_features_index = [i for i,col in enumerate(X.columns) if col in cat_columns]
cat_features_index

In [None]:
X_train , X_valid, y_train, y_valid = train_test_split(X,y, 
                                                       test_size=0.2, 
                                                       random_state=SEED, 
                                                       stratify=y)

LGBM Classifier 

In [None]:
params = {
    'cat_features': cat_features_index,
    'eval_metric': 'AUC',
    'random_seed': SEED,
    'n_estimators': NUM_OF_BOOST_ROUND,
}

In [None]:
clf = lgb.LGBMClassifier(**params, n_jobs= -1)

In [None]:
clf.fit(X_train, y_train, eval_set=(X_valid, y_valid), early_stopping_rounds=100, verbose=-1)

In [None]:
ypred_lgb = clf.predict_proba(X_valid)[:,1]

In [None]:
roc_auc_score(y_valid, ypred_lgb)

In [None]:
ytest = clf.predict_proba(XTEST)[:,1]

In [None]:
ytest

In [None]:
sub_df.Response = ytest

In [None]:
sns.distplot(sub_df.Response)

CATBOOST Classifier 

In [None]:
params = {
    'cat_features' : cat_features_index,
    'eval_metric': 'AUC',
    'random_seed': SEED,
    'n_estimators': NUM_OF_BOOST_ROUND
}

In [None]:
bst = CatBoostClassifier(**params, early_stopping_rounds=EARLY_STOPPING,thread_count=-1)
_ = bst.fit(X_train, y_train, eval_set=(X_valid,y_valid), plot=True, verbose=False)

In [None]:
ypred_cat = bst.predict_proba(X_valid)[:,1]

In [None]:
roc_auc_score(y_valid, ypred_cat)

In [None]:
f_importance_df = pd.DataFrame(bst.get_feature_importance(), columns=['importance'], index=X_valid.columns)
f_importance_df = f_importance_df.sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(10,5))
sns.barplot(x=f_importance_df.importance[:500], y=f_importance_df.index[:500]);


In [None]:
ypred_test = bst.predict_proba(XTEST)[:,1]

In [None]:
sns.distplot(ypred_test)

In [None]:
pred_test_df = bst.predict(XTEST)

In [None]:
pred_test_df

In [None]:
sub_df

In [None]:
def plot_roc(y_trues, y_preds, labels, x_max=1.0):
    fig, ax = plt.subplots()
    for i, y_pred in enumerate(y_preds):
        y_true = y_trues[i]
        fpr, tpr, thresholds = roc_curve(y_true, y_pred)
        auc = roc_auc_score(y_true, y_pred)
        ax.plot(fpr, tpr, label='%s; AUC=%.3f' % (labels[i], auc), marker='o', markersize=1)

    ax.legend()
    ax.grid()
    ax.plot(np.linspace(0, 1, 20), np.linspace(0, 1, 20), linestyle='--')
    ax.set_title('ROC curve')
    ax.set_xlabel('False Positive Rate')
    ax.set_xlim([-0.01, x_max])
    _ = ax.set_ylabel('True Positive Rate')

In [None]:
plot_roc(
    [y_valid]*2,
    [ypred_lgb, ypred_cat],
    ['LGBM', 'CATBOOST'])