## Libraries Imported

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_theme(style="darkgrid")
import plotly.express as px

from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold

from xgboost import XGBClassifier
from catboost import CatBoostClassifier
import lightgbm as lgb

from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings("ignore")

## Loading & Analyzing Data

In [None]:
pd.set_option("display.max_colwidth", 200)

train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv').drop('id', axis=1)
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv').drop('id', axis=1)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
print('Shape of Train data:', train.shape)
print('Shape of Test data:', test.shape)

In [None]:
print('Null Values in Train data:', train.isnull().values.any())
print('Null Values in Test data:', test.isnull().values.any())

In [None]:
print('Duplicate Values in Train data:', train.duplicated().sum())
print('Duplicate Values in Test data:', test.duplicated().sum())

In [None]:
print('No. of Unique elements in Train data:')
print(train.nunique())
print('------------------------------------')
print('No. of Unique elements in Test data:')
print(test.nunique())

In [None]:
print('Checking the type of our data:')
train.dtypes

In [None]:
train['target'].value_counts(normalize=True)

## Data Visualization

In [None]:
px.pie(train,names='target',title='Target Distribution',hole=0.2)

In [None]:
fig, ax = plt.subplots(4,4, figsize = (30,25) , sharey= True)
ax = ax.ravel()

for i,col in enumerate(train.dtypes[train.dtypes =="float64"].index):
    train[col].plot(ax = ax[i], kind = "hist", bins = 100, color = "r")
    ax[i].set_title(f"{col}")
fig.suptitle("Histogram of Continous columns", fontsize=35)
plt.tight_layout()
plt.show()

In [None]:
fig, ax = plt.subplots(7,2, figsize = (20,15))
ax = ax.ravel()

for i,col in enumerate(train.dtypes[(train.dtypes =="int64") & (train.dtypes.index != "target") ].index):
    train[col].value_counts().plot(ax = ax[i], kind = "bar",color = "r")
    ax[i].set_title(f"{col}")
fig.suptitle("Histogram of Categorical Columns", fontsize=23)
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(30, 2))
sns.heatmap(train.corr()[-1:],cmap="viridis",annot=True)

plt.title('Correlation with Target Feature')
plt.show()

In [None]:
test['target'] = -1
test.head()

In [None]:
df = pd.concat([train,test])
df.head()

In [None]:
df.nunique()

## Feature Engineering

In [None]:
df['f_27_engineered']=df['f_27'].apply(lambda x: len(set(x)))
df.head()

## Label Encoding

In [None]:
col_to_encode = ['f_07','f_08','f_09','f_10','f_11','f_12','f_13','f_14','f_15','f_16','f_17','f_18','f_27_engineered','f_29','f_30']

for col in col_to_encode:
    le = LabelEncoder()
    
    le.fit(df[col])
    
    df.loc[:, col] = le.transform(df[col])
    
df.head()

In [None]:
train = df.query("target != -1").reset_index(drop=True)
test = df.query("target == -1").reset_index(drop=True)

In [None]:
train.head()

In [None]:
test = test.drop(['f_27','target'], axis=1)

test.head()

In [None]:
train['target'].value_counts(normalize=True)

## Train-Test Split

In [None]:
X = train.drop(['f_27','target'], axis=1)
y = train['target']

X_train,X_valid,y_train,y_valid = train_test_split(X,y,random_state=42,test_size=0.2)

In [None]:
print(X_train.shape,y_train.shape)
print(X_valid.shape,y_valid.shape)

## XGBoost

In [None]:
%%time
xgb = XGBClassifier(n_estimators=5000,tree_method='gpu_hist',objective='binary:logistic',eval_metric='auc',random_state=42)
xgb.fit(X_train,y_train)

In [None]:
%%time
xgb_pred = xgb.predict_proba(X_valid)[:, 1]
print('XGBoost Model AUC :', roc_auc_score(y_valid,xgb_pred))

In [None]:
fpr, tpr, _ = roc_curve(y_valid,xgb_pred)

plt.plot(fpr,tpr)
plt.title('ROC Curve for XGB Model')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## LightGBM

In [None]:
%%time
lgbm = lgb.LGBMClassifier(objective= 'binary',
                          metric= "auc",
                          n_estimators = 5000,
                          num_threads= -1,
                          learning_rate= 0.18319492258552644,
                          boosting='gbdt',
                          lambda_l1=0.00028648667113792726,
                          lambda_l2=0.00026863027834978876,
                          num_leaves=229,
                          max_depth= 0,
                          min_child_samples=80,
                          device='gpu',
                          random_state=42
                         )
lgbm.fit(X_train, y_train, eval_set=[(X_valid,y_valid)],callbacks=[lgb.early_stopping(30)],eval_metric="auc")

In [None]:
%%time
lgbm_pred = lgbm.predict_proba(X_valid)[:, 1]
print('LightGBM Model AUC :', roc_auc_score(y_valid,lgbm_pred))

In [None]:
fpr, tpr, _ = roc_curve(y_valid,lgbm_pred)

plt.plot(fpr,tpr)
plt.title('ROC Curve for LGBM Model')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

## Cross-Validation

In [None]:
%%time
# initialize kfold column to -1
train['kfold'] = -1

# fetch labels
y = train['target']

# initialize the KFold class from model_selection module
# n_splits = number of folds = 5, don't forget to initialize random_state
skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# fill the new kfold column
for f, (t_, v_) in enumerate(skf.split(X=train, y=y)):
    train.loc[v_, 'kfold'] = f
    
# save the new folds file
train.to_csv("train_folds.csv", index=False)

In [None]:
train['kfold'].value_counts()

In [None]:
for i in range(5):
    print(f"Fold: {i}")
    print(train[train['kfold'] == i].target.value_counts(normalize=True))
    print()

In [None]:
%%time
# iterate over each fold
scores = []
test_preds = []

for fold in tqdm(range(5)):

    print("Getting df_train and df_valid")
    df_train = train.query("kfold != @fold").reset_index(drop=True)
    df_valid = train.query("kfold == @fold").reset_index(drop=True)

    lgbm = lgb.LGBMClassifier(objective= 'binary',
                              metric= "auc",
                              n_estimators = 5000,
                              num_threads= -1,
                              learning_rate= 0.18319492258552644,
                              boosting='gbdt',
                              lambda_l1=0.00028648667113792726,
                              lambda_l2=0.00026863027834978876,
                              num_leaves=229,
                              max_depth= 0,
                              min_child_samples=80,
                              device='gpu',
                              random_state=42
                            )

    print("Splitting into X_train and X_valid")
    Xtrain = df_train.drop(['f_27','target', 'kfold'], axis=1)
    Xvalid = df_valid.drop(['f_27','target', 'kfold'], axis=1)

    ytrain = df_train['target']
    yvalid = df_valid['target']

    print("Fitting model")
    lgbm.fit(Xtrain,ytrain,eval_set=[(Xvalid,yvalid)],callbacks=[lgb.early_stopping(30)],eval_metric="auc")

    print("Getting predictions")
    # we need probabilities of class 1 to calculate roc_auc_score
    y_preds = lgbm.predict_proba(Xvalid)[:, 1]

    test_pred = lgbm.predict_proba(test)[:, 1]

    test_preds.append(test_pred)

    auc = roc_auc_score(yvalid, y_preds)
    
    print("*"*50)
    print(f"Fold = {fold}, AUC = {auc}")
    print("*"*50)

    scores.append(auc)

print(f"CV average: {np.mean(scores)}")

In [None]:
fpr, tpr, _ = roc_curve(yvalid,y_preds)

plt.plot(fpr,tpr)
plt.title('ROC Curve for Cross Validated LGBM Model')
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()

In [None]:
pred = np.mean(np.array(test_preds).T, axis=1)

## Submission File

In [None]:
df_submit = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv', index_col=0)
df_submit['target'] = pred
df_submit.to_csv('submission.csv',index=True)