In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
test = pd.read_csv('../input/tabular-playground-series-may-2022/test.csv')
train = pd.read_csv('../input/tabular-playground-series-may-2022/train.csv')
sample_submission = pd.read_csv('../input/tabular-playground-series-may-2022/sample_submission.csv')
train

In [None]:
train.describe()

In [None]:
test.describe()

In [None]:
features = ['f_{:02d}'.format(i) for i in range(31)]
categorical_features = ['f_{:02d}'.format(i) for i in range(7,19)]
categorical_features = categorical_features + ['f_29', 'f_30']
categorical_features

In [None]:
# f27_unique = train['f_27'].unique()
# f27_unique.sort()

In [None]:
def count_sequence(df, field):
    alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']    
    for letter in alphabet:
        df[letter + '_count'] = df[field].str.count(letter)
    return df

In [None]:
def find_char(df, field):
    alphabet = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']    
    for letter in alphabet:
        df['ch_'+letter] = df[field].apply(lambda x: 1 if letter in x else 0)
    return df

# Plotting feature interactions

In [None]:
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
%matplotlib inline

# Set global figure size and dots per inch
plt.rcParams.update({'figure.figsize':(5,5), 'figure.dpi':100})

cmap = ListedColormap(["#ffd700", "#0057b8"])

In [None]:
plt.scatter(train['f_01'], train['f_03'], cmap=cmap, c=train['target'])
plt.show()

In [None]:
fig, ax = plt.subplots(1, 3, sharex='col', sharey='row')
ax[0].scatter(train['f_02'], train['f_21'], cmap=cmap, c=train['target'])
ax[1].scatter(train['f_03'], train['f_22'], cmap=cmap, c=train['target'])
ax[2].scatter(train['f_00']+train['f_01'], train['f_26'], cmap=cmap, c=train['target'])

In [None]:
def feature_engineering(df):
    df["unique_characters"] = df.f_27.apply(lambda s: len(set(s)))
    df['i_02_21'] = (df.f_21 + df.f_02 > 5.2).astype(int) - (df.f_21 + df.f_02 < -5.3).astype(int)
    df['i_05_22'] = (df.f_22 + df.f_05 > 5.1).astype(int) - (df.f_22 + df.f_05 < -5.4).astype(int)
    i_00_01_26 = df.f_00 + df.f_01 + df.f_26
    df['i_00_01_26'] = (i_00_01_26 > 5.0).astype(int) - (i_00_01_26 < -5.0).astype(int)
    return df

In [None]:
from tqdm import tqdm
from sklearn import preprocessing
stdl = preprocessing.StandardScaler()
lbl = preprocessing.LabelEncoder()


def preprocess_data(df, categorical_features):
    unscaled_features = ['f_19', 'f_20', 'f_21', 'f_22', 'f_23', 'f_24', 'f_25', 'f_26', 'f_28']
    data = df.copy()
    for i in range(10):
        data[f'ch{i}'] = data.f_27.str.get(i).apply(ord) - ord('A')
    data = count_sequence(data, 'f_27')
    #data = find_char(data, 'f_27')     
    #data['f_27_en'] = lbl.fit_transform(data['f_27'])
            
    #for cf in categorical_features:
    #    f_df = pd.get_dummies(data[cf], prefix=cf)
    #    data = pd.concat([data, f_df], axis=1)
    data = feature_engineering(data)
    for ft in tqdm(unscaled_features):
       data[ft] = stdl.fit_transform(data[[ft]])
    data = data.drop(['id', 'f_27'], axis=1)
    return data

train_df = preprocess_data(train, categorical_features)
test_df = preprocess_data(test, categorical_features)

In [None]:
train_df

In [None]:
test_df

In [None]:
X = train_df.drop(['target'], axis=1)
y = train_df['target']
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, X_valid.shape, y_train.shape,  y_valid.shape)

# Trying out XGBoost model

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(max_depth=10, learning_rate=0.01, n_estimators=8192, colsample_bytree=0.80, subsample=0.80, reg_lambda=1, reg_alpha=1, gamma=1, objective='binary:logistic', tree_method='gpu_hist', early_stopping_rounds = 256, eval_metric = ['auc'], seed=42)
xgb_model

In [None]:
# Training
xgb_model.fit(X_train,y_train, eval_set=[(X_train, y_train),(X_valid, y_valid)], verbose=250)
#xgb_model.save_model('xgb.model')

In [None]:
preds1 = xgb_model.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid, preds1)

# Model Analysis

In [None]:
from matplotlib import pyplot as plt
from xgboost import plot_importance
def plot_features(booster, figsize):    
    fig, ax = plt.subplots(1,1,figsize=figsize)
    return plot_importance(booster=booster, ax=ax)
plot_features(xgb_model, (10,14))

In [None]:
#!pip install shap

In [None]:
import shap
explainer = shap.Explainer(xgb_model)
shap_values = explainer(X_valid)

# visualize the first prediction's explanation
shap.plots.waterfall(shap_values[0])

# Using Stratified K-Fold Splits and Training an XGboost Model 

In [None]:
# kfold = StratifiedKFold(n_splits=3, random_state=42, shuffle=True)
# X = train_df.drop(['target'], axis=1)
# y = train_df['target']
# for train_index, valid_index in kfold.split(X, y):
#     print("TRAIN:", train_index, "TEST:", valid_index)
#     X_t, X_v = X.iloc[train_index], X.iloc[valid_index]
#     y_t, y_v = y.iloc[train_index], y.iloc[valid_index]
#     xgb_model.fit(X_train,y_train, eval_set=[(X_t, y_t),(X_v, y_v)], xgb_model='xgb.model', verbose=250)
#     xgb_model.save_model('xgb.model')

In [None]:
# preds = xgb_model.predict_proba(X_valid)[:,1]
# roc_auc_score(y_valid, preds)

# Trying out LGBM model
* Setting categorical features in LGBM model

In [None]:
cat_index = [train_df.columns.get_loc(col_name) for col_name in categorical_features]
cat_index

In [None]:
import lightgbm as lgb
lgbm_model = lgb.LGBMClassifier(learning_rate=0.15, objective='binary', num_iterations=8196, early_stopping_round=256, categorical_feature=cat_index , lambda_l1=1.5, lambda_l2 =1.5, random_state=42)
lgbm_model

In [None]:
lgbm_model.fit(X_train,y_train,eval_set=[(X_train,y_train), (X_valid,y_valid)], eval_metric='auc', verbose=50)

In [None]:
preds2 = lgbm_model.predict_proba(X_valid)[:,1]
roc_auc_score(y_valid, preds2)

In [None]:
# preds = (preds1+preds2)/2
# roc_auc_score(y_valid, preds)

# Trying out RandomSearchCV

In [None]:
# from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# params = {
#         'min_child_weight': [1, 5, 10],
#         'gamma': [0.5, 1, 1.5, 2, 5],
#         'subsample': [0.6, 0.8, 1.0],
#         'colsample_bytree': [0.6, 0.8, 1.0],
#         'max_depth': [6, 8, 10]
#         }

In [None]:
# folds = 3
# param_comb = 5
# skf = StratifiedKFold(n_splits=folds, shuffle = True, random_state = 1001)
# xgb = XGBClassifier(learning_rate=0.15, n_estimators=8192, reg_lambda=1.5, reg_alpha=1.5, objective='binary:logistic', tree_method='gpu_hist', eval_metric = ['auc'], seed=42)
# random_search = RandomizedSearchCV(xgb, param_distributions=params, n_iter=param_comb, scoring='roc_auc', n_jobs=4, cv=skf.split(X,y), verbose=3, random_state=1001 )

In [None]:
# random_search.fit(X, y)

# Trying out Neural Network with FastAI

In [None]:
from fastai.tabular.all import *

In [None]:
train_df_nn = train_df.copy()
test_df_nn = test_df.copy()

In [None]:
cat_nn = categorical_features
cont_nn = [ft for ft in train_df_nn.columns if ft not in categorical_features]
cont_nn.remove('target')
print(cat_nn, cont_nn)

In [None]:
# for cf in cat_nn:
#     train_df_nn[cf] = train_df_nn[cf].astype('category')
#     test_df_nn[cf] = test_df_nn[cf].astype('category')

In [None]:
train_df_nn['target'] = train_df_nn['target'].astype(np.float32)
train_df_nn

In [None]:
splits = TrainTestSplitter(test_size=0.2, random_state=42)(range_of(train_df_nn))

In [None]:
#procs_nn = [Categorify, FillMissing]
to_nn = TabularPandas(train_df_nn, [], [], cat_nn+cont_nn,
                      splits=splits, y_names=['target'])

In [None]:
dls = to_nn.dataloaders(256)

In [None]:
dls.train.show_batch()

In [None]:
learner = tabular_learner(dls, y_range=(0,1), layers=[512,256,64], n_out=1, loss_func=F.binary_cross_entropy)

In [None]:
learner.lr_find()

In [None]:
learner.fit_one_cycle(5, 1e-2)

In [None]:
preds,targs = learner.get_preds()

In [None]:
roc_auc_score(targs.numpy().reshape(-1), preds.numpy().reshape(-1))

In [None]:
dl = learner.dls.test_dl(test_df_nn)
y_nn, _ = learner.get_preds(dl = dl)
y_nn

## Selecting predictions from the models

In [None]:
X_test = test_df
y_test1 = xgb_model.predict_proba(X_test)[:, 1]
#y_test2 = lgbm_model.predict_proba(X_test)[:, 1]
#y_test3 = y_nn.numpy().reshape(-1)
y_test = y_test1
y_test

In [None]:
submission = sample_submission.copy()
submission['target'] = y_test
submission

In [None]:
submission.to_csv("tpg_0522_submission.csv", index=False)