#  <center> Workflow </center>

![img](https://i.imgur.com/1IsBI7H.png)

N.B. 
* You need to run this notebook after turning on the GPU
* The database is cleaned and there's no missing values, that's why there's no scope for imputation
* Scaling didn't improve the score, that's why this step is eliminated
* LGBM performs the best out of all other algorithms, I removed low scored models as they were killing times
* As the dataset is imbalanced, I've used stratification 

In [None]:
# Count the time span of the notebook
import datetime
start_time = datetime.datetime.now()

# 1. Import Libraries

In [None]:
from lightgbm import LGBMClassifier
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from optuna.visualization import plot_optimization_history, plot_param_importances
from IPython.display import display
from pandas import set_option
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.callbacks import LearningRateScheduler
from tensorflow.keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.layers import Input,Dense,Dropout
from tensorflow.keras import Model
from  tensorflow.keras.regularizers import l2


import lightgbm as lgb
import numpy as np
import pandas as pd
import optuna
import tensorflow as tf


# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Show all the columns
pd.set_option('display.max_columns', None) 

### Set Constants

In [None]:
# Set seed to make the codebase reproducible
SEED = 42
np.random.seed(SEED)
batch_size = 128
epochs = 100
K = 15
num_folds = K

# Fetch Data

In [None]:
# Consider first column as index
train = pd.read_csv('../input/tabular-playground-series-mar-2021/train.csv', index_col=0)
test = pd.read_csv('../input/tabular-playground-series-mar-2021/test.csv', index_col=0)
train.head()

In [None]:
print("This dataset has {:.1f}".format(100*train.isna().to_numpy().sum()/(train.shape[0]*train.shape[1])) + "% missing values")

In [None]:
# Row and Column size of the training dataset
train.shape

# 2. Feature Engineering

In [None]:
# Show the number of missing values
missing = train.isnull().sum()
print(missing)

# Remove Outliers

### dropna()

In [None]:
def remove_outliers_using_quantiles(qu_dataset, qu_field, qu_fence):
    a = qu_dataset[qu_field].describe()
    
    iqr = a["75%"] - a["25%"]
    print("interquartile range:", iqr)
    
    upper_inner_fence = a["75%"] + 1.5 * iqr
    lower_inner_fence = a["25%"] - 1.5 * iqr
    print("upper_inner_fence:", upper_inner_fence)
    print("lower_inner_fence:", lower_inner_fence)
    
    upper_outer_fence = a["75%"] + 3 * iqr
    lower_outer_fence = a["25%"] - 3 * iqr
    print("upper_outer_fence:", upper_outer_fence)
    print("lower_outer_fence:", lower_outer_fence)
    
    count_over_upper = len(qu_dataset[qu_dataset[qu_field]>upper_inner_fence])
    count_under_lower = len(qu_dataset[qu_dataset[qu_field]<lower_inner_fence])
    percentage = 100 * (count_under_lower + count_over_upper) / a["count"]
    print("percentage of records out of inner fences: %.2f"% (percentage))
    
    count_over_upper = len(qu_dataset[qu_dataset[qu_field]>upper_outer_fence])
    count_under_lower = len(qu_dataset[qu_dataset[qu_field]<lower_outer_fence])
    percentage = 100 * (count_under_lower + count_over_upper) / a["count"]
    print("percentage of records out of outer fences: %.2f"% (percentage))
    
    if qu_fence == "inner":
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_inner_fence]
        output_dataset = output_dataset[output_dataset[qu_field]>=lower_inner_fence]
    elif qu_fence == "outer":
        output_dataset = qu_dataset[qu_dataset[qu_field]<=upper_outer_fence]
        output_dataset = output_dataset[output_dataset[qu_field]>=lower_outer_fence]
    else:
        output_dataset = qu_dataset
    
    print("length of input dataframe:", len(qu_dataset))
    print("length of new dataframe after outlier removal:", len(output_dataset))
    
    return output_dataset

# Drop the outliers rows
train.dropna(inplace=True)
new_dataset = remove_outliers_using_quantiles(train, "target", "inner")

# the dataset is actually already cleaned, hence no outliers

In [None]:
# Show is there any imbalance in the target value, actually there's a huge imbalance,
# so stratification is needed
train['target'].value_counts()

# Denoising AutoEncoder (DAE)

In [None]:
# Create separate variables for both categorical and continous data
categorical_cols = ['cat'+str(i) for i in range(19)]
continous_cols = ['cont'+str(i) for i in range(11)]

In [None]:
cols=categorical_cols+continous_cols
train_objs_num = len(train)

# Just combining both train and test sets columns so that encoding categorical data becomes easy 
dataset = pd.concat(objs=[train[cols], test[cols]], axis=0)

# Encode only the categories, though LGBM doesn't need that conversion
# As I'll compare with other models, I'm converting them anyway
dataset_preprocessed = pd.get_dummies(dataset,columns=categorical_cols)

# Separate train and test set again
train_preprocessed = dataset_preprocessed[:train_objs_num]
test_preprocessed = dataset_preprocessed[train_objs_num:]

train_preprocessed.head()

In [None]:
def get_DAE():
    # The dataset contains 11 independent variables
    inputs = Input((11,))
    x = Dense(1500, activation='relu')(inputs) # 1500 original
    x = Dense(1500, activation='relu', name="feature")(x) # 1500 original
    x = Dense(1500, activation='relu')(x) # 1500 original
    outputs = Dense(11, activation='relu')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='mse')

    return model

In [None]:
# Here we don't need any target value, as DAE slightly corrupts the input data but still, 
# maintain the uncorrupted data as our target output.
alldata = pd.concat([train[continous_cols],test[continous_cols]],axis=0)
print(alldata.shape)
autoencoder = get_DAE()
autoencoder.fit(alldata[continous_cols], alldata[continous_cols],
                    epochs=20,
                    batch_size=256,
                    shuffle=True
                    )

In [None]:
# Create corrupted data for the corresponding input
test_denoised = test_preprocessed.copy()
test_denoised[continous_cols] = autoencoder.predict(test_denoised[continous_cols])
train_denoised = train_preprocessed.copy()
train_denoised[continous_cols] = autoencoder.predict(train_denoised[continous_cols])

In [None]:
train_denoised['target'] = train.target

In [None]:
X = train_denoised.drop(['target'], axis=1)
Y = train_denoised.target
X_TEST = test_denoised

In [None]:
train.head()

In [None]:
X.head()

# Train Test Split

In [None]:
# As the labels are not equal, stratification is employed
X_train, X_test, y_train, y_test =train_test_split(X,train['target'],
                                                   test_size=0.20,
                                                   random_state=42,
                                                   stratify=train['target'])

# 3. Models Implementation

### Deep Neural Network

In [None]:
annealer = LearningRateScheduler(lambda x: 1e-3 * 0.95 ** x)
# Early stopping should be based on loss, not on accuracy
es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=10)
mc = ModelCheckpoint('best_model.h5', monitor='val_acc', mode='max', verbose=0, save_best_only=True)

In [None]:
model = keras.Sequential([
    ## reshaping the input entries
    # there're 30 independent variables 
    keras.layers.Dense(50, input_shape=(X.shape[1],), activation='relu'),  
    keras.layers.Dropout(0.50),    ## to avoid overfitting and underfiting

    ## creating the hidden layer
    keras.layers.Dense(100,activation='relu'),
    keras.layers.Dropout(0.70),    ##  to avoid overfitting and underfiting
    
    keras.layers.Dense(150,activation='relu'),
    keras.layers.Dropout(0.70),     ## to avoid overfitting and underfiting
 
    # sigmoid as this is a binary classification problem
    ## final neural layer
    keras.layers.Dense(1,activation='sigmoid')
    
])


# as an optimizer, adams give promising performance
model.compile(optimizer='adam',
             loss='binary_crossentropy',  ## since output in 0 or 1
             metrics=[tf.keras.metrics.BinaryAccuracy(),tf.keras.metrics.AUC()])

model.fit(X,Y,epochs=epochs, 
          batch_size=batch_size,
          callbacks=[annealer,es,mc],
          validation_split=0.1,
          shuffle=True,
         )

y_pred_cnn = model.predict(X_TEST)

cnn_score = model.evaluate(X, Y)[1]
cnn_score

# Try different Machine Learning Base Models

In [None]:
# Spot-Check Algorithms
def GetBasedModel():
    basedModels = []
    basedModels.append(('NB'   , GaussianNB()))
    basedModels.append(('XGB'   , XGBClassifier(verbosity = 0)))
    basedModels.append(('LGB'   , LGBMClassifier()))
#     basedModels.append(('LR'   , LogisticRegression()))
#     basedModels.append(('LDA'  , LinearDiscriminantAnalysis()))
#     basedModels.append(('KNN'  , KNeighborsClassifier()))
#     basedModels.append(('CART' , DecisionTreeClassifier()))
#     basedModels.append(('SVM'  , SVC(probability=True)))
#     basedModels.append(('AB'   , AdaBoostClassifier()))
#     basedModels.append(('GBM'  , GradientBoostingClassifier()))
#     basedModels.append(('RF'   , RandomForestClassifier()))
#     basedModels.append(('ET'   , ExtraTreesClassifier()))
#     basedModels.append(('CBC'   , CatBoostClassifier()))


    return basedModels

In [None]:
def BasedLine2(X_train, y_train,models):
    # Test options and evaluation metric
    scoring = 'roc_auc'

    results = []
    names = []
    for name, model in models:
        # stratification is needed for imbalanced target of dataframe
        kfold = StratifiedKFold(n_splits=num_folds)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
        results.append(cv_results)
        names.append(name)
        msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
        
    return names, results

In [None]:
models = GetBasedModel()

# Analyze all the base models
names,results = BasedLine2(X_train, y_train,models)

In [None]:
# Compare all the models according to the baseline score

def ScoreDataFrame(names,results):
    def floatingDecimals(f_val, dec=3):
        prc = "{:."+str(dec)+"f}" 
    
        return float(prc.format(f_val))

    scores = []
    for r in results:
        scores.append(floatingDecimals(r.mean(),4))

    scoreDataFrame = pd.DataFrame({'Model':names, 'Score': scores})
    return scoreDataFrame

In [None]:
basedLineScore = ScoreDataFrame(names,results)
basedLineScore

# 4. Scaling the Dataframe

In [None]:
# Standard and MinMax scalers are considered
# Let's see which one performs better
def GetScaledModel(nameOfScaler):
    
    if nameOfScaler == 'standard':
        scaler = StandardScaler()
    elif nameOfScaler =='minmax':
        scaler = MinMaxScaler()

    pipelines = []
    pipelines.append((nameOfScaler+'NB'  , Pipeline([('Scaler', scaler),('NB'  , GaussianNB())])))
    pipelines.append((nameOfScaler+'XGB'  , Pipeline([('Scaler', scaler),('XGB'  , XGBClassifier())])))
    pipelines.append((nameOfScaler+'LGB'  , Pipeline([('Scaler', scaler),('LGB'  , LGBMClassifier())])))
#     pipelines.append((nameOfScaler+'LR'  , Pipeline([('Scaler', scaler),('LR'  , LogisticRegression())])))
#     pipelines.append((nameOfScaler+'LDA' , Pipeline([('Scaler', scaler),('LDA' , LinearDiscriminantAnalysis())])))
#     pipelines.append((nameOfScaler+'KNN' , Pipeline([('Scaler', scaler),('KNN' , KNeighborsClassifier())])))
#     pipelines.append((nameOfScaler+'CART', Pipeline([('Scaler', scaler),('CART', DecisionTreeClassifier())])))
#     pipelines.append((nameOfScaler+'SVM' , Pipeline([('Scaler', scaler),('SVM' , SVC())])))
#     pipelines.append((nameOfScaler+'AB'  , Pipeline([('Scaler', scaler),('AB'  , AdaBoostClassifier())])  ))
#     pipelines.append((nameOfScaler+'GBM' , Pipeline([('Scaler', scaler),('GMB' , GradientBoostingClassifier())])  ))
#     pipelines.append((nameOfScaler+'RF'  , Pipeline([('Scaler', scaler),('RF'  , RandomForestClassifier())])  ))
#     pipelines.append((nameOfScaler+'ET'  , Pipeline([('Scaler', scaler),('ET'  , ExtraTreesClassifier())])  ))


    return pipelines 

### Standard Scaler

In [None]:
models = GetScaledModel('standard')
names,results = BasedLine2(X_train, y_train,models)
scaledScoreStandard = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard], axis=1)
compareModels

# MinMax Scaler

In [None]:
names,results = BasedLine2(X_train, y_train,models)
scaledScoreMinMax = ScoreDataFrame(names,results)
compareModels = pd.concat([basedLineScore,
                           scaledScoreStandard,
                          scaledScoreMinMax], axis=1)
compareModels

## Functions for Kfold-Cross validaiton, training, evaluation and prediction

In [None]:
# Functions for KFold evaluation
def create(hyperparams):
    """Create LGBM Classifier for a given set of hyper-parameters."""
    model = LGBMClassifier(**hyperparams)
    return model

def fit(model, X, y):
    """Simple training of a given model."""
    model.fit(X, y)
    return model

def fit_with_stop(model, X, y, X_val, y_val, esr):
    """Advanced training with early stopping."""
    model.fit(X, y,
              eval_set=(X_val, y_val),
              early_stopping_rounds=esr, 
              verbose=200)
    return model

def evaluate(model, X, y):
    """Compute AUC for a given model."""
    yp = model.predict_proba(X)[:, 1]
    auc_score = roc_auc_score(y, yp)
    return auc_score

def kfold_evaluation(X, y, k, hyperparams, esr=100):
    """Run a KFlod evaluation."""
    scores = []
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = StratifiedKFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X,y)):
        print(f"\n----- FOLD {i} -----")
        
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)
        train_score = evaluate(model, X_train, y_train)
        val_score = evaluate(model, X_val, y_val)
        scores.append((train_score, val_score))
        
        # Using AUC, as per competition's rules
        print(f"Fold {i} | Eval AUC: {val_score}")
        
        
    scores = pd.DataFrame(scores, columns=['train score', 'validation score'])
    
    return scores


def kfold_prediction(X, y, X_test, k, hyperparams, esr=100):
    """Make predictions with a bagged model based on KFold."""
    yp = np.zeros(len(X_test))
    
    print(f"\n------ {k}-fold evaluation -----")
    print(hyperparams)
    
    kf = KFold(k)
    for i, (train_idx, test_idx) in enumerate(kf.split(X)):
        print(f"\n----- FOLD {i} -----")
        X_train = X.iloc[train_idx]
        y_train = y.iloc[train_idx]
        X_val = X.iloc[test_idx]
        y_val = y.iloc[test_idx]
        
        model = create(hyperparams)
        model = fit_with_stop(model, X_train, y_train, X_val, y_val, esr)
        yp += model.predict_proba(X_test)[:, 1] / k
       
    
    return yp

# 5. Hyperparameter tuning (OPTUNA)

In [None]:
# Set default parameters
BEST_PARAMS = {
    'n_estimators': 10000, # Number of boosted trees to fit
    'learning_rate': 0.05, # Me
    'metric': 'auc', # Me
    'device_type': 'gpu'
}

In [None]:
# Objective function
# Scrutinize the best hyperparameter using this function
def objective(trial):
    # Search spaces
    hyperparams = {
    'learning_rate': trial.suggest_float('learning_rate', 1e-4, 1e-2),
    'max_depth': trial.suggest_int('max_depth', 6, 127),
    'num_leaves': trial.suggest_int('num_leaves', 31, 128),
    'reg_alpha': trial.suggest_float('reg_alpha', 1e-3, 10.0),
    'reg_lambda': trial.suggest_float('reg_lambda', 1e-3, 10.0),
    'random_state': 2021,
    'metric': 'auc',
    'n_estimators': 20000,
    'n_jobs': -1,
    'bagging_seed': 2021,
    'feature_fraction_seed': 2021,
    'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.9),
    'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
    'subsample_freq': trial.suggest_int('subsample_freq', 1, 10),
    'subsample': trial.suggest_float('subsample', 0.3, 0.9),
    'min_data_per_group': trial.suggest_int('min_data_per_group', 50, 200),
    'cat_smooth': trial.suggest_int('cat_smooth', 10, 100),
    'cat_l2': trial.suggest_int('cat_l2', 1, 20),
    }
    
    # Add BEST_PARAMS
    hyperparams.update(BEST_PARAMS)
    
    # Evaluation
    scores = kfold_evaluation(X, Y, K, hyperparams, 100)
    
    return scores['validation score'].mean()

In [None]:
# Optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5)
# study.optimize(objective, timeout=3600*7)

In [None]:
# Best score
study.best_value

In [None]:
# Historic
plot_optimization_history(study)

In [None]:
#plot_parallel_coordinate: interactively visualizes the hyperparameters and scores
optuna.visualization.plot_parallel_coordinate(study)

In [None]:
plot_param_importances(study)

In [None]:
# plot_slice: shows the evolution of the search. You can see where in the hyperparameter space your search
# went and which parts of the space were explored more.
optuna.visualization.plot_slice(study)

In [None]:
# Best parameters
BEST_PARAMS.update(study.best_params)
BEST_PARAMS

# 6. Inference

In [None]:
# Update hyperparams for prediction
BEST_PARAMS['learning_rate'] = 0.03238848685934311

In [None]:
# Finally predictions on test set and submission using all the hypertuned parameters and relevant dataframe
test['target'] = kfold_prediction(X, Y, X_TEST, K, BEST_PARAMS, 500)
test['target'].to_csv('submission.csv')

In [None]:
end_time = datetime.datetime.now()
print(end_time - start_time)