In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install imblearn
!pip install LightGBM
!pip install XGBosst
!pip install CatBoost

In [None]:
# Import Library
import gc
from itertools import product

# Graphic Components
import seaborn as sns
import matplotlib.pyplot as plt

# Statistical Inference Analyis
import statsmodels.api as sm
import scipy.stats as stats

# Data Precessing 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer

# Model
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
import lightgbm as lgbm
import xgboost as xgb 
import catboost

# Feature Selection/ Model Optimization
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import BaggingClassifier

# Validation 
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

In [None]:
Train_df = pd.read_feather('../input/amexfeather/train_data.ftr')
Train_df.head()

#Test_df = pd.read_feather('../input/amexfeather/test_data.ftr')
#Test_df.head()

In [None]:
Train_df.shape

As all features of the dataset are masked, we cannot rely on domain knowledge nor common sense to get a general picture on the dataset. Let us make some wild guesses by examining the data type of each feature.

In [None]:
Train_df.dtypes.to_list()

We observe that the customer ID is duplicated and there is a datetime column (S_2). 11 features are categorial data, and the remaining are all numeric features. Although the information is still very limited, the above result implies that the dataset should be very likely a transaction data.

Given this observation, there is two ways ahead. The first is time-series analysis on the features, and the second should be taking the latest snapshot with a lower scale. However, the second way is quite dangerous as it may overlook some time-serial features and distort the features importance. 

However, under the limited capacity of Kaggle Environment, we can only take the first way. Let us return if any chances!

In [None]:
Train_df = Train_df.groupby('customer_ID').tail(1)
Train_df.shape

Dataset has been scaled down 10x when taking the latest records only.

In [None]:
Null_Check = pd.DataFrame({'Columns':Train_df.columns,
                           'Null Ratio':Train_df.isna().sum().values / len(Train_df)}).sort_values(by = ['Null Ratio'], ascending = False)
Null_Check.head(20)

In [None]:
sns.set_theme()
plt.figure(figsize=(12, 6))
sns.histplot(Null_Check['Null Ratio'])
plt.title('Histogram of Null Ratio')
plt.show()

In [None]:
print('Null Ratio Median:', Null_Check['Null Ratio'].quantile(.5))
print('Null Ratio Average:', Null_Check['Null Ratio'].mean())

In [None]:
for i in np.linspace(0,1, 11).round(1):
    print(i, len(Null_Check[Null_Check['Null Ratio'] > i]))
    
Drop_Columns = Null_Check[Null_Check['Null Ratio'] > 0.7]['Columns']
Drop_Columns

The histogram shows that most of the most columns are fine (close to left tail 0% null ratio), except some outliners. Let us remove these cases as they should have very limited importance to our models.

In [None]:
Train_df = Train_df.drop(columns = Null_Check[Null_Check['Null Ratio'] > 0.7]['Columns'])
Train_df.shape

In [None]:
Train_df['target'].value_counts()

The distribution of target is 3:1. We will keep this in mind and handle it in the later section.

In [None]:
del Null_Check
gc.collect()

# Exploration Data Analysis

In [None]:
sns.set_theme()
plt.figure(figsize=(15, 15))
sns.heatmap(Train_df.corr())
plt.title('Correlation Matrix')
plt.show()

In [None]:
sns.set_theme()
plt.figure(figsize=(15, 15))
sns.heatmap(Train_df[[i for i in Train_df.columns if 'S' in i]].corr(), annot = True, fmt='.1f')
plt.title('Correlation Matrix - S Type')
plt.show()

In [None]:
sns.set_theme()
plt.figure(figsize=(15, 15))
sns.heatmap(Train_df[[i for i in Train_df.columns if 'R' in i]].corr(), annot = True, fmt='.1f')
plt.title('Correlation Matrix - R Type')
plt.show()

In [None]:
sns.set_theme()
plt.figure(figsize=(10, 10))
sns.heatmap(Train_df[[i for i in Train_df.columns if 'P' in i]].corr(), annot = True, fmt='.1f')
plt.title('Correlation Matrix - P Type')
plt.show()

In [None]:
sns.set_theme()
plt.figure(figsize=(25, 25))
sns.heatmap(Train_df[[i for i in Train_df.columns if 'B' in i]].corr(), annot = True, fmt='.1f')
plt.title('Correlation Matrix - B Type')
plt.show()

In [None]:
sns.set_theme()
plt.figure(figsize=(20, 20))
sns.heatmap(Train_df[[i for i in Train_df.columns if 'D' in i]].corr())
plt.title('Correlation Matrix - D Type')
plt.show()

From the above heatmaps, we found a lot collinear variables. These variables will cause us not only the problem of dimensionality curse but also noises during computatuion, which undermines our statistical inference thus model performances. Let us do a dimensional reduction by the category.

# PCA

In [None]:
from sklearn.decomposition import PCA
from sklearn.decomposition import FactorAnalysis
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

import statsmodels.api as sm

In [None]:
# Prepare for a PCA 
Master_df = Train_df[['customer_ID','target']].reset_index(drop = True)

# Categorial
PCA_Cat = Train_df.select_dtypes(include='category').reset_index(drop = True)

for i in PCA_Cat.columns:
    PCA_Cat[i].fillna(PCA_Cat[i].quantile(.5), inplace = True)
    
PCA_Cat = pd.get_dummies(PCA_Cat, drop_first= True)

# Numeric and Normalize
PCA_Numeric = Train_df.select_dtypes(include=['float16']).reset_index(drop = True)

for i in PCA_Numeric.columns:
    PCA_Numeric[i] = PCA_Numeric[i].astype('float64')
    PCA_Numeric[i] = PCA_Numeric[i].fillna(PCA_Numeric[i].mean())

PCA_Numeric = pd.DataFrame(StandardScaler().fit_transform(PCA_Numeric), columns = PCA_Numeric.columns)
    
# Concat
PCA_df = pd.concat([PCA_Cat, PCA_Numeric], axis = 1)

In [None]:
PCA_Model = PCA(n_components=3, random_state=0)

for cat in ['S','R','B','P','D']:
    
    Temp = pd.DataFrame(PCA_Model.fit_transform(PCA_df[[i for i in PCA_df.columns if cat in i]]))
    Temp.columns = [cat +'_'+ str(name) for name in Temp.columns]
    Master_df = pd.concat([Master_df, Temp], axis = 1)
    
Master_df.head()

In [None]:
sns.set_theme()
plt.figure(figsize=(15, 15))
sns.heatmap(Master_df.iloc[:, 2:].corr(),annot = True, fmt='.1f')
plt.title('Correlation Matrix - PCA')
plt.show()

We have reduced into only 10 variables and the problem multicollinearity is now more improved, where the max R coefficient is around .7. However, it still exceeds the conventional threshold of collinear where the absolute value of R coefficient is equal or higher than .6. It seems that the variables categoriztion (R, S, B, D, P) is not as independent as we thought. There are some interwining impacts among these categories.

Here are some observations:
1. S, R features are quite independent and non-collinear.
2. D, P and B are still quite collinear (D_0 and B_0 and P_0).

In [None]:
import statsmodels.api as sm
Regression = sm.add_constant(Master_df.iloc[:,2:])
logit_mod = sm.Logit(Master_df['target'],Regression)
logit_res = logit_mod.fit()
print(logit_res.summary())

Most of the variables are signficant after PCA. And we observe that the importance of Spending variables is quite light. The most significant variables are payment related. Let us have a try to do a PCA in general instead.

In [None]:
# Scree 
PCA_Model = PCA(n_components=10, random_state=0)
PCA_Model.fit(PCA_df)

plt.figure(figsize=(12, 7))
plt.plot(PCA_Model.explained_variance_ratio_)
plt.title('Scree Plot')
plt.xlabel('Principal Component')
plt.ylabel('Variance Explained')
plt.show()

The optimal number of compontents should be around 4-6. Let us use 6 componentents for our models.

In [None]:
PCA_Model = PCA(n_components=6, random_state=0)
Temp = pd.DataFrame(PCA_Model.fit_transform(PCA_df))
Master_df = pd.concat([Master_df.iloc[:, :2], Temp], axis = 1)
Master_df

In [None]:
sns.set_theme()
plt.figure(figsize=(15, 15))
sns.heatmap(Master_df.iloc[:, 2:].corr(),annot = True, fmt='.1f')
plt.title('Correlation Matrix - PCA')
plt.show()

In [None]:
import statsmodels.api as sm
Regression = sm.add_constant(Master_df.iloc[:,2:])
logit_mod = sm.Logit(Master_df['target'],Regression)
logit_res = logit_mod.fit()
print(logit_res.summary())

After a complete PCA, all variables are now significant (p<0.05). The pseudo R-squ (0.5459) is just slighly lower than the model with the above with collinear. The next step will be building a data model on the PCA data.

# Data Models

As mentioned above that the target ratio is 3:1 which may cause bias on our models, let us use SMOTE oversampling method to make create a balance sample.

In [None]:
from imblearn.over_sampling import SMOTE 
sm = SMOTE(random_state=42)
X, y = sm.fit_resample(Master_df.iloc[:,2:], Master_df['target'])

In [None]:
y.value_counts()

In [None]:
# Train Test Split
train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
# Build a DataFrame To Score the Performance

Performance_df = pd.DataFrame(columns = ['Model', 'Feature Selection', 'Accuracy', 'Log Loss', 'ROC', 'Amex Metric'])
Performance_df

In [None]:
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

# Logit Regression

In [None]:
# Randomized Cross Validation for Hyperparameters Tuning

def Logit_Randomize_CV(X,y, parameters, k = -1):
    
    # Create Parameter Combination
    keys, values = zip(*parameters.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    
    if k != -1:

        result = np.random.choice(result, k, replace = False)
    
    best_score = -1
    best_parameter = {}
    best_model = None
    
    # Train Test Split
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
    
    for i in result:
        
        Logit = LogisticRegression(**i)
        Logit.fit(train_X, train_y)
        
        pred = Logit.predict(test_X)
        amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
        amex_prediction = pd.DataFrame({'prediction': Logit.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
        score = amex_metric(amex_actual,amex_prediction)
        
        if score > best_score:
            best_score = score
            best_parameter = i
            best_model = Logit
        
        print(f'{i}: {score}')
        
    print(f'Best Parameters - {best_parameter}: {best_score}')
    
    return best_model

In [None]:
# Pools of Parameters
random_parameters = {'solver': ['newton-cg', 'lbfgs', 'liblinear'],
                     'C' :[100, 10, 1.0, 0.1, 0.01]
                    }

Logit = Logit_Randomize_CV(train_X,train_y, random_parameters)

In [None]:
# Validation
pred = Logit.predict(test_X)
amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
amex_prediction = pd.DataFrame({'prediction': Logit.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
Performance_df = Performance_df.append(pd.DataFrame([['Logit', 'Full', accuracy_score(test_y, pred), log_loss(test_y, pred), roc_auc_score(test_y, pred), amex_metric(amex_actual,amex_prediction)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC','Amex Metric']), sort = False)

print('Accuracy:', accuracy_score(test_y, pred))
print('Log Loss:', log_loss(test_y, pred))
print('ROC Accuracy:', roc_auc_score(test_y, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(test_y, pred))
print('Amex Metric:', amex_metric(amex_actual,amex_prediction))

# Support Vector

In [None]:
def SVM_Bagging_Randomize_CV(X,y, parameters, k = -1):
    
    # Create Parameter Combination
    keys, values = zip(*parameters.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    
    if k != -1:

        result = np.random.choice(result, k, replace = False)
        
    best_score = -1
    best_parameter = {}
    best_model = None
    
    # Train Test Split
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
    
    for i in result:
        
        Support_Vector = BaggingClassifier(base_estimator=SVC(**i), 
                                           n_estimators=64,
                                           max_samples = 128, 
                                           random_state=0,
                                           n_jobs = -1)

        Support_Vector.fit(train_X, train_y)
        
        pred = Support_Vector.predict(test_X)
        amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
        amex_prediction = pd.DataFrame({'prediction': Support_Vector.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
        score = amex_metric(amex_actual,amex_prediction)
        
        if score > best_score:
            best_score = score
            best_parameter = i
            best_model = Support_Vector
        
        print(f'{i}: {score}')
        
    print(f'Best Parameters - {best_parameter}: {best_score}')
    
    return best_model

In [None]:
# Pools of Parameters   

random_parameters = {'C': [1, 10, 100], 
                     'gamma': [0.1,0.01, 0.001],
                     'kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 
                     }

Support_Vector = SVM_Bagging_Randomize_CV(train_X,train_y, random_parameters, 15)

In [None]:
# Validation
pred = Support_Vector.predict(test_X)
amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
amex_prediction = pd.DataFrame({'prediction': Support_Vector.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
Performance_df = Performance_df.append(pd.DataFrame([['Support_Vector', 'Full', accuracy_score(test_y, pred), log_loss(test_y, pred), roc_auc_score(test_y, pred), amex_metric(amex_actual,amex_prediction)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC','Amex Metric']), sort = False)

print('Accuracy:', accuracy_score(test_y, pred))
print('Log Loss:', log_loss(test_y, pred))
print('ROC Accuracy:', roc_auc_score(test_y, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(test_y, pred))
print('Amex Metric:', amex_metric(amex_actual,amex_prediction))

# XGBoost

In [None]:
def XGBoost_Randomize_CV(X,y, parameters, k = -1):
    
    # Create Parameter Combination
    keys, values = zip(*parameters.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    
    # If -1 Then Grid Search
    if k != -1:

        result = np.random.choice(result, k, replace = False)
        
    best_score = -1
    best_parameter = {}
    
    # Train Test Split
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
    
    for i in result:
        

        XGB_Model = xgb.XGBClassifier(**i,
                                      early_stopping_rounds = 10,
                                      verbosity = 0,
                                       
                                      n_jobs = -1).fit(train_X, train_y, eval_set = [(test_X, test_y)], verbose=False)
        
        pred = XGB_Model.predict(test_X).round().astype(int)
        amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
        amex_prediction = pd.DataFrame({'prediction':XGB_Model.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
        score = amex_metric(amex_actual,amex_prediction)
        
        if score > best_score:
            best_score = score
            best_parameter = i
            best_model = XGB_Model
        
        print(f'{i}: {score}')

        
    print(f'Best Parameters - {best_parameter}: {best_score}')
    
    return best_model

In [None]:
random_parameters = {'max_depth':[9,10,11,12],
                      'min_child_weight': [5,6,7,8],
                      'eta':[.1, .01, .001],
                      'objective':['binary:logistic'],
                      'tree_method': ['gpu_hist'],
                      'eval_metric': ['rmsle'],
}

XGB_Best = XGBoost_Randomize_CV(train_X,train_y, random_parameters)

In [None]:
# Validation
pred = XGB_Best.predict(test_X).round().astype(int)
amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
amex_prediction = pd.DataFrame({'prediction':XGB_Best.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
Performance_df = Performance_df.append(pd.DataFrame([['XGBC', 'Full', accuracy_score(test_y, pred), log_loss(test_y, pred), roc_auc_score(test_y, pred), amex_metric(amex_actual,amex_prediction)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC','Amex Metric']), sort = False)

print('Accuracy:', accuracy_score(test_y, pred))
print('Log Loss:', log_loss(test_y, pred))
print('ROC Accuracy:', roc_auc_score(test_y, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(test_y, pred))
print('Amex Metric:', amex_metric(amex_actual,amex_prediction))

# LGBM

In [None]:
def LGBM_Randomize_CV(X,y, parameters, k = -1):
    
    # Create Parameter Combination
    keys, values = zip(*parameters.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    
    # If -1 Then Grid Search
    if k != -1:

        result = np.random.choice(result, k, replace = False)
        
    best_score = -1
    best_parameter = {}
    best_model = None
    
    # Train Test Split
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
    
    for i in result:
        
        LGBM_Model = lgbm.LGBMClassifier(**i,
                                         device = 'gpu',
                                         gpu_platform_id= 0,
                                         gpu_device_id= 0,
                                         n_jobs = -1).fit(train_X, train_y)
        

        pred = LGBM_Model.predict(test_X).round().astype(int)
        amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
        amex_prediction = pd.DataFrame({'prediction':LGBM_Model.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
        score = amex_metric(amex_actual,amex_prediction)
        
        if score > best_score:
            best_score = score
            best_parameter = i
            best_model = LGBM_Model
        
        print(f'{i}: {score}')
        
    print(f'Best Parameters - {best_parameter}: {best_score}')
    
    return best_model

In [None]:
random_parameters = {'objective': ['binary'],
                      'metric': ['binary_logloss'],
                      'learning_rate':[0.1,0.01,0.001],
                      'num_leaves':[6,7,8,9],
                      'max_depth':[9,10,11,12]}

LGBM_Model = LGBM_Randomize_CV(train_X,train_y, random_parameters, k = -1)

In [None]:
# Validation
pred = LGBM_Model.predict(test_X)
amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
amex_prediction = pd.DataFrame({'prediction': LGBM_Model.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
Performance_df = Performance_df.append(pd.DataFrame([['LGBM', 'Full', accuracy_score(test_y, pred), log_loss(test_y, pred), roc_auc_score(test_y, pred), amex_metric(amex_actual,amex_prediction)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC','Amex Metric']), sort = False)

print('Accuracy:', accuracy_score(test_y, pred))
print('Log Loss:', log_loss(test_y, pred))
print('ROC Accuracy:', roc_auc_score(test_y, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(test_y, pred))
print('Amex Metric:', amex_metric(amex_actual,amex_prediction))

# CatBoost

In [None]:
def CatBoost_Randomize_CV(X,y, parameters, k = -1):
    
    # Create Parameter Combination
    keys, values = zip(*parameters.items())
    result = [dict(zip(keys, p)) for p in product(*values)]
    
    # If -1 Then Grid Search
    if k != -1:

        result = np.random.choice(result, k, replace = False)
        
    best_score = -1
    best_parameter = {}
    best_model = None
    
    # Train Test Split
    train_X, test_X, train_y, test_y = train_test_split(X, y, test_size=0.3, random_state=42)
    
    for i in result:
        
        CatBoost_Model = catboost.CatBoostClassifier(**i,
                                                     verbose = 0,
                                                     task_type="GPU",
                                                     devices='0',
                                                     early_stopping_rounds = 10).fit(train_X, train_y)
        

        pred = CatBoost_Model.predict(test_X).round().astype(int)
        amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
        amex_prediction = pd.DataFrame({'prediction':CatBoost_Model.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
        score = amex_metric(amex_actual,amex_prediction)
        
        if score > best_score:
            best_score = score
            best_parameter = i
            best_model = CatBoost_Model
        
        print(f'{i}: {score}')
        
    print(f'Best Parameters - {best_parameter}: {best_score}')
    
    return best_model

In [None]:
random_parameters = {'loss_function': ['Logloss'],
                      'learning_rate':[0.1,0.01,0.001],
                      'depth':[6,7,8,9,10]}

CatBoost_Model = CatBoost_Randomize_CV(train_X,train_y, random_parameters, k = -1)

In [None]:
# Validation
pred = CatBoost_Model.predict(test_X)
amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
amex_prediction = pd.DataFrame({'prediction': CatBoost_Model.predict_proba(test_X)[:, 1]}).reset_index(drop=True)
Performance_df = Performance_df.append(pd.DataFrame([['CatBoost', 'Full', accuracy_score(test_y, pred), log_loss(test_y, pred), roc_auc_score(test_y, pred), amex_metric(amex_actual,amex_prediction)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC','Amex Metric']), sort = False)

print('Accuracy:', accuracy_score(test_y, pred))
print('Log Loss:', log_loss(test_y, pred))
print('ROC Accuracy:', roc_auc_score(test_y, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(test_y, pred))
print('Amex Metric:', amex_metric(amex_actual,amex_prediction))

# Tensorflow

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
tf.test.gpu_device_name()

In [None]:
# Defining Early Stopping 
early_stopping = EarlyStopping(
    min_delta=0.001, # minimium amount of change to count as an improvement
    patience=20, # how many epochs to wait before stopping
    restore_best_weights=True,
)

# Defining Neural Model
DL_Model =  keras.Sequential([
    layers.Dense(32, input_shape = [6]),
    layers.Dropout(.2,),
    layers.Dense(16, activation = 'relu'),
    layers.Dense(1, activation='sigmoid'),
])

# Compile Model Fit
DL_Model.compile(
    optimizer='adam',
    loss='BinaryCrossentropy',
    metrics = 'MeanSquaredLogarithmicError'
)

In [None]:
# Record The Epoch 

with tf.device('/gpu:0'):
    History = DL_Model.fit(train_X, train_y,
                           validation_data=(test_X, test_y,),
                           callbacks=[early_stopping],
                           batch_size=500,
                           epochs=1000,
                           verbose=1)

In [None]:
# Convert the Training History to a Dataframe
history_df = pd.DataFrame(History.history)
#Pandas native plot method
history_df.loc[:, ['loss', 'val_loss']].plot();
history_df.loc[:, ['mean_squared_logarithmic_error', 'val_mean_squared_logarithmic_error']].plot()

In [None]:
# Validation
pred = DL_Model.predict(test_X).round(0).astype(int)
amex_actual = pd.DataFrame({'target':test_y}).reset_index(drop=True)
amex_prediction = pd.DataFrame(DL_Model.predict(test_X), columns = ['prediction']).reset_index(drop=True)
Performance_df = Performance_df.append(pd.DataFrame([['Tensorflow', 'Full', accuracy_score(test_y, pred), log_loss(test_y, pred), roc_auc_score(test_y, pred),amex_metric(amex_actual,amex_prediction)]],
                                                    columns = ['Model', 'Feature Selection','Accuracy', 'Log Loss', 'ROC','Amex Metric']), sort = False)

print('Accuracy:', accuracy_score(test_y, pred))
print('Log Loss:', log_loss(test_y, pred))
print('ROC Accuracy:', roc_auc_score(test_y, pred))
print('Confusion Matrix:\n', 
       confusion_matrix(test_y, pred))
print('Amex Metric:\n', amex_metric(amex_actual,amex_prediction))

# Evaluation

In [None]:
sns.set_theme()
sns.scatterplot(data = Performance_df, x = 'Log Loss',y = 'Accuracy', hue = 'Model', legend = 'brief')

In [None]:
sns.set_theme()
sns.scatterplot(data = Performance_df, x = 'Amex Metric',y = 'Accuracy', hue = 'Model', legend = 'brief')

In [None]:
sns.barplot(data = Performance_df, x = 'Model',y = 'Amex Metric', hue = 'Model')

In [None]:
Performance_df

From the performance table, we found that XGBC model has the highest amex metric score. Let us use this model for our prediction submission. However, the testing data is enormous and we have memory limition. We will have part two to complete the submission!