# CS5228 - Team pandas - Dataset and Classifier Selection


A0105650R - Wang Gejing 

A0198889R - Chen Ningshuang 

A0210996X - Zhang Hao 

## Import libraries

In [38]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import *

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)

## Import training data without column drop

In [39]:
drop_columns = []

le = generate_labels()
base_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
feature_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
feature_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)

## Training dataset Information

In [40]:
base_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               49808 non-null  int32  
 1   City               49808 non-null  int32  
 2   State              49808 non-null  int32  
 3   Zip                49808 non-null  int32  
 4   Bank               49808 non-null  int32  
 5   BankState          49808 non-null  int32  
 6   NAICS              49808 non-null  int32  
 7   ApprovalDate       49808 non-null  int64  
 8   ApprovalFY         49808 non-null  int16  
 9   Term               49808 non-null  int64  
 10  NoEmp              49808 non-null  int64  
 11  CreateJob          49808 non-null  int64  
 12  RetainedJob        49808 non-null  int64  
 13  FranchiseCode      49808 non-null  int32  
 14  DisbursementDate   49808 non-null  int64  
 15  DisbursementGross  49808 non-null  float32
 16  GrAppv             498

In [41]:
base_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               50000 non-null  int32  
 1   City               50000 non-null  int32  
 2   State              50000 non-null  int32  
 3   Zip                50000 non-null  int32  
 4   Bank               50000 non-null  int32  
 5   BankState          50000 non-null  int32  
 6   NAICS              50000 non-null  int32  
 7   ApprovalDate       50000 non-null  int64  
 8   ApprovalFY         50000 non-null  int16  
 9   Term               50000 non-null  int64  
 10  NoEmp              50000 non-null  int64  
 11  CreateJob          50000 non-null  int64  
 12  RetainedJob        50000 non-null  int64  
 13  FranchiseCode      50000 non-null  int32  
 14  DisbursementDate   50000 non-null  int64  
 15  DisbursementGross  50000 non-null  float32
 16  GrAppv             500

In [42]:
feature_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               49808 non-null  int32  
 1   City               49808 non-null  int32  
 2   State              49808 non-null  int32  
 3   Zip                49808 non-null  int32  
 4   Bank               49808 non-null  int32  
 5   BankState          49808 non-null  int32  
 6   NAICS              49808 non-null  int32  
 7   ApprovalDate       49808 non-null  int64  
 8   ApprovalFY         49808 non-null  int16  
 9   CreateJob          49808 non-null  int64  
 10  RetainedJob        49808 non-null  int64  
 11  FranchiseCode      49808 non-null  int32  
 12  DisbursementDate   49808 non-null  int64  
 13  DisbursementGross  49808 non-null  float32
 14  GrAppv             49808 non-null  float32
 15  SBA_Appv           49808 non-null  float32
 16  ChargeOff          498

In [43]:
feature_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 34 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               50000 non-null  int32  
 1   City               50000 non-null  int32  
 2   State              50000 non-null  int32  
 3   Zip                50000 non-null  int32  
 4   Bank               50000 non-null  int32  
 5   BankState          50000 non-null  int32  
 6   NAICS              50000 non-null  int32  
 7   ApprovalDate       50000 non-null  int64  
 8   ApprovalFY         50000 non-null  int16  
 9   CreateJob          50000 non-null  int64  
 10  RetainedJob        50000 non-null  int64  
 11  FranchiseCode      50000 non-null  int32  
 12  DisbursementDate   50000 non-null  int64  
 13  DisbursementGross  50000 non-null  float32
 14  GrAppv             50000 non-null  float32
 15  SBA_Appv           50000 non-null  float32
 16  ChargeOff          500

## Import test data

In [44]:
feature_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
base_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

## Test dataset information

In [45]:
feature_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 33 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Name               100000 non-null  int32  
 1   City               100000 non-null  int32  
 2   State              100000 non-null  int32  
 3   Zip                100000 non-null  int32  
 4   Bank               100000 non-null  int32  
 5   BankState          100000 non-null  int32  
 6   NAICS              100000 non-null  int32  
 7   ApprovalDate       100000 non-null  int64  
 8   ApprovalFY         100000 non-null  float64
 9   CreateJob          100000 non-null  int64  
 10  RetainedJob        100000 non-null  int64  
 11  FranchiseCode      100000 non-null  int32  
 12  DisbursementDate   100000 non-null  int64  
 13  DisbursementGross  100000 non-null  float32
 14  GrAppv             100000 non-null  float32
 15  SBA_Appv           100000 non-null  float32
 16  New

In [46]:
base_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Name               100000 non-null  int32  
 1   City               100000 non-null  int32  
 2   State              100000 non-null  int32  
 3   Zip                100000 non-null  int32  
 4   Bank               100000 non-null  int32  
 5   BankState          100000 non-null  int32  
 6   NAICS              100000 non-null  int32  
 7   ApprovalDate       100000 non-null  int64  
 8   ApprovalFY         100000 non-null  float64
 9   Term               100000 non-null  int64  
 10  NoEmp              100000 non-null  int64  
 11  CreateJob          100000 non-null  int64  
 12  RetainedJob        100000 non-null  int64  
 13  FranchiseCode      100000 non-null  int32  
 14  DisbursementDate   100000 non-null  int64  
 15  DisbursementGross  100000 non-null  float32
 16  GrA

## Data and model selection

In [47]:
model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM','Ada Boost']
base_dropna_f1 = []
base_dropna_acc = []
base_fillna_f1 = []
base_fillna_acc = []
feature_dropna_f1 = []
feature_dropna_acc = []
feature_fillna_f1 = []
feature_fillna_acc = []

def calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_test, y_pred) * 100, 2)
    return f1, acc

    
def train_single_classifier(classifier, df_in, f1_list, acc_list):
    df_x = df_in.drop(columns='ChargeOff')
    df_y = df_in['ChargeOff']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state=0)
    f1, acc = calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test)
    f1_list.append(f1)
    acc_list.append(acc)
    

def train_model(df_in, f1_list, acc_list):
    train_single_classifier(KNeighborsClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(LogisticRegression(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(DecisionTreeClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(RandomForestClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(GradientBoostingClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1234),random_state=1234), df_in, f1_list, acc_list)


train_model(base_dropna, base_dropna_f1, base_dropna_acc)
train_model(base_fillna, base_fillna_f1, base_fillna_acc)
train_model(feature_dropna, feature_dropna_f1, feature_dropna_acc)
train_model(feature_fillna, feature_fillna_f1, feature_fillna_acc)


In [48]:
accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'base_fillna_acc': base_fillna_acc, 'feature_dropna_acc': feature_dropna_acc, 'feature_fillna_acc': feature_fillna_acc})
accuracy_record['acc_mean'] = accuracy_record.mean(axis=1).round(2)
accuracy_record.set_index('Model', inplace=True)
accuracy_record.loc['avg'] = accuracy_record.mean()

F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'base_fillna_f1': base_fillna_f1, 'feature_dropna_f1': feature_dropna_f1, 'feature_fillna_f1': feature_fillna_f1})
F1_record['F1_mean'] = F1_record.mean(axis=1).round(2)
F1_record.set_index('Model', inplace=True)
F1_record.loc['avg'] = F1_record.mean()


In [49]:
accuracy_record.head(10)

Unnamed: 0_level_0,base_dropna_acc,base_fillna_acc,feature_dropna_acc,feature_fillna_acc,acc_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNN,68.5,68.46,68.5,68.46,68.48
LR,63.76,61.22,63.75,61.22,62.49
DT,87.91,87.45,71.94,71.86,79.79
RF,90.22,89.9,79.56,79.16,84.71
GBM,90.1,89.59,77.68,77.87,83.81
Ada Boost,87.83,87.57,71.64,71.95,79.75
avg,81.386667,80.698333,72.178333,71.753333,76.505


In [50]:
F1_record.head(10)

Unnamed: 0_level_0,base_dropna_f1,base_fillna_f1,feature_dropna_f1,feature_fillna_f1,F1_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNN,68.49,68.46,68.49,68.46,68.47
LR,63.03,59.95,63.01,59.95,61.48
DT,87.91,87.45,71.94,71.86,79.79
RF,90.21,89.89,79.54,79.13,84.69
GBM,90.1,89.59,77.68,77.86,83.81
Ada Boost,87.83,87.57,71.64,71.95,79.75
avg,81.261667,80.485,72.05,71.535,76.331667


## Import training data - Drop low correlation columns

In [51]:
drop_columns = ['CreateJob','RetainedJob','City','Name','Zip','BankState']

base_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
feature_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
feature_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)

## Training dataset Information

In [52]:
base_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   State              49808 non-null  int32  
 1   Bank               49808 non-null  int32  
 2   NAICS              49808 non-null  int32  
 3   ApprovalDate       49808 non-null  int64  
 4   ApprovalFY         49808 non-null  int16  
 5   Term               49808 non-null  int64  
 6   NoEmp              49808 non-null  int64  
 7   FranchiseCode      49808 non-null  int32  
 8   DisbursementDate   49808 non-null  int64  
 9   DisbursementGross  49808 non-null  float32
 10  GrAppv             49808 non-null  float32
 11  SBA_Appv           49808 non-null  float32
 12  ChargeOff          49808 non-null  int64  
 13  NewExist_1         49808 non-null  uint8  
 14  NewExist_2         49808 non-null  uint8  
 15  UrbanRural_0       49808 non-null  uint8  
 16  UrbanRural_1       498

In [53]:
base_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 22 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   State              50000 non-null  int32  
 1   Bank               50000 non-null  int32  
 2   NAICS              50000 non-null  int32  
 3   ApprovalDate       50000 non-null  int64  
 4   ApprovalFY         50000 non-null  int16  
 5   Term               50000 non-null  int64  
 6   NoEmp              50000 non-null  int64  
 7   FranchiseCode      50000 non-null  int32  
 8   DisbursementDate   50000 non-null  int64  
 9   DisbursementGross  50000 non-null  float32
 10  GrAppv             50000 non-null  float32
 11  SBA_Appv           50000 non-null  float32
 12  ChargeOff          50000 non-null  int64  
 13  NewExist_1         50000 non-null  uint8  
 14  NewExist_2         50000 non-null  uint8  
 15  UrbanRural_0       50000 non-null  uint8  
 16  UrbanRural_1       500

In [54]:
feature_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   State              49808 non-null  int32  
 1   Bank               49808 non-null  int32  
 2   NAICS              49808 non-null  int32  
 3   ApprovalDate       49808 non-null  int64  
 4   ApprovalFY         49808 non-null  int16  
 5   FranchiseCode      49808 non-null  int32  
 6   DisbursementDate   49808 non-null  int64  
 7   DisbursementGross  49808 non-null  float32
 8   GrAppv             49808 non-null  float32
 9   SBA_Appv           49808 non-null  float32
 10  ChargeOff          49808 non-null  int64  
 11  NewExist_1         49808 non-null  uint8  
 12  NewExist_2         49808 non-null  uint8  
 13  UrbanRural_0       49808 non-null  uint8  
 14  UrbanRural_1       49808 non-null  uint8  
 15  UrbanRural_2       49808 non-null  uint8  
 16  RevLineCr_N        498

In [55]:
feature_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 28 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   State              50000 non-null  int32  
 1   Bank               50000 non-null  int32  
 2   NAICS              50000 non-null  int32  
 3   ApprovalDate       50000 non-null  int64  
 4   ApprovalFY         50000 non-null  int16  
 5   FranchiseCode      50000 non-null  int32  
 6   DisbursementDate   50000 non-null  int64  
 7   DisbursementGross  50000 non-null  float32
 8   GrAppv             50000 non-null  float32
 9   SBA_Appv           50000 non-null  float32
 10  ChargeOff          50000 non-null  int64  
 11  NewExist_1         50000 non-null  uint8  
 12  NewExist_2         50000 non-null  uint8  
 13  UrbanRural_0       50000 non-null  uint8  
 14  UrbanRural_1       50000 non-null  uint8  
 15  UrbanRural_2       50000 non-null  uint8  
 16  RevLineCr_N        500

## Import test data

In [56]:
feature_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
base_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

## Test dataset information

In [57]:
feature_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   State              100000 non-null  int32  
 1   Bank               100000 non-null  int32  
 2   NAICS              100000 non-null  int32  
 3   ApprovalDate       100000 non-null  int64  
 4   ApprovalFY         100000 non-null  float64
 5   FranchiseCode      100000 non-null  int32  
 6   DisbursementDate   100000 non-null  int64  
 7   DisbursementGross  100000 non-null  float32
 8   GrAppv             100000 non-null  float32
 9   SBA_Appv           100000 non-null  float32
 10  NewExist_1         100000 non-null  uint8  
 11  NewExist_2         100000 non-null  uint8  
 12  UrbanRural_0       100000 non-null  uint8  
 13  UrbanRural_1       100000 non-null  uint8  
 14  UrbanRural_2       100000 non-null  uint8  
 15  RevLineCr_N        100000 non-null  uint8  
 16  Rev

In [58]:
base_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 21 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   State              100000 non-null  int32  
 1   Bank               100000 non-null  int32  
 2   NAICS              100000 non-null  int32  
 3   ApprovalDate       100000 non-null  int64  
 4   ApprovalFY         100000 non-null  float64
 5   Term               100000 non-null  int64  
 6   NoEmp              100000 non-null  int64  
 7   FranchiseCode      100000 non-null  int32  
 8   DisbursementDate   100000 non-null  int64  
 9   DisbursementGross  100000 non-null  float32
 10  GrAppv             100000 non-null  float32
 11  SBA_Appv           100000 non-null  float32
 12  NewExist_1         100000 non-null  uint8  
 13  NewExist_2         100000 non-null  uint8  
 14  UrbanRural_0       100000 non-null  uint8  
 15  UrbanRural_1       100000 non-null  uint8  
 16  Urb

## Data and model selection

In [59]:
model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM','Ada Boost']
base_dropna_f1 = []
base_dropna_acc = []
base_fillna_f1 = []
base_fillna_acc = []
feature_dropna_f1 = []
feature_dropna_acc = []
feature_fillna_f1 = []
feature_fillna_acc = []

def calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_test, y_pred) * 100, 2)
    return f1, acc

    
def train_single_classifier(classifier, df_in, f1_list, acc_list):
    df_x = df_in.drop(columns='ChargeOff')
    df_y = df_in['ChargeOff']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state=0)
    f1, acc = calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test)
    f1_list.append(f1)
    acc_list.append(acc)
    

def train_model(df_in, f1_list, acc_list):
    train_single_classifier(KNeighborsClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(LogisticRegression(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(DecisionTreeClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(RandomForestClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(GradientBoostingClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1234),random_state=1234), df_in, f1_list, acc_list)
    
    

train_model(base_dropna, base_dropna_f1, base_dropna_acc)
train_model(base_fillna, base_fillna_f1, base_fillna_acc)
train_model(feature_dropna, feature_dropna_f1, feature_dropna_acc)
train_model(feature_fillna, feature_fillna_f1, feature_fillna_acc)


In [60]:
accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'base_fillna_acc': base_fillna_acc, 'feature_dropna_acc': feature_dropna_acc, 'feature_fillna_acc': feature_fillna_acc})
accuracy_record['acc_mean'] = accuracy_record.mean(axis=1).round(2)
accuracy_record.set_index('Model', inplace=True)
accuracy_record.loc['avg'] = accuracy_record.mean()

F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'base_fillna_f1': base_fillna_f1, 'feature_dropna_f1': feature_dropna_f1, 'feature_fillna_f1': feature_fillna_f1})
F1_record['F1_mean'] = F1_record.mean(axis=1).round(2)
F1_record.set_index('Model', inplace=True)
F1_record.loc['avg'] = F1_record.mean()

In [61]:
accuracy_record.head(10)

Unnamed: 0_level_0,base_dropna_acc,base_fillna_acc,feature_dropna_acc,feature_fillna_acc,acc_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNN,68.36,68.42,68.36,68.42,68.39
LR,63.28,61.18,63.28,61.18,62.23
DT,88.25,87.62,71.64,71.01,79.63
RF,89.76,89.74,78.32,78.1,83.98
GBM,90.09,89.83,77.72,77.42,83.76
Ada Boost,88.31,87.89,71.09,71.09,79.6
avg,81.341667,80.78,71.735,71.203333,76.265


In [62]:
F1_record.head(10)

Unnamed: 0_level_0,base_dropna_f1,base_fillna_f1,feature_dropna_f1,feature_fillna_f1,F1_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNN,68.36,68.42,68.36,68.42,68.39
LR,62.38,59.9,62.38,59.9,61.14
DT,88.25,87.62,71.64,71.01,79.63
RF,89.76,89.74,78.3,78.06,83.96
GBM,90.09,89.83,77.72,77.41,83.76
Ada Boost,88.31,87.89,71.09,71.09,79.6
avg,81.191667,80.566667,71.581667,70.981667,76.08


## Import training data - Prune highly correlated columns

In [63]:
drop_columns = ['DisbursementDate','ApprovalFY','GrAppv','SBA_Appv','RetainedJob']

base_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
feature_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
feature_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)

## Training dataset Information

In [64]:
base_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               49808 non-null  int32  
 1   City               49808 non-null  int32  
 2   State              49808 non-null  int32  
 3   Zip                49808 non-null  int32  
 4   Bank               49808 non-null  int32  
 5   BankState          49808 non-null  int32  
 6   NAICS              49808 non-null  int32  
 7   ApprovalDate       49808 non-null  int64  
 8   Term               49808 non-null  int64  
 9   NoEmp              49808 non-null  int64  
 10  CreateJob          49808 non-null  int64  
 11  FranchiseCode      49808 non-null  int32  
 12  DisbursementGross  49808 non-null  float32
 13  ChargeOff          49808 non-null  int64  
 14  NewExist_1         49808 non-null  uint8  
 15  NewExist_2         49808 non-null  uint8  
 16  UrbanRural_0       498

In [65]:
base_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               50000 non-null  int32  
 1   City               50000 non-null  int32  
 2   State              50000 non-null  int32  
 3   Zip                50000 non-null  int32  
 4   Bank               50000 non-null  int32  
 5   BankState          50000 non-null  int32  
 6   NAICS              50000 non-null  int32  
 7   ApprovalDate       50000 non-null  int64  
 8   Term               50000 non-null  int64  
 9   NoEmp              50000 non-null  int64  
 10  CreateJob          50000 non-null  int64  
 11  FranchiseCode      50000 non-null  int32  
 12  DisbursementGross  50000 non-null  float32
 13  ChargeOff          50000 non-null  int64  
 14  NewExist_1         50000 non-null  uint8  
 15  NewExist_2         50000 non-null  uint8  
 16  UrbanRural_0       500

In [66]:
feature_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               49808 non-null  int32  
 1   City               49808 non-null  int32  
 2   State              49808 non-null  int32  
 3   Zip                49808 non-null  int32  
 4   Bank               49808 non-null  int32  
 5   BankState          49808 non-null  int32  
 6   NAICS              49808 non-null  int32  
 7   ApprovalDate       49808 non-null  int64  
 8   CreateJob          49808 non-null  int64  
 9   FranchiseCode      49808 non-null  int32  
 10  DisbursementGross  49808 non-null  float32
 11  ChargeOff          49808 non-null  int64  
 12  NewExist_1         49808 non-null  uint8  
 13  NewExist_2         49808 non-null  uint8  
 14  UrbanRural_0       49808 non-null  uint8  
 15  UrbanRural_1       49808 non-null  uint8  
 16  UrbanRural_2       498

In [67]:
feature_fillna.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 29 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Name               50000 non-null  int32  
 1   City               50000 non-null  int32  
 2   State              50000 non-null  int32  
 3   Zip                50000 non-null  int32  
 4   Bank               50000 non-null  int32  
 5   BankState          50000 non-null  int32  
 6   NAICS              50000 non-null  int32  
 7   ApprovalDate       50000 non-null  int64  
 8   CreateJob          50000 non-null  int64  
 9   FranchiseCode      50000 non-null  int32  
 10  DisbursementGross  50000 non-null  float32
 11  ChargeOff          50000 non-null  int64  
 12  NewExist_1         50000 non-null  uint8  
 13  NewExist_2         50000 non-null  uint8  
 14  UrbanRural_0       50000 non-null  uint8  
 15  UrbanRural_1       50000 non-null  uint8  
 16  UrbanRural_2       500

## Import test data

In [68]:
feature_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
base_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

## Test dataset information

In [69]:
feature_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 28 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Name               100000 non-null  int32  
 1   City               100000 non-null  int32  
 2   State              100000 non-null  int32  
 3   Zip                100000 non-null  int32  
 4   Bank               100000 non-null  int32  
 5   BankState          100000 non-null  int32  
 6   NAICS              100000 non-null  int32  
 7   ApprovalDate       100000 non-null  int64  
 8   CreateJob          100000 non-null  int64  
 9   FranchiseCode      100000 non-null  int32  
 10  DisbursementGross  100000 non-null  float32
 11  NewExist_1         100000 non-null  uint8  
 12  NewExist_2         100000 non-null  uint8  
 13  UrbanRural_0       100000 non-null  uint8  
 14  UrbanRural_1       100000 non-null  uint8  
 15  UrbanRural_2       100000 non-null  uint8  
 16  Rev

In [70]:
base_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 22 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   Name               100000 non-null  int32  
 1   City               100000 non-null  int32  
 2   State              100000 non-null  int32  
 3   Zip                100000 non-null  int32  
 4   Bank               100000 non-null  int32  
 5   BankState          100000 non-null  int32  
 6   NAICS              100000 non-null  int32  
 7   ApprovalDate       100000 non-null  int64  
 8   Term               100000 non-null  int64  
 9   NoEmp              100000 non-null  int64  
 10  CreateJob          100000 non-null  int64  
 11  FranchiseCode      100000 non-null  int32  
 12  DisbursementGross  100000 non-null  float32
 13  NewExist_1         100000 non-null  uint8  
 14  NewExist_2         100000 non-null  uint8  
 15  UrbanRural_0       100000 non-null  uint8  
 16  Urb

## Data and model selection

In [71]:
model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM','Ada Boost']
base_dropna_f1 = []
base_dropna_acc = []
base_fillna_f1 = []
base_fillna_acc = []
feature_dropna_f1 = []
feature_dropna_acc = []
feature_fillna_f1 = []
feature_fillna_acc = []

def calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_test, y_pred) * 100, 2)
    return f1, acc

    
def train_single_classifier(classifier, df_in, f1_list, acc_list):
    df_x = df_in.drop(columns='ChargeOff')
    df_y = df_in['ChargeOff']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state=0)
    f1, acc = calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test)
    f1_list.append(f1)
    acc_list.append(acc)
    

def train_model(df_in, f1_list, acc_list):
    train_single_classifier(KNeighborsClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(LogisticRegression(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(DecisionTreeClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(RandomForestClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(GradientBoostingClassifier(random_state=1234), df_in, f1_list, acc_list)
    train_single_classifier(AdaBoostClassifier(base_estimator=DecisionTreeClassifier(random_state=1234),random_state=1234), df_in, f1_list, acc_list)

    

train_model(base_dropna, base_dropna_f1, base_dropna_acc)
train_model(base_fillna, base_fillna_f1, base_fillna_acc)
train_model(feature_dropna, feature_dropna_f1, feature_dropna_acc)
train_model(feature_fillna, feature_fillna_f1, feature_fillna_acc)


In [72]:
accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'base_fillna_acc': base_fillna_acc, 'feature_dropna_acc': feature_dropna_acc, 'feature_fillna_acc': feature_fillna_acc})
accuracy_record['acc_mean'] = accuracy_record.mean(axis=1).round(2)
accuracy_record.set_index('Model', inplace=True)
accuracy_record.loc['avg'] = accuracy_record.mean()

F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'base_fillna_f1': base_fillna_f1, 'feature_dropna_f1': feature_dropna_f1, 'feature_fillna_f1': feature_fillna_f1})
F1_record['F1_mean'] = F1_record.mean(axis=1).round(2)
F1_record.set_index('Model', inplace=True)
F1_record.loc['avg'] = F1_record.mean()

In [73]:
accuracy_record.head(10)

Unnamed: 0_level_0,base_dropna_acc,base_fillna_acc,feature_dropna_acc,feature_fillna_acc,acc_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNN,66.41,66.78,66.41,66.78,66.6
LR,59.58,59.52,59.58,59.52,59.55
DT,88.11,87.83,71.28,71.45,79.67
RF,90.04,89.7,79.2,79.09,84.51
GBM,89.96,89.6,77.5,77.18,83.56
Ada Boost,88.21,87.71,71.51,71.56,79.75
avg,80.385,80.19,70.913333,70.93,75.606667


In [74]:
F1_record.head(10)

Unnamed: 0_level_0,base_dropna_f1,base_fillna_f1,feature_dropna_f1,feature_fillna_f1,F1_mean
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
KNN,66.4,66.76,66.4,66.76,66.58
LR,57.77,58.07,57.77,58.07,57.92
DT,88.12,87.83,71.28,71.45,79.67
RF,90.04,89.69,79.17,79.04,84.49
GBM,89.96,89.6,77.5,77.17,83.56
Ada Boost,88.21,87.71,71.51,71.56,79.75
avg,80.083333,79.943333,70.605,70.675,75.328333
