# CS 5228


In [1]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import GradientBoostingClassifier

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)

In [2]:
# Process Training Data
# drop_columns = ['CreateJob','RetainedJob','City','Name','Zip','BankState']

drop_columns = []

le = generate_labels()

base_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
feature_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
feature_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)

In [3]:
base_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 28 columns):
Name                 49808 non-null int64
City                 49808 non-null int64
State                49808 non-null int64
Zip                  49808 non-null int32
Bank                 49808 non-null int64
BankState            49808 non-null int64
NAICS                49808 non-null int32
ApprovalDate         49808 non-null int64
ApprovalFY           49808 non-null int16
Term                 49808 non-null int64
NoEmp                49808 non-null int64
CreateJob            49808 non-null int64
RetainedJob          49808 non-null int64
FranchiseCode        49808 non-null int32
DisbursementDate     49808 non-null int64
DisbursementGross    49808 non-null float32
GrAppv               49808 non-null float32
SBA_Appv             49808 non-null float32
ChargeOff            49808 non-null int64
NewExist_1           49808 non-null uint8
NewExist_2           49808 non-null uint8
Urb

In [4]:
# Process Test Data
feature_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
base_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

In [5]:
base_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 28 columns):
Name                 49808 non-null int64
City                 49808 non-null int64
State                49808 non-null int64
Zip                  49808 non-null int32
Bank                 49808 non-null int64
BankState            49808 non-null int64
NAICS                49808 non-null int32
ApprovalDate         49808 non-null int64
ApprovalFY           49808 non-null int16
Term                 49808 non-null int64
NoEmp                49808 non-null int64
CreateJob            49808 non-null int64
RetainedJob          49808 non-null int64
FranchiseCode        49808 non-null int32
DisbursementDate     49808 non-null int64
DisbursementGross    49808 non-null float32
GrAppv               49808 non-null float32
SBA_Appv             49808 non-null float32
ChargeOff            49808 non-null int64
NewExist_1           49808 non-null uint8
NewExist_2           49808 non-null uint8
Urb

In [6]:
feature_test.describe(include='all')

Unnamed: 0,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,CreateJob,RetainedJob,FranchiseCode,DisbursementDate,DisbursementGross,GrAppv,SBA_Appv,NewExist_1,NewExist_2,UrbanRural_0,UrbanRural_1,UrbanRural_2,RevLineCr_N,RevLineCr_Y,LowDoc_N,LowDoc_Y,NoEmp_Micro,NoEmp_Small,NoEmp_Medium,NoEmp_Large,Term_Short,Term_Intermediate,Term_Long,Term_Extra Long
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,71715.68828,7599.84097,24.31122,53789.02551,1872.72635,26.45968,421790.95676,1023636000.0,2002.42659,11.16305,13.67696,2595.41773,1033001000.0,175057.9,164705.1,124496.9,0.70894,0.2899,0.27528,0.60408,0.12064,0.7376,0.2624,0.89246,0.09944,0.76272,0.21032,0.02585,0.00111,0.05224,0.12843,0.78692,0.03241
std,41341.874871,4465.893862,15.134917,30960.871596,1325.081943,15.146882,250716.950871,187730100.0,6.208961,285.11886,285.472884,12362.965686,186578200.0,269164.2,265248.0,209626.2,0.454253,0.453718,0.446657,0.48905,0.32571,0.439941,0.439941,0.3098,0.299253,0.425417,0.407538,0.158688,0.033298,0.222512,0.33457,0.409486,0.177087
min,2.0,0.0,0.0,0.0,0.0,0.0,0.0,-59788800.0,1968.0,0.0,0.0,0.0,-56419200.0,0.0,400.0,200.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,35775.5,3661.0,10.0,28273.0,574.0,12.0,238220.0,912556800.0,1999.0,0.0,0.0,0.0,925430400.0,35000.0,25000.0,16000.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
50%,71865.0,7696.0,24.0,55304.0,1892.5,28.0,448210.0,1085443000.0,2004.0,0.0,1.0,0.0,1093910000.0,81000.0,65000.0,43000.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,107419.25,11526.0,37.0,83814.0,3017.0,41.0,561790.0,1159942000.0,2007.0,1.0,4.0,0.0,1164845000.0,194000.0,175000.0,133000.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,143193.0,15726.0,51.0,99999.0,4015.0,52.0,928120.0,1399853000.0,2017.0,8800.0,8800.0,91350.0,1483229000.0,8995000.0,5000000.0,4500000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM']
base_dropna_f1 = []
base_dropna_acc = []
base_fillna_f1 = []
base_fillna_acc = []
feature_dropna_f1 = []
feature_dropna_acc = []
feature_fillna_f1 = []
feature_fillna_acc = []

def calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_test, y_pred) * 100, 2)
    return f1, acc

    
def train_single_classifier(classifier, df_in, f1_list, acc_list):
    df_x = df_in.drop(columns='ChargeOff')
    df_y = df_in['ChargeOff']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state=0)
    f1, acc = calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test)
    f1_list.append(f1)
    acc_list.append(acc)
    

def train_model(df_in, f1_list, acc_list):
    train_single_classifier(KNeighborsClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(LogisticRegression(), df_in, f1_list, acc_list)
    train_single_classifier(DecisionTreeClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(RandomForestClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(GradientBoostingClassifier(), df_in, f1_list, acc_list)
    

train_model(base_dropna, base_dropna_f1, base_dropna_acc)
train_model(base_fillna, base_fillna_f1, base_fillna_acc)
train_model(feature_dropna, feature_dropna_f1, feature_dropna_acc)
train_model(feature_fillna, feature_fillna_f1, feature_fillna_acc)


In [8]:
accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'base_fillna_acc': base_fillna_acc, 'feature_dropna_acc': feature_dropna_acc, 'feature_fillna_acc': feature_fillna_acc})
# accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'feature_dropna_acc': feature_dropna_acc})
accuracy_record['acc_mean'] = accuracy_record.mean(axis=1).round(2)
accuracy_record.set_index('Model', inplace=True)
accuracy_record.loc['avg'] = accuracy_record.mean()

F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'base_fillna_f1': base_fillna_f1, 'feature_dropna_f1': feature_dropna_f1, 'feature_fillna_f1': feature_fillna_f1})
# F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'feature_dropna_f1': feature_dropna_f1})
F1_record['F1_mean'] = F1_record.mean(axis=1).round(2)
F1_record.set_index('Model', inplace=True)
F1_record.loc['avg'] = F1_record.mean()

print(accuracy_record)
print('\n')
print(F1_record)

       base_dropna_acc  base_fillna_acc  feature_dropna_acc  \
Model                                                         
KNN              68.50           68.460              68.500   
LR               64.68           63.790              64.680   
DT               87.89           87.250              71.710   
RF               89.13           88.390              77.340   
GBM              90.10           89.600              77.690   
avg              80.06           79.498              71.984   

       feature_fillna_acc  acc_mean  
Model                                
KNN                68.460    68.480  
LR                 63.790    64.240  
DT                 71.940    79.700  
RF                 77.030    82.970  
GBM                77.870    83.820  
avg                71.818    75.842  


       base_dropna_f1  base_fillna_f1  feature_dropna_f1  feature_fillna_f1  \
Model                                                                         
KNN            68.490          

In [10]:
model = GradientBoostingClassifier()
base_dropna_x = base_dropna.drop(columns='ChargeOff')
base_dropna_y = base_dropna['ChargeOff']
model.fit(base_dropna_x, base_dropna_y)
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")



In [None]:
x_train, x_test, y_train, y_test = train_test_split(base_dropna_x, base_dropna_y, test_size = 0.25, random_state=0)


clf = GradientBoostingClassifier()
# c = np.append(np.logspace(0, 4, 20),[0.001,.009,0.01,.09,1,5,10,25,100])
# param_grid = {'loss': ['deviance', 'exponential'],
#               'learning_rate': [0.001,0.05,0.1,0.2,0.5],
#               'n_estimators':[50,100,200,500,1000],
#               'subsample':[0.9,1],
#               'criterion':['friedman_mse', 'mse', 'mae'],
#               'min_samples_split':[2,5,10]
#              }

param_grid = {'loss': ['deviance', 'exponential'],
              'learning_rate': [0.05,0.1,0.3],
              'n_estimators':[50,100,150],
              'subsample':[0.9,1],
              'criterion':['friedman_mse', 'mse', 'mae'],
#               'min_samples_split':[2,5],
#               'max_depth':[3,5,7],
#               'max_features':['sqrt','log2', None],
              
             }
print('param_grid: \n',param_grid)

lr_cv = GridSearchCV(clf, param_grid,scoring = 'accuracy',verbose=10,n_jobs=-1)
lr_cv.fit(x_train, y_train)

#Predict values based on new parameters
# y_pred_acc = lr_cv.predict(x_test)

print("Best Parameters",lr_cv.best_params_)
print("Best Accuracy :",lr_cv.best_score_)

y_pred= lr_cv.predict(x_test)
print("Accuracy: ",round(accuracy_score(y_test, y_pred) * 100, 2))
print('Weighted F1 Mesure: ',round(f1_score(y_test, y_pred, average='weighted') * 100, 2))



param_grid: 
 {'loss': ['deviance', 'exponential'], 'learning_rate': [0.05, 0.1, 0.3], 'n_estimators': [50, 100, 150], 'subsample': [0.9, 1], 'criterion': ['friedman_mse', 'mse', 'mae']}
Fitting 3 folds for each of 108 candidates, totalling 324 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.7s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:    8.2s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   12.7s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:   16.9s
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:   20.3s
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:   30.3s
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:   35.9s
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:   42.7s
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:   50.0s
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:   54.9s
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

In [None]:
test_pred = lr_cv.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_grid_search.csv',header=['ChargeOff'],index_label="Id")