# CS 5228


In [32]:
# All Imports
from utils import *
import pandas as pd
import locale
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import preprocessing

# model training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# model evaluation
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score

# classifiers
from sklearn.naive_bayes import GaussianNB # naive bayes
from sklearn.neighbors import KNeighborsClassifier # KNN
from sklearn.linear_model import LogisticRegression # logistic regression
from sklearn.tree import DecisionTreeClassifier # decision Tree
from sklearn.ensemble import RandomForestClassifier 
from sklearn.ensemble import *

# ignore warnings
import warnings
warnings.filterwarnings('ignore')
locale.setlocale(locale.LC_ALL,'')
pd.set_option('display.max_columns', None)

In [26]:
# Process Training Data
# drop_columns = ['CreateJob','RetainedJob','City','Name','Zip','BankState']

# drop_columns = ['CreateJob','ApprovalFY','ApprovalDate','DisbursementGross','SBA_Appv']


# drop_columns = []

le = generate_labels()

base_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
base_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)
feature_dropna = get_data(le=le,type='train', dropna=True, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
feature_fillna = get_data(le=le,type='train', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)

In [3]:
base_dropna.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 49808 entries, 0 to 49999
Data columns (total 28 columns):
Name                 49808 non-null int64
City                 49808 non-null int64
State                49808 non-null int64
Zip                  49808 non-null int32
Bank                 49808 non-null int64
BankState            49808 non-null int64
NAICS                49808 non-null int32
ApprovalDate         49808 non-null int64
ApprovalFY           49808 non-null int16
Term                 49808 non-null int64
NoEmp                49808 non-null int64
CreateJob            49808 non-null int64
RetainedJob          49808 non-null int64
FranchiseCode        49808 non-null int32
DisbursementDate     49808 non-null int64
DisbursementGross    49808 non-null float32
GrAppv               49808 non-null float32
SBA_Appv             49808 non-null float32
ChargeOff            49808 non-null int64
NewExist_1           49808 non-null uint8
NewExist_2           49808 non-null uint8
Urb

In [27]:
# Process Test Data
# feature_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=True, values_only=True,drop_columns=drop_columns)
base_test = get_data(le=le,type='test', dropna=False, get_dummy=True, feature_split=False, values_only=True,drop_columns=drop_columns)

In [None]:
base_dropna.info()

In [None]:
feature_test.describe(include='all')

In [None]:
model_names = ['KNN', 'LR', 'DT', 'RF', 'GBM']
base_dropna_f1 = []
base_dropna_acc = []
base_fillna_f1 = []
base_fillna_acc = []
feature_dropna_f1 = []
feature_dropna_acc = []
feature_fillna_f1 = []
feature_fillna_acc = []

def calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test):
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    f1 = round(f1_score(y_test, y_pred, average='weighted') * 100, 2)
    acc = round(accuracy_score(y_test, y_pred) * 100, 2)
    return f1, acc

    
def train_single_classifier(classifier, df_in, f1_list, acc_list):
    df_x = df_in.drop(columns='ChargeOff')
    df_y = df_in['ChargeOff']
    x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size = 0.25, random_state=0)
    f1, acc = calculate_acc_and_f1(classifier, x_train, y_train, x_test, y_test)
    f1_list.append(f1)
    acc_list.append(acc)
    

def train_model(df_in, f1_list, acc_list):
    train_single_classifier(KNeighborsClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(LogisticRegression(), df_in, f1_list, acc_list)
    train_single_classifier(DecisionTreeClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(RandomForestClassifier(), df_in, f1_list, acc_list)
    train_single_classifier(GradientBoostingClassifier(), df_in, f1_list, acc_list)
    

train_model(base_dropna, base_dropna_f1, base_dropna_acc)
train_model(base_fillna, base_fillna_f1, base_fillna_acc)
train_model(feature_dropna, feature_dropna_f1, feature_dropna_acc)
train_model(feature_fillna, feature_fillna_f1, feature_fillna_acc)


In [None]:
accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'base_fillna_acc': base_fillna_acc, 'feature_dropna_acc': feature_dropna_acc, 'feature_fillna_acc': feature_fillna_acc})
# accuracy_record = pd.DataFrame({'Model': model_names, 'base_dropna_acc': base_dropna_acc, 'feature_dropna_acc': feature_dropna_acc})
accuracy_record['acc_mean'] = accuracy_record.mean(axis=1).round(2)
accuracy_record.set_index('Model', inplace=True)
accuracy_record.loc['avg'] = accuracy_record.mean()

F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'base_fillna_f1': base_fillna_f1, 'feature_dropna_f1': feature_dropna_f1, 'feature_fillna_f1': feature_fillna_f1})
# F1_record = pd.DataFrame({'Model': model_names, 'base_dropna_f1': base_dropna_f1, 'feature_dropna_f1': feature_dropna_f1})
F1_record['F1_mean'] = F1_record.mean(axis=1).round(2)
F1_record.set_index('Model', inplace=True)
F1_record.loc['avg'] = F1_record.mean()

print(accuracy_record)
print('\n')
print(F1_record)

In [None]:
model = GradientBoostingClassifier()
base_dropna_x = base_dropna.drop(columns='ChargeOff')
base_dropna_y = base_dropna['ChargeOff']
model.fit(base_dropna_x, base_dropna_y)
test_pred = model.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")



In [33]:
x_train, x_test, y_train, y_test = train_test_split(base_dropna_x, base_dropna_y, test_size = 0.25, random_state=0)


clf = GradientBoostingClassifier()
# c = np.append(np.logspace(0, 4, 20),[0.001,.009,0.01,.09,1,5,10,25,100])
# param_grid = {'loss': ['deviance', 'exponential'],
#               'learning_rate': [0.001,0.05,0.1,0.2,0.5],
#               'n_estimators':[50,100,200,500,1000],
#               'subsample':[0.9,1],
#               'criterion':['friedman_mse', 'mse', 'mae'],
#               'min_samples_split':[2,5,10]
#              }

param_grid = {'loss': ['deviance', 'exponential'],
              'learning_rate': [0.05,0.1,0.3],
              'n_estimators':[50,100,150],
              'subsample':[0.9,1],
              'criterion':['friedman_mse', 'mse', 'mae'],
#               'min_samples_split':[2,5],
#               'max_depth':[3,5,7],
#               'max_features':['sqrt','log2', None],
              
             }
print('param_grid: \n',param_grid)

lr_cv = GridSearchCV(clf, param_grid,scoring = 'accuracy',verbose=10,n_jobs=-1)
lr_cv.fit(x_train, y_train)

#Predict values based on new parameters
# y_pred_acc = lr_cv.predict(x_test)

print("Best Parameters",lr_cv.best_params_)
print("Best Accuracy :",lr_cv.best_score_)

y_pred= lr_cv.predict(x_test)
print("Accuracy: ",round(accuracy_score(y_test, y_pred) * 100, 2))
print('Weighted F1 Mesure: ',round(f1_score(y_test, y_pred, average='weighted') * 100, 2))



param_grid: 
 {'loss': ['deviance', 'exponential'], 'learning_rate': [0.05, 0.1, 0.3], 'n_estimators': [50, 100, 150], 'subsample': [0.9, 1], 'criterion': ['friedman_mse', 'mse', 'mae']}
Fitting 5 folds for each of 108 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:    4.8s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:    5.1s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   12.1s


KeyboardInterrupt: 

In [None]:
test_pred = lr_cv.predict(base_test)
pd.DataFrame(test_pred).to_csv('y_pred_grid_search.csv',header=['ChargeOff'],index_label="Id")

In [28]:
# Normalize + K-fold
base_dropna_x = base_dropna.drop(columns='ChargeOff')
base_dropna_y = base_dropna['ChargeOff']

min_max_scaler_x = preprocessing.MinMaxScaler()

min_max_scaler_x.fit(base_dropna_x)


base_dropna_x_scaled = min_max_scaler_x.transform(base_dropna_x)
base_dropna_x_normalized = pd.DataFrame(base_dropna_x_scaled)



base_dropna_x_normalized.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
0,0.645099,0.714894,0.18,0.330273,0.858709,0.156863,0.360423,0.321782,0.0025,0.0,0.0,0.91194,0.399976,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
1,0.498425,0.778173,0.88,0.840948,1.0,0.882353,0.254401,0.081683,0.0006,0.000341,0.0,0.906513,0.00336,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
2,0.288509,0.489379,0.08,0.900229,0.077249,0.529412,0.874273,0.148515,0.0001,0.000114,0.0,0.741188,0.00336,1.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0
3,0.76694,0.485436,0.96,0.535555,0.064291,0.960784,0.0,0.39604,0.0001,0.0,0.0,0.561105,0.027361,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0
4,0.028081,0.470936,0.48,0.655367,0.232993,0.470588,0.669321,0.25,0.0002,0.000227,0.0,0.820954,0.00496,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
5,0.200377,0.66726,0.1,0.811478,0.431597,0.098039,0.479798,0.673267,0.0003,0.000227,0.0,0.869252,0.099964,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
6,0.637809,0.466675,0.66,0.891169,0.974084,0.823529,0.874143,0.742574,0.0005,0.000568,0.0,0.744908,0.054962,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
7,0.780104,0.256042,0.4,0.210432,0.527037,0.392157,0.456773,0.262376,0.0001,0.000114,0.0,0.906513,0.014961,0.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0
8,0.587171,0.161346,0.08,0.945189,0.077249,0.666667,0.669321,0.742574,0.0005,0.000568,0.0,0.68179,0.139826,1.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0
9,0.422661,0.028937,0.7,0.453045,0.154996,0.078431,0.521615,0.024752,0.0015,0.001705,0.0,0.839615,0.019961,1.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [42]:


# x_train, x_test, y_train, y_test = train_test_split(base_dropna_x_normalized, base_dropna_y, test_size = 0.25, random_state=0)

# Train Model
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=3))
param_grid = {'learning_rate': [0.5],
#               'loss': ['exponential'],
#               'max_depth':[8],
#               'max_features':[None],
              'n_estimators':[50,100,200,300],
#               'min_samples_split':[2],
              
             }

# param_grid ={}
model = GridSearchCV(clf, param_grid,scoring = 'accuracy', cv=10,n_jobs=-1)
model.fit(base_dropna_x_normalized, base_dropna_y)

# # Validation
# test_pred = model.predict(x_test)
# f1 = round(f1_score(y_test, test_pred, average='weighted') * 100, 2)
# acc = round(accuracy_score(y_test, test_pred) * 100, 2)
# print("f1=", f1, "acc=", acc)

print("Best Accuracy :",model.best_score_)
print("Best Parameters",model.best_params_)


# Prediction
x_scaled = min_max_scaler_x.transform(base_test)
test_normalized = pd.DataFrame(x_scaled)

test_pred = model.predict(test_normalized)
pd.DataFrame(test_pred).to_csv('y_pred.csv',header=['ChargeOff'],index_label="Id")

Best Accuracy : 0.9231450475662941
Best Parameters {'learning_rate': 0.5, 'n_estimators': 300}


Best Parameters {'learning_rate': 0.5, 'loss': 'exponential', 'max_depth': 8, 'max_features': None, 'min_samples_split': 4, 'n_estimators': 310}


In [6]:
full_x.shape

(149808, 27)