In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
#importing data
df = pd.read_csv('../data/clean_df.csv')

#making copy to work with
df_copy = df.copy()

X = df_copy.drop(['NEXT_MONTH_DEFAULT','Client_ID'], axis=1)
y = df_copy["NEXT_MONTH_DEFAULT"]

#splitting training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [4]:
classifiers = [
    KNeighborsClassifier(2),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier()]

names = ["Nearest Neighbors",
         "Decision Tree", "Random Forest", "Neural Net", "AdaBoost",
         "Naive Bayes", "QDA"]

for name, clf in zip(names, classifiers):
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)
        print(name)
        print(score)

Nearest Neighbors
0.7636286201022147
Decision Tree
0.8144520159000568
Random Forest
0.7890403180011357




Neural Net
0.7323963657013061
AdaBoost
0.8164395229982965
Naive Bayes
0.4034639409426462
QDA
0.33588869960249856




In [5]:
#Create a AdaBoostClassifier
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X_train, y_train)

clf.feature_importances_

y_pred=clf.predict(X_test)

# Model Accuracy, how often is the classifier correct
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print(cm)

Accuracy: 0.816297558205565
[[5245  235]
 [1059  505]]


In [6]:
#importing test data
Test = pd.read_csv('TestData.csv')

#making copy to work with
Test_copy = Test.copy()

Test_copy= Test_copy.drop(['Client_ID'], axis=1)

#Train the model using the complete training set using the model parameters defined above
clf.fit(X, y)

#predicting values for test set
y_pred=clf.predict(Test_copy)

#Create a  DataFrame with the client ids and our prediction 
submission = pd.DataFrame({'Client_ID':Test['Client_ID'],'NEXT_MONTH_DEFAULT':y_pred})

#Convert DataFrame to a csv file that can be uploaded
filename = 'Predictions AdaBooster  0.8162.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: Predictions AdaBooster  0.8162.csv
