In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [2]:
#importing data
df = pd.read_csv('../data/clean_df.csv')

#making copy to work with
df_copy = df.copy()

X = df_copy.drop(['NEXT_MONTH_DEFAULT','Client_ID'], axis=1)
y = df_copy["NEXT_MONTH_DEFAULT"]

#splitting training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [3]:
#Create a Gaussian Classifier
#clf=RandomForestClassifier(min_samples_leaf = 25, bootstrap = False)
clf=RandomForestClassifier(random_state=0, min_samples_leaf = 25, bootstrap = False)

#Train the model using the training sets
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

# Model Accuracy, how often is the classifier correct
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
print(cm)

feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
print(feature_imp)

Accuracy: 0.8121805792163543
[[5204  257]
 [1066  517]]
PAY_JULY            0.229366
Pay_Time            0.154885
PAY_AUG             0.101662
PAY_SEP             0.060055
PAY_NOV             0.038364
PAY_OCT             0.036622
PAID_AMT_JULY       0.036289
DUE_AMT_JULY        0.034746
PAID_AMT_AUG        0.030344
PAID_AMT_SEP        0.029080
PAY_DEC             0.028444
PAID_AMT_OCT        0.025748
DUE_AMT_AUG         0.024715
Balance_Limit_V1    0.022942
DUE_AMT_SEP         0.022817
DUE_AMT_NOV         0.021923
DUE_AMT_OCT         0.021870
PAID_AMT_DEC        0.021023
DUE_AMT_DEC         0.019869
PAID_AMT_NOV        0.018362
EDUCATION_STATUS    0.005744
MARITAL_STATUS      0.004474
Gender              0.004472
AGE                 0.004321
NO_PAY_DEC          0.001865
dtype: float64


In [11]:
#importing test data
Test = pd.read_csv('TestData.csv')

#making copy to work with
Test_copy = Test.copy()

Test_copy= Test_copy.drop(['Client_ID'], axis=1)

#Train the model using the complete training set using the model parameters defined above
clf.fit(X, y)

#predicting values for test set
y_pred=clf.predict(Test_copy)

#Create a  DataFrame with the client ids and our prediction 
submission = pd.DataFrame({'Client_ID':Test['Client_ID'],'NEXT_MONTH_DEFAULT':y_pred})

#Convert DataFrame to a csv file that can be uploaded
filename = 'DataStorm Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: DataStorm Predictions 1.csv
