In [16]:
import pandas as pd
import numpy as np
pd.set_option('display.max_rows', 1000)
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import f1_score

from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from imblearn.combine import SMOTETomek
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

In [10]:
#importing data
df = pd.read_csv('TrainData.csv')

#making copy to work with
df_copy = df.copy()

X = df_copy.drop(['NEXT_MONTH_DEFAULT','Client_ID'], axis=1)
y = df_copy["NEXT_MONTH_DEFAULT"]

#adjusting sampling 
sme = SMOTEENN(random_state=42)
X, y = sme.fit_resample(X, y)

#splitting training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3) # 70% training and 30% test

In [18]:
#clf=RandomForestClassifier(min_samples_leaf = 25, bootstrap = False)
clf=RandomForestClassifier(random_state=0, bootstrap = False)

#Train the model using the training sets
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1:",f1_score(y_test, y_pred, zero_division=1))

cm = confusion_matrix(y_test, y_pred)
print(cm)

feature_imp = pd.Series(clf.feature_importances_,index=X.columns).sort_values(ascending=False)
print(feature_imp)

Accuracy: 0.912817375344142
F1: 0.9306231742940603
[[2145  325]
 [ 245 3823]]
PAY_JULY            0.067006
DUE_AMT_JULY        0.064324
PAID_AMT_JULY       0.063248
PAY_AUG             0.060174
Balance_Limit_V1    0.057685
PAID_AMT_AUG        0.052111
DUE_AMT_AUG         0.047138
PAID_AMT_OCT        0.044643
MARITAL_STATUS      0.044204
PAID_AMT_DEC        0.043902
DUE_AMT_SEP         0.043561
PAID_AMT_SEP        0.043021
DUE_AMT_OCT         0.041353
DUE_AMT_NOV         0.040978
DUE_AMT_DEC         0.039998
PAID_AMT_NOV        0.038779
PAY_SEP             0.038396
Pay_Time            0.036881
Gender              0.026435
AGE                 0.024100
PAY_OCT             0.023162
PAY_NOV             0.019134
EDUCATION_STATUS    0.018079
PAY_DEC             0.017396
NO_PAY_DEC          0.004291
dtype: float64


In [14]:
#importing test data
Test = pd.read_csv('TestData.csv')

#making copy to work with
Test_copy = Test.copy()

Test_copy= Test_copy.drop(['Client_ID'], axis=1)

#Train the model using the complete training set using the model parameters defined above
clf.fit(X, y)

#predicting values for test set
y_pred=clf.predict(Test_copy)

#Create a  DataFrame with the client ids and our prediction 
submission = pd.DataFrame({'Client_ID':Test['Client_ID'],'NEXT_MONTH_DEFAULT':y_pred})

#Convert DataFrame to a csv file that can be uploaded
filename = 'DataStorm Predictions 1.csv'

submission.to_csv(filename,index=False)

print('Saved file: ' + filename)

Saved file: DataStorm Predictions 1.csv
