In [41]:
#import package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [54]:
#load data
hmeq = pd.read_csv("hmeq-clean.csv")
hmeq.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320,HomeImp,Other,9,0,0,101.466002,1,8,37.113614
1,1,1800,28502,43034,HomeImp,Other,11,0,0,88.76603,0,8,36.884894
2,0,2300,102370,120953,HomeImp,Office,2,0,0,90.992533,0,13,31.588503
3,1,2400,34863,47471,HomeImp,Mgr,12,0,0,70.49108,1,21,38.263601
4,0,2400,98449,117195,HomeImp,Office,4,0,0,93.811775,0,13,29.681827


In [61]:
#transfomr data string to numpy integer
le = LabelEncoder()
hmeq["REASON"] = le.fit_transform(hmeq["REASON"])
hmeq["JOB"] =  le.fit_transform(hmeq["JOB"])
hmeq.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320,1,2,9,0,0,101.466002,1,8,37.113614
1,1,1800,28502,43034,1,2,11,0,0,88.76603,0,8,36.884894
2,0,2300,102370,120953,1,1,2,0,0,90.992533,0,13,31.588503
3,1,2400,34863,47471,1,0,12,0,0,70.49108,1,21,38.263601
4,0,2400,98449,117195,1,1,4,0,0,93.811775,0,13,29.681827


In [62]:
#Divide the data into feature and label
X = hmeq.iloc[:,1:].values #feature
y = hmeq["BAD"].values #Label

In [63]:
#define dictionary of model
models = {
    "knn":KNeighborsClassifier(n_neighbors=1),
    "naive_bayes" : GaussianNB(),
    "logit":LogisticRegression(solver="lbfgs", multi_class="auto"),
    "svm":SVC(kernel="rbf", gamma="auto"),
    "decision_tree":DecisionTreeClassifier(),
    "random_forest":RandomForestClassifier(n_estimators=100),
    "mlp":MLPClassifier()
}

In [64]:
#Split data for training
# Ration 60:40
(X_train, X_test, y_train, y_test ) = train_test_split(X, y, random_state = 3, test_size=0.4)

In [84]:
#Train model using random forest
modelname = "random_forest"
print("[INFO] using '{}' model".format(modelname))
model = models[modelname]
model.fit(X_train, y_train)

[INFO] using 'random_forest' model


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [85]:
#Evaluate model
print("[INFO] evaluating...")
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Model accuracy : ", accuracy_score(y_test,predictions))

[INFO] evaluating...
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      1226
           1       0.94      0.40      0.56       119

   micro avg       0.94      0.94      0.94      1345
   macro avg       0.94      0.70      0.77      1345
weighted avg       0.94      0.94      0.93      1345

Model accuracy :  0.9449814126394052
