In [1]:
#import package
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

In [2]:
#load data
hmeq = pd.read_csv("hmeq-clean.csv")
hmeq.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320,HomeImp,Other,9,0,0,101.466002,1,8,37.113614
1,1,1800,28502,43034,HomeImp,Other,11,0,0,88.76603,0,8,36.884894
2,0,2300,102370,120953,HomeImp,Office,2,0,0,90.992533,0,13,31.588503
3,1,2400,34863,47471,HomeImp,Mgr,12,0,0,70.49108,1,21,38.263601
4,0,2400,98449,117195,HomeImp,Office,4,0,0,93.811775,0,13,29.681827


In [3]:
#transfomr data string to numpy integer
le = LabelEncoder()
hmeq["REASON"] = le.fit_transform(hmeq["REASON"])
hmeq["JOB"] =  le.fit_transform(hmeq["JOB"])
hmeq.head()

Unnamed: 0,BAD,LOAN,MORTDUE,VALUE,REASON,JOB,YOJ,DEROG,DELINQ,CLAGE,NINQ,CLNO,DEBTINC
0,1,1700,30548,40320,1,2,9,0,0,101.466002,1,8,37.113614
1,1,1800,28502,43034,1,2,11,0,0,88.76603,0,8,36.884894
2,0,2300,102370,120953,1,1,2,0,0,90.992533,0,13,31.588503
3,1,2400,34863,47471,1,0,12,0,0,70.49108,1,21,38.263601
4,0,2400,98449,117195,1,1,4,0,0,93.811775,0,13,29.681827


In [4]:
#Divide the data into feature and label
X = hmeq.iloc[:,1:].values #feature
y = hmeq["BAD"].values #Label

In [5]:
#define dictionary of model
models = {
    "knn":KNeighborsClassifier(n_neighbors=1),
    "naive_bayes" : GaussianNB(),
    "logit":LogisticRegression(solver="lbfgs", multi_class="auto"),
    "svm":SVC(kernel="rbf", gamma="auto"),
    "decision_tree":DecisionTreeClassifier(),
    "random_forest":RandomForestClassifier(n_estimators=100),
    "mlp":MLPClassifier()
}

In [6]:
#Split data for training
# Ration 60:40
(X_train, X_test, y_train, y_test ) = train_test_split(X, y, random_state = 3, test_size=0.4)

In [7]:
#Train model using random forest
modelname = "random_forest"
print("[INFO] using '{}' model".format(modelname))
model = models[modelname]
model.fit(X_train, y_train)

[INFO] using 'random_forest' model


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [8]:
#Evaluate model
print("[INFO] evaluating...")
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Model accuracy : ", accuracy_score(y_test,predictions))

[INFO] evaluating...
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      1226
           1       0.96      0.39      0.56       119

   micro avg       0.94      0.94      0.94      1345
   macro avg       0.95      0.70      0.77      1345
weighted avg       0.95      0.94      0.93      1345

Model accuracy :  0.9449814126394052


In [9]:
#Extract Features
#Import decomposition
from sklearn import decomposition

In [43]:
#Extract features using pca method
pca = decomposition.PCA(n_components = 6) #make into 6 features
X_r = pca.fit(X).transform(X)

In [44]:
#Split data for training
# Ration 60:40
(X_train, X_test, y_train, y_test ) = train_test_split(X_r, y, random_state = 3, test_size=0.4)

In [55]:
#Train model using random forest
modelname = "random_forest"
print("[INFO] using '{}' model".format(modelname))
model = models[modelname]
model.fit(X_train, y_train)

[INFO] using 'random_forest' model


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
#Evaluate model
print("[INFO] evaluating PCA Features...")
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Model accuracy : ", accuracy_score(y_test,predictions))

[INFO] evaluating PCA Features...
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1226
           1       0.90      0.24      0.37       119

   micro avg       0.93      0.93      0.93      1345
   macro avg       0.92      0.62      0.67      1345
weighted avg       0.93      0.93      0.91      1345

Model accuracy :  0.9301115241635688


In [62]:
#trying LDA for extraction
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
lda = LinearDiscriminantAnalysis(n_components=6)

In [63]:
X_s = lda.fit(X, y).transform(X)

In [64]:
#Split data for training
# Ration 60:40
(X_train, X_test, y_train, y_test ) = train_test_split(X_s, y, random_state = 3, test_size=0.4)

In [75]:
#Train model using random forest
modelname = "mlp"
print("[INFO] using '{}' model".format(modelname))
model = models[modelname]
model.fit(X_train, y_train)

[INFO] using 'mlp' model


MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [76]:
#Evaluate model
print("[INFO] evaluating LDA Features...")
predictions = model.predict(X_test)
print(classification_report(y_test, predictions))
print("Model accuracy : ", accuracy_score(y_test,predictions))

[INFO] evaluating LDA Features...
              precision    recall  f1-score   support

           0       0.93      1.00      0.96      1226
           1       0.82      0.24      0.37       119

   micro avg       0.93      0.93      0.93      1345
   macro avg       0.88      0.62      0.66      1345
weighted avg       0.92      0.93      0.91      1345

Model accuracy :  0.9278810408921933
