In [1]:
import numpy as np
import pandas as pd

#loading metrics packages
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

#loading feature selection packages
from sklearn.feature_selection import RFECV

#loading model parameter selection packages
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.model_selection import ShuffleSplit

# loading algorithm packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [2]:
# loading data
data = pd.read_csv('defaulter_full.csv', sep=',') 
#deleting unwanted rows
del data['ID']

In [3]:
#split data into train and test and splitting featurs and class
train = data.sample(frac=0.6, random_state=1)
test = data.loc[~data.index.isin(train.index)] 
columns = data.columns.tolist()
columns = [c for c in columns if c not in ["default.payment.next.month"]]
def_n_train_X =train[columns]
def_n_test_X =test[columns]
def_n_train_Y=train['default.payment.next.month']
def_n_test_Y=test['default.payment.next.month']

In [4]:
#ranking features for random sampled data and extracting important features
estimator= LogisticRegression()
selector = RFECV(estimator, 5,cv=5)
selector = selector.fit(def_n_train_X, def_n_train_Y)
print "fearure ranking for random sampled data= ", selector.ranking_

[6 2 3 2 4 1 3 3 3 4 3 5 5 6 6 5 6 4 4 5 4 5 6]


In [5]:
columns = def_n_train_X.columns.tolist()
columns = [c for c in columns if c in ["SEX","EDUCATION","MARRIAGE",'AGE','PAY_0','PAY_2','PAY_3','PAY_5','PAY_4']]
def_f_n_train_X =def_n_train_X[columns]
def_f_n_test_X=def_n_test_X[columns]
def_f_n_train_Y=def_n_train_Y
def_f_n_test_Y=def_n_test_Y

In [8]:
#split data into train and test and splitting featurs and class for under sampled data
train = pd.read_csv('defaulter_undersample_train.csv', sep=',') 
test = pd.read_csv('defaulter_undersample_test.csv', sep=',') 
columns = train.columns.tolist()
columns = [c for c in columns if c not in ["default.payment.next.month"]]
def_u_train_X =train[columns]
def_u_test_X =test[columns]
def_u_train_Y=train['default.payment.next.month']
def_u_test_Y=test['default.payment.next.month']

In [10]:
#ranking features for random sampled data and extracting important features for under sampled data
estimator= LogisticRegression()
selector = RFECV(estimator, 5,cv=5)
selector = selector.fit(def_u_train_X, def_u_train_Y)
print "fearure ranking for under sampled data= ", selector.ranking_
columns = def_u_train_X.columns.tolist()
columns = [c for c in columns if c in ["SEX","EDUCATION","MARRIAGE",'AGE','PAY_0','PAY_2','PAY_3','PAY_5','PAY_4']]
def_f_u_train_X =train[columns]
def_f_u_test_X =test[columns]
def_f_u_train_Y=def_u_train_Y
def_f_u_test_Y=def_u_test_Y

fearure ranking=  [6 4 3 2 4 1 3 2 3 3 3 5 5 6 6 6 5 4 4 5 4 6 5]


In [11]:
# defining parameter dictionaries
ID3_params={'max_depth':[i for i in range(1,15)], 'criterion':['gini', 'entropy']}
SVM_params = {'loss':['squared_hinge'],'penalty':['l2', 'l1'], 'dual':[False]}
ada_params= {'n_estimators':[20, 40, 50], 'learning_rate':[0.5,1.0]}
knn_params={'n_neighbors':[5,10,15], 'weights':['uniform','distance'],'algorithm':['ball_tree', 'kd_tree', 'brute']}
bag_params={'n_estimators':[10,15,20], 'bootstrap_features':[True,False],'max_features': [0.5,1.0],'max_samples':[1,0.5,0.75]}
logit_params={'max_iter':[100,50,200],'C':[0.25,0.5,0.75,1.0],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag'],'warm_start':[True,False]}
xg_params={'learning_rate':[0.01,0.1,0.3,0.5], 'n_estimators':[500,600,700,1000]}

In [12]:
#function to calculate best parameters
def bestparam (model, train_X, train_Y):
    model.fit(train_X,train_Y)
    print"Best paramters", model.best_params_
    print""
    return model.best_params_

In [16]:
#function to print metrics
def calcmetrics(predicted,actual,ModelName):
    print"************************** "+ModelName+" ****************************"
    print"Confusion Matrix"
    print metrics.confusion_matrix(actual, predicted)
    print""
    print metrics.classification_report(actual, predicted)
    print""
    print"Accuracy = ","{0:.2f}".format(round(accuracy_score(actual, predicted)*100,2))
    print""
    false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print"AUC =","{0:.2f}".format(round(roc_auc*100,2))
    print""
    plt.title('Receiver Operating Characteristic '+ModelName)
    plt.plot(false_positive_rate, true_positive_rate, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

## buliding Decision tree model from undersampled data and Random sampled data

In [None]:
ID3=GridSearchCV(DecisionTreeClassifier(), param_grid = ID3_params, n_jobs = -1, cv = 5)
b_param=bestparam (ID3, def_u_train_X, def_u_train_Y)
ID3_best=DecisionTreeClassifier(criterion = b_param['criterion'],max_depth=b_param['max_depth'])
ID3_best.fit(def_u_train_X,def_u_train_Y)
ID3_predicted = ID3_best.predict(def_u_test_X)
calcmetrics(ID3_predicted,def_u_test_Y,"Decision Tree - Under sampling with out feature reduction")

In [None]:
ID3=GridSearchCV(DecisionTreeClassifier(), param_grid = ID3_params, n_jobs = -1, cv = 5)
b_param=bestparam (ID3, def_f_u_train_X, def_f_u_train_Y)
ID3_best=DecisionTreeClassifier(criterion = b_param['criterion'],max_depth=b_param['max_depth'])
ID3_best.fit(def_f_u_train_X,def_f_u_train_Y)
ID3_predicted = ID3_best.predict(def_f_u_test_X)
calcmetrics(ID3_predicted,def_f_u_test_Y,"Decision Tree - Under sampling with feature reduction")

In [None]:
ID3=GridSearchCV(DecisionTreeClassifier(), param_grid = ID3_params, n_jobs = -1, cv = 5)
b_param=bestparam (ID3, def_n_train_X, def_n_train_Y)
ID3_best=DecisionTreeClassifier(criterion = b_param['criterion'],max_depth=b_param['max_depth'])
ID3_best.fit(def_n_train_X,def_n_train_Y)
ID3_predicted = ID3_best.predict(def_n_test_X)
calcmetrics(ID3_predicted,def_n_test_Y,"Decision Tree - Random sampling with out feature reduction")

In [None]:
ID3=GridSearchCV(DecisionTreeClassifier(), param_grid = ID3_params, n_jobs = -1, cv = 5)
b_param=bestparam (ID3, def_f_n_train_X, def_f_n_train_Y)
ID3_best=DecisionTreeClassifier(criterion = b_param['criterion'],max_depth=b_param['max_depth'])
ID3_best.fit(def_f_n_train_X,def_f_n_train_Y)
ID3_predicted = ID3_best.predict(def_f_n_test_X)
calcmetrics(ID3_predicted,def_f_n_test_Y,"Decision Tree - Under sampling with feature reduction")

## buliding SVM model from undersampled data and Random sampled data

In [None]:
SVM=GridSearchCV(svm.LinearSVC(), param_grid = SVM_params, n_jobs = -1, cv = 5)
b_param=bestparam (SVM, def_u_train_X, def_u_train_Y)
SVM_best=svm.LinearSVC(penalty= b_param['penalty'], loss= b_param['loss'], dual= b_param['dual'])
SVM_best.fit(def_u_train_X,def_u_train_Y)
SVM_predicted = SVM_best.predict(def_u_test_X)
calcmetrics(SVM_predicted,def_u_test_Y,"Linear SVM - Under sampling with out feature reduction")

In [None]:
SVM=GridSearchCV(svm.LinearSVC(), param_grid = SVM_params, n_jobs = -1, cv = 5)
b_param=bestparam (SVM, def_f_u_train_X, def_f_u_train_Y)
SVM_best=svm.LinearSVC(penalty= b_param['penalty'], loss= b_param['loss'], dual= b_param['dual'])
SVM_best.fit(def_f_u_train_X,def_f_u_train_Y)
SVM_predicted = SVM_best.predict(def_f_u_test_X)
calcmetrics(SVM_predicted,def_f_u_test_Y,"Linear SVM - Under sampling with feature reduction")

In [None]:
SVM=GridSearchCV(svm.LinearSVC(), param_grid = SVM_params, n_jobs = -1, cv = 5)
b_param=bestparam (SVM, def_n_train_X, def_n_train_Y)
SVM_best=svm.LinearSVC(penalty= b_param['penalty'], loss= b_param['loss'], dual= b_param['dual'])
SVM_best.fit(def_n_train_X,def_n_train_Y)
SVM_predicted = SVM_best.predict(def_n_test_X)
calcmetrics(SVM_predicted,def_n_test_Y,"Linear SVM - Random sampling with out feature reduction ")

In [None]:
SVM=GridSearchCV(svm.LinearSVC(), param_grid = SVM_params, n_jobs = -1, cv = 5)
b_param=bestparam (SVM, def_f_n_train_X, def_f_n_train_Y)
SVM_best=svm.LinearSVC(penalty= b_param['penalty'], loss= b_param['loss'], dual= b_param['dual'])
SVM_best.fit(def_f_n_train_X,def_f_n_train_Y)
SVM_predicted = SVM_best.predict(def_f_n_test_X)
calcmetrics(SVM_predicted,def_f_n_test_Y,"Linear SVM - Random sampling with feature reduction ")

## buliding KNN model from undersampled data and Random sampled data

In [None]:
knn=GridSearchCV(KNeighborsClassifier(), param_grid = knn_params, n_jobs = -1, cv = 5)
b_param=bestparam (knn, def_u_train_X, def_u_train_Y)
knn_best=KNeighborsClassifier(n_neighbors= b_param['n_neighbors'], weights=b_param['weights'], algorithm= b_param['algorithm'])
knn_best.fit(def_u_train_X,def_u_train_Y)
knn_predicted = knn_best.predict(def_u_test_X)
calcmetrics(knn_predicted,def_u_test_Y,"KNN- Under sampling with out feature reduction ")

In [None]:
knn=GridSearchCV(KNeighborsClassifier(), param_grid = knn_params, n_jobs = -1, cv = 5)
b_param=bestparam (knn, def_f_u_train_X, def_f_u_train_Y)
knn_best=KNeighborsClassifier(n_neighbors= b_param['n_neighbors'], weights=b_param['weights'], algorithm= b_param['algorithm'])
knn_best.fit(def_f_u_train_X,def_f_u_train_Y)
knn_predicted = knn_best.predict(def_f_u_test_X)
calcmetrics(knn_predicted,def_f_u_test_Y,"KNN- Under sampling with feature reduction ")

In [None]:
knn=GridSearchCV(KNeighborsClassifier(), param_grid = knn_params, n_jobs = -1, cv = 5)
b_param=bestparam (knn, def_n_train_X, def_n_train_Y)
knn_best=KNeighborsClassifier(n_neighbors= b_param['n_neighbors'], weights=b_param['weights'], algorithm= b_param['algorithm'])
knn_best.fit(def_n_train_X,def_n_train_Y)
knn_predicted = knn_best.predict(def_n_test_X)
calcmetrics(knn_predicted,def_n_test_Y,"KNN- Random sampling with out feature reduction ")

In [None]:
knn=GridSearchCV(KNeighborsClassifier(), param_grid = knn_params, n_jobs = -1, cv = 5)
b_param=bestparam (knn, def_f_n_train_X, def_f_n_train_Y)
knn_best=KNeighborsClassifier(n_neighbors= b_param['n_neighbors'], weights=b_param['weights'], algorithm= b_param['algorithm'])
knn_best.fit(def_f_n_train_X,def_f_n_train_Y)
knn_predicted = knn_best.predict(def_f_n_test_X)
calcmetrics(knn_predicted,def_f_n_test_Y,"KNN- Random sampling with feature reduction ")

## buliding adaboost model from undersampled data and Random sampled data

In [None]:
ada=GridSearchCV(AdaBoostClassifier(ID3_best), param_grid = ada_params, n_jobs = -1, cv = 5)
b_param=bestparam (ada, def_u_train_X, def_u_train_Y)
ada_best=AdaBoostClassifier(ID3_best, n_estimators= b_param['n_estimators'], learning_rate=b_param['learning_rate'])
ada_best.fit(def_u_train_X,def_u_train_Y)
ada_predicted = ada_best.predict(def_u_test_X)
calcmetrics(ada_predicted,def_u_test_Y,"Adaboost- Under sampling with out feature reduction")

In [None]:
ada=GridSearchCV(AdaBoostClassifier(ID3_best), param_grid = ada_params, n_jobs = -1, cv = 5)
b_param=bestparam (ada, def_f_u_train_X, def_f_u_train_Y)
ada_best=AdaBoostClassifier(ID3_best, n_estimators= b_param['n_estimators'], learning_rate=b_param['learning_rate'])
ada_best.fit(def_f_u_train_X,def_f_u_train_Y)
ada_predicted = ada_best.predict(def_f_u_test_X)
calcmetrics(ada_predicted,def_f_u_test_Y,"Adaboost- Under sampling with feature reduction")

In [None]:
ada=GridSearchCV(AdaBoostClassifier(ID3_best), param_grid = ada_params, n_jobs = -1, cv = 5)
b_param=bestparam (ada, def_n_train_X, def_n_train_Y)
ada_best=AdaBoostClassifier(ID3_best, n_estimators= b_param['n_estimators'], learning_rate=b_param['learning_rate'])
ada_best.fit(def_n_train_X,def_n_train_Y)
ada_predicted = ada_best.predict(def_n_test_X)
calcmetrics(ada_predicted,def_n_test_Y,"Adaboost- Random sampling with out feature reduction")

In [None]:
ada=GridSearchCV(AdaBoostClassifier(ID3_best), param_grid = ada_params, n_jobs = -1, cv = 5)
b_param=bestparam (ada, def_f_n_train_X, def_f_n_train_Y)
ada_best=AdaBoostClassifier(ID3_best, n_estimators= b_param['n_estimators'], learning_rate=b_param['learning_rate'])
ada_best.fit(def_f_n_train_X,def_f_n_train_Y)
ada_predicted = ada_best.predict(def_f_n_test_X)
calcmetrics(ada_predicted,def_f_n_test_Y,"Adaboost- Random sampling with feature reduction")

## buliding bagging model from undersampled data and Random sampled data

In [None]:
bag=GridSearchCV(BaggingClassifier(ID3_best), param_grid = bag_params, n_jobs = -1, cv = 5)
b_param=bestparam (bag, def_u_train_X, def_u_train_Y)
bag_best=BaggingClassifier(ID3_best,max_features= b_param['max_features'], max_samples= b_param['max_samples'], n_estimators= b_param['n_estimators'], bootstrap_features= b_param['bootstrap_features'])
bag_best.fit(def_u_train_X,def_u_train_Y)
bag_predicted = bag_best.predict(def_u_test_X)
calcmetrics(bag_predicted,def_u_test_Y,"Bagging- Under sampling with out feature reduction")

In [None]:
bag=GridSearchCV(BaggingClassifier(ID3_best), param_grid = bag_params, n_jobs = -1, cv = 5)
b_param=bestparam (bag, def_f_u_train_X, def_f_u_train_Y)
bag_best=BaggingClassifier(ID3_best,max_features= b_param['max_features'], max_samples= b_param['max_samples'], n_estimators= b_param['n_estimators'], bootstrap_features= b_param['bootstrap_features'])
bag_best.fit(def_f_u_train_X,def_f_u_train_Y)
bag_predicted = bag_best.predict(def_f_u_test_X)
calcmetrics(bag_predicted,def_f_u_test_Y,"Bagging- Under sampling with feature reduction")

In [None]:
bag=GridSearchCV(BaggingClassifier(ID3_best), param_grid = bag_params, n_jobs = -1, cv = 5)
b_param=bestparam (bag, def_n_train_X, def_n_train_Y)
bag_best=BaggingClassifier(ID3_best,max_features= b_param['max_features'], max_samples= b_param['max_samples'], n_estimators= b_param['n_estimators'], bootstrap_features= b_param['bootstrap_features'])
bag_best.fit(def_n_train_X,def_n_train_Y)
bag_predicted = bag_best.predict(def_n_test_X)
calcmetrics(bag_predicted,def_n_test_Y,"Bagging- Random sampling with out feature reduction")

In [None]:
bag=GridSearchCV(BaggingClassifier(ID3_best), param_grid = bag_params, n_jobs = -1, cv = 5)
b_param=bestparam (bag, def_f_n_train_X, def_f_n_train_Y)
bag_best=BaggingClassifier(ID3_best,max_features= b_param['max_features'], max_samples= b_param['max_samples'], n_estimators= b_param['n_estimators'], bootstrap_features= b_param['bootstrap_features'])
bag_best.fit(def_f_n_train_X,def_f_n_train_Y)
bag_predicted = bag_best.predict(def_f_n_test_X)
calcmetrics(bag_predicted,def_f_n_test_Y,"Bagging- Random sampling with feature reduction")

## buliding Logistic regression model from undersampled data and Random sampled data

In [None]:
logit=GridSearchCV(LogisticRegression(), param_grid = logit_params, n_jobs = -1, cv = 5)
b_param=bestparam (logit, def_u_train_X, def_u_train_Y)
logit_best=LogisticRegression(warm_start=b_param['warm_start'], C= b_param['C'], max_iter= b_param['max_iter'], solver= b_param['solver'])
logit_best.fit(def_u_train_X,def_u_train_Y)
logit_predicted = logit_best.predict(def_u_test_X)
calcmetrics(logit_predicted,def_u_test_Y,"logistic Regression- Under sampling with out feature reduction")

In [None]:
logit=GridSearchCV(LogisticRegression(), param_grid = logit_params, n_jobs = -1, cv = 5)
b_param=bestparam (logit, def_f_u_train_X, def_f_u_train_Y)
logit_best=LogisticRegression(warm_start=b_param['warm_start'], C= b_param['C'], max_iter= b_param['max_iter'], solver= b_param['solver'])
logit_best.fit(def_f_u_train_X,def_f_u_train_Y)
logit_predicted = logit_best.predict(def_f_u_test_X)
calcmetrics(logit_predicted,def_f_u_test_Y,"logistic Regression- Under sampling with out feature reduction")

In [None]:
logit=GridSearchCV(LogisticRegression(), param_grid = logit_params, n_jobs = -1, cv = 5)
b_param=bestparam (logit, def_n_train_X, def_n_train_Y)
logit_best=LogisticRegression(warm_start=b_param['warm_start'], C= b_param['C'], max_iter= b_param['max_iter'], solver= b_param['solver'])
logit_best.fit(def_n_train_X,def_n_train_Y)
logit_predicted = logit_best.predict(def_n_test_X)
calcmetrics(logit_predicted,def_n_test_Y,"logistic Regression-  Random sampling with feature reduction")

In [None]:
logit=GridSearchCV(LogisticRegression(), param_grid = logit_params, n_jobs = -1, cv = 5)
b_param=bestparam (logit, def_f_n_train_X, def_f_n_train_Y)
logit_best=LogisticRegression(warm_start=b_param['warm_start'], C= b_param['C'], max_iter= b_param['max_iter'], solver= b_param['solver'])
logit_best.fit(def_f_n_train_X,def_f_n_train_Y)
logit_predicted = logit_best.predict(def_f_n_test_X)
calcmetrics(logit_predicted,def_f_n_test_Y,"logistic Regression-  Random sampling with feature reduction")

## buliding XGBOOST model from undersampled data and Random sampled data

In [None]:
Xgboost = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.03).fit(def_u_train_X, def_u_train_Y) 
xgb_predicted = Xgboost.predict(def_u_test_X)
calcmetrics(xgb_predicted,def_u_test_Y,"XGBoost - Under sampling with out feature reduction")

In [None]:
Xgboost = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.03).fit(def_f_u_train_X, def_f_u_train_Y) 
xgb_predicted = Xgboost.predict(def_f_u_test_X)
calcmetrics(xgb_predicted,def_f_u_test_Y,"XGBoost - Under sampling with feature reduction")

In [None]:
Xgboost = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.03).fit(def_n_train_X, def_n_train_Y) 
xgb_predicted = Xgboost.predict(def_n_test_X)
calcmetrics(xgb_predicted,def_n_test_Y,"XGBoost - Random sampling with feature reduction")

In [None]:
Xgboost = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.03).fit(def_f_n_train_X, def_f_n_train_Y) 
xgb_predicted = Xgboost.predict(def_f_n_test_X)
calcmetrics(xgb_predicted,def_f_n_test_Y,"XGBoost - Random sampling with out feature reduction")