In [1]:
import numpy as np
import pandas as pd

#loading metrics packages
from sklearn import metrics
from sklearn.metrics import roc_curve, auc, accuracy_score
import matplotlib.pyplot as plt
%matplotlib inline

#loading feature selection packages
from sklearn.feature_selection import RFECV

#loading model parameter selection packages
from sklearn.model_selection import validation_curve
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.model_selection import ShuffleSplit

# loading algorithm packages
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn import svm
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier 
from sklearn.neighbors import KNeighborsClassifier
import xgboost as xgb

In [2]:
#loading fraud under sampled  data
fraud_u_train = pd.read_csv('under_sampling_train.csv', sep=',') 
fraud_u_test = pd.read_csv('under_sampling_test.csv', sep=',') 

#loading fraud random sampled data
fraud_n_train = pd.read_csv('train3.csv', sep=',') 
fraud_n_test = pd.read_csv('test2.csv', sep=',') 

#deleting unwanted columns
del fraud_u_train['Time']
del fraud_u_test['Time']

del fraud_n_train['Time']
del fraud_n_test['Time']

# drop rows with NA
fraud_u_train.dropna(inplace = True)
fraud_u_test.dropna(inplace = True)

fraud_n_train.dropna(inplace = True)
fraud_n_test.dropna(inplace = True)

#splitting featurs and class
columns = fraud_u_train.columns.tolist()
columns = [c for c in columns if c not in ["Class"]]
fraud_u_train_X=fraud_u_train[columns]
fraud_u_train_Y=fraud_u_train["Class"]
fraud_u_test_X=fraud_u_test[columns]
fraud_u_test_Y=fraud_u_test["Class"]

columns = fraud_u_train.columns.tolist()
columns = [c for c in columns if c not in ["Class"]]
fraud_n_train_X=fraud_n_train[columns]
fraud_n_train_Y=fraud_n_train["Class"]
fraud_n_test_X=fraud_n_test[columns]
fraud_n_test_Y=fraud_n_test["Class"]

fraud_u_train_X = pd.get_dummies(fraud_u_train_X)
fraud_u_test_X= pd.get_dummies(fraud_u_test_X)

fraud_n_train_X = pd.get_dummies(fraud_n_train_X)
fraud_n_test_X= pd.get_dummies(fraud_n_test_X)

In [3]:
#Building parameter dictionaries
ID3_params={'max_depth':[i for i in range(1,15)], 'criterion':['gini', 'entropy']}
SVM_params = {'loss':['squared_hinge'],'penalty':['l2', 'l1'], 'dual':[False]}
SVM_2_params = {'class_weight':['balanced','auto'],'C':[0.25,0.5,0.75,1.0], 'max_iter':[10,15,-1],'cache_size':[500.0]}
SVM_3_params = {'class_weight':['balanced','auto'],'C':[0.25,0.5,0.75,1.0], 'max_iter':[10,15,-1],'cache_size':[500.0]}
ada_params= {'n_estimators':[20, 40, 50], 'learning_rate':[0.5,1.0]}
knn_params={'n_neighbors':[5,10,15], 'weights':['uniform','distance'],'algorithm':['ball_tree', 'kd_tree', 'brute']}
bag_params={'n_estimators':[10,15,20], 'bootstrap_features':[True,False],'max_features': [0.5,1.0],'max_samples':[1,0.5,0.75]}
logit_params={'max_iter':[100,50,200],'C':[0.25,0.5,0.75,1.0],'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag'],'warm_start':[True,False]}
xg_params={'learning_rate':[0.01,0.1,0.3,0.5], 'n_estimators':[500,600,700,1000]}

In [17]:
#Function to calculate best set of parameters for each model 
def bestparam (model, train_X, train_Y):
    model.fit(train_X,train_Y)
    print"Best paramters", model.best_params_
    print""
    return model.best_params_

In [34]:
#function to prin the metrics of a model
def calcmetrics(predicted,actual,ModelName):
    print"****************************************** "+ModelName+" ****************************************"
    print"Confusion Matrix"
    print metrics.confusion_matrix(actual, predicted)
    print""
    print metrics.classification_report(actual, predicted)
    print""
    print"Accuracy = ","{0:.2f}".format(round(accuracy_score(actual, predicted)*100,2))
    print""
    false_positive_rate, true_positive_rate, thresholds = roc_curve(actual, predicted)
    roc_auc = auc(false_positive_rate, true_positive_rate)
    print"AUC =","{0:.2f}".format(round(roc_auc*100,2))
    print""
    plt.title('Receiver Operating Characteristic '+ModelName)
    plt.plot(false_positive_rate, true_positive_rate, 'b',
    label='AUC = %0.2f'% roc_auc)
    plt.legend(loc='lower right')
    plt.plot([0,1],[0,1],'r--')
    plt.xlim([-0.1,1.2])
    plt.ylim([-0.1,1.2])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

## buliding Decision tree model from undersampled data and Random sampled data

In [None]:
ID3=GridSearchCV(DecisionTreeClassifier(), param_grid = ID3_params, n_jobs = -1, cv = 5)
b_param=bestparam (ID3, fraud_u_train_X, fraud_u_train_Y)
ID3_best=DecisionTreeClassifier(criterion = b_param['criterion'],max_depth=b_param['max_depth'])
ID3_best.fit(fraud_u_train_X,fraud_u_train_Y)
ID3_predicted = ID3_best.predict(fraud_u_test_X)
calcmetrics(ID3_predicted,fraud_u_test_Y,"Decision Tree - Under sampling")

In [None]:
ID3=GridSearchCV(DecisionTreeClassifier(), param_grid = ID3_params, n_jobs = -1, cv = 5)
b_param=bestparam (ID3, fraud_n_train_X, fraud_n_train_Y)
ID3_best=DecisionTreeClassifier(criterion = b_param['criterion'],max_depth=b_param['max_depth'])
ID3_best.fit(fraud_n_train_X,fraud_n_train_Y)
ID3_predicted = ID3_best.predict(fraud_n_test_X)
calcmetrics(ID3_predicted,fraud_n_test_Y,"Decision Tree - Random sampling")

## buliding SVM model from undersampled data and Random sampled data

In [None]:
SVM=GridSearchCV(svm.LinearSVC(), param_grid = SVM_params, n_jobs = -1, cv = 5)
b_param=bestparam (SVM, fraud_u_train_X, fraud_u_train_Y)
SVM_best=svm.LinearSVC(penalty= b_param['penalty'], loss= b_param['loss'], dual= b_param['dual'])
SVM_best.fit(fraud_u_train_X,fraud_u_train_Y)
SVM_predicted = SVM_best.predict(fraud_u_test_X)
calcmetrics(SVM_predicted,fraud_u_test_Y,"Linear SVM - Under sampling")

In [None]:
SVM=GridSearchCV(svm.LinearSVC(), param_grid = SVM_params, n_jobs = -1, cv = 5)
b_param=bestparam (SVM, fraud_n_train_X, fraud_n_train_Y)
SVM_best=svm.LinearSVC(penalty= b_param['penalty'], loss= b_param['loss'], dual= b_param['dual'])
SVM_best.fit(fraud_n_train_X,fraud_n_train_Y)
SVM_predicted = SVM_best.predict(fraud_n_test_X)
calcmetrics(SVM_predicted,fraud_n_test_Y,"Linear SVM - Random sampling")

## buliding KNN model from undersampled data and Random sampled data

In [None]:
knn=GridSearchCV(KNeighborsClassifier(), param_grid = knn_params, n_jobs = -1, cv = 5)
b_param=bestparam (knn, fraud_u_train_X, fraud_u_train_Y)
knn_best=KNeighborsClassifier(n_neighbors= b_param['n_neighbors'], weights=b_param['weights'], algorithm= b_param['algorithm'])
knn_best.fit(fraud_u_train_X,fraud_u_train_Y)
knn_predicted = knn_best.predict(fraud_u_test_X)
calcmetrics(knn_predicted,fraud_u_test_Y,"KNN- Under sampling")

In [None]:
knn=GridSearchCV(KNeighborsClassifier(), param_grid = knn_params, n_jobs = -1, cv = 5)
b_param=bestparam (knn, fraud_n_train_X, fraud_n_train_Y)
knn_best=KNeighborsClassifier(n_neighbors= b_param['n_neighbors'], weights=b_param['weights'], algorithm= b_param['algorithm'])
knn_best.fit(fraud_n_train_X,fraud_n_train_Y)
knn_predicted = knn_best.predict(fraud_n_test_X)
calcmetrics(knn_predicted,fraud_n_test_Y,"KNN- Random sampling")

## buliding adaboost model from undersampled data and Random sampled data

In [None]:
ada=GridSearchCV(AdaBoostClassifier(ID3_best), param_grid = ada_params, n_jobs = -1, cv = 5)
b_param=bestparam (ada, fraud_u_train_X, fraud_u_train_Y)
ada_best=AdaBoostClassifier(ID3_best, n_estimators= b_param['n_estimators'], learning_rate=b_param['learning_rate'])
ada_best.fit(fraud_u_train_X,fraud_u_train_Y)
ada_predicted = ada_best.predict(fraud_u_test_X)
calcmetrics(ada_predicted,fraud_u_test_Y,"Adaboost- Under sampling")

In [None]:
ada=GridSearchCV(AdaBoostClassifier(ID3_best), param_grid = ada_params, n_jobs = -1, cv = 5)
b_param=bestparam (ada, fraud_n_train_X, fraud_n_train_Y)
ada_best=AdaBoostClassifier(ID3_best, n_estimators= b_param['n_estimators'], learning_rate=b_param['learning_rate'])
ada_best.fit(fraud_n_train_X,fraud_n_train_Y)
ada_predicted = ada_best.predict(fraud_n_test_X)
calcmetrics(ada_predicted,fraud_n_test_Y,"Adaboost- Random sampling")

## buliding bagging model from undersampled data and Random sampled data

In [None]:
bag=GridSearchCV(BaggingClassifier(ID3_best), param_grid = bag_params, n_jobs = -1, cv = 5)
b_param=bestparam (bag, fraud_u_train_X, fraud_u_train_Y)
bag_best=BaggingClassifier(ID3_best,max_features= b_param['max_features'], max_samples= b_param['max_samples'], n_estimators= b_param['n_estimators'], bootstrap_features= b_param['bootstrap_features'])
bag_best.fit(fraud_u_train_X,fraud_u_train_Y)
bag_predicted = bag_best.predict(fraud_u_test_X)
calcmetrics(bag_predicted,fraud_u_test_Y,"Bagging- Under sampling")

In [None]:
bag=GridSearchCV(BaggingClassifier(ID3_best), param_grid = bag_params, n_jobs = -1, cv = 5)
b_param=bestparam (bag, fraud_n_train_X, fraud_n_train_Y)
bag_best=BaggingClassifier(ID3_best,max_features= b_param['max_features'], max_samples= b_param['max_samples'], n_estimators= b_param['n_estimators'], bootstrap_features= b_param['bootstrap_features'])
bag_best.fit(fraud_n_train_X,fraud_n_train_Y)
bag_predicted = bag_best.predict(fraud_n_test_X)
calcmetrics(bag_predicted,fraud_n_test_Y,"Bagging- Random sampling")

## buliding Logistic regression model from undersampled data and Random sampled data

In [None]:
logit=GridSearchCV(LogisticRegression(), param_grid = logit_params, n_jobs = -1, cv = 5)
b_param=bestparam (logit, fraud_u_train_X, fraud_u_train_Y)
logit_best=LogisticRegression(warm_start=b_param['warm_start'], C= b_param['C'], max_iter= b_param['max_iter'], solver= b_param['solver'])
logit_best.fit(fraud_u_train_X,fraud_u_train_Y)
logit_predicted = logit_best.predict(fraud_u_test_X)
calcmetrics(logit_predicted,fraud_u_test_Y,"logistic Regression- Under sampling")

In [None]:
logit=GridSearchCV(LogisticRegression(), param_grid = logit_params, n_jobs = -1, cv = 5)
b_param=bestparam (logit, fraud_n_train_X, fraud_n_train_Y)
logit_best=LogisticRegression(warm_start=b_param['warm_start'], C= b_param['C'], max_iter= b_param['max_iter'], solver= b_param['solver'])
logit_best.fit(fraud_n_train_X,fraud_n_train_Y)
logit_predicted = logit_best.predict(fraud_n_test_X)
calcmetrics(logit_predicted,fraud_n_test_Y,"logistic Regression- Random sampling")

## buliding XGBOOST model from undersampled data and Random sampled data

In [None]:
Xgboost = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.03).fit(fraud_u_train_X, fraud_u_train_Y) 
xgb_predicted = Xgboost.predict(fraud_u_test_X)
calcmetrics(xgb_predicted,fraud_u_test_Y,"XGBoost - Under sampling")

In [None]:
Xgboost = xgb.XGBClassifier(max_depth=4, n_estimators=600, learning_rate=0.03).fit(fraud_n_train_X, fraud_n_train_Y) 
xgb_predicted = Xgboost.predict(fraud_n_test_X)
calcmetrics(xgb_predicted,fraud_n_test_Y,"XGBoost - Random sampling")