In [22]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import classification_report
from imblearn.over_sampling import smote


In [4]:
#Read the data into dataframe
car_df = pd.read_csv('newCardata.csv')
car_features = pd.read_csv('finalDataPreprocessBinary.csv')
car_label = car_df['FraudFound']



In [5]:
#change the label of the data
labelNo = LabelEncoder()
car_df['FraudFound'] = labelNo.fit_transform(car_df['FraudFound'].astype('str'))
car_label = car_df['FraudFound']
print(car_label.shape)


(15419,)


In [6]:
#split the data into train and test
print(car_features.shape,car_label.shape)

X_train,X_test,y_train,y_test = train_test_split(car_features,car_label,random_state=3,test_size=0.25)
print('xtrain:',type(X_train))

(15419, 52) (15419,)
xtrain: <class 'pandas.core.frame.DataFrame'>


In [7]:
#model object
model = RandomForestClassifier(n_estimators=100)


In [8]:
#train the model

model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
print('Random forest classifier:')
predicted = model.predict(X_test)
print(type(predicted),predicted.shape,predicted)
print('Accuracy is ',round(accuracy_score(y_test,model.predict(X_test)) * 100,2))


Random forest classifier:
<class 'numpy.ndarray'> (3855,) [0 0 0 ..., 0 0 0]
Accuracy is  93.64


In [10]:
# calculating specifity and sensitivity
# 0  := Negative
# 1 := Positive
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)

Confusion Matrix:
 [[3607    2]
 [ 243    3]]


In [11]:
TN, FP, FN, TP = cm.ravel()
print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

TN: 3607
FP: 2
FN: 243
TP: 3
Accuracy: 93.64461738
Sensitivity: 1.21951219512
Specificity: 99.944582987


In [12]:

#converting pandas.core.series.Series to numpy.ndarray
print(type(y_test),type(predicted))
ytest = pd.Series(y_test).values
print((ytest.shape[0]))

<class 'pandas.core.series.Series'> <class 'numpy.ndarray'>
3855


In [13]:
#find the index where both are 1.
count = 0
fault = 0
predictedfault = 0
for i in range(predicted.shape[0]):
    if((predicted[i] == 1) and (ytest[i] == 1)):
        count += 1
    if(ytest[i] == 1):
        fault += 1
    if(predicted[i]==1):
        predictedfault += 1
print("No of fraud cases:",fault)
print("No of fauld cases predicted by model:",predictedfault)
print("No of fraud correctly predicted  as fault:",count)

No of fraud cases: 246
No of fauld cases predicted by model: 5
No of fraud correctly predicted  as fault: 3


In [14]:
#with Adabost
from sklearn.ensemble import AdaBoostClassifier


In [15]:
#model object
X_train,X_test,y_train,y_test = train_test_split(car_features,car_label,random_state=43,test_size=0.25)
def adaboost(X_train, X_test, y_train):
    model = AdaBoostClassifier(n_estimators=100, random_state=42)
    model.fit(X_train,y_train)
    y_pred = model.predict(X_test) 
    return y_pred

# AdaBoost
y_baseline = adaboost(X_train, X_test, y_train)
print('Accuracy is ',round(accuracy_score(y_test,y_baseline) * 100,2))


Accuracy is  92.68


In [16]:
cm = confusion_matrix(y_test,y_baseline)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[3571   22]
 [ 260    2]]
Accuracy: 92.6848249027
Sensitivity: 0.763358778626
Specificity: 99.3876983023


In [17]:
#converting pandas.core.series.Series to numpy.ndarray
ytest = pd.Series(y_test).values
print((ytest.shape[0]))
print(predicted.shape[0])
print(ytest)
print()

3855
3855
[0 0 0 ..., 0 0 1]



In [18]:
#find the index where both are 1.
count = 0
fault = 0
predictedfault = 0
for i in range(predicted.shape[0]):
    
    if((predicted[i] == 1) and (ytest[i] == 1)):
        count += 1
    if(ytest[i] == 1):
        fault += 1
    if(predicted[i]==1):
        predictedfault += 1
print("No of fraud cases:",fault)
print("No of fauld cases predicted by model:",predictedfault)
print("No of fraud correctly predicted  as fault:",count)

No of fraud cases: 262
No of fauld cases predicted by model: 5
No of fraud correctly predicted  as fault: 0


In [24]:
# Adabost after SMOTE
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_sample(X_train, y_train)
y_smote = adaboost(X_train_sm, X_test, y_train_sm)
target_names = ['0', '1']

#predict accuracy
print('Accuracy is ',round(accuracy_score(y_test,y_smote) * 100,2))
cm = confusion_matrix(y_test,y_baseline)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)
print(classification_report(y_test, y_smote,target_names=target_names))

Accuracy is  88.17
Confusion Matrix:
 [[3571   22]
 [ 260    2]]
Accuracy: 92.6848249027
Sensitivity: 0.763358778626
Specificity: 99.3876983023
             precision    recall  f1-score   support

          0       0.94      0.93      0.94      3593
          1       0.17      0.20      0.19       262

avg / total       0.89      0.88      0.89      3855



In [25]:
#Random underSampling(RUS)
from sklearn.utils import resample
X_full = X_train.copy()
X_full['target'] = y_train
X_maj = X_full[X_full.target==0]
X_min = X_full[X_full.target==1]
X_maj_rus = resample(X_maj,replace=False,n_samples=len(X_min),random_state=44)
X_rus = pd.concat([X_maj_rus, X_min])
X_train_rus = X_rus.drop(['target'], axis=1)
y_train_rus = X_rus.target
y_rus = adaboost(X_train_rus, X_test, y_train_rus)

#predict accuracy
#predict accuracy
print('Accuracy is ',round(accuracy_score(y_test,y_smote) * 100,2))
cm = confusion_matrix(y_test,y_baseline)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

target_names = ['0', '1']
print(classification_report(y_test, y_rus,target_names=target_names))


Accuracy is  88.17
Confusion Matrix:
 [[3571   22]
 [ 260    2]]
Accuracy: 92.6848249027
Sensitivity: 0.763358778626
Specificity: 99.3876983023
             precision    recall  f1-score   support

          0       0.98      0.65      0.78      3593
          1       0.14      0.78      0.23       262

avg / total       0.92      0.65      0.74      3855



In [23]:
#SMOTEAdaBoost and RUS Bost


for algorithm in [smote.SMOTEBoost(n_estimators=100, n_samples=300)]:
    algorithm.fit(X_train, y_train)
    y_pred = algorithm.predict(X_test)
    print()
    print(str(algorithm))
    print()
    print(classification_report(y_test, y_pred,target_names=target_names))

AttributeError: module 'imblearn.over_sampling.smote' has no attribute 'SMOTEBoost'

In [25]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(y_test, predicted, average='micro')
precision_recall_fscore_support(y_test, predicted, average='macro')
precision_recall_fscore_support(y_test, predicted, average='weighted')


(0.97006233744130022, 0.9685430463576159, 0.96851786252311811, None)