In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
import math
from imblearn.over_sampling import SMOTE

In [33]:
#Read the data into dataframe
car_df = pd.read_csv('newCardata.csv')
car_features = pd.read_csv('finalDataPreprocess.csv')

#encode the label as 1's or 0's
label_Number = LabelEncoder()
car_df['FraudFound'] = label_Number.fit_transform(car_df['FraudFound'].astype('str'))
car_label = car_df['FraudFound']
print(car_features.shape)

(15419, 97)


In [34]:
#divide the data into train and test set

X_train,X_test,y_train,y_test = train_test_split(car_features,car_label,random_state=3,test_size=0.25)
print(X_train.shape,X_test.shape)


(11564, 97) (3855, 97)


In [35]:
#build the model
clf= RandomForestClassifier(n_estimators=100) 

#train the model
model= clf.fit(X_train,y_train)

#test the model and find model's performance metric
predicted= model.predict(X_test)

# calculating specifity and sensitivity
# 0  := Negative(FraudNotFound)
# 1 := Positive (FraudFound)
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)


Confusion Matrix:
 [[3608    1]
 [ 240    6]]
TN: 3608
FP: 1
FN: 240
TP: 6
Accuracy: 93.7483787289
Sensitivity: 2.43902439024
Specificity: 99.9722914935


In [36]:
#With SMOTE 

#apply the Smote
sm= SMOTE()
features,labels= sm.fit_sample(car_features,car_label)
print(features.shape,labels.shape)

#split the smote generated data into the train and test set
X_train,X_test,y_train,y_test= train_test_split(features,labels,random_state=10,test_size=0.30)
print(X_train.shape,X_test.shape)



(28992, 97) (28992,)
(20294, 97) (8698, 97)


In [37]:
#build the model
clf= RandomForestClassifier(n_estimators=100) 

#train the model
model= clf.fit(X_train,y_train)

#test the model and find model's performance metric
predicted= model.predict(X_test)

# calculating specifity and sensitivity
# 0  := Negative(FraudNotFound)
# 1 := Positive (FraudFound)
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[4223   69]
 [ 171 4235]]
TN: 4223
FP: 69
FN: 171
TP: 4235
Accuracy: 97.2407449989
Sensitivity: 96.1189287335
Specificity: 98.3923578751


In [38]:
#apply the Smote
sm= SMOTE()
features,labels= sm.fit_sample(car_features,car_label)
print(features.shape,labels.shape)

#split the smote generated data into the train and test set
X_train,X_test,y_train,y_test= train_test_split(features,labels,random_state=3,test_size=0.25)
print(X_train.shape,X_test.shape)



(28992, 97) (28992,)
(21744, 97) (7248, 97)


In [39]:
print(type(X_test))

<class 'numpy.ndarray'>


In [40]:
#find the total number of the fraudulent claims in the smote generated data
#find the all the fradulent claims in the dataset
total_fraud= 0
total_Nonfraud= 0
print("Total no of data in smote generated data: ",labels.shape[0])
for i in range(labels.shape[0]):
    if labels[i]==1:
        total_fraud += 1
    else:
        total_Nonfraud += 1
print("Total fraud in smote generated data: ", total_fraud)
print("Total Non-fraud in smote generated data: ", total_Nonfraud)

Total no of data in smote generated data:  28992
Total fraud in smote generated data:  14496
Total Non-fraud in smote generated data:  14496


In [41]:
#testing only on the original test data
from numpy import linalg as LA

#print the original size of the smote generated test set
print( "Original X_test  and y_test", (X_test.shape, y_test.shape))

#find the all the fradulent claims in the dataset
indexOFfraudulent= []
columnfeatures= list(car_features.columns.values)
fraudulentClaims= pd.DataFrame(columns=columnfeatures)
fraudLabel= pd.DataFrame(columns=['label'])
j= 0
for i in range(car_label.shape[0]):
    if car_label[i]==1:
        fraudulentClaims.loc[j]= car_features.loc[i]
        fraudLabel.loc[i]= 1
        j += 1
print("Total number of fraudulent claims in original data:",fraudulentClaims.shape[0])
#print(y_test[1000])

Original X_test  and y_test ((7248, 97), (7248,))
Total number of fraudulent claims in original data: 923


In [27]:

#remove the smote generated sample from the test set.
k= 0
index= []


#make empty dataframes for storing the new test data
new_X_test= pd.DataFrame(columns=columnfeatures)
new_y_test= pd.DataFrame(columns=['label'])

#convert numpy array to pd.DataFrame
X_test= pd.DataFrame(X_test)

print(type(X_test))

for i in range(X_test.shape[0]):
#for i in range(1000):
    element= X_test.loc[i]
    assert(False)
    if (int(y_test[i])==1):
        #print("Inside condition y_test== 1, y_test[i] is:",y_test[i])
        for j in range(fraudulentClaims.shape[0]):
            data = pd.Series(element).values
            fraudClaims= fraudulentClaims.loc[j]
            fraudClaims= pd.Series(fraudClaims).values
            euclidean_distance= LA.norm(data-fraudClaims,2)   #calculate the euclidean distance

            if (euclidean_distance == 0):
                index.append(i)
                new_X_test.loc[k]= [element[elm] for elm in range(element.shape[0])]
                #print("After euclidean distance==0, y_test[i] is:",y_test[i])
                new_y_test.loc[k]= y_test[i]
                break
    else:
        new_X_test.loc[k]= [element[elm] for elm in range(element.shape[0])]
        #print("Else condition, y_test[i]:",y_test[i])
        new_y_test.loc[k]= y_test[i]
    k += 1


<class 'pandas.core.frame.DataFrame'>


AssertionError: 

In [23]:
print("New X_test and y_test shape:",new_X_test.shape,new_y_test.shape)


New X_test and y_test shape: (3855, 97) (3855, 1)


In [24]:
#find the original test set from smote generated dataset
print(new_X_test.shape)
print(new_y_test.shape)

#change the dataframe into numpy array
mod_X_test= new_X_test.values
mod_y_test= new_y_test.values

fraud= 0
for i in range(mod_y_test.shape[0]):
    if(mod_y_test[i]==1):
        fraud += 1
print("Total Number of fraud in modified test set:",fraud)



(3855, 97)
(3855, 1)
Total Number of fraud in modified test set: 232


In [25]:
#train the model with new modified test set
#build the model
clf= RandomForestClassifier(n_estimators=100) 

#train the model
model= clf.fit(X_train,y_train)

#test the model and find model's performance metric
#predicted= model.predict(mod_X_test)
predicted= model.predict(mod_X_test)

# calculating specifity and sensitivity
# 0  := Negative(FraudNotFound)
# 1 := Positive (FraudFound)
#cm = confusion_matrix(mod_y_test,predicted)
cm = confusion_matrix(mod_y_test,predicted)

print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[3560   63]
 [ 142   90]]
TN: 3560
FP: 63
FN: 142
TP: 90
Accuracy: 94.682230869
Sensitivity: 38.7931034483
Specificity: 98.2611095777


In [42]:
#test on original fraudulent data
org_test= fraudulentClaims.values
org_label= fraudLabel.values
predicted= model.predict(org_test)

# calculating specifity and sensitivity
# 0  := Negative(FraudNotFound)
# 1 := Positive (FraudFound)
#cm = confusion_matrix(mod_y_test,predicted)
cm = confusion_matrix(org_label,predicted)

print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[  0   0]
 [163 760]]
TN: 0
FP: 0
FN: 163
TP: 760
Accuracy: 82.3401950163
Sensitivity: 82.3401950163
Specificity: nan
