In [38]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.under_sampling import RandomUnderSampler 
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_curve, auc
from collections import Counter

In [39]:
#Read the data into dataframe
car_df = pd.read_csv('newCardata.csv')
car_features = pd.read_csv('finalDataPreprocessBinary.csv')
car_label = car_df['FraudFound']



In [50]:
#change the label of the data
labelNo = LabelEncoder()
car_df['FraudFound'] = labelNo.fit_transform(car_df['FraudFound'].astype('str'))
car_label = car_df['FraudFound']
print(car_label.shape)

#split the data into train and test
X_train,X_test,y_train,y_test = train_test_split(car_features,car_label,random_state=3,test_size=0.25)
print('xtrain:',type(X_train))

print('Original dataset shape {}'.format(Counter(car_label)))

#find the count of majority and minority
maj_count1 = 0
min_count1 = 0
maj_count2 = 0
min_count2 = 0
print(y_train.shape)
print(y_test.shape)

y_train = pd.Series(y_train).values
y_test = pd.Series(y_test).values
print(type(y_train))

for i in range(y_train.shape[0]):
    if(y_train[i]==0):
        maj_count1 = maj_count1 + 1
    else:
        min_count1 = min_count1 + 1

for j in range(y_test.shape[0]):
    if(y_test[j]==0):
        maj_count2 += 1
    else:
        min_count2 += 1

print("Majority Count1: ",maj_count1)
print("Minority Count1: ",min_count1)
print("Majority Count2: ",maj_count2)
print("Minority Count2: ",min_count2)

    


(15419,)
xtrain: <class 'pandas.core.frame.DataFrame'>
Original dataset shape Counter({0: 14496, 1: 923})
(11564,)
(3855,)
<class 'numpy.ndarray'>
Majority Count1:  10887
Minority Count1:  677
Majority Count2:  3609
Minority Count2:  246


In [19]:
#model object
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)
print('Random forest classifier:')
predicted = model.predict(X_test)
print(type(predicted),predicted.shape,predicted)
print('Accuracy is ',round(accuracy_score(y_test,model.predict(X_test)) * 100,2))


Random forest classifier:
<class 'numpy.ndarray'> (3855,) [0 0 0 ..., 0 0 0]
Accuracy is  93.7


In [20]:
# calculating specifity and sensitivity
# 0  := Negative
# 1 := Positive
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()
print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[3606    3]
 [ 240    6]]
TN: 3606
FP: 3
FN: 240
TP: 6
Accuracy: 93.6964980545
Sensitivity: 2.43902439024
Specificity: 99.9168744805


In [21]:
#converting pandas.core.series.Series to numpy.ndarray
print(type(y_test),type(predicted))
ytest = pd.Series(y_test).values
print((ytest.shape[0]))

#find the index where both are 1.
count = 0
fault = 0
predictedfault = 0
for i in range(predicted.shape[0]):
    if((predicted[i] == 1) and (ytest[i] == 1)):
        count += 1
    if(ytest[i] == 1):
        fault += 1
    if(predicted[i]==1):
        predictedfault += 1
print("No of fraud cases:",fault)
print("No of fauld cases predicted by model:",predictedfault)
print("No of fraud correctly predicted  as fault:",count)


<class 'pandas.core.series.Series'> <class 'numpy.ndarray'>
3855
No of fraud cases: 246
No of fauld cases predicted by model: 9
No of fraud correctly predicted  as fault: 6


In [22]:
#with Random Under Sampling
#rus = RandomUnderSampler(ratio={0:14496,0:923},random_state=42)

print('Original dataset shape {}'.format(Counter(car_label)))

rus = RandomUnderSampler(random_state=42,replacement=False)
X_resampled,Y_resampled = rus.fit_sample(car_features,car_label)
print('Resampled dataset shape {}'.format(Counter(Y_resampled)))

maj_count = 0
min_count = 0
for i in range(Y_resampled.shape[0]):
    if(Y_resampled[i]==0):
        maj_count += 1
    else:
        min_count += 1
print("Majority Count: ",maj_count)
print("Minority Count: ",min_count)

Original dataset shape Counter({0: 14496, 1: 923})
Resampled dataset shape Counter({0: 923, 1: 923})
Majority Count:  923
Minority Count:  923


In [51]:
#divide the trainin set and test set
X_train,X_test,y_train,y_test = train_test_split(X_resampled,Y_resampled,test_size=0.3)

print(type(y_train))
#find the count of majority and minority
maj_count1 = 0
min_count1 = 0
maj_count2 = 0
min_count2 = 0
print(y_train.shape)
print(y_test.shape)
for i in range(y_train.shape[0]):
    if(y_train[i]==0):
        maj_count1 += 1
    else:
        min_count1 += 1
for j in range(y_test.shape[0]):
    if(y_test[j]==0):
        maj_count2 += 1
    else:
        min_count2 += 1

print("Majority Count1: ",maj_count1)
print("Minority Count1: ",min_count1)
print("Majority Count2: ",maj_count2)
print("Minority Count2: ",min_count2)

    


<class 'numpy.ndarray'>
(1292,)
(554,)
Majority Count1:  638
Minority Count1:  654
Majority Count2:  285
Minority Count2:  269


In [52]:
#model object
model = RandomForestClassifier(n_estimators=100)
model.fit(X_train,y_train)
print('Random forest classifier:')
predicted = model.predict(X_test)
print('Accuracy is ',round(accuracy_score(y_test,model.predict(X_test)) * 100,2))


Random forest classifier:
Accuracy is  76.71


In [53]:
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[189  96]
 [ 33 236]]
Accuracy: 76.714801444
Sensitivity: 87.7323420074
Specificity: 66.3157894737


In [54]:
#converting pandas.core.series.Series to numpy.ndarray
ytest = pd.Series(y_test).values
#find the index where both are 1.
count = 0
fault = 0
predictedfault = 0
for i in range(predicted.shape[0]):
    
    if((predicted[i] == 1) and (ytest[i] == 1)):
        count += 1
    if(ytest[i] == 1):
        fault += 1
    if(predicted[i]==1):
        predictedfault += 1
print("No of fraud cases:",fault)
print("No of fauld cases predicted by model:",predictedfault)
print("No of fraud correctly predicted  as fault:",count)

No of fraud cases: 269
No of fauld cases predicted by model: 332
No of fraud correctly predicted  as fault: 236


In [55]:
from sklearn.metrics import precision_recall_fscore_support

precision_recall_fscore_support(y_test, predicted, average='micro')
precision_recall_fscore_support(y_test, predicted, average='macro')
precision_recall_fscore_support(y_test, predicted, average='weighted')


(0.7831263584928061, 0.76714801444043323, 0.76488526783716559, None)

In [56]:
#cross-validation 10-FOLD cross validation
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn import model_selection
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score



print('Random forest classifier:')

rus = RandomUnderSampler(random_state=42,replacement=False)
X_resampled,Y_resampled = rus.fit_sample(car_features,car_label)
print('Resampled dataset shape {}'.format(Counter(Y_resampled)))

#convert np.array to dataframe 
df_features = pd.DataFrame(X_resampled)
df_lables = pd.DataFrame(Y_resampled)



Random forest classifier:
Resampled dataset shape Counter({0: 923, 1: 923})


In [57]:
#add the both the labels and features
df_features['FraudFound']= df_lables

#shuffle the featrues
new_combine_features = df_features.set_index(np.random.permutation(df_features.index))


new_label = new_combine_features['FraudFound']

#drop the FraudFound attribute
new_combine_features.drop(['FraudFound'],inplace=True,axis=1)

scoring = {'accuracy' : make_scorer(accuracy_score), 
           'precision' : make_scorer(precision_score),
           'recall' : make_scorer(recall_score), 
           'f1_score' : make_scorer(f1_score)}

kfold = model_selection.KFold(n_splits=10, random_state=10)
model=RandomForestClassifier(n_estimators=100) 

results = model_selection.cross_validate(estimator=model,X=new_combine_features,y=new_label,cv=kfold,scoring=scoring)

print(np.mean(results['test_accuracy']))
print(np.mean(results['test_precision']))
print(np.mean(results['test_recall']))
print(np.mean(results['test_f1_score']))



  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


0.690846063455
0.503278688525
0.488683901293
0.443461683387
