In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE 
from sklearn.metrics import confusion_matrix

In [2]:
#Read the data into dataframe
car_df = pd.read_csv('newCardata.csv')
car_features = pd.read_csv('finalCarData.csv')


label_Number = LabelEncoder()
car_df['FraudFound'] = label_Number.fit_transform(car_df['FraudFound'].astype('str'))
car_label = car_df['FraudFound']


In [3]:
#split the data into train and test
print(car_features.shape,car_label.shape)

X_train,X_test,y_train,y_test = train_test_split(car_features,car_label,random_state=3,test_size=0.25)
print('xtrain:',type(X_train))

(15419, 97) (15419,)
xtrain: <class 'pandas.core.frame.DataFrame'>


In [4]:
#model object
model = DecisionTreeClassifier(random_state=0)


In [5]:
#train the model
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [6]:
#test the model and print the accuracy.
print('Decision classifier:')
predicted = model.predict(X_test)
print(type(predicted),predicted.shape,predicted)
print('Accuracy is ',round(accuracy_score(y_test,model.predict(X_test)) * 100,2))


Decision classifier:
<class 'numpy.ndarray'> (3855,) [0 0 0 ..., 0 0 0]
Accuracy is  88.46


In [7]:
# calculating specifity and sensitivity
# 0  := Negative
# 1 := Positive
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()
print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[3370  239]
 [ 206   40]]
TN: 3370
FP: 239
FN: 206
TP: 40
Accuracy: 88.4565499351
Sensitivity: 16.2601626016
Specificity: 93.3776669438


In [8]:
#converting pandas.core.series.Series to numpy.ndarray
print(type(y_test),type(predicted))
ytest = pd.Series(y_test).values
print((ytest.shape[0]))

<class 'pandas.core.series.Series'> <class 'numpy.ndarray'>
3855


In [9]:
#find the index where both are 1.
count = 0
fault = 0
predictedfault = 0
for i in range(predicted.shape[0]):
    if((predicted[i] == 1) and (ytest[i] == 1)):
        count += 1
    if(ytest[i] == 1):
        fault += 1
    if(predicted[i]==1):
        predictedfault += 1
print("No of fraud cases:",fault)
print("No of fauld cases predicted by model:",predictedfault)
print("No of fraud cases correctly predicted  as fault:",count)

No of fraud cases: 246
No of fauld cases predicted by model: 279
No of fraud cases correctly predicted  as fault: 40


In [47]:
#get a decision path:
#decision_path(X, check_input=True):= Return the decision path in the tree
# X:= The input samples.
# check_input:= boolean, (default=True),Allow to bypass several input checking. 
#                                       Don’t use this parameter unless you know what you do.
#Returns:indicator : sparse csr array, shape = [n_samples, n_nodes]
# Return a node indicator matrix where non zero elements indicates that the samples goes through the nodes.

#take a sample from the test.
#s = X_test.iloc[1].reshape(-1,97)
#s = pd.DataFrame(sample) #converting np.ndarray to dataframe
#print(sample.shape)
#print(type(sample))
#s.astype('float32') #converting the type to float32
#print(type(sample))
#s = pd.Series(s).values
#print(type(sample))
#path[i] = model.decision_path(sample,check_input=False)

In [10]:
#with smote.
sm = SMOTE()
features,labels = sm.fit_sample(car_features,car_label)
print(features.shape,labels.shape)
X_train,X_test,y_train,y_test = train_test_split(features,labels,random_state=3,test_size=0.25)


(28992, 97) (28992,)


In [11]:
#model object
model = DecisionTreeClassifier(random_state=0)


In [12]:
#train the model
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=0,
            splitter='best')

In [13]:
#test the model and print the accuracy.
print('Decision classifier:')
predicted = model.predict(X_test)
print(type(predicted),predicted.shape,predicted)
print('Accuracy is ',round(accuracy_score(y_test,model.predict(X_test)) * 100,2))


Decision classifier:
<class 'numpy.ndarray'> (7248,) [1 1 0 ..., 0 0 1]
Accuracy is  93.56


In [14]:
# calculating specifity and sensitivity
# 0  := Negative
# 1 := Positive
cm = confusion_matrix(y_test,predicted)
print("Confusion Matrix:\n",cm)
TN, FP, FN, TP = cm.ravel()
print("TN:",TN)
print("FP:",FP)
print("FN:",FN)
print("TP:",TP)

print("Accuracy:",(TP+TN)/(TP+FP+FN+TN)*100)
print("Sensitivity:",TP/(TP+FN)*100)
print("Specificity:",TN/(TN+FP)*100)

Confusion Matrix:
 [[3351  272]
 [ 195 3430]]
TN: 3351
FP: 272
FN: 195
TP: 3430
Accuracy: 93.5568432671
Sensitivity: 94.6206896552
Specificity: 92.4924096053


In [15]:
#converting pandas.core.series.Series to numpy.ndarray
ytest = pd.Series(y_test).values
print((ytest.shape[0]))

7248


In [16]:
#find the index where both are 1.
count = 0
fault = 0
predictedfault = 0
for i in range(predicted.shape[0]):
    if((predicted[i] == 1) and (ytest[i] == 1)):
        count += 1
    if(ytest[i] == 1):
        fault += 1
    if(predicted[i]==1):
        predictedfault += 1
print("No of fraud cases:",fault)
print("No of fauld cases predicted by model:",predictedfault)
print("No of fraud cases correctly predicted  as fault:",count)

No of fraud cases: 3625
No of fauld cases predicted by model: 3702
No of fraud cases correctly predicted  as fault: 3430
