In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
dataset=pd.read_csv('fraudDetection.csv', encoding= 'unicode_escape', engine='python')
print(dataset.shape)

(14, 6)


In [3]:
X=dataset.iloc[ : , 1:5].values
y=dataset.iloc[ : , -1].values

In [4]:
X

array([['Male ', 'Middle age', 'Unmarried', 'HSC'],
       ['Female', 'Youth', 'Married', 'UG'],
       ['Male ', 'Middle age', 'Divorced', 'PG'],
       ['Female', 'Youth', 'Unmarried', 'HSC'],
       ['Male', 'Senior', 'Married', 'PG'],
       ['Female', 'Youth', 'Divorced', 'UG'],
       ['Male', 'Youth', 'Divorced', 'HSC'],
       ['Female', 'Middle age', 'Married', 'HSC'],
       ['Male', 'Senior', 'Married', 'UG'],
       ['Male', 'Youth', 'Unmarried', 'PG'],
       ['Female', 'Youth', 'Unmarried', 'UG'],
       ['Male', 'Youth', 'Unmarried', 'PG'],
       ['Male', 'Senior', 'Married', 'UG'],
       ['Female', 'Middle age', 'Married', 'HSC']], dtype=object)

In [5]:
y

array(['YES', 'YES', 'NO', 'NO', 'YES', 'NO', 'NO', 'YES', 'YES', 'YES',
       'NO', 'YES', 'YES', 'YES'], dtype=object)

In [6]:
dataset.head(10)

Unnamed: 0,S. No,Gender,Age,Marital Status,Education,Fraud
0,1,Male,Middle age,Unmarried,HSC,YES
1,2,Female,Youth,Married,UG,YES
2,3,Male,Middle age,Divorced,PG,NO
3,4,Female,Youth,Unmarried,HSC,NO
4,5,Male,Senior,Married,PG,YES
5,6,Female,Youth,Divorced,UG,NO
6,7,Male,Youth,Divorced,HSC,NO
7,8,Female,Middle age,Married,HSC,YES
8,9,Male,Senior,Married,UG,YES
9,10,Male,Youth,Unmarried,PG,YES


In [7]:
# DATA PREPROCESSING 

# 1. checking and handling missing values
print("No. of missing values in data:",dataset.isnull().sum()) #gives the number of np.NaN values in each column
#checks for missing values in the dataset--if any, replace with mean of the column

No. of missing values in data: S. No             0
Gender            0
Age               0
Marital Status    0
Education         0
Fraud             0
dtype: int64


In [8]:
# ordinal encode the  dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
# ordinal encode input variables
ordinal_encoder = OrdinalEncoder()
X = ordinal_encoder.fit_transform(X)
# ordinal encode target variable
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)
# summarize the transformed data
print('Input', X.shape)
print(X[:5, :])
print('Output', y.shape)
print(y[:5])

Input (14, 4)
[[2. 0. 2. 0.]
 [0. 2. 1. 2.]
 [2. 0. 0. 1.]
 [0. 2. 2. 0.]
 [1. 1. 1. 1.]]
Output (14,)
[1 1 0 0 1]


In [9]:
#SPLITTING DATASET INTO TRAINING AND TEST SET
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(X,y,test_size=0.2,random_state=0)

In [10]:
# APPLYING DECISION TREE CLASSIFICATION MODEL
# import DecisionTreeClassifier from sklearn
from sklearn.tree import DecisionTreeClassifier
classifier=DecisionTreeClassifier( criterion='entropy')  #entropy is the measure of randomness and we split in such a way to bring order into the classifications after split and thus reduce entropy
classifier.fit(X_train,y_train)  

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [11]:
# PREDICTING TEST SET RESULTS
# prediction with sklearn
y_pred=classifier.predict(X_test)

In [14]:
# COMPARING PREDICTED OUTPUT VALUES y_pred WITH ACTUAL TARGET TESTING SET VALUES y_test
dataset = pd.DataFrame({'Actual': y_test.flatten(), 'Predicted': y_pred.flatten()})
print(dataset.head(6))
print(dataset.tail(2))

   Actual  Predicted
0       1          1
1       0          0
2       1          1
   Actual  Predicted
1       0          0
2       1          1


In [13]:
from sklearn.metrics import confusion_matrix
cm= confusion_matrix(y_test,y_pred)
''' [ TP FN
      FP TN]'''
print("Confusion Matrix:",cm)
print('Training score:',classifier.score(X_train,y_train)*100)
print('Testing score:',classifier.score(X_test,y_test)*100)
print("Importance of each feature:" ,classifier.feature_importances_)

Confusion Matrix: [[1 0]
 [0 2]]
Training score: 100.0
Testing score: 100.0
Importance of each feature: [0.20186621 0.19448533 0.60364846 0.        ]
