Import Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,accuracy_score

Load The Data Set

In [2]:
data = pd.read_csv("Fraud_check.csv")

In [3]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [4]:
conditions = [(data["Taxable.Income"]<=30000),(data["Taxable.Income"]>30000)]
categories = ("Risky","Good")

In [5]:
data["Taxable.Income"] = np.select(conditions,categories)

In [6]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO


In [7]:
data.shape

(600, 6)

Missing values

In [8]:
data.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    object
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(2), object(4)
memory usage: 28.3+ KB


In [10]:
data.describe()

Unnamed: 0,City.Population,Work.Experience
count,600.0,600.0
mean,108747.368333,15.558333
std,49850.075134,8.842147
min,25779.0,0.0
25%,66966.75,8.0
50%,106493.5,15.0
75%,150114.25,24.0
max,199778.0,30.0


Handelling the Categorical Columns

Label Encoded

In [11]:
label_encode = LabelEncoder()

In [12]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,Good,50047,10,YES
1,YES,Divorced,Good,134075,18,YES
2,NO,Married,Good,160205,30,YES
3,YES,Single,Good,193264,15,YES
4,NO,Married,Good,27533,28,NO


In [13]:
data["Undergrad"] = label_encode.fit_transform(data["Undergrad"])
data["Marital.Status"] = label_encode.fit_transform(data["Marital.Status"])
data["Taxable.Income"] = label_encode.fit_transform(data["Taxable.Income"])
data["Urban"]  = label_encode.fit_transform(data["Urban"])

In [14]:
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,0,50047,10,1
1,1,0,0,134075,18,1
2,0,1,0,160205,30,1
3,1,2,0,193264,15,1
4,0,1,0,27533,28,0


Spliting the dataset into Features and Target

In [15]:
x = data.drop(columns="Taxable.Income",axis=1)
y = data["Taxable.Income"]

In [16]:
x.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,0,2,50047,10,1
1,1,0,134075,18,1
2,0,1,160205,30,1
3,1,2,193264,15,1
4,0,1,27533,28,0


In [17]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: Taxable.Income, dtype: int32

Spliting the dataset into train test split

In [18]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [19]:
print(x.shape,x_train.shape,x_test.shape)

(600, 5) (480, 5) (120, 5)


In [20]:
model = RandomForestClassifier(criterion="gini")
print("Model is Loaded")

Model is Loaded


In [21]:
model_train = model.fit(x_train,y_train)
print("Model is Trained")

Model is Trained


In [22]:
pred = model_train.predict(x_test)

In [23]:
pred

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [24]:
acc = accuracy_score(pred,y_test)
acc

0.7416666666666667

In [25]:
cf = confusion_matrix(pred,y_test)
cf

array([[89, 26],
       [ 5,  0]], dtype=int64)