In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


In [2]:
data=pd.read_csv("C:\\Users\\DELL\\Downloads\\Fraud_check.csv")
data.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [3]:
from sklearn import preprocessing
label_encoder = preprocessing.LabelEncoder()
data['Undergrad']= label_encoder.fit_transform(data['Undergrad']) 
data['Marital.Status']= label_encoder.fit_transform(data['Marital.Status']) 
data['Urban']= label_encoder.fit_transform(data['Urban'])

In [4]:
data.rename({'Taxable.Income':"Tax_Income"},axis=1,inplace=True)
data.head()

Unnamed: 0,Undergrad,Marital.Status,Tax_Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


In [5]:
bins=[0,30000,100000]
name=["risky","Not_risky"]
data["Tax"]=pd.cut(data.Tax_Income,bins,labels=name)
data.head()

Unnamed: 0,Undergrad,Marital.Status,Tax_Income,City.Population,Work.Experience,Urban,Tax
0,0,2,68833,50047,10,1,Not_risky
1,1,0,33700,134075,18,1,Not_risky
2,0,1,36925,160205,30,1,Not_risky
3,1,2,50190,193264,15,1,Not_risky
4,0,1,81002,27533,28,0,Not_risky


In [6]:
data.drop("Tax_Income",axis=1,inplace=True)
data['Tax']= label_encoder.fit_transform(data['Tax'].astype(str))
data.head()

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban,Tax
0,0,2,50047,10,1,0
1,1,0,134075,18,1,0
2,0,1,160205,30,1,0
3,1,2,193264,15,1,0
4,0,1,27533,28,0,0


In [7]:
X=data.iloc[:,0:5]
Y=data.iloc[:,5]

In [8]:
X

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,0,2,50047,10,1
1,1,0,134075,18,1
2,0,1,160205,30,1
3,1,2,193264,15,1
4,0,1,27533,28,0
...,...,...,...,...,...
595,1,0,39492,7,1
596,1,0,55369,2,1
597,0,0,154058,0,1
598,1,1,180083,17,0


In [9]:
Y

0      0
1      0
2      0
3      0
4      0
      ..
595    0
596    0
597    0
598    0
599    0
Name: Tax, Length: 600, dtype: int32

# RANDOM FOREST 

In [10]:
num_trees = 100
max_features = 4
kfold = KFold(n_splits=10, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())



0.7433333333333333


### Random forest gives accuracy of 74.33% for max_feature=4

# RANDOM FOREST USING BAGGING ENSEMBLE TECHNIQUE

In [11]:
from sklearn.ensemble import BaggingClassifier
seed = 7

kfold = KFold(n_splits=10, random_state=seed)
RF = RandomForestClassifier()
num_trees = 100
model = BaggingClassifier(base_estimator=RF, n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())



0.7833333333333334


### We get improved accuracy of 78.33% by using bagging ensemble technique 

# ADABOOST CLASSIFICATION

In [12]:
from sklearn.ensemble import AdaBoostClassifier
array = data.values

X = array[:,0:5]
Y = array[:,5]

num_trees = 10
seed=7
kfold = KFold(n_splits=10, random_state=seed)
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())




0.7916666666666666


### Adaboost gives the maximum accuracy as 79.16%

# STACKING TECHNIQUE

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [14]:
array = data.values
X = array[:,0:5]
Y = array[:,5]
kfold = KFold(n_splits=10, random_state=7)

# create the sub models
estimators = []
model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model1))
model2 = RandomForestClassifier()
estimators.append(('RF', model2))
model3 = SVC()
estimators.append(('svm', model3))





In [15]:
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())

0.7933333333333332


### Stacking technique accuracy = 79.33%