In [1]:
# Bagged Decision Trees for Classification
import pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import preprocessing

In [2]:
df=pd.read_csv("Fraud_Check.csv")
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
df['Target']=pd.cut(df['Taxable.Income'],bins=[10000,30000,99602],labels=['Risky','Good'])
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Target
0,NO,Single,68833,50047,10,YES,Good
1,YES,Divorced,33700,134075,18,YES,Good
2,NO,Married,36925,160205,30,YES,Good
3,YES,Single,50190,193264,15,YES,Good
4,NO,Married,81002,27533,28,NO,Good
...,...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES,Good
596,YES,Divorced,69967,55369,2,YES,Good
597,NO,Divorced,47334,154058,0,YES,Good
598,YES,Married,98592,180083,17,NO,Good


In [4]:
df['Target'].value_counts()

Good     475
Risky    124
Name: Target, dtype: int64

# Labbel Encoding

In [5]:
label_encoder = preprocessing.LabelEncoder()
df['Urban']= label_encoder.fit_transform(df['Urban'],)
df['Undergrad']= label_encoder.fit_transform(df['Undergrad'],)
df['Marital.Status']= label_encoder.fit_transform(df['Marital.Status'],)
df['Target']=label_encoder.fit_transform(df['Target'])

In [6]:
df

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban,Target
0,0,2,68833,50047,10,1,0
1,1,0,33700,134075,18,1,0
2,0,1,36925,160205,30,1,0
3,1,2,50190,193264,15,1,0
4,0,1,81002,27533,28,0,0
...,...,...,...,...,...,...,...
595,1,0,76340,39492,7,1,0
596,1,0,69967,55369,2,1,0
597,0,0,47334,154058,0,1,0
598,1,1,98592,180083,17,0,0


In [7]:
x=df.iloc[:,0:6]
y=df.iloc[:,6]

In [8]:
x.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


In [9]:
y

0      0
1      0
2      0
3      0
4      0
      ..
595    0
596    0
597    0
598    0
599    0
Name: Target, Length: 600, dtype: int32

In [10]:
# As the seed no. increases accuracy of  the model increases
seed=10
kfold= KFold(n_splits=10,random_state=seed,shuffle=True)

In [11]:
cart=DecisionTreeClassifier()
num_trees=40
model=BaggingClassifier(base_estimator=cart,
                       n_estimators=num_trees,
                       random_state=seed)

In [12]:
results=cross_val_score(model,x,y,cv=kfold)
results

array([0.98333333, 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.98333333, 1.        , 1.        , 1.        ])

In [13]:
print(results.mean())

0.9966666666666667


# Random Forest Classification

In [14]:
num_trees=40
model=RandomForestClassifier(n_estimators=num_trees,
                            max_depth=3)

In [15]:
results=cross_val_score(model,x,y,cv=kfold)
results

array([0.98333333, 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.98333333, 1.        , 1.        , 1.        ])

In [16]:
print(results.mean())

0.9966666666666667


# Adaboost Classification

In [17]:
num_trees=40
seed=8
model1=AdaBoostClassifier(n_estimators=num_trees,
                         random_state=seed)

In [18]:
result=cross_val_score(model1,x,y,cv=kfold)

In [19]:
results

array([0.98333333, 1.        , 1.        , 1.        , 1.        ,
       1.        , 0.98333333, 1.        , 1.        , 1.        ])

In [20]:
print(results.mean())

0.9966666666666667


# Stacking Ensemble for Classification

In [21]:
# create the sub models
estimators = []

model1 = LogisticRegression(max_iter=500)
estimators.append(('logistic', model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

model3 = SVC()
estimators.append(('svm', model3))

# create the ensemble model
ensemble = VotingClassifier(estimators)

In [22]:
estimators

[('logistic', LogisticRegression(max_iter=500)),
 ('cart', DecisionTreeClassifier()),
 ('svm', SVC())]

In [23]:
results = cross_val_score(ensemble, x, y, cv = kfold)
print(results.mean())

0.9833333333333334


# 0.98