
1. SMOTE
2. Dummy Variables
3. Train test split 


In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn import tree
from sklearn import metrics

In [2]:
fc_df=pd.read_csv("Fraud_check.csv")

In [3]:
fc_df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [4]:
# Data Pre-processing

In [5]:
fc_df=fc_df.rename({'Marital.Status':'Marital_Status',
                     'Taxable.Income':'Taxable_Income',
                      'City.Population':'City_Population',
                       'Work.Experience':'Work_Experience'},axis=1)

In [6]:
# Converting categorical data to numerical
label_encoder=preprocessing.LabelEncoder()
fc_df['Undergrad']=label_encoder.fit_transform(fc_df['Undergrad'])
fc_df['Marital_Status']=label_encoder.fit_transform(fc_df['Marital_Status'])
fc_df['Urban']=label_encoder.fit_transform(fc_df['Urban'])

In [7]:
fc_df.head()

Unnamed: 0,Undergrad,Marital_Status,Taxable_Income,City_Population,Work_Experience,Urban
0,0,2,68833,50047,10,1
1,1,0,33700,134075,18,1
2,0,1,36925,160205,30,1
3,1,2,50190,193264,15,1
4,0,1,81002,27533,28,0


In [8]:
# Creating dummy variables
fc_df=pd.get_dummies(data=fc_df,
                        columns=['Undergrad','Marital_Status','Urban'])

In [9]:
fc_df.head()

Unnamed: 0,Taxable_Income,City_Population,Work_Experience,Undergrad_0,Undergrad_1,Marital_Status_0,Marital_Status_1,Marital_Status_2,Urban_0,Urban_1
0,68833,50047,10,1,0,0,0,1,0,1
1,33700,134075,18,0,1,1,0,0,0,1
2,36925,160205,30,1,0,0,1,0,0,1
3,50190,193264,15,0,1,0,0,1,0,1
4,81002,27533,28,1,0,0,1,0,1,0


In [None]:
# Converting Taxable income
fc_df.Taxable_Income=pd.cut(fc_df.Taxable_Income,bins=[0,30000,100000],labels=['Risky','Good'])

In [None]:
fc_df.head()

In [None]:
fc_df['Taxable_Income'].value_counts()

In [None]:
# The classes are not balanced . So, we go for SMOTE 

# SMOTE 

In [None]:
#pip install imbalanced-learn

In [None]:
import imblearn 

from imblearn.over_sampling import SMOTE


In [None]:
x=fc_df.iloc[:,1:]
y=fc_df.iloc[:,0]

In [None]:
sm =SMOTE(random_state = 100)# SMOTENC(categorical_features=[0,1,2,3,4,9,10], random_state = 100)
x_sm,y_sm = sm.fit_resample(x, y)

In [None]:
y_sm.value_counts()

In [None]:
# Train-test split

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x_sm,y_sm,test_size=0.2,
                                                train_size=0.8,
                                                shuffle=True,
                                                random_state=40)

In [None]:
y_test.value_counts()

# M1 : Decision Tree using gini

In [None]:
m_dt_gini=DecisionTreeClassifier(criterion='gini',
                                    max_depth=5,
                                    min_samples_split=10,
                                    random_state=40)
m_dt_gini.fit(x_train,y_train)
                                    

In [None]:
tree.plot_tree(m_dt_gini);

In [None]:
'''fn=['Undergrad','Marital_Status','City_Population','Work_Experience','Urban']
cn=['Risky','Good']
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(m_dt_gini,
               feature_names = fn, 
               class_names=cn,
               filled = True);
               '''

In [None]:
#Predicting on test data
preds_gini = m_dt_gini.predict(x_test) 
pd.Series(preds_gini).value_counts()

In [None]:
print(classification_report(y_test,preds_gini))

# M2 : Decision Tree using Entropy

In [None]:
m_dt_entropy=DecisionTreeClassifier(criterion='entropy',
                                        max_depth=6,
                                        min_samples_split=10,
                                        random_state=40)
m_dt_entropy.fit(x_train,y_train)

In [None]:
#Predicting on test data
preds_entropy = m_dt_entropy.predict(x_test) 
pd.Series(preds_entropy).value_counts()

In [None]:
print(classification_report(y_test,preds_entropy))

# M3:  Bagging

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
min_trees=50
max_trees=500
step_size=50

In [None]:
acc_list = []
f1_list = []
cart=DecisionTreeClassifier()
for i in range(min_trees,max_trees,step_size):
    num_trees = i
    m_dt_bagging = BaggingClassifier( base_estimator=cart,max_samples=0.75,n_estimators=num_trees,random_state=8)#base_estimator=cart,
    m_dt_bagging.fit(x_train,y_train)
    preds_bagging = m_dt_bagging.predict(x_test)
    acc_list.append(metrics.accuracy_score(y_test,preds_bagging))
    f1_list.append(metrics.f1_score(y_test,preds_bagging,pos_label="Good"))
   
    #print("acc after " , i, " is ",acc_list)
    
     # print("**************")
   # print(classification_report(y_test,preds_bagging))
   # print(metrics.confusion_matrix(y_test,preds_bagging))
   # print("Accuracy for ",i," trees : ",metrics.accuracy_score(y_test,preds_bagging))
  

In [None]:
sns.lineplot(x=range(min_trees,max_trees,step_size),y=acc_list)

In [None]:
sns.lineplot(x=range(min_trees,max_trees,step_size),y=f1_list)

# M4:  Random Forest 

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
min_trees=50
max_trees=500
step_size=50
max_featu=len(fc_df.columns)

In [None]:
acc_scores = []
f1_scores = []

for j in range(min_trees,max_trees,step_size):
    
    #print("For No of trees :",j)
    c_acc = []
    c_f1 = []
    
    for i in range(2,max_featu):
        num_trees = j
        max_features = i
        #print("no of features :",i)
        m_dt_randomForest = RandomForestClassifier(n_estimators=num_trees, max_features=max_features,random_state=8)
        m_dt_randomForest.fit(x_train,y_train)
        preds_rf = m_dt_randomForest.predict(x_test)
        c_acc.append(metrics.accuracy_score(y_test,preds_rf))
        c_f1.append(metrics.f1_score(y_test,preds_rf,pos_label='Good'))
      
    acc_scores.append(c_acc)
    f1_scores.append(c_f1)
   

In [None]:
trees=min_trees
for i in range(0,len(acc_scores)):
    print("Max accuracy for :" , trees," is ",max(acc_scores[i]))
    trees+=step_size
    #print(max(acc_scores[i]))
        

In [None]:
#for i in range(len(acc_scores[0])):
#    sns.lineplot(x=range(2,max_features+1),y=acc_scores[i])