# Random Forest

Fraud Check dataset

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot

In [3]:
fraud = pd.read_csv("Fraud_check.csv")
fraud

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [6]:
#Converting the Taxable income variable to bucketing. 
fraud["income"]="<=30000"
fraud.loc[fraud["Taxable.Income"]>=30000,"income"]="Good"
fraud.loc[fraud["Taxable.Income"]<=30000,"income"]="Risky"

In [7]:
#Droping the Taxable income variable
fraud.drop(["Taxable.Income"],axis=1,inplace=True)

In [8]:
fraud.rename(columns={"Undergrad":"undergrad","Marital.Status":"marital","City.Population":"population","Work.Experience":"experience","Urban":"urban"},inplace=True)

In [9]:
from sklearn import preprocessing
le=preprocessing.LabelEncoder()
for column_name in fraud.columns:
    if fraud[column_name].dtype == object:
        fraud[column_name] = le.fit_transform(fraud[column_name])
    else:
        pass

In [10]:
##Splitting the data into featuers and labels
features = fraud.iloc[:,0:5]
labels = fraud.iloc[:,5]


In [11]:
colnames = list(fraud.columns)

In [12]:
predictors = colnames[0:5]

In [13]:
target = colnames[5]

In [14]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(features,labels,test_size = 0.2,stratify = labels)

In [15]:
#building model
from sklearn.ensemble import RandomForestClassifier as RF
model = RF(n_jobs = 3,n_estimators = 15, oob_score = True, criterion = "entropy")
model.fit(x_train,y_train)

RandomForestClassifier(criterion='entropy', n_estimators=15, n_jobs=3,
                       oob_score=True)

In [16]:
model.estimators_

[DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1089163367),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=973999099),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=992678292),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=724729204),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1392352681),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=2038380548),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1278492257),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
                        random_state=1844597614),
 DecisionTreeClassifier(criterion='entropy', max_features='auto',
         

In [18]:
model.classes_

array([0, 1])

In [19]:
model.n_features_

5

In [20]:
model.n_classes_

2

In [22]:
model.n_outputs_

1

In [23]:
model.oob_score_

0.7291666666666666

In [24]:
prediction = model.predict(x_train)

In [26]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_train,prediction)
accuracy

0.9854166666666667

In [27]:
np.mean(prediction == y_train)

0.9854166666666667

In [28]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
confusion = confusion_matrix(y_train,prediction)

In [29]:
#Prediction on test data
pred_test = model.predict(x_test)
pred_test

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0])

In [30]:
#Accuracy
acc_test =accuracy_score(y_test,pred_test)
acc_test

0.725

In [32]:
from sklearn.tree import export_graphviz
import pydotplus
from six import StringIO

In [33]:
tree = model.estimators_[5]
tree

DecisionTreeClassifier(criterion='entropy', max_features='auto',
                       random_state=2038380548)

In [34]:
dot_data = StringIO()
export_graphviz(tree,out_file = dot_data, filled = True,rounded = True, feature_names = predictors ,class_names = target,impurity =False)

In [38]:
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())
graph

<pydotplus.graphviz.Dot at 0x1f1255d95e0>