# Random Forest(Fraud Check)

Use Random Forest to prepare a model on fraud data 
treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [77]:
#importing necessary libraries
import pandas as pd
import matplotlib.pyplot as plt

In [78]:
df = pd.read_csv("Fraud_check.csv")

In [79]:
#Viewing top 5 rows of dataframe
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO


In [80]:
df.shape

(600, 6)

In [81]:
df.isnull().sum()

Undergrad          0
Marital.Status     0
Taxable.Income     0
City.Population    0
Work.Experience    0
Urban              0
dtype: int64

In [82]:
df['Undergrad'] = df['Undergrad'].apply(lambda x: 1 if x=='YES' else 0)
df['Urban'] = df['Urban'].apply(lambda x: 1 if x=='YES' else 0)
df['Taxable.Income'] = df['Taxable.Income'].apply(lambda x: 'Risky' if x>30000  else 'Good')

In [83]:
df['Marital.Status'].value_counts()

Single      217
Married     194
Divorced    189
Name: Marital.Status, dtype: int64

In [84]:
df['Marital.Status'] = df['Marital.Status'].map({'Single':1, 'Married':2, 'Divorced':3})

In [85]:
df.head()

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,1,Risky,50047,10,1
1,1,3,Risky,134075,18,1
2,0,2,Risky,160205,30,1
3,1,1,Risky,193264,15,1
4,0,2,Risky,27533,28,0


In [86]:
df['Marital.Status'].value_counts()

1    217
2    194
3    189
Name: Marital.Status, dtype: int64

In [87]:
import seaborn as sns

In [88]:
## riskey = 0 , good = 1
df['Taxable.Income'] = df['Taxable.Income'].apply(lambda x: 0 if x== 'Risky'  else 1)

In [89]:
## Train test the data
train = df.drop('Taxable.Income',axis=1)
test = df['Taxable.Income']

In [90]:
## train test split
from sklearn.model_selection import train_test_split

In [91]:
X_train, X_test, y_train, y_test = train_test_split(train, test, test_size = 0.3,random_state = 1)

# Standardize the Data

In [92]:
from sklearn.preprocessing import StandardScaler

In [93]:
scaler = StandardScaler()

In [94]:
col = X_train.columns
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)

In [95]:
X_train = pd.DataFrame(X_train, columns=col)
X_test = pd.DataFrame(X_test, columns=col)

In [96]:
X_train.shape

(420, 5)

In [97]:
X_test.shape

(180, 5)

# Build random forest

In [98]:
from sklearn.ensemble import RandomForestClassifier

In [99]:
df.head(2)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,0,1,0,50047,10,1
1,1,3,0,134075,18,1


In [100]:
rf_model = RandomForestClassifier(n_estimators= 100)

In [101]:
rf_model.fit(X_train, y_train)

RandomForestClassifier()

In [102]:
rf_model_pred = rf_model.predict(X_test)

In [103]:
rf_model_actual_predict = pd.DataFrame({'Actual': y_test, 'Predict': rf_model_pred})

In [104]:
rf_model_actual_predict[20:40]

Unnamed: 0,Actual,Predict
13,0,0
510,0,0
268,0,0
358,0,0
289,0,1
483,1,0
429,0,0
47,0,0
223,0,0
185,0,0


In [105]:
rf_model.score(X_test, y_test)*100

78.33333333333333

# Apply Baging

In [106]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [107]:
bg = BaggingClassifier(base_estimator= DecisionTreeClassifier(), max_samples = 0.5, max_features =1.0, n_estimators = 100, random_state=8)

In [108]:
bg.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=0.5,
                  n_estimators=100, random_state=8)

In [109]:
bg_prid = bg.predict(X_test)

In [110]:
bg_actual_prid = pd.DataFrame({'Actual':y_test, 'Pridict': bg_prid})

In [111]:
bg_actual_prid[:10]

Unnamed: 0,Actual,Pridict
446,1,0
404,0,0
509,1,0
455,0,0
201,0,0
23,0,0
415,0,0
548,0,0
66,0,0
165,0,0


In [112]:
bg.score(X_test, y_test)

0.7888888888888889

# Ada Boosting

In [113]:
from sklearn.ensemble import AdaBoostClassifier

In [114]:
boost = AdaBoostClassifier(base_estimator= DecisionTreeClassifier(), n_estimators=100, learning_rate=1)

In [115]:
boost.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(), learning_rate=1,
                   n_estimators=100)

In [116]:
boost_pred = boost.predict(X_test)

In [117]:
boost_actual_predict = pd.DataFrame({"Actual": y_test, 'Predict': boost_pred})

In [118]:
boost_actual_predict[:10]

Unnamed: 0,Actual,Predict
446,1,1
404,0,1
509,1,0
455,0,0
201,0,0
23,0,0
415,0,1
548,0,1
66,0,0
165,0,1


In [119]:
boost.score(X_test, y_test)

0.6388888888888888

# Build the Random Forest model using kfold

In [120]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier

In [121]:
kfold = KFold(n_splits=10)

In [122]:
num_trees = 100

In [123]:
max_feature = 3

In [124]:
rfc = RandomForestClassifier(n_estimators=num_trees, max_features= max_feature)

In [125]:
rfc = cross_val_score(rfc, train , test, cv=kfold)

In [126]:
rfc

array([0.78333333, 0.76666667, 0.68333333, 0.71666667, 0.76666667,
       0.63333333, 0.85      , 0.7       , 0.73333333, 0.85      ])

In [127]:
rfc.mean()*100

74.83333333333333

In [128]:
rfc.std()*100

6.601767440112787