# FraudCheck with Risk Category

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import KFold, cross_val_score
from sklearn.ensemble import BaggingClassifier,RandomForestClassifier, AdaBoostClassifier, StackingClassifier

## problem statement

Use RandomForsest to prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

## Data Description :

    Undergrad           :   person is under graduated or not
    Marital.Status      :   marital status of a person
    Taxable.Income      :   Taxable income is the amount of how much tax an individual owes to the government 
    Work Experience     :   Work experience of an individual person
    Urban               :   Whether that person belongs to urban area or not

## EDA Phase

In [2]:
rawData = pd.read_csv('Fraud_Check.csv')
rawData

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
...,...,...,...,...,...,...
595,YES,Divorced,76340,39492,7,YES
596,YES,Divorced,69967,55369,2,YES
597,NO,Divorced,47334,154058,0,YES
598,YES,Married,98592,180083,17,NO


In [3]:
rawData.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [4]:
X = rawData.drop('Taxable.Income',axis=1).copy(deep=True)
X

Unnamed: 0,Undergrad,Marital.Status,City.Population,Work.Experience,Urban
0,NO,Single,50047,10,YES
1,YES,Divorced,134075,18,YES
2,NO,Married,160205,30,YES
3,YES,Single,193264,15,YES
4,NO,Married,27533,28,NO
...,...,...,...,...,...
595,YES,Divorced,39492,7,YES
596,YES,Divorced,55369,2,YES
597,NO,Divorced,154058,0,YES
598,YES,Married,180083,17,NO


In [5]:
Y = rawData['Taxable.Income']
Y

0      68833
1      33700
2      36925
3      50190
4      81002
       ...  
595    76340
596    69967
597    47334
598    98592
599    96519
Name: Taxable.Income, Length: 600, dtype: int64

In [6]:
Y = Y.apply(lambda Y:'Risky' if Y <= 30000 else 'Good')
Y

0      Good
1      Good
2      Good
3      Good
4      Good
       ... 
595    Good
596    Good
597    Good
598    Good
599    Good
Name: Taxable.Income, Length: 600, dtype: object

In [7]:
Y.value_counts()

Good     476
Risky    124
Name: Taxable.Income, dtype: int64

In [8]:
for i in range(len(X.columns)):
#     print(X[X.columns[i]])
    if X[X.columns[i]].dtypes == 'object':
        print(X[X.columns[i]].value_counts())
    else:
        pass

YES    312
NO     288
Name: Undergrad, dtype: int64
Single      217
Married     194
Divorced    189
Name: Marital.Status, dtype: int64
YES    302
NO     298
Name: Urban, dtype: int64


the above code can be directly written as
for i in X.columns:
    if X[i].dtypes == object:
        print(X[i].value_counts())

In [9]:
X = pd.get_dummies(X)
X.head()

Unnamed: 0,City.Population,Work.Experience,Undergrad_NO,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Marital.Status_Single,Urban_NO,Urban_YES
0,50047,10,1,0,0,0,1,0,1
1,134075,18,0,1,1,0,0,0,1
2,160205,30,1,0,0,1,0,0,1
3,193264,15,0,1,0,0,1,0,1
4,27533,28,1,0,0,1,0,1,0


In [10]:
X2 = X.drop(['Undergrad_NO', 'Marital.Status_Single', 'Urban_NO'],axis=1)
X2

Unnamed: 0,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Divorced,Marital.Status_Married,Urban_YES
0,50047,10,0,0,0,1
1,134075,18,1,1,0,1
2,160205,30,0,0,1,1
3,193264,15,1,0,0,1
4,27533,28,0,0,1,0
...,...,...,...,...,...,...
595,39492,7,1,1,0,1
596,55369,2,1,1,0,1
597,154058,0,0,1,0,1
598,180083,17,1,0,1,0


## Feature Engineering

### PPSCORE

In [11]:
import ppscore as pps

In [12]:
pps.matrix(rawData,sorted=True)

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
0,Undergrad,Undergrad,1.0,predict_itself,True,,0.0,1.0,
1,Marital.Status,Marital.Status,1.0,predict_itself,True,,0.0,1.0,
2,Taxable.Income,Taxable.Income,1.0,predict_itself,True,,0.0,1.0,
3,City.Population,City.Population,1.0,predict_itself,True,,0.0,1.0,
4,Work.Experience,Work.Experience,1.0,predict_itself,True,,0.0,1.0,
5,Urban,Urban,1.0,predict_itself,True,,0.0,1.0,
6,Taxable.Income,Urban,0.069812,classification,True,weighted F1,0.493333,0.528705,DecisionTreeClassifier()
7,Work.Experience,Urban,0.054289,classification,True,weighted F1,0.493333,0.52084,DecisionTreeClassifier()
8,Taxable.Income,Marital.Status,0.03429,classification,True,weighted F1,0.333333,0.356193,DecisionTreeClassifier()
9,Marital.Status,Taxable.Income,0.007396,regression,True,mean absolute error,22771.961667,22603.545807,DecisionTreeRegressor()


In [13]:
pps.matrix(rawData,sorted=True)[pps.matrix(rawData,sorted=True).y == 'Taxable.Income']

Unnamed: 0,x,y,ppscore,case,is_valid_score,metric,baseline_score,model_score,model
2,Taxable.Income,Taxable.Income,1.0,predict_itself,True,,0.0,1.0,
9,Marital.Status,Taxable.Income,0.007396,regression,True,mean absolute error,22771.961667,22603.545807,DecisionTreeRegressor()
13,Urban,Taxable.Income,0.000138,regression,True,mean absolute error,22771.961667,22768.829023,DecisionTreeRegressor()
15,Undergrad,Taxable.Income,0.0,regression,True,mean absolute error,22771.961667,22779.973615,DecisionTreeRegressor()
26,City.Population,Taxable.Income,0.0,regression,True,mean absolute error,22771.961667,29853.625,DecisionTreeRegressor()
31,Work.Experience,Taxable.Income,0.0,regression,True,mean absolute error,22771.961667,24035.718747,DecisionTreeRegressor()


**Based on PPS scores. There are no good features that independently predict the Taxable income in the given dataset. let us check how do they perform using RFE technique.**

## Feature Engineering
### RecursiveFeatureElemination(RFE)

In [14]:
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier

In [15]:
DTC = DecisionTreeClassifier()
rfe = RFE(DTC,n_features_to_select=1,verbose=1)
fit = rfe.fit(X2,Y)
fit.support_

Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.
Fitting estimator with 2 features.


array([ True, False, False, False, False, False])

In [16]:
fit.ranking_,X2.columns

(array([1, 2, 3, 6, 5, 4]),
 Index(['City.Population', 'Work.Experience', 'Undergrad_YES',
        'Marital.Status_Divorced', 'Marital.Status_Married', 'Urban_YES'],
       dtype='object'))

In [17]:
confusion_matrix(Y,fit.predict(X2))

array([[476,   0],
       [  1, 123]], dtype=int64)

In [18]:
DTC = DecisionTreeClassifier()
rfe = RFE(DTC,n_features_to_select=2,verbose=3)
fit = rfe.fit(X2,Y)
fit.ranking_

Fitting estimator with 6 features.
Fitting estimator with 5 features.
Fitting estimator with 4 features.
Fitting estimator with 3 features.


array([1, 1, 4, 5, 2, 3])

In [19]:
confusion_matrix(Y,fit.predict(X2)) # this accuracy is not reliable as all the decision trees are built to pure region.

array([[476,   0],
       [  0, 124]], dtype=int64)

**The Recursive Elemination process suggests that 'City.Population' and 'Work.Experience' together are good to predict y variable. let us prceed to build our Random forest with cross validation.**

### TrainTestSplit for model Training and Validation

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train,X_test,Y_train,Y_test = train_test_split(X2,Y,test_size=.5,random_state=42)
print(X_train.shape,X_test.shape)
print(Y_train.shape,Y_test.shape)
Y_train.value_counts(),Y_test.value_counts()

(300, 6) (300, 6)
(300,) (300,)


(Good     242
 Risky     58
 Name: Taxable.Income, dtype: int64,
 Good     234
 Risky     66
 Name: Taxable.Income, dtype: int64)

# Model Building / Validation
## Random Forest Classifier

In [22]:
kfold = KFold(n_splits=10)
RFT = RandomForestClassifier(random_state=0)
result = cross_val_score(RFT,X_train,Y_train,cv=kfold)

In [23]:
result.mean(),result.std()

(0.76, 0.085374989832438)

In [24]:
RFT.fit(X_train,Y_train)
RFT.score(X_train,Y_train)

1.0

In [25]:
confusion_matrix(Y_train,RFT.predict(X_train))

array([[242,   0],
       [  0,  58]], dtype=int64)

In [26]:
print(classification_report(Y_train,RFT.predict(X_train)))

              precision    recall  f1-score   support

        Good       1.00      1.00      1.00       242
       Risky       1.00      1.00      1.00        58

    accuracy                           1.00       300
   macro avg       1.00      1.00      1.00       300
weighted avg       1.00      1.00      1.00       300



In [27]:
RFT.predict(X_test)

array(['Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Risky', 'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Risky', 'Good', 'Good', 'Good', 'Good',
       'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good', 'Good',


In [28]:
RFT.score(X_test,Y_test)

0.7466666666666667

In [29]:
confusion_matrix(Y_test,RFT.predict(X_test))

array([[222,  12],
       [ 64,   2]], dtype=int64)

In [30]:
print(classification_report(Y_test,RFT.predict(X_test)))

              precision    recall  f1-score   support

        Good       0.78      0.95      0.85       234
       Risky       0.14      0.03      0.05        66

    accuracy                           0.75       300
   macro avg       0.46      0.49      0.45       300
weighted avg       0.64      0.75      0.68       300



**There seems to be a problem of overfitting**

**As expected the model does not fare well in identifying the risky data. eventhough the overall accuracy is ok at 75%, the precision for 'risky' category is as low as 14%, further specificity is even low @ 3%. As stated in the PPS score that there are no good features to predict the Y variable, our model suffers in performance.**

**May be we need to look at the source of the data and rework on the selecting and collecting/sourcing right features for prediction.**