In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')


## Reading Data

In [2]:
creditcard = pd.read_csv('creditcard_sampledata_2.csv')

In [3]:
creditcard.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7300 entries, 0 to 7299
Data columns (total 31 columns):
Unnamed: 0    7300 non-null int64
V1            7300 non-null float64
V2            7300 non-null float64
V3            7300 non-null float64
V4            7300 non-null float64
V5            7300 non-null float64
V6            7300 non-null float64
V7            7300 non-null float64
V8            7300 non-null float64
V9            7300 non-null float64
V10           7300 non-null float64
V11           7300 non-null float64
V12           7300 non-null float64
V13           7300 non-null float64
V14           7300 non-null float64
V15           7300 non-null float64
V16           7300 non-null float64
V17           7300 non-null float64
V18           7300 non-null float64
V19           7300 non-null float64
V20           7300 non-null float64
V21           7300 non-null float64
V22           7300 non-null float64
V23           7300 non-null float64
V24           7300 non-null flo

## preparing the Data into X and y

In [4]:
def prep_data(dataframe):
    y = dataframe.Class.values
    X = dataframe.drop(['Unnamed: 0' , 'V1', 'Class'] , axis = 1).values
    return X,y

In [5]:
X,y = prep_data(creditcard)

### Checking Baseline Accuracy

In [6]:
total_len =len(y)

In [7]:
non_fraud_count = len([i for i in y if i==0])

In [8]:
base_percent=non_fraud_count/total_len*100

In [9]:
base_percent

95.8904109589041

we need to perform better than this

## Creating the BaseLine Model

In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [11]:
model = RandomForestClassifier(random_state = 5)

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size = 0.3, random_state = 0)

In [13]:
model.fit(X_train,y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=5, verbose=0, warm_start=False)

In [14]:
y_pred = model.predict(X_test)

## Evaluating Performance of BaseLine Model

In [15]:
from sklearn.metrics import confusion_matrix, classification_report , roc_auc_score, accuracy_score

In [16]:
confusion_matrix(y_test,y_pred)

array([[2096,    3],
       [  18,   73]])

In [17]:
print(classification_report(y_pred, y_test))

              precision    recall  f1-score   support

           0       1.00      0.99      1.00      2114
           1       0.80      0.96      0.87        76

   micro avg       0.99      0.99      0.99      2190
   macro avg       0.90      0.98      0.93      2190
weighted avg       0.99      0.99      0.99      2190



In [18]:
roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

0.9338879319822626

In [19]:
accuracy_score(y_test,y_pred)


0.9904109589041096

## Adjusting Parameters of Model

adjusting Wieght to deal with Heavily Imbalanced Data

class_weight=
**{balanced mode:** Uses value of `y` to automatically adjust wieghts inversely proportional to class frequencies in input data,

**balanced_subsample:** same as balanced except weigths are calculated `again` at each iteration of a growing tree in a random forest}

Number of trees: `n_estimators` <br>
number of Features considered for splitting at each leaf node : `max_features` <br>
shape of the Tree: leaf_size<br>
length of tree: max_depth


### Hyper Parameter Tuning using GridSearchCV 
example:
<br>
from sklearn.model_selection import `GridSearchCV`

param_grid = {<br>
                'max_depth':[90,90,100]<br>
                'max_features':[2,3]<br>
                'n_estimators':[100,200]<br>
                }
                
                
                
model = `RandomForestRegressor()` <br>
grid_search_model = GridSearchCV(estimator = model, param_grid = `param_grid1` , cv = 5 , n_jobs = -1 , scoring = 'f1')

**Fitting the Grid object on data**
grid_search_model.fit(X_train, y_train) <br>
grid_search_model.best_params_ <br>
grid_search_model.best_estimator_<br>
grid_search_model.best_score_<br>

### Let's Create Model2 with class_weight = 'balanced_subsample' option

In [20]:
model2 = RandomForestClassifier(class_weight='balanced_subsample', random_state = 5)
model2.fit(X_train,y_train)
y_pred=model2.predict(X_test)

In [21]:
confusion_matrix(y_test,y_pred)

array([[2098,    1],
       [  19,   72]])

In [22]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2099
           1       0.99      0.79      0.88        91

   micro avg       0.99      0.99      0.99      2190
   macro avg       0.99      0.90      0.94      2190
weighted avg       0.99      0.99      0.99      2190



In [23]:
roc_auc_score(y_test, model.predict_proba(X_test)[:,1])

0.9338879319822626

In [24]:
accuracy_score(y_test,y_pred)

0.9908675799086758

## Let's Write a function to Print Model Score

In [68]:
def get_score(model):
    model.fit(X_train,y_train)
    y_pred=model.predict(X_test)
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(roc_auc_score(y_test,model.predict_proba(X_test)[:,1]))

### Let's create One more Model by adjusting other parameters as well

In [69]:
model3  = RandomForestClassifier(bootstrap = True , class_weight = {0:1 , 1:12 } , criterion = 'entropy' , 
                                #depth of Model
                                max_depth = 10,
                                #minimum number of samples at a leaf
                                min_samples_leaf=10 ,  
                                #number of trees 
                                n_estimators=20 , n_jobs = -1 , random_state = 5)

In [70]:
get_score(model3)

[[2097    2]
 [  14   77]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2099
           1       0.97      0.85      0.91        91

   micro avg       0.99      0.99      0.99      2190
   macro avg       0.98      0.92      0.95      2190
weighted avg       0.99      0.99      0.99      2190

0.9609651901219315


In [71]:
accuracy_score(y_test,y_pred)

0.9908675799086758

As we can see the ModelScore has improved.
<br>Now let's Use GridSearchCV to find optimal Parameters and create final Classifier 

### Implimenting GridSearchCV 

In [72]:
from sklearn.model_selection import GridSearchCV

In [73]:
params = { 'n_estimators':[1,30],
          'max_features': ['auto', 'log2'] , 
          'max_depth' : [4,8] , 
          'criterion' : ['gini', 'entropy']
            }

sample_model = RandomForestClassifier(random_state = 5)

In [74]:
cv_model = GridSearchCV(estimator= sample_model, param_grid=params,cv = 5 , scoring='recall' , n_jobs = -1)

In [75]:
cv_model.fit(X_train,y_train)

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators='warn', n_jobs=None,
            oob_score=False, random_state=5, verbose=0, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'n_estimators': [1, 30], 'max_features': ['auto', 'log2'], 'max_depth': [4, 8], 'criterion': ['gini', 'entropy']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='recall', verbose=0)

In [76]:
cv_model.best_params_

{'criterion': 'gini',
 'max_depth': 8,
 'max_features': 'log2',
 'n_estimators': 30}

### Let's Create the Final Model using parameter's suggested by GridSearchCV

In [77]:
model_4 = RandomForestClassifier(random_state = 5 , criterion='gini', max_depth = 8 ,max_features = 'log2' , n_estimators =30 ,
                                 class_weight = {0:1 , 1 :12} , n_jobs = -1, min_samples_leaf =10
                                )

In [78]:
get_score(model_4)

[[2095    4]
 [  15   76]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2099
           1       0.95      0.84      0.89        91

   micro avg       0.99      0.99      0.99      2190
   macro avg       0.97      0.92      0.94      2190
weighted avg       0.99      0.99      0.99      2190

0.9749697658225529


In [79]:
accuracy_score(y_test,y_pred)

0.9908675799086758

## Let's Proceed with Using Ensemble Method to create ML Classifier

#### Ensemble Methods Combine the result of Mutiple Models to predict the result
Infact Random Forest is also an ensemble of Decision tree and thus known as Bootstrap Aggrgation or Bagging Ensemble 

`Bagging Ensemble` : Models are Trained on Random subset of Data, and aggregate result by taking average prediction of each model. <br>
`Stacking Ensemble` : Models are trained on Full set of Data,results are predicted by a Voting rule

Ensemble Methods are:
    - Robust
    - Avoid Overfitting
    - Improve prediction Performance

Tutorial: <br>

from sklearn.ensemble import VotingClassifier <br>
clf1 = LogisticRegression(random_state = 5) <br>
clf2 = RandomForestClassifier(random_state = 5 ) <br>
clf3 = GaussianNB() <br>

ens_model = VotingClassifier(estimators= [('lr' , clf1) , ('rf' , clf2) , ('nb' , clf3)] , voting = 'hard') <br>
ens_model.fit(X_train, y_train) <br>
ens_model.predict(X_test)
    

### Voting Rule Parameter 
`Hard` : Uses predicted class Labels and takes **Majority Votes**. <br>
`Soft` : uses **Average probability** by combining probablities of individual models.

We can Also assign weights to model prediction when we know that one of the models is outperforming. <br>
`ens_model` = VotingClassifier( estimators = [('lr' , clf1) ,( 'rf', clf2),( 'nb', clf3) ], voting = 'hard' , weights = [1,2,1] ]

## Let's impelment this on our Dataset

In [80]:
from sklearn.linear_model import LogisticRegression
lr_model = LogisticRegression(random_state = 5 , class_weight = {0:1 , 1 :15})
get_score(lr_model)

[[2052   47]
 [  11   80]]
              precision    recall  f1-score   support

           0       0.99      0.98      0.99      2099
           1       0.63      0.88      0.73        91

   micro avg       0.97      0.97      0.97      2190
   macro avg       0.81      0.93      0.86      2190
weighted avg       0.98      0.97      0.98      2190

0.9722054981702433


In [81]:
accuracy_score(y_test,y_pred)

0.9908675799086758

### Ensembling by stacking

In [82]:
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier

clf1 = LogisticRegression(random_state = 5 , class_weight = {0:1 , 1 :15})
clf2 = RandomForestClassifier( random_state = 5 , criterion='gini', max_depth = 8 ,max_features = 'log2' , n_estimators =30 ,
                                 class_weight = {0:1 , 1 :12} , n_jobs = -1, min_samples_leaf =10   )
clf3 = DecisionTreeClassifier(random_state = 5 , class_weight = 'balanced')


In [83]:
ensemble_model = VotingClassifier(estimators = [('lr' , clf1) , ('rf' , clf2), ('dt' , clf3)], voting ='hard')

In [84]:
def GetScore_hard(ens_model):
    ens_model.fit(X_train,y_train)
    y_pred = ens_model.predict(X_test)
    print(confusion_matrix(y_test,y_pred))
    print(classification_report(y_test,y_pred))
    print(accuracy_score(y_test,y_pred))

In [86]:
GetScore_hard(ensemble_model)

[[2090    9]
 [  13   78]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      2099
           1       0.90      0.86      0.88        91

   micro avg       0.99      0.99      0.99      2190
   macro avg       0.95      0.93      0.94      2190
weighted avg       0.99      0.99      0.99      2190

0.9899543378995433


## Let's add Weights to Ensemble Model

In [87]:
ensemble_model_weighed = VotingClassifier(estimators  = [('lr', clf1) , ('rf' , clf2) , ('dt', clf3)], weights = [1,4,1], voting = 'hard',  flatten_transform = True)

In [90]:
GetScore_hard(ensemble_model_weighed)

[[2095    4]
 [  15   76]]
              precision    recall  f1-score   support

           0       0.99      1.00      1.00      2099
           1       0.95      0.84      0.89        91

   micro avg       0.99      0.99      0.99      2190
   macro avg       0.97      0.92      0.94      2190
weighted avg       0.99      0.99      0.99      2190

0.991324200913242
