# Predict Credit Card Fraud using SVM, KNN, and Naive Bayes



Credit card fraud happens when someone — a fraudster or a thief — uses your stolen credit card or the information from that card to make unauthorized purchases in your name or take out cash advances using your account.

### Problem Statement:

Credit card companies such as **Citibank**, **HSBC**, and **American Express** need to recognize fraudulent credit card transactions so that customers are not charged for items that they did not purchase.

### Aim:

In this demo, you have to build a classification model to identify fraudulent credit card transactions

### Dataset Description
The datasets contains transactions made by credit cards in September 2013 by european cardholders. 

Presents transactions that occurred in two days, where we have **492** frauds out of **284,807** transactions. 

- **Time** - Number of seconds elapsed between this transaction and the first transaction in the dataset
- **V1-V28** - Encrpted attributes (or columns) to protect user identities and sensitive features (v1-v28)
- **Amount** - Transaction Amount
- **Class** - **1** for fraudulent transactions, **0** otherwise

In [None]:
# After Insalling Pandas Profiling, restart the kernel and run from the top excluding the installation cell
#It's better to install all th libraries at the top

# !pip install pandas-profiling==2.7.1 
# !pip install pycaret
# !pip install shap

### Import the required libraries and load the data


In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
# from pandas_profiling import ProfileReport
sns.set()

import warnings
warnings.filterwarnings("ignore")    
import os

[**Click Here!**](https://www.dropbox.com/s/oey6vxdcc4fiemv/creditcard.csv?dl=0) to download the data set

In [2]:
!wget https://www.dropbox.com/s/oey6vxdcc4fiemv/creditcard.csv

--2023-02-26 02:12:52--  https://www.dropbox.com/s/oey6vxdcc4fiemv/creditcard.csv
Resolving www.dropbox.com (www.dropbox.com)... 162.125.1.18, 2620:100:6031:18::a27d:5112
Connecting to www.dropbox.com (www.dropbox.com)|162.125.1.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/oey6vxdcc4fiemv/creditcard.csv [following]
--2023-02-26 02:12:53--  https://www.dropbox.com/s/raw/oey6vxdcc4fiemv/creditcard.csv
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucbfcd0e0f940763e5163782da13.dl.dropboxusercontent.com/cd/0/inline/B3NfvQ2lqwJPQYOX_jeKnHwCHUcynaT0BniL_bAVgNTPrUw7usMvvuUV8MI8VuAqobc3unskMpJJyfA6drTCzZ_lQlUnTnKaGZh8JQjxOalZaglBAoKRKOlNHGhcsjpO1Lef5ikdyah-5JzAqZGFGc6Kk9scaAAYFWiRPHWQICMhcA/file# [following]
--2023-02-26 02:12:53--  https://ucbfcd0e0f940763e5163782da13.dl.dropboxusercontent.com/cd/0/inline/B3NfvQ2lqwJPQYOX_jeKnHwCHUcynaT0BniL_bAVgNTPrUw7usMvvuUV8MI8VuAqobc3unskM

In [3]:
df = pd.read_csv('/content/creditcard.csv')
print(df.shape)
df.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
df.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [None]:
df.Amount.min(), df.Amount.max()

(0.0, 25691.16)

In [None]:
df.Class.value_counts()

0    284315
1       492
Name: Class, dtype: int64

### Exploratory Data Analysis

#### Pandas Profiling

In [None]:
import pandas_profiling
from pandas_profiling import ProfileReport
prof = ProfileReport(df)
prof.to_file(output_file='output.html') #Generating a Data Report

# This will take a few minutes to run

Please refer to the HTML file created by the name of **output.html**

In [None]:
ProfileReport(df).to_notebook_iframe()

___
**Observations:**

- There are **31** variables or features in the dataframe and the total number of instances or rows are **2,84,807**
- We have **30** Numeric and **1** Boolean variable
- There are no missing cells in the dataset which is a big relief
- There are **1081** duplicate rows in the data set which accounts for **0.4%** of the entire data set
___

**Note:** Answer the following questions:

- What columns seems to have **outliers** based on **min**, **max** and **percentile values**, **IQR range** along with the **standard deviation** and **mean absolute deviation**?
- What columns have missing values? (Check the **Missing Values** section in **Pandas Profiling**)
- What columns have high amount of zeros/NaN

- What columns have **high variance** and **standard deviation**?
- Comment on the distribution of the continuous values **(Real Number: ℝ≥0)**
- Do you see any alarming trends in the extreme values (minimum 5 and maximum 5)?
- How many booleans columns are there in the data set and out of those how many are imbalanced?
- Check for **duplicate records** across all columns (**Check Warning Section**)

- How many columns are categorical?
  - Are those categories in sync with the domain categories?
  - Check if all the categories are unique and they represent distinct information
  - Is there any imbalance in the categorical columns?

Based on the above questions and your observations, chart out a plan for **Data Pre-processing** and feature engineering

**Note:** Feature Engineering (Feature Selection and Feature Creation)

- From the **Interaction Tab**, write at least 3 observations that may be very crucial for prediction. Make sure that they are in story format

**For Example:** Av monthly hours vs Satisfaction Level..

- Check **Pearson** and **Spearman** tab in the **correlation** section and note down the columns which are highly correlated (Postive and Negative Correlation). Create two bands of thresholds. (Consider 60 (0.6) to 80 (0.8) or 80 to 100 as high) 


#### Univariate Distribution
A **Univariate distribution** is a probability distribution of only one random variable

**Note:** You have already seen this in Pandas Profiling. Still, if you want to write the code, you can do so.

What is the distribution of the **Time** & **Amount** columns in the data set?

In [None]:
fig = px.histogram(df, x = 'Time')
fig.show()

In [None]:
fig = px.histogram(df, x = 'Amount')
fig.show()

In [None]:
round((df.Class.value_counts()/df.shape[0]),5)*100

In [None]:
df.Class.value_counts()

In [None]:
%matplotlib inline

# plt.figure(figsize=(12,8))
# ax = sns.countplot(df["Class"], color='green')
# for p in ax.patches:
#     x = p.get_bbox().get_points()[:,0]
    
#     y = p.get_bbox().get_points()[1,1]
    
#     ax.annotate('{:.2g}%'.format(100.*y/len(df)), (x.mean(), y), ha='center', va='bottom')
# plt.show()

___
**Observations:**

The data set is **Highly Unbalanced** with only **0.17%** of transactions being classified as **Fraudulent**. 

Several ways to approach this Imbalance Classification problem:

- **Acquire More Data** (Not Possible in our case)
- **Changing the performance metric:**
 - Use the **Confusion Matrix**
 - **F1-Score** (Weighted Average of **Precision** & **Recall**)
 - **ROC Curves**

- **Re-sampling the dataset:** Essentially this is a method that will process the data to have an approximate 50-50 ratio.

 - **Over-sampling**, which is adding copies of the under-represented class (better when you have little data)

 - **Under-sampling**, which deletes instances from the over-represented class (better when he have lot's of data)

**NOTE:** We will use the 2nd Method first and then, use **SMOTE** while using the 3rd approach
___

### Data pre-processing and splitting

**RobustScaler:** Unlike the previous scalers, the 
centering and scaling statistics of RobustScaler is based on percentiles and are therefore not influenced by a few number of very large marginal outliers. Consequently, the resulting range of the transformed feature values is larger than for the previous scalers and, more importantly, are approximately similar: for both features most of the transformed values lie in a [-2, 3] range

In [None]:
print(type(df['Amount']))
print(type(df['Amount'].values))
print(df['Amount'].values.shape)
print(df['Amount'].values.reshape(-1, 1).shape)

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>
(284807,)
(284807, 1)


In [4]:
from sklearn.preprocessing import RobustScaler
rs = RobustScaler()

#Fit_Transform the scaled_amount and scaled_time columns in the data set and dropping the Original Time and Amount Column from the data set

df['scaled_amount'] = rs.fit_transform(df['Amount'].values.reshape(-1,1))
df['scaled_time'] = rs.fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time', 'Amount'], axis=1, inplace=True) #Dropping the Original Time and Amount Column from the data set

In [None]:
df.columns

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Class',
       'scaled_amount', 'scaled_time'],
      dtype='object')

In [5]:
df = df[['scaled_amount', 'scaled_time','V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Class']]
df.head()

Unnamed: 0,scaled_amount,scaled_time,V1,V2,V3,V4,V5,V6,V7,V8,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Class
0,1.783274,-0.994983,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,...,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,0
1,-0.269825,-0.994983,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,...,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,0
2,4.983721,-0.994972,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,...,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,0
3,1.418291,-0.994972,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,...,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,0
4,0.670579,-0.99496,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,...,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,0


In [None]:
# from sklearn.model_selection import train_test_split as tts
# x = np.array(df.iloc[:, df.columns != 'Class']) #Predictors 
# y = np.array(df.iloc[:, df.columns == 'Class']) #Target Column
# x_train, x_test, y_train, y_test = tts(x, y, test_size=0.2, random_state=0)

In [6]:
# x = np.array(df[['scaled_amount', 'scaled_time','V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
#        'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
#        'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28']])
from sklearn.model_selection import train_test_split
X = df.drop('Class', axis=1)
y = df.Class
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [7]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape, X.shape, y.shape

((227845, 30), (56962, 30), (227845,), (56962,), (284807, 30), (284807,))

### SVM, KNN, Naive Bayes

#### SVM


In [None]:
from sklearn import svm
# Create and train a Linear SVM model
msvm = svm.LinearSVC(random_state=15)
msvm.fit(X_train, y_train)
y_pred = msvm.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[56850,    11],
       [   26,    75]])

In [None]:
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
# Metrics
svm_roc=roc_auc_score(y_test, y_pred)
svm_acc = accuracy_score(y_test, y_pred)
svm_prec = precision_score(y_test, y_pred)
svm_rec = recall_score(y_test, y_pred)
svm_f1 = f1_score(y_test, y_pred)
print('roc:',svm_roc)
print('accuracy:',svm_acc)
print('precision:',svm_prec)
print('recall:',svm_rec)
print('f1:',svm_f1)

roc: 0.8711904016064186
accuracy: 0.9993504441557529
precision: 0.872093023255814
recall: 0.7425742574257426
f1: 0.8021390374331551


#### KNN

In [None]:
df.head()

In [None]:
from sklearn.neighbors import KNeighborsClassifier
mknn=KNeighborsClassifier(n_neighbors=3,metric='euclidean')  # here k=3
mknn.fit(X_train,y_train)
y_pred2=mknn.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred2))

[[56854     7]
 [   19    82]]


In [None]:
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report

knn_roc=roc_auc_score(y_test, y_pred2)
knn_acc = accuracy_score(y_test, y_pred2)
knn_prec = precision_score(y_test, y_pred2)
knn_rec = recall_score(y_test, y_pred2)
knn_f1 = f1_score(y_test, y_pred2)
print('roc:',knn_roc)
print('accuracy:',knn_acc)
print('precision:',knn_prec)
print('recall:',knn_rec)
print('f1:',knn_f1)

roc: 0.9058790404462089
accuracy: 0.9995435553526912
precision: 0.9213483146067416
recall: 0.8118811881188119
f1: 0.8631578947368421


In [None]:
print(classification_report(y_test,y_pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.92      0.81      0.86       101

    accuracy                           1.00     56962
   macro avg       0.96      0.91      0.93     56962
weighted avg       1.00      1.00      1.00     56962



- **macro avg**
 - This function computes f1 for each label, and returns the average without considering the proportion for each label in the dataset
 
- **weighted avg**
 - This function computes f1 for each label, and returns the average considering the proportion for each label in the dataset

**How to choose the Value of K in KNN?**

In [None]:
for i in range(1,12,2):
  print(i)

In [None]:
#This cell will take around 30 to 45 mins to execute

error_rate = []
for i in range(1,12,2):
 knn = KNeighborsClassifier(n_neighbors=i)
 knn.fit(X_train,y_train)
 pred_i = knn.predict(X_test)
 error_rate.append(np.mean(pred_i != y_test))

plt.figure(figsize=(10,6))
plt.plot(range(1,40),error_rate)

plt.title('Error Rate vs. K Value')
plt.xlabel('K')
plt.ylabel('Error Rate')

print("Minimum error:-",min(error_rate),"at K =",error_rate.index(min(error_rate)))

**Re-train the KNN Model with the optimal value of K**

In [None]:
# from above using k=7
model=KNeighborsClassifier(n_neighbors=7,metric='euclidean')  # here k=7
model.fit(X_train,y_train)

y_pred2=model.predict(X_test)

print(classification_report(y_test,y_pred2))

#### Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB

#Create and Train a Gaussian Classifier
gnb = GaussianNB()
gnb.fit(X_train, y_train)
y_pred3 = gnb.predict(X_test)

In [None]:
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,y_pred3))

[[55636  1225]
 [   15    86]]


In [None]:
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

roc=roc_auc_score(y_test, y_pred3)
acc = accuracy_score(y_test, y_pred3)
prec = precision_score(y_test, y_pred3)
rec = recall_score(y_test, y_pred3)
f1 = f1_score(y_test, y_pred3)
print('roc:',roc)
print('accuracy:',acc)
print('precision:',prec)
print('recall:',rec)
print('f1:',f1)

roc: 0.9149706919479341
accuracy: 0.9782311014360451
precision: 0.06559877955758962
recall: 0.8514851485148515
f1: 0.12181303116147309


### Model Evaluation

- **Precision:**
  - What percebtage of positive predictions made were correct? This is **Precision**
  - No. of True Positives divided by the no. of True Positives plus the No. of False Positives
 
- **Recall:** Ratio of True Positives to all the positives in your Dataset

- **When to use Precision & Recall:** 
 - In the credit card fraud detection task, lets say we modify the model slightly, and identify a single transaction correctly as fraud. 

 - Now, our precision will be 1.0 (no false positives) but our recall will be very low because we will still have many false negatives. 

 - If we go to the other extreme and classify all transactions as fraud, we will have a recall of 1.0 — we’ll catch every fraud transaction — but our precision will be very low and we’ll misclassify many legit transactions. In other words, as we increase precision we decrease recall and vice-versa.

- **F1-Score:**
 F1 Score is the weighted average of Precision and Recall. F1 is usually more useful than accuracy, especially when we have an uneven class distribution

 - **When to use F1-Score:** 
   - Useful when you have data with imbalance classes
   - Let us say, we have a model with a precision of 1, and recall of 0 which gives a simple average as 0.5 and an F1 score of 0
   - If one of the parameters is low, the second one no longer matters in the F1 score 
   - The F1 score favors classifiers that have similar precision and recall
   - F1 score is a better measure to use if you are seeking a balance between Precision and Recall

- **roc_auc_score**
 - roc_auc_score always runs from 0 to 1, and is sorting predictive possibilities. 0.5 is the baseline for random guessing
 - This metric shows how good at ranking predictions our model is
  
   When to/ not to use it?
    - Should not use it when your data is heavily imbalanced
    - Should use it when you care equally about positive and negative classes



In [None]:
results = pd.DataFrame([['Naive Bayes Classifier', acc,prec,rec,f1,roc],
                        ['Support Vector Machine', svm_acc, svm_prec, svm_rec, svm_f1, svm_roc],
                        ['K- Nearest Neigbor', knn_acc, knn_prec, knn_rec, knn_f1, knn_roc]
                        ],
               columns = ['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score','ROC'])
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC
0,Naive Bayes Classifier,0.978231,0.065599,0.851485,0.121813,0.914971
1,Support Vector Machine,0.99935,0.872093,0.742574,0.802139,0.87119
2,K- Nearest Neigbor,0.999544,0.921348,0.811881,0.863158,0.905879


observation:



### Model Optimization: GridSearchCV

**SVM Hyperparameters:**

- **Gamma**
 - Used with non-linear SVM. Commonly used non-linear kernel is the Radial Basis Function (RBF)
 - Gamma parameter of RBF controls the distance of the influence of a single training point
 - Low values of gamma indicate a large similarity radius which results in more points being grouped together
 - For high values of gamma, the points need to be very close to each other in order to be considered in the same group (or class)
 - Models with very large gamma values tend to overfit.
 
- **C**
 - Adds a penalty for each misclassified data point
 - If c is small, the penalty for misclassified points is low so a decision boundary with a large margin is chosen at the expense of a greater number of misclassifications
 - If c is large, SVM tries to minimize the number of misclassified examples due to high penalty which results in a decision boundary with a smaller margin
 - Penalty is not same for all misclassified examples
 - It is directly proportional to the distance to decision boundary

**NOTE:** If you want to learn more about SVM Hyper-parameters, [**Click Here!**](http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

In [None]:
#GridSearchCV
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# defining parameter range 
# param_grid = {'C': [0.1, 1, 10, 100, 1000], 
# 			'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 
# 			'kernel': ['rbf']} 
param_grid = {'C': [0.1, 1, 10], 
			'gamma': ['scale', 'auto'], 
			'kernel': ['rbf']} 

svc_grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3, cv=2) 

svc_grid.fit(X_train, y_train) 


Fitting 2 folds for each of 6 candidates, totalling 12 fits
[CV 1/2] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.999 total time=  40.8s
[CV 2/2] END ....C=0.1, gamma=scale, kernel=rbf;, score=0.999 total time=  45.0s
[CV 1/2] END .....C=0.1, gamma=auto, kernel=rbf;, score=0.998 total time= 1.2min
[CV 2/2] END .....C=0.1, gamma=auto, kernel=rbf;, score=0.998 total time= 4.1min
[CV 1/2] END ......C=1, gamma=scale, kernel=rbf;, score=0.999 total time=  52.1s
[CV 2/2] END ......C=1, gamma=scale, kernel=rbf;, score=0.999 total time=  49.7s
[CV 1/2] END .......C=1, gamma=auto, kernel=rbf;, score=0.999 total time= 1.3min
[CV 2/2] END .......C=1, gamma=auto, kernel=rbf;, score=0.999 total time= 1.2min
[CV 1/2] END .....C=10, gamma=scale, kernel=rbf;, score=0.999 total time=  39.3s
[CV 2/2] END .....C=10, gamma=scale, kernel=rbf;, score=0.999 total time=  38.8s
[CV 1/2] END ......C=10, gamma=auto, kernel=rbf;, score=0.999 total time=  53.2s
[CV 2/2] END ......C=10, gamma=auto, kernel=rbf;,

GridSearchCV(cv=2, estimator=SVC(),
             param_grid={'C': [0.1, 1, 10], 'gamma': ['scale', 'auto'],
                         'kernel': ['rbf']},
             verbose=3)

In [None]:
print('Best Parameters:',svc_grid.best_params_, 'Best Estimator:',svc_grid.best_estimator_) 

Best Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'} Best Estimator: SVC(C=10)


In [None]:
grid_predictions = svc_grid.predict(X_test) 
from sklearn.metrics import confusion_matrix
print(confusion_matrix(y_test,grid_predictions))
#print(classification_report(y_test, grid_predictions)) 

[[56857     4]
 [   27    74]]


In [None]:
grid_roc=roc_auc_score(y_test, grid_predictions)
grid_acc = accuracy_score(y_test, grid_predictions)
grid_prec = precision_score(y_test, grid_predictions)
grid_rec = recall_score(y_test, grid_predictions)
grid_f1 = f1_score(y_test, grid_predictions)
print('roc:',grid_roc)
print('accuracy:',grid_acc)
print('precision:',grid_prec)
print('recall:',grid_rec)
print('f1:',grid_f1)

roc: 0.8663014601701109
accuracy: 0.9994557775359011
precision: 0.9487179487179487
recall: 0.7326732673267327
f1: 0.8268156424581006


In [None]:
df2 = {'Model': 'SVC With GridSearchCV', 'Accuracy': grid_acc, 'Precision': grid_prec, 'Recall': grid_rec, 'F1 Score': grid_f1, 'ROC': grid_roc} 
results = results.append(df2, ignore_index = True)
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC
0,Naive Bayes Classifier,0.978231,0.065599,0.851485,0.121813,0.914971
1,Support Vector Machine,0.99935,0.872093,0.742574,0.802139,0.87119
2,K- Nearest Neigbor,0.999544,0.921348,0.811881,0.863158,0.905879
3,SVC With GridSearchCV,0.999456,0.948718,0.732673,0.826816,0.866301


In [None]:
results

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score,ROC
0,Naive Bayes Classifier,0.978231,0.065599,0.851485,0.121813,0.914971
1,Support Vector Machine,0.99935,0.872093,0.742574,0.802139,0.87119
2,K- Nearest Neigbor,0.999544,0.921348,0.811881,0.863158,0.905879
3,SVC With GridSearchCV,0.999456,0.948718,0.732673,0.826816,0.866301


### Model Boosting

**Gradient Boosting Classifier**

Parameters

- **n_estimators:** Represents the number of trees in the forest
- **learning_rate:** Shrinks the contribution of each tree by learning_rate.
- **max_features:** Represents the number of features to consider when looking for the best split 
- **max_depth:** Indicates how deep the built tree can be
- **random_state:** Random state ensures that the splits that you generate are reproducible. Used as a seed to the random number generator. This ensures that the random numbers are generated in the same order


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import confusion_matrix, classification_report
#from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

model_gb = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_features=2, max_depth=2, random_state=0)
model_gb.fit(X_train, y_train)

print("Accuracy score (training set): {0:.3f}".format(model_gb.score(X_train, y_train)))
print("Accuracy score (testing set): {0:.3f}".format(model_gb.score(X_test, y_test)))

predictions = model_gb.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

Accuracy score (training set): 0.999
Accuracy score (testing set): 0.999
Confusion Matrix:
[[56835    26]
 [   48    53]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.67      0.52      0.59       101

    accuracy                           1.00     56962
   macro avg       0.84      0.76      0.79     56962
weighted avg       1.00      1.00      1.00     56962



**Let us try different learning rates to compare the performance of the classifier's performance at different learning rates**

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score
lr_rate_list = [0.05, 0.075, 0.1, 0.25, 0.5, 0.75, 1]

for i in lr_rate_list:
    model_gb = GradientBoostingClassifier(n_estimators=20, learning_rate=i, max_features=2, max_depth=2, random_state=0)
    model_gb.fit(X_train, y_train)
#   y_pred (using model_gb, x_test)
    print("Learning rate: ", i)
    print("Accuracy score (training set): {0:.3f}".format(model_gb.score(X_train, y_train)))
    print("Accuracy score (testing set): {0:.3f}".format(model_gb.score(X_test, y_test)))

    #print("f1 score (testing set): {0:.3f}".format(f1score(y_pred, y_test)))
   

**From above, we can see that a learning rate of 0.05 gives us the best performance on the testing set and good performance on the training set**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

gb_clf2 = GradientBoostingClassifier(n_estimators=20, learning_rate=0.05, max_features=2, max_depth=2, random_state=0)
gb_clf2.fit(X_train, y_train)
predictions = gb_clf2.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix:
[[56812    49]
 [   34    67]]
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     56861
           1       0.58      0.66      0.62       101

    accuracy                           1.00     56962
   macro avg       0.79      0.83      0.81     56962
weighted avg       1.00      1.00      1.00     56962



___
**Observations:**
- From above, you can see that the Precision and Recall Score is 0.58 and 0.66 
respectively. But, we know that the accuracy is close to 99 which can be a misleading factor in this scenario
___

**XGBoost**

Parameters
- **n_estimators:** Represents the number of trees in the forest
- **learning_rate:** Shrinks the contribution of each tree by learning_rate.
- **max_depth:** Indicates how deep the built tree can be


In [None]:
# !pip install xgboost (required only on your local machine...colab has already xgboost installted)
import xgboost as xgb

In [None]:
import xgboost as xgb
# model = xgb.XGBClassifier(n_estimators = 5000, max_depth = 30, learning_rate = 0.01)
model = xgb.XGBClassifier(n_estimators = 1000, max_depth = 5, learning_rate = 0.01)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

KeyboardInterrupt: ignored

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

print(classification_report(y_test, y_pred))

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

### Dealing with Imbalanced Classes: Re-sampling the data set


Two methods to create a balanced dataset out of an imbalanced one are:

- **Under-Sampling:**  
 - Balances the data by reducing the size of the majority class
 - Used when quantity of data is sufficient
 - By keeping all samples in the minority class and randomly selecting an equal number of samples in the majority class, a balanced new dataset can be made

- **Over-Sampling:**
 - Balances the data by increasing the size of the minority class 
 - New minority class samples are generated by using **SMOTE** (**Synthetic Minority Over-Sampling Technique**)

**Oversampling:**
Using the resampling module from Scikit-Learn to randomly replicate samples from the minority class

In [8]:
df = pd.read_csv('/content/creditcard.csv')

In [9]:
from sklearn.utils import resample
from sklearn.model_selection import train_test_split

not_fraud = df[df.Class==0]
fraud = df[df.Class==1]
not_fraud.shape, fraud.shape

((284315, 31), (492, 31))

In [23]:
fraud_upsampled = resample(fraud,
                          replace=True, # sample with replacement
                          n_samples=len(not_fraud), # match number in majority class
                          random_state=27) # reproducible results

fraud_upsampled.Class.value_counts()


1    284315
Name: Class, dtype: int64

In [12]:
y = df.Class
X = df.drop('Class', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('x_train:',X_train.shape)
print('y_train:',y_train.shape)
print('x_test:',X_test.shape)
print('y_test:',y_test.shape)
X = pd.concat([X_train, y_train], axis=1)

x_train: (227845, 30)
y_train: (227845,)
x_test: (56962, 30)
y_test: (56962,)


In [24]:
upsampled = pd.concat([not_fraud, fraud_upsampled])
print(upsampled.shape)
upsampled.Class.value_counts()

(568630, 31)


0    284315
1    284315
Name: Class, dtype: int64

In [None]:
  
# %matplotlib inline

# plt.figure(figsize=(12,8))
# ax = sns.countplot(upsampled["Class"], color='green')
# for p in ax.patches:
#     x = p.get_bbox().get_points()[:,0]
    
#     y = p.get_bbox().get_points()[1,1]
    
#     ax.annotate('{:.2g}%'.format(100.*y/len(upsampled)), (x.mean(), y), ha='center', va='bottom')
# plt.show()

**SVM**

In [None]:
from sklearn import svm
from sklearn.metrics import accuracy_score


y_train = upsampled.Class
X_train = upsampled.drop('Class', axis=1)

upsampled = svm.LinearSVC(random_state=20).fit(X_train, y_train)


# Create Linear SVM object
#support = svm.LinearSVC(random_state=20)

# Train the model using the training sets 
#support.fit(x_train, y_train)


In [15]:
predicted = upsampled.predict(X_test)

# Checking the Model Accuracy
score=accuracy_score(y_test,predicted)
print("Your Model Accuracy is", score)

Your Model Accuracy is 0.8768477230434325


In [17]:
# Checking accuracy
print('Accuracy Score:',accuracy_score(y_test, predicted))
from sklearn.metrics import  accuracy_score, f1_score, precision_score, recall_score, roc_auc_score, classification_report
print(classification_report(y_test, predicted))

Accuracy Score: 0.8768477230434325
              precision    recall  f1-score   support

           0       1.00      0.88      0.93     56864
           1       0.01      0.87      0.02        98

    accuracy                           0.88     56962
   macro avg       0.51      0.87      0.48     56962
weighted avg       1.00      0.88      0.93     56962



In [None]:
print(confusion_matrix(y_test, predicted))

**Under-sampling**

In [25]:
# Downsample Majority

not_fraud_downsampled = resample(not_fraud,
                                replace = False, # sample without replacement
                                n_samples = len(fraud), # match minority n
                                random_state = 42) # reproducible results
print(not_fraud_downsampled.shape)
# Combine Minority and Downsampled Majority

downsampled = pd.concat([not_fraud_downsampled, fraud])

downsampled.Class.value_counts()

(492, 31)


0    492
1    492
Name: Class, dtype: int64

**SVM**

In [19]:
y_train = downsampled.Class
X_train = downsampled.drop('Class', axis=1)

undersampled = svm.LinearSVC(random_state=20).fit(X_train, y_train)

undersampled_pred = undersampled.predict(X_test)

# Checking accuracy
print('Accuracy Score:',accuracy_score(y_test, undersampled_pred))

print(classification_report(y_test, undersampled_pred))


Accuracy Score: 0.005249113444050419
              precision    recall  f1-score   support

           0       1.00      0.00      0.01     56864
           1       0.00      1.00      0.00        98

    accuracy                           0.01     56962
   macro avg       0.50      0.50      0.01     56962
weighted avg       1.00      0.01      0.01     56962



**Ovsersampling Using [imblearn's](https://imbalanced-learn.readthedocs.io/en/stable/index.html) SMOTE**

**SMOTE**(Synthetic Minority Oversampling Technique)
- Most commonly used oversampling methods to solve the imbalance problem
- Balances class distribution by randomly increasing minority class examples by replicating them
- Synthesises new minority instances between existing minority instances
- Generates the virtual training records by linear interpolation for the minority class
- These synthetic training records are generated by randomly selecting one or more of the k-nearest neighbors for each example in the minority class
- After the oversampling process, the data is reconstructed and several classification models can be applied for the processed data

**NOTE:**If you want to learn more about **SMOTE**, [**Click Here!**](https://imbalanced-learn.readthedocs.io/en/stable/generated/imblearn.over_sampling.SMOTE.html)


In [21]:
from imblearn.over_sampling import SMOTE

X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=27)

sm = SMOTE(random_state=27)
X_train, y_train = sm.fit_resample(X_train, y_train)

print('x_train:',X_train.shape)
print('y_train:',y_train.shape)
print('x_test:',X_test.shape)
print('y_test:',y_test.shape)


x_train: (426490, 30)
y_train: (426490,)
x_test: (71202, 30)
y_test: (71202,)


In [22]:
smote_svm = svm.LinearSVC(random_state=20).fit(X_train, y_train)

smote_pred = smote_svm.predict(X_test)

# Checking Accuracy, Recall, and F1-Score
print('Accuracy Score:',accuracy_score(y_test, smote_pred))
print(classification_report(y_test, smote_pred))



Accuracy Score: 0.9643689783994832
              precision    recall  f1-score   support

           0       1.00      0.96      0.98     71070
           1       0.04      0.83      0.08       132

    accuracy                           0.96     71202
   macro avg       0.52      0.90      0.53     71202
weighted avg       1.00      0.96      0.98     71202



___
**Observations:**
- SMOTE outperformed other re-sampling techniques (over-sampling and under-sampling)
- Achieved a **Recall Score** of **0.81** (Best Value: **1** and Worst Value: **0**)
___

###**Model Intrepretation: Eli5**

In [None]:
!pip install eli5

In [None]:
import eli5 as eli

**show_weights:** Shows the features and their weights


In [None]:
eli.show_weights(smote_svm)

Let us clearly define feature names

In [None]:
df.columns

In [None]:
class_names = ['0' , '1']
feature_names = ['Time', 'V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10',
       'V11', 'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20',
       'V21', 'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount']

In [None]:
eli.show_weights(smote_svm, feature_names = feature_names, target_names = class_names)

___


In [None]:
#Show Explanation for a Single Prediction

eli.show_prediction(smote_svm, np.array(x_test)[1], feature_names = feature_names, target_names = class_names )

#eli5 shows us the contribution of each feature in predicting the output

###**PyCaret**


Use **PyCaret** to find the best model and perform Automatic Hyperparameter tuning

**PyCaret** is an open source, low-code machine learning library in **Python** that allows you to go from preparing your data to deploying your model within minutes in your choice of notebook environment

[**Click Here!**](https://pycaret.org/) to learn more about **PyCaret**

**Installing PyCaret**

- !pip install pycaret

####**Tasks to be performed**

- Import PyCaret and load the data set
- Initialize or setup the environment 
- Compare Multiple Models and their Accuracy Metrics
- Create the model
- Tune the model
- Evaluate the model


####**Import PyCaret and load the data set**

In [None]:
import pycaret.classification as pc
#dir(pc)

In [None]:
#Loading the dataset

df = pd.read_csv('/content/creditcard.csv')

df.head() #Printing the first 5 rows of dataframe

####**Initialize or setup the environment**

In [None]:
pc.setup(df, target='Class')

___
**Observations:**
- The target type (Serial No. 2) is **Binary** because we have two values in **Class** column i.e., **0** and **1**
___

####**Compare Multiple Models and their Accuracy Metrics**

In [None]:
pc.compare_models()

**Note:** Don't worry about the models. You are gonna learn most of them in the upcoming modules

####**Create the Model**



In [None]:
rf_model = pc.create_model('rf') #Performs K-Fold (10) CV for the selected model

####**Tune the Model**

In [None]:
tuned_rf = pc.tune_model(rf_model)

In [None]:
print(rf_model)

In [None]:
print(tuned_rf)

See the difference between the original model (**rf_model**) and the tuned model (**tuned_rf**)

####**Evaluate the Model**

In [None]:
tuned_rf_eval = pc.evaluate_model(tuned_rf)

###**Deploy a Web App to predict Fraudulent Transactions Using Streamlit**

- **Save the next cell as a .py file**
- **Run it in your local system** (streamlit run filename.py)

In [None]:
import streamlit as st
import pandas as pd
import numpy as np

#Model Building
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

#Data Pre-processing
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

#Model Evaluation
from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve
from sklearn.metrics import precision_score, recall_score

def main():
    st.title("Binary Classification Web App")
    st.sidebar.title("Binary Classification Web App")
    st.markdown("Are your transactions fraud or legit?")
    st.sidebar.markdown("Are your transactions fraud or legit?")

    @st.cache(persist=True)
    def load_data():
        df = pd.read_csv("creditcard.csv")

        rs = RobustScaler()
        df['scaled_amount'] = rs.fit_transform(df['Amount'].values.reshape(-1,1))
        df['scaled_time'] = rs.fit_transform(df['Time'].values.reshape(-1,1))
        df.drop(['Time', 'Amount'], axis=1, inplace=True)

        scaled_amount = df['scaled_amount']
        scaled_time = df['scaled_time']
        df.drop(['scaled_amount', 'scaled_time'], axis=1, inplace=True)
        df.insert(0, 'scaled_amount', scaled_amount)
        df.insert(0, 'scaled_time', scaled_time)

        return df
        
    @st.cache(persist=True)
    def split(df):
        from sklearn.model_selection import train_test_split as holdout
        x = np.array(df.iloc[:, df.columns != 'Class'])
        y = np.array(df.iloc[:, df.columns == 'Class'])
        
        x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)
        return x_train, x_test, y_train, y_test
    
    def plot_metrics(metrics_list):
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model, x_test, y_test, display_labels=class_names)
            st.pyplot()

        if 'ROC Curve' in metrics_list:
            st.subheader("ROC Curve")
            plot_roc_curve(model, x_test, y_test)
            st.pyplot()
        
        if 'Precision-Recall Curve' in metrics_list:
            st.subheader('Precision-Recall Curve')
            plot_precision_recall_curve(model, x_test, y_test)
            st.pyplot()

    df = load_data()
    class_names = [0, 1]
    
    x_train, x_test, y_train, y_test = split(df)

    st.sidebar.subheader("Choose Classifier")
    classifier = st.sidebar.selectbox("Classifier", ("Support Vector Machine (SVM)", "KNN", "Naive Bayes"))

    if classifier == 'Support Vector Machine (SVM)':
        st.sidebar.subheader("Model Hyperparameters")
        #choose parameters
        C = st.sidebar.number_input("C (Regularization parameter)", 0.01, 10.0, step=0.01, key='C_SVM')
        kernel = st.sidebar.radio("Kernel", ("rbf", "linear"), key='kernel')
        gamma = st.sidebar.radio("Gamma (Kernel Coefficient)", ("scale", "auto"), key='gamma')

        metrics = st.sidebar.multiselect("What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))
        
        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Support Vector Machine (SVM) Results")
            model = SVC(C=C, kernel=kernel, gamma=gamma)
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write("Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)
    
    if classifier == 'KNN':
        metrics = st.sidebar.multiselect("What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("KNN Results")
            model = KNeighborsClassifier(n_neighbors=3,metric='euclidean')
            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write("Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)
    
    if classifier == 'Naive Bayes':
        metrics = st.sidebar.multiselect("What metrics to plot?", ('Confusion Matrix', 'ROC Curve', 'Precision-Recall Curve'))

        if st.sidebar.button("Classify", key='classify'):
            st.subheader("Naive Bayes Results")
            model = GaussianNB()

            model.fit(x_train, y_train)
            accuracy = model.score(x_test, y_test)
            y_pred = model.predict(x_test)
            st.write("Accuracy: ", accuracy.round(2))
            st.write("Precision: ", precision_score(y_test, y_pred, labels=class_names).round(2))
            st.write("Recall: ", recall_score(y_test, y_pred, labels=class_names).round(2))
            plot_metrics(metrics)

    if st.sidebar.checkbox("Show raw data", False):
        st.subheader("Credit Card Data Set (Classification)")
        st.write(df)
        
if __name__ == '__main__':
    main()


