# Capstone project - HealthCare Fraud Detection : Support Vector Machine Models

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearnex import patch_sklearn 
patch_sklearn()
from sklearn import preprocessing
from sklearn.svm import LinearSVC,SVC
from sklearn.model_selection import train_test_split,\
StratifiedKFold, cross_val_score, RandomizedSearchCV
from yellowbrick.classifier import confusion_matrix, classification_report, ROCAUC
from yellowbrick.model_selection import CVScores
from cp_clean_helper import show_values
from LGR_helper import std_num_cols, rb_scale_cols, model_results, get_confusion_matrix
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")
pd.set_option('display.max_columns', 100)

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


### # Loading datasets, and looking at shapes:-

In [2]:
# Loading train-test 70:30 split (SMOTE and BorderlineSMOTE) datasets
# For these same columns have been drooped as done for the LGR model

trainX_SM = pd.read_csv("trainX_SM.csv",index_col=0)
trainY_SM = pd.read_csv("trainY_SM.csv",index_col=0)
testX_SM = pd.read_csv("testX_SM.csv",index_col=0)
testY_SM = pd.read_csv("testY_SM.csv",index_col=0)

trainX_BSM = pd.read_csv("trainX_BSM.csv",index_col=0)
trainY_BSM = pd.read_csv("trainY_BSM.csv",index_col=0)
testX_BSM = pd.read_csv("testX_BSM.csv",index_col=0)
testY_BSM = pd.read_csv("testY_BSM.csv",index_col=0)

# Looking at dataset shapes

print('\n')
print('Shape of SMOTE balanced trainX data :',trainX_SM.shape)
print('Shape of SMOTE balanced trainY data :',trainY_SM.shape)
print('Shape of SMOTE balanced testX data :',testX_SM.shape)
print('Shape of SMOTE balanced testY data :',testY_SM.shape)
print('Shape of Borderline SMOTE balanced trainX data :',trainX_BSM.shape)
print('Shape of Borderline SMOTE balanced trainY data :',trainY_BSM.shape)
print('Shape of Borderline SMOTE balanced testX data :',testX_BSM.shape)
print('Shape of Borderline SMOTE balanced testY data :',testY_BSM.shape, end='\n')
print('\n')
print("*"*60)

# Looking at class ratios

print('\n')
print('Class ratio - Fraud/Non-Fraud (trainY_SM) :',trainY_SM.value_counts(normalize=True)*100)
print('Class ratio - Fraud/Non-Fraud (testY_SM) :',testY_SM.value_counts(normalize=True)*100)
print('Class ratio - Fraud/Non-Fraud (trainY_BSM) :',trainY_BSM.value_counts(normalize=True)*100)
print('Class ratio - Fraud/Non-Fraud (testY_BSM) :',testY_BSM.value_counts(normalize=True)*100, end='\n')
print('\n')
print("*"*60)



Shape of SMOTE balanced trainX data : (483580, 44)
Shape of SMOTE balanced trainY data : (483580, 1)
Shape of SMOTE balanced testX data : (207250, 44)
Shape of SMOTE balanced testY data : (207250, 1)
Shape of Borderline SMOTE balanced trainX data : (483580, 44)
Shape of Borderline SMOTE balanced trainY data : (483580, 1)
Shape of Borderline SMOTE balanced testX data : (207250, 44)
Shape of Borderline SMOTE balanced testY data : (207250, 1)


************************************************************


Class ratio - Fraud/Non-Fraud (trainY_SM) : PotentialFraud
0                 50.0
1                 50.0
dtype: float64
Class ratio - Fraud/Non-Fraud (testY_SM) : PotentialFraud
0                 50.0
1                 50.0
dtype: float64
Class ratio - Fraud/Non-Fraud (trainY_BSM) : PotentialFraud
0                 50.0
1                 50.0
dtype: float64
Class ratio - Fraud/Non-Fraud (testY_BSM) : PotentialFraud
0                 50.0
1                 50.0
dtype: float64


*******

### # Standardizing the train/test features for balanced datasets:

In [3]:
# SMOTE balanced dataset

std_num_cols(trainX_SM)
std_num_cols(testX_SM)

# BorderlineSMOTE balanced dataset

std_num_cols(trainX_BSM)
std_num_cols(testX_BSM)

### Baseline Linear Support Vector Classifier Model - Standardized

In [4]:
# SMOTE Balanced

svm_std = LinearSVC(random_state=0, verbose=1)
model_results(trainX_SM, trainY_SM, testX_SM, testY_SM, svm_std, show = True)

[LibLinear]The Model Results for  LinearSVC(random_state=0, verbose=2)
************************************************************
Train Accuracy is equal to 0.516
Test Accuracy is equal to 0.513
The Precision score is 0.520
The Average Precision score is 0.507
The Recall score is 0.340
The F1 score is 0.411
The AUC/ROC score is 0.513
True-Positive: 35201.000
True-Negative: 71106.000
False-Positive: 32519.000
False-Negative: 68424.000
Correctly Classified: 106307.000
Incorrectly Classified: 100943.000


[0.5158691426444435,
 0.512940892641737,
 0.5198021264028352,
 0.33969601930036186]

In [5]:
# BorderlineSMOTE Balanced

svm_std = LinearSVC(random_state=0, verbose=1)
model_results(trainX_BSM, trainY_BSM, testX_BSM, testY_BSM, svm_std, show = True)

[LibLinear]The Model Results for  LinearSVC(random_state=0, verbose=2)
************************************************************
Train Accuracy is equal to 0.504
Test Accuracy is equal to 0.505
The Precision score is 0.504
The Average Precision score is 0.503
The Recall score is 0.716
The F1 score is 0.591
The AUC/ROC score is 0.505
True-Positive: 74240.000
True-Negative: 30452.000
False-Positive: 73173.000
False-Negative: 29385.000
Correctly Classified: 104692.000
Incorrectly Classified: 102558.000


[0.5044522105959717,
 0.5051483715319662,
 0.5036190837985794,
 0.7164294330518697]

### # Robust Scaling the train/test features for  balanced datasets:

In [10]:
# SMOTE balanced dataset

rb_scale_cols(trainX_SM)
rb_scale_cols(testX_SM)

# BorderlineSMOTE balanced dataset

rb_scale_cols(trainX_BSM)
rb_scale_cols(testX_BSM)

### Baseline Linear Support Vector Classifier Model - Robust Scaled

In [15]:
# SMOTE Balanced

# svm_rb = LinearSVC(random_state=0,verbose=1)
# model_results(trainX_SM, trainY_SM, testX_SM, testY_SM, svm_rb, show = True)

[LibLinear]The Model Results for  LinearSVC(loss='hinge', random_state=0, verbose=1)
************************************************************
Train Accuracy is equal to 0.516
Test Accuracy is equal to 0.513
The Precision score is 0.520
The Average Precision score is 0.507
The Recall score is 0.340
The F1 score is 0.411
The AUC/ROC score is 0.513
True-Positive: 35201.000
True-Negative: 71106.000
False-Positive: 32519.000
False-Negative: 68424.000
Correctly Classified: 106307.000
Incorrectly Classified: 100943.000


[0.5158691426444435,
 0.512940892641737,
 0.5198021264028352,
 0.33969601930036186]

In [16]:
# BorderlineSMOTE Balanced

# svm_rb = LinearSVC(random_state=0,verbose=1)
# model_results(trainX_BSM, trainY_BSM, testX_BSM, testY_BSM, svm_rb, show = True)

[LibLinear]The Model Results for  LinearSVC(loss='hinge', random_state=0, verbose=1)
************************************************************
Train Accuracy is equal to 0.504
Test Accuracy is equal to 0.505
The Precision score is 0.504
The Average Precision score is 0.503
The Recall score is 0.716
The F1 score is 0.591
The AUC/ROC score is 0.505
True-Positive: 74240.000
True-Negative: 30452.000
False-Positive: 73173.000
False-Negative: 29385.000
Correctly Classified: 104692.000
Incorrectly Classified: 102558.000


[0.5044522105959717,
 0.5051483715319662,
 0.5036190837985794,
 0.7164294330518697]

## Using RandomSearchCV to select best parameters (SM):

In [18]:
# Setting x and y variables

x1 = trainX_SM 
y1 = trainY_SM

# Setting model required parameters

svm_base_sm = LinearSVC(random_state=0,verbose=1)

# Setting the parameter grid

grid_para_svm = [{
    "C": [0.0001, 0.01, 1, 100, 10000, 1000000, 100000000]}]

# Running the RF model through the grid search

grid_search_svm = RandomizedSearchCV(svm_base_sm, grid_para_svm,\
                                     cv=5, scoring='f1_weighted',\
                                     return_train_score = True, n_jobs=-1)

grid_search_svm.fit(x1, y1)

[LibLinear]

RandomizedSearchCV(cv=5, estimator=LinearSVC(random_state=0, verbose=1),
                   n_jobs=-1,
                   param_distributions=[{'C': [0.0001, 0.01, 1, 100, 10000,
                                               1000000, 100000000]}],
                   return_train_score=True, scoring='f1_weighted')

In [19]:
# The best parameters

grid_search_svm.best_params_

{'C': 1}

In [20]:
# The best score

grid_search_svm.best_score_

0.4351845046284975

In [22]:
# Best estimator training/test errors

print("The training error is: %.5f" % (1 - grid_search_svm.best_estimator_.score(x1, y1)))
print("The test     error is: %.5f" % (1 - grid_search_svm.best_estimator_.score(testX_SM, testY_SM)))

The training error is: 0.48413
The test     error is: 0.48706


## Using RandomSearchCV to select best parameters (BSM):

In [None]:
# Setting x and y variables

x2 = trainX_BSM 
y2 = trainY_BSM

# Setting model required parameters

svm_base_bsm = LinearSVC(random_state=0,verbose=1)

# Setting the parameter grid

grid_para_svm = [{
    "C": [0.0001, 0.01, 1, 100, 10000, 1000000, 100000000]}]

# Running the RF model through the grid search

grid_search_svm = RandomizedSearchCV(svm_base_sm, grid_para_svm,\
                                     cv=5, scoring='f1_weighted',\
                                     return_train_score = True, n_jobs=-1)

grid_search_svm.fit(x2, y2)

In [None]:
# The best parameters

grid_search_svm.best_params_

In [None]:
# The best score

grid_search_svm.best_score_

In [None]:
# Best estimator training/test errors

print("The training error is: %.5f" % (1 - grid_search_logit.best_estimator_.score(x2, y2)))
print("The test     error is: %.5f" % (1 - grid_search_logit.best_estimator_.score(testX_BSM, testY_BSM)))

In [None]:
from sklearn.kernel_approximation import Nystroem

feature_map_nystroem = Nystroem(gamma=.2,random_state=1,n_components=300)

trainX_SM_tr = feature_map_nystroem.fit_transform(trainX_SM)
testX_SM_tr = feature_map_nystroem.fit_transform(testX_SM)
trainX_BSM_tr = feature_map_nystroem.fit_transform(trainX_BSM)
testX_BSM_tr = feature_map_nystroem.fit_transform(testX_BSM)

In [None]:
svm_rb = LinearSVC(random_state=0, verbose=1, C=1)
model_results(trainX_SM_tr, trainY_SM, testX_SM_tr, testY_SM, svm_rb, show = True)

In [None]:
svm_rb = LinearSVC(random_state=0, verbose=1, C=)
model_results(trainX_BSM_tr, trainY_BSM, testX_BSM_tr, testY_BSM, svm_rb, show = True)