In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler,LabelEncoder
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,\
ConfusionMatrixDisplay,roc_curve,roc_auc_score,precision_score,recall_score,f1_score

In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
df = pd.read_csv("/content/drive/MyDrive/AIML_IITG/Datasets/wine_fraud.csv")
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,type
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Legit,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,Legit,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,Legit,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,Legit,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,Legit,red


In [4]:
df["quality"].value_counts()

Unnamed: 0_level_0,count
quality,Unnamed: 1_level_1
Legit,6251
Fraud,246


In [5]:
df["quality"]=df["quality"].map({"Legit":0,"Fraud":1})

In [7]:
df["type"].value_counts()

Unnamed: 0_level_0,count
type,Unnamed: 1_level_1
white,4898
red,1599


In [8]:
df["type"]=df["type"].map({"white":0,"red":1})

In [9]:
X = df.drop("quality",axis = 1)
y = df["quality"]
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.1,random_state=101)

In [10]:
sc = StandardScaler()
x_train_scaled = sc.fit_transform(x_train)
x_test_scaled = sc.transform(x_test)

# Create a parameter grid with:
 # C values: [0.001, 0.01, 0.1, 0.5, 0.8, 1, 10]
# gamma values: ['scale', 'auto']
# kernel values: ['rbf', 'linear']

In [11]:
param = {
    'C': [0.001, 0.01, 0.1, 0.5, 0.8, 1, 10],
    'gamma': ['scale', 'auto'],
    'kernel': ['rbf', 'linear']
    }

In [13]:
svm = SVC(class_weight="balanced")

# Perform GridSearchCV with 5-fold cross-validation

In [18]:
grid = GridSearchCV(svm,param_grid=param,cv=5,scoring="accuracy")
grid.fit(x_train_scaled,y_train)


In [19]:
grid

# Display best parameters and best cross-validation score

In [20]:
grid.best_params_

{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}

In [22]:
score = pd.DataFrame(grid.cv_results_)


In [23]:
final_score = score[["params","mean_test_score"]]
final_score

Unnamed: 0,params,mean_test_score
0,"{'C': 0.001, 'gamma': 'scale', 'kernel': 'rbf'}",0.037455
1,"{'C': 0.001, 'gamma': 'scale', 'kernel': 'line...",0.805544
2,"{'C': 0.001, 'gamma': 'auto', 'kernel': 'rbf'}",0.037455
3,"{'C': 0.001, 'gamma': 'auto', 'kernel': 'linear'}",0.805544
4,"{'C': 0.01, 'gamma': 'scale', 'kernel': 'rbf'}",0.724303
5,"{'C': 0.01, 'gamma': 'scale', 'kernel': 'linear'}",0.79186
6,"{'C': 0.01, 'gamma': 'auto', 'kernel': 'rbf'}",0.724303
7,"{'C': 0.01, 'gamma': 'auto', 'kernel': 'linear'}",0.79186
8,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.819224
9,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'linear'}",0.781257


In [25]:
final_score.sort_values(by="mean_test_score",ascending=False)

Unnamed: 0,params,mean_test_score
24,"{'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}",0.896529
26,"{'C': 10, 'gamma': 'auto', 'kernel': 'rbf'}",0.896358
22,"{'C': 1, 'gamma': 'auto', 'kernel': 'rbf'}",0.854629
20,"{'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}",0.854287
18,"{'C': 0.8, 'gamma': 'auto', 'kernel': 'rbf'}",0.851379
16,"{'C': 0.8, 'gamma': 'scale', 'kernel': 'rbf'}",0.851208
14,"{'C': 0.5, 'gamma': 'auto', 'kernel': 'rbf'}",0.842998
12,"{'C': 0.5, 'gamma': 'scale', 'kernel': 'rbf'}",0.842656
8,"{'C': 0.1, 'gamma': 'scale', 'kernel': 'rbf'}",0.819224
10,"{'C': 0.1, 'gamma': 'auto', 'kernel': 'rbf'}",0.819053


# Train final model with best parameters and evaluate on test set

In [26]:
svm = SVC(class_weight="balanced",kernel='rbf',C=10,gamma="scale")
svm.fit(x_train_scaled,y_train)
svm

In [27]:
y_pred = svm.predict(x_test_scaled)

# Compare performance before and after tuning

In [28]:
conf = confusion_matrix(y_test,y_pred)
conf

array([[557,  66],
       [ 15,  12]])

In [29]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.97      0.89      0.93       623
           1       0.15      0.44      0.23        27

    accuracy                           0.88       650
   macro avg       0.56      0.67      0.58       650
weighted avg       0.94      0.88      0.90       650

