In [58]:
import numpy as np
import pandas as pd 
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix,f1_score
from imblearn.over_sampling import SMOTE
import graphviz

## Loading the data set

In [7]:
df=pd.read_csv("C:\\Users\\ritik\\Downloads\\winequality_red.csv")

In [8]:
df

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
1,7.8,0.880,0.00,2.6,0.098,25.0,67.0,0.99680,3.20,0.68,9.8,5
2,7.8,0.760,0.04,2.3,0.092,15.0,54.0,0.99700,3.26,0.65,9.8,5
3,11.2,0.280,0.56,1.9,0.075,17.0,60.0,0.99800,3.16,0.58,9.8,6
4,7.4,0.700,0.00,1.9,0.076,11.0,34.0,0.99780,3.51,0.56,9.4,5
...,...,...,...,...,...,...,...,...,...,...,...,...
1594,6.2,0.600,0.08,2.0,0.090,32.0,44.0,0.99490,3.45,0.58,10.5,5
1595,5.9,0.550,0.10,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.510,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5


The data set consists following variables :
  1. fixed acidity 
  2. volitile acidity
  3. Critic acid 
  4. residual sugar 
  5. clorides
  6. free sulfur dioxide
  7. total sulfue dioaxide
  8. density
  9. pH
  10. Sulphate 
  11. alcohol
  12. quality 
    

## Checking the shape 

In [9]:
df.shape

(1599, 12)

## Seggregate the feature and target


In [10]:
x=df.drop(columns=["quality"])
y=df["quality"]

## Do scaling of feature 

In [11]:
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
new_data=scaler.fit_transform(x)

In [12]:
new_data

array([[-0.52835961,  0.96187667, -1.39147228, ...,  1.28864292,
        -0.57920652, -0.96024611],
       [-0.29854743,  1.96744245, -1.39147228, ..., -0.7199333 ,
         0.1289504 , -0.58477711],
       [-0.29854743,  1.29706527, -1.18607043, ..., -0.33117661,
        -0.04808883, -0.58477711],
       ...,
       [-1.1603431 , -0.09955388, -0.72391627, ...,  0.70550789,
         0.54204194,  0.54162988],
       [-1.39015528,  0.65462046, -0.77526673, ...,  1.6773996 ,
         0.30598963, -0.20930812],
       [-1.33270223, -1.21684919,  1.02199944, ...,  0.51112954,
         0.01092425,  0.54162988]])

## Checking imbalance data

In [13]:
df["quality"].value_counts()

5    681
6    638
7    199
4     53
8     18
3     10
Name: quality, dtype: int64

here the class imbalance so here handle with smote

In [17]:
smote=SMOTE()
x_smote,y_smote=smote.fit_resample(new_data,y)

In [18]:
from collections import Counter
Counter(y_smote)

Counter({5: 681, 6: 681, 7: 681, 4: 681, 8: 681, 3: 681})

In [19]:
print(x_smote.shape)
print(y_smote.shape)

(4086, 11)
(4086,)


In [20]:
x_train,x_test,y_train,y_test=train_test_split(x_smote,y_smote,test_size=0.25,random_state=3)

In [21]:
print(x_train.shape)
print(y_train.shape)

(3064, 11)
(3064,)


In [22]:
print(x_test.shape)
print(y_test.shape)

(1022, 11)
(1022,)


## SVM classifier

In [23]:
model=SVC()
model.fit(x_train,y_train)

SVC()

In [24]:
y_pred=model.predict(x_test)

In [25]:
acc=accuracy_score(y_test,y_pred)
acc

0.7661448140900196

In [29]:
y_pred_train=model.predict(x_train)
acc=accuracy_score(y_train,y_pred_train)
acc

0.7966710182767625

In [30]:
c=confusion_matrix(y_test,y_pred)
c

array([[170,   0,   0,   0,   0,   0],
       [  7, 138,  18,   1,   0,   0],
       [  6,  25,  95,  33,   6,   0],
       [  2,  16,  38,  73,  37,   7],
       [  0,   2,   5,  19, 133,  17],
       [  0,   0,   0,   0,   0, 174]], dtype=int64)

In [31]:
f1=f1_score(y_test,y_pred,average="weighted")
f1

0.7553126837051535

In [40]:
model_1=SVC(kernel="linear")
model_1.fit(x_train,y_train)
y_pred=model_1.predict(x_test)
y_pred_train=model_1.predict(x_train)
acc_t=accuracy_score(y_test,y_pred)
acc_tr=accuracy_score(y_train,y_pred_train)

In [41]:
print("accuracy of train",acc_tr)
print("accuracy of test",acc_t)

accuracy of train 0.6377284595300261
accuracy of test 0.6301369863013698


In [63]:
param_grid={"gamma":[0.1,1,10,20,30,40], 'C': [1,0.5,0.1,1.5,2,2.5]}

In [64]:
grid= GridSearchCV(SVC(),param_grid,verbose=3,n_jobs =-1)

In [65]:
grid.fit(x_train,y_train)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


GridSearchCV(estimator=SVC(), n_jobs=-1,
             param_grid={'C': [1, 0.5, 0.1, 1.5, 2, 2.5],
                         'gamma': [0.1, 1, 10, 20, 30, 40]},
             verbose=3)

In [68]:
grid.best_params_

{'C': 1.5, 'gamma': 1}

## New ,Fit the model using optical parameters of c and gamma

In [69]:
model_new=SVC(C=1.5,gamma=1)
model_new.fit(x_train,y_train)

SVC(C=1.5, gamma=1)

In [70]:
accuracy_score(y_test,model_new.predict(x_test))


0.8796477495107632