In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

***Loading the dataset***

In [None]:
data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
data.head()

In [None]:
data.describe().transpose()

In [None]:
data.shape

***Checking for null values***

In [None]:
data.isnull().sum()

In [None]:
sns.displot(data['quality'],discrete=True,shrink=0.8)

***Checking for correlations***

In [None]:
corrMat = data.corr().transpose()
corrMat

***Visualizing the correlation matrix***

In [None]:
fig= plt.subplots(figsize=(14,14))
sns.heatmap(data.corr(), annot = True, fmt='.1g')

In [None]:
sns.pairplot(data=data,hue='quality',palette="viridis")


***Checking distribution of some predictors***

In [None]:
sns.displot(data['sulphates'] ,kde=True )
sns.displot( data['fixed acidity'] ,kde=True )
sns.displot( data['alcohol']  ,kde=True)
sns.displot( data['residual sugar'] ,kde=True )
sns.displot( data['total sulfur dioxide'] ,kde=True )
sns.displot( data['chlorides'] ,kde=True )
 
 
 
 

***removing the outliers via quantile method***

In [None]:
q=data['alcohol'].quantile(0.98)
data1= data[data['alcohol']<q]
data1.describe(include='all')

q=data1['residual sugar'].quantile(0.98)
data2= data1[data1['residual sugar']<q]
data2.describe(include='all')

q=data2['fixed acidity'].quantile(0.98)
data3= data2[data2['fixed acidity']<q]
data3.describe(include='all')

q=data3['pH'].quantile(0.98)
data4= data3[data3['pH']<q]
data4.describe(include='all')

q=data4['chlorides'].quantile(0.98)
data5= data4[data4['chlorides']<q]
data5.describe(include='all')

q=data5['density'].quantile(0.98)
data6= data5[data5['density']<q]
data6.describe(include='all')

q=data6['quality'].quantile(0.98)
data7= data6[data6['quality']<q]
data7.describe(include='all')

 

In [None]:
data_clean = data7.reset_index(drop=True)
data_clean.describe(include='all')

***dropping correlated variables***

In [None]:
X= data_clean.drop('quality',axis=1)
X= X.drop('citric acid',axis=1)
X= X.drop('free sulfur dioxide',axis=1)
X= X.drop('volatile acidity',axis=1)
X= X.drop('density',axis=1)
y = data_clean['quality']
y.shape

***train-test-split and scaling***

In [None]:
y.shape
from sklearn.model_selection import train_test_split

from sklearn import preprocessing 
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X= sc.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)
print("Shape of X_train: ",X_train.shape)
print("Shape of X_test: ", X_test.shape)
print("Shape of y_train: ",y_train.shape)
print("Shape of y_test",y_test.shape)


***SVM predictor***

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf = SVC(kernel='rbf', random_state=42).fit(X_train,y_train)
y_pred = clf.predict(X_test)
svm_acc=accuracy_score(y_test, y_pred)
svm_acc