***Importing Libraries***

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import missingno as msno
import seaborn as sb
import plotly.express as px
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.metrics import accuracy_score,classification_report
import warnings
warnings.simplefilter('ignore')

plt.style.use('dark_background')

In [None]:
df = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head(10)

In [None]:
df.info()

In [None]:
df.describe().T.style.bar(subset=['mean'], color='#205fA2').background_gradient(subset=['std'], cmap='Reds').background_gradient(subset=['50%'], cmap='PuOr')

In [None]:
df.shape

In [None]:
df.isnull().sum()

***Exploratory Data Analysis***

In [None]:
msno.matrix(df,figsize=(12,8))

In [None]:
plt.figure(figsize=(12,8))
sb.countplot(data = df ,x="quality", color='w')
plt.title("Quality Counts", fontsize=15)
plt.xlabel("Quality", fontsize=14)
plt.ylabel("Counts", fontsize=14)
plt.grid()

In [None]:
px.violin(df, x="quality", y="fixed acidity", width=1000, height=600, template="plotly_dark",title="Fixed Acidity against Quality",orientation="v")

In [None]:
px.strip(df, x="quality", y="citric acid", width=1000, height=600, template="plotly_dark",title="Citric Acid against Quality",orientation="v")

In [None]:
px.scatter(df, x="quality", y="residual sugar", width=1000, height=600, template="plotly_dark",title="Residual Sugar against Quality")

In [None]:
px.scatter(df, x="quality", y="chlorides", width=1000, height=600, template="plotly_dark",title="Chlorides against Quality")

In [None]:
px.violin(df, x="quality", y="density", width=1000, height=600, template="plotly_dark",title="Density against Quality")

In [None]:
px.bar(df, x="quality", y="pH", width=1000, height=600, template="plotly_dark",title="pH against Quality")

In [None]:
px.box(df, x="alcohol", y="pH", width=1000, height=600, template="plotly_dark",title="Alcohol against Quality")

In [None]:
plt.figure(figsize=(12,8))
plt.hist(df['quality'], density=True, color="white",orientation="horizontal")
plt.xlabel("Density",fontsize=15)
plt.ylabel("Quality",fontsize=15)
plt.title("Wine Quality", fontsize=15)
plt.grid()

In [None]:
sb.displot(df['pH'],height=8)
plt.title("pH Count", fontsize=15)
plt.xlabel("pH", fontsize=14)
plt.ylabel("Counts", fontsize=14)
plt.grid()

In [None]:
sb.pairplot(df)

***Preprocessing***

In [None]:
plt.figure(figsize=(14,9))
sb.heatmap(df.corr(),cmap='cubehelix_r',annot=True, linewidths=1.5)

Here we see most negative features are : volatile acidity, pH

In [None]:
f, (ax1,ax2) = plt.subplots(1,2, figsize=(14,8))

sb.boxplot(data=df, x='quality', y='volatile acidity', ax=ax1)
ax1.set_title("Negative Correlation between Volatile Acidity and Quality", fontsize=10, fontweight="bold")
sb.boxplot(data=df, x='quality', y='pH', ax=ax2)
ax2.set_title("Negative Correlation between pH and Quality", fontsize=10, fontweight="bold")

In [None]:
f, (ax1,ax2) = plt.subplots(1,2, figsize=(14,8))
sb.distplot(df['volatile acidity'], fit=norm, ax=ax1)
ax1.set_title("Volatile Acidity Distribution", fontsize=14)
sb.distplot(df['pH'], fit=norm, ax=ax2)
ax2.set_title("pH Distribution", fontsize=14)

**Outliers Handle**

In [None]:
va = df['volatile acidity']
q25, q75 = np.percentile(va, 25), np.percentile(va,75)
print("Qurantile 25 : {}\n".format(q25))
print("Qurantile 75 : {}\n".format(q75))
va_iqr = q75 - q25
print("IQR of Volatile Acidity :{}\n".format(va_iqr))
va_cut_off = va_iqr*1.5
va_lower, va_upper = q25 - va_cut_off, q75 + va_cut_off
outliers = [x for x in va if x<va_lower or x>va_upper]
print("Numbers of Outliers : {}\n".format(len(outliers)))
print("Outliers : {}\n".format(outliers))
df = df.drop(df[(df['volatile acidity']>va_upper) | (df['volatile acidity']<va_lower)].index)
print(df.shape,"\n")
print("<.>"*25)


ph = df['pH']
q25, q75 = np.percentile(ph, 25), np.percentile(ph,75)
print("Qurantile 25 : {}\n".format(q25))
print("Qurantile 75 : {}\n".format(q75))
ph_iqr = q75 - q25
print("IQR of pH :{}\n".format(ph_iqr))
ph_cut_off = ph_iqr*1.5
ph_lower, ph_upper = q25 - ph_cut_off, q75 + ph_cut_off
outliers = [x for x in ph if x<ph_lower or x>ph_upper]
print("Numbers of Outliers : {}\n".format(len(outliers)))
print("Outliers : {}\n".format(outliers))
df = df.drop(df[(df['pH']>ph_upper) | (df['pH']<ph_lower)].index)
print(df.shape,"\n")
print("<.>"*25)


In [None]:
scale = StandardScaler()

In [None]:
df_scaled = scale.fit_transform(df)

In [None]:
x = df.iloc[:,:-1]
y = df['quality']

*Spliting*

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,train_size=.85, random_state=17)

***SVM***

In [None]:
svc = SVC()
svc.fit(xtrain,ytrain)
svm_ypred = svc.predict(xtest)
svm_as = accuracy_score(ytest,svm_ypred)
print("Accuracy Score of SVM : ",svm_as)
print("\n",classification_report(svm_ypred,ytest))

***GNB***

In [None]:
gnb = GaussianNB()
gnb.fit(xtrain,ytrain)
gnb_ypred = gnb.predict(xtest)
gnb_as = accuracy_score(ytest,gnb_ypred)
print("Accuracy Score of Gaussian Naive Bayes : ",gnb_as)
print("\n",classification_report(gnb_ypred,ytest))

***Random Forest***

In [None]:
rfc = RandomForestClassifier()
rfc.fit(xtrain,ytrain)
rfc_ypred = rfc.predict(xtest)
rfc_as = accuracy_score(ytest,rfc_ypred)
print("Accuracy Score of Random Forest Classifier : ",rfc_as)
print("\n",classification_report(rfc_ypred,ytest))

***Loistic Regression***

In [None]:
lg = LogisticRegression()
lg.fit(xtrain,ytrain)
lg_ypred = lg.predict(xtest)
lr_as = accuracy_score(ytest,lg_ypred)
print("Accuracy Score of Logistic Regression : ",lr_as)
print("\n",classification_report(lg_ypred,ytest))

***Result***

In [None]:
plt.figure(figsize=(9,6))
models = ['SVM','GNB','Random Forest','Loistic Regression']
conc = [svm_as, gnb_as, rfc_as, lr_as]
sb.barplot(x=conc, y=models, palette='cubehelix')
plt.title("Models Result", fontsize=15)
plt.xlabel("Accuracy", fontsize=14)
plt.ylabel("Models", fontsize=14)

***If you like my kernel, please upvote!***