In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [None]:
df=pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head(5)

In [None]:
df.info()

In [None]:
df.describe()

The MEAN, STD, 75%, max value are all ~~ 1 for density. So let's drop 'Density column.

In [None]:
df.drop(['density'],inplace=True, axis =1)
df.isnull().sum()

# Univariate analysis

In [None]:
df.hist(bins=10,figsize=(15,12))
plt.show()

# Multivariate analysis

Let's plot a heat map to find the correlated data columns

In [None]:
plt.figure(figsize=(15,10))
corr = df.corr()
sns.heatmap(corr,annot=True,cmap='viridis')

In [None]:
corr_cols=[]
for i in range(len(df.columns)-1):
    for j in range(i):
        val=corr.iloc[i,j]
        if abs(val)>=.7:
            corr_cols.append(corr.columns[i])
print(corr_cols)

No columns have strong correlation with other columns in the data. So we are keeping all columns here.

Let's categorise the wine quality into 3 as Bad, Average and Good. From the histogram plotted above, It s clear that quality is more of distributed normally with majority falling in between 5 and 6. So let's take values between 5&6 as average quality.

In [None]:
def quality(x):
    if x<5:
        return 0 #bad
    elif x==5 or x==6:
        return 1 #average
    else:
        return 2 #good
df['quality']=df['quality'].apply(quality)

In [None]:
x=df.iloc[:,:-1]
y=df.iloc[:,-1]
xTrain,xTest,yTrain,yTest=train_test_split(x,y,test_size=.2,random_state=1)

# Scale the Data

In [None]:
sc=MinMaxScaler()
xTrain=sc.fit_transform(xTrain)
xTest=sc.transform(xTest)

# KNN Classifier

In [None]:
acc = 0
k=0
for i in range(1,40):
    knn = KNeighborsClassifier(n_neighbors=i)
    knn.fit(xTrain,yTrain)
    pred = knn.predict(xTest)
    score=accuracy_score(yTest,pred)
    if score>acc:
        acc=score
        k=i
print(acc,k)

In [None]:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(xTrain,yTrain)
pred = knn.predict(xTest)
print(accuracy_score(yTest,pred))

# Random Forest

In [None]:
classifier=RandomForestClassifier()
classifier.fit(xTrain,yTrain)
yPred = classifier.predict(xTest)
print(accuracy_score(yTest,yPred))

# XGBOOST

In [None]:
model=xgb.XGBClassifier() 
model.fit(xTrain, yTrain) 
yPred=model.predict(xTest) 
print(accuracy_score(yTest, yPred)) 

# Naive Bayes

In [None]:
mod_nb=GaussianNB()
mod_nb.fit(xTrain,yTrain)
yPred=mod_nb.predict(xTest) 
print(accuracy_score(yTest, yPred))

# Support Vector Classifier

In [None]:
mod=SVC(kernel='rbf')
mod.fit(xTrain,yTrain)
yPred=mod.predict(xTest) 
print(accuracy_score(yTest, yPred))

Random Forest classifier had the highest accuracy. Upvote if you like the kernel.