**Basic Imports**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

**Get The Data**

In [None]:
df = pd.read_csv("../input/winequality-red.csv")
df.head(10)

**Classify The Quality**

In [None]:
quality = df["quality"].values
category = []
for num in quality:
    if num<5:
        category.append("Bad")
    elif num>6:
        category.append("Good")
    else:
        category.append("Medium")
    

**Classifying alcohol for Faceting**

In [None]:
alcohol = df["alcohol"].values
bracket = []
for i in alcohol:
    if i < 10.5:
        bracket.append("Considerable")
    else:
        bracket.append("Excessive")

In [None]:
#Create new data
bracket = pd.DataFrame(data=bracket, columns=["bracket"])
category = pd.DataFrame(data=category, columns=["category"])
data = pd.concat([df,bracket,category],axis=1)
data.drop(["alcohol","quality"],axis=1,inplace=True)          

In [None]:
data.head(10)

## Exploratory Data Analysis


In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data["category"],palette="pastel")
data["category"].value_counts()

In [None]:
plt.figure(figsize=(12,6))
sns.heatmap(df.corr(),annot=True, cmap = "YlOrRd")

**According to heatmap, we can focus on alcohol-quality, density-alcohol and volatile acidity-quality relations to get meaningful exploration**

In [None]:
plt.figure(figsize=(12,6))
sns.barplot(x=df["quality"],y=df["alcohol"],palette="colorblind")

In [None]:
plt.figure(figsize=(12,6))
sns.jointplot(y=df["density"],x=df["alcohol"],kind="scatter")

**Facet plot **

In [None]:
facet = sns.FacetGrid(data, col = "bracket", hue = "category", height = 4, palette = "deep")
facet.map(plt.scatter, 'fixed acidity', 'volatile acidity')
plt.legend()

** Setting features, labels and
Encoding the categorical data**

**[](http://)(bad = 0, good=1, medium=2)**

In [None]:
data.drop(["bracket"],axis=1,inplace=True)    
data.head(10)

In [None]:
#Make bins
X= data.iloc[:,:-1].values
y=data.iloc[:,-1].values

**We use LabelEncoder for converting the test based categorical variables into understandable numerical values that can be used to run in the model**

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y =LabelEncoder()
y= labelencoder_y.fit_transform(y)

## Training and Testing Data
**Now that we've explored the data a bit, let's go ahead and split the data into training and testing sets.**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)

**Scaling the data for optimise predictions**

In [None]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

## Training the Model and Predicting the Test Data 

## Support Vector Machine

In [None]:
from sklearn.svm import SVC
svc = SVC()
svc.fit(X_train,y_train)
pred_svc =svc.predict(X_test)

In [None]:
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y_test,pred_svc))

## Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=250)
rfc.fit(X_train, y_train)
pred_rfc = rfc.predict(X_test)
print(classification_report(y_test, pred_rfc))

## K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)
pred_knn=knn.predict(X_test)
print(classification_report(y_test, pred_knn))

**Testing with a 70-30 split for training and testing dataset**

In [None]:
from sklearn.model_selection import train_test_split
X_train30, X_test30, y_train30, y_test30 = train_test_split(X, y, test_size = 0.3,random_state=0)

In [None]:
sc_X = StandardScaler()
X_train30 = sc_X.fit_transform(X_train30)
X_test30 = sc_X.transform(X_test30)

In [None]:
#Applying SVC

svc = SVC()
svc.fit(X_train30,y_train30)
pred_svc30 =svc.predict(X_test30)

print(classification_report(y_test30,pred_svc30))

In [None]:
#Applying Random Forest

rfc = RandomForestClassifier(n_estimators=250)
rfc.fit(X_train30, y_train30)
pred_rfc30 = rfc.predict(X_test30)
print(classification_report(y_test30, pred_rfc30))

In [None]:
#Applying K-Nearest Neighbor

knn = KNeighborsClassifier()
knn.fit(X_train30,y_train30)
pred_knn30=knn.predict(X_test30)
print(classification_report(y_test30, pred_knn30))

## Conclusion

In [None]:
results = pd.DataFrame({'models': ["SVC","Random Forest","KNN"],
                           'accuracies20': [accuracy_score(y_test,pred_svc),accuracy_score(y_test,pred_rfc),accuracy_score(y_test,pred_knn)],
                          'accuracies30': [accuracy_score(y_test30,pred_svc30),accuracy_score(y_test30,pred_rfc30),accuracy_score(y_test30,pred_knn30)]})
results

**As a result, we can see Random Forest model has the best accurary ratio for predicting our wine quality when dividing the training and testing dataset into a 70-30 split**

