# Predict Red Wine Quality Using KNN Classification.

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from pandas_profiling import ProfileReport

## 1. Importing Dataset

In [None]:
df=pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
df.head()

## 2. Data Quality Check

In [None]:
df.isnull().sum()

__We can see that there are no null values in this dataset.__

## 3. Exploratory Data Analysis

In [None]:
ProfileReport(df)

__Hence from the above report we can see that all the 12 variables are of numeric type. And also that there are zero missing or null values in the dataset.__

In [None]:
plt.figure(figsize=[14,9])
sns.heatmap(df.corr(),annot=True)

__The heatmap above shows the correlation between variables in the dataset. It shows how variables are dependent on each other and by what factor and also their nature of dependency (directly or inversely proportional).__
__By seeing at the heatmap we can see that the top 5 features that affect the quality of wine are alcohol, sulphates, citric acid, fixed acidity, volatile acidity.__

In [None]:
sns.barplot("quality","alcohol",data=df)

In [None]:
sns.barplot("quality","volatile acidity",data=df)

In [None]:
sns.countplot(x="quality",data=df)

__The values are not uniformly distributed. A large number of wine sample quality fall under are 5 or 6. This may create a problem while creating model. So we will classify the data.__
__We will do so by divide the wine quality into "good" and "bad" categories.__

In [None]:
quality = df["quality"].values
category = []
for num in quality:
    if num>5:
        category.append("Good")
    else:
        category.append("Bad")

In [None]:
category = pd.DataFrame(data=category, columns=["category"])
data = pd.concat([df,category],axis=1)
data.drop(columns="quality",axis=1,inplace=True)

In [None]:
data.head()

In [None]:
sns.countplot(x="category",data=data)

__Hence, now we can see that the data is uniformly distributed, which we require for accuracy in the model.__

### 3a. Univariate Analysis.

In [None]:
#histogram of alcohol
plt.hist(x=df["alcohol"],bins=20)
plt.show()

In [None]:
#histogram of sulphates
plt.hist(x=df["sulphates"],bins=20)
plt.show()

In [None]:
#histogram of citric acid
plt.hist(x=df["citric acid"],bins=20)
plt.show()

In [None]:
#histogram of fixed acidity
plt.hist(x=df["fixed acidity"],bins=20)
plt.show()

In [None]:
#histogram of volatile acidity
plt.hist(x=df["volatile acidity"],bins=20)
plt.show()

In [None]:
#creating a list of numeric features in the database.
numeric_values=[x for x in df.columns if df[x].dtypes!='O']
numeric_values

In [None]:
#plotting boxplot of all the numeric values.
for feature in numeric_values:
    plt.figure(figsize=[8,5])
    sns.boxplot(df[feature],palette="spring_r")

### 3b. Bivariate Analysis.

In [None]:
sns.jointplot(x=df["alcohol"],y=df["density"],kind="hex")

In [None]:
plt.figure(figsize=[10,8])
sns.scatterplot("alcohol","density",hue="category",data=data)

In [None]:
plt.figure(figsize=[10,8])
sns.scatterplot("chlorides","sulphates",hue="category",data=data)

In [None]:
sns.barplot("category","citric acid",data=data)

In [None]:
sns.barplot("category","volatile acidity",data=data)

## 4. Modelling.

### 4a. Split the dataset using “train-test-split” function.

In [None]:
#declaring X and y variables where X are features and y is our target variable.
X= data.iloc[:,:-1].values
y=data.iloc[:,-1].values

In [None]:
from sklearn.preprocessing import LabelEncoder
label_quality = LabelEncoder()

In [None]:
#splitting the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=0)

In [None]:
#scaling data for optimization
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

### 4b. Apply KNN classification on “quality” column of the dataset. Select the appropriate features.

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train,y_train)

### 4c. Predict on the test set.

In [None]:
pred_knn=knn.predict(X_test)

### 4d. Find out the accuracy.

In [None]:
from sklearn.metrics import classification_report,accuracy_score
print(classification_report(y_test, pred_knn))

In [None]:
print("The accuracy of this model is ",accuracy_score(y_test,pred_knn)*100," %")