# Red Wine Quality Prediction with Machine Learning

![](https://img.evbuc.com/https%3A%2F%2Fcdn.evbuc.com%2Fimages%2F39574914%2F97849063167%2F1%2Foriginal.jpg?s=157d92df50986f10581ddf3b300075d8)

## Attribute Information:

Input variables (based on physicochemical tests):
1 - fixed acidity
2 - volatile acidity
3 - citric acid
4 - residual sugar
5 - chlorides
6 - free sulfur dioxide
7 - total sulfur dioxide
8 - density
9 - pH
10 - sulphates
11 - alcohol
Output variable (based on sensory data):
12 - quality (score between 0 and 10)

In [None]:
import pandas as pd

import seaborn as sb
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.corr()

### visualizing trends between the data labels

In [None]:
sb.set_theme(style='whitegrid')
sb.violinplot(x='quality',y='alcohol',data=data,inner='box')
plt.title('alcohol vs quality violin plot')
plt.show()

In [None]:
sb.set_theme(style='whitegrid')
sb.kdeplot(x='quality',y='total sulfur dioxide',data=data,cmap='mako')
plt.title('quality vs total sulfur dioxide')
plt.show()

In [None]:
sb.displot(data=data,x='volatile acidity',hue='quality',kind='kde',height=6,multiple="fill", clip=(0, None),palette="ch:rot=-.25,hue=1,light=.75")
plt.title('Conditional kernel density estimate of volatile acidity')
plt.show()

In [None]:
plt.figure(figsize=(40,20))
sb.boxplot(data=data)
plt.show()

In [None]:
sb.pairplot(data,hue = 'quality')
plt.title('Pair wise plot')
plt.show()

## identifying features and labels

In [None]:
features = ['volatile acidity','citric acid','sulphates','alcohol','fixed acidity','chlorides','total sulfur dioxide','density','residual sugar','free sulfur dioxide','pH']
x = data[features]

In [None]:
x.describe()

In [None]:
y = data['quality'].apply(lambda y_data: 1 if y_data>=7 else 0)

In [None]:
y.value_counts()

In [None]:
x_train,x_test,y_train,y_test = train_test_split(x,y,random_state = 3,test_size = 0.25)

# logistic regression

In [None]:
logi = LogisticRegression(max_iter = 1000)
logi.fit(x_train,y_train)
logi_pred = logi.predict(x_test)
print('balanced accuracy score for LOGISTIC REGRESSION : ',balanced_accuracy_score(y_test,logi_pred))
print('accuracy score for LOGISTIC REGRESSION : ',accuracy_score(y_test,logi_pred))

In [None]:
sb.distplot(x=logi_pred)
plt.title('logistic predictions')
plt.show()

In [None]:
sb.regplot(x=y_test,y=logi_pred)
plt.show()

# random forest classifier

In [None]:
ran = RandomForestClassifier()
ran.fit(x_train,y_train)
ran_pred = ran.predict(x_test)
print('balanced accuracy sore for RANDOM FOREST CLASSIFIER : ',balanced_accuracy_score(y_test,ran_pred))
print('accuracy sore for RANDOM FOREST CLASSIFIER : ',accuracy_score(y_test,ran_pred))

In [None]:
sb.distplot(x=ran_pred)
plt.title('random forest predictions')
plt.show()

In [None]:
sb.regplot(x=y_test,y=ran_pred)
plt.show()

### random forest classifier gives the best accuracy with 93%.