# Wine Quality

## This notebook is used to create a predictive model to classify wine as good or bad.

In [None]:
import pandas as pd
df_data_1 = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")


### Loading the packages

In [None]:
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.feature_selection import RFECV
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
import seaborn as sns; sns.set_theme(style = "dark")
%matplotlib inline
import matplotlib.pyplot as plt

### Data Visualization

In [None]:
wine = df_data_1

In [None]:
plt.figure(figsize = (18,10))
sns.kdeplot(x = "alcohol", data = wine, color = "red")
plt.xlabel("Alcohol content")
plt.ylabel("Density")
plt.title("A distribution of Alcoholic content", fontsize = 15)
plt.show();

In [None]:
plt.figure(figsize = (18,10))
sns.kdeplot(x = "quality", data = wine, color = "purple")
plt.xlabel("Alcohol content")
plt.ylabel("Density")
plt.title("A distribution of Alcoholic content", fontsize = 15)
plt.show();

### Building Predictive Model

In [None]:
x_wine = wine.iloc[:,:-1]
y_wine = wine.iloc[:,-1]
y_wine_bi = (y_wine >= 5.5).astype(int)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x_wine, y_wine_bi, train_size = 0.8, random_state = 42)

In [None]:
cor = wine.corr()

In [None]:
plt.figure(figsize = (16,12))
sns.heatmap(cor, annot = True, cmap = "viridis")
plt.show();

#### Decision Tree

In [None]:
clf = DecisionTreeClassifier(max_depth = 4, max_features = 6)
x_train_new = x_train[["volatile acidity", "citric acid", "chlorides", "pH", "sulphates", "alcohol"]]
clf.fit(x_train_new, y_train);

In [None]:
x_test_new = x_test[["volatile acidity", "citric acid", "chlorides", "pH", "sulphates", "alcohol"]]
clf.score(x_test_new, y_test)

In [None]:
! conda install graphviz

In [None]:
import graphviz
plt.figure(figsize = (20,10))
data = tree.export_graphviz(clf,feature_names=x_train_new.columns,class_names = ["not good", "good"], out_file=None, rounded = True, filled = True)
graph = graphviz.Source(data)
graph

In [None]:
?tree.export_graphviz

#### Random Forest Classifier

In [None]:
pipeline = Pipeline([("rfe", RFECV(Ridge(alpha = 1.0,solver = "cholesky"), cv = 5)), ("trees", RandomForestClassifier(max_features = 5, max_depth = 10, n_estimators = 19))])

In [None]:
final_model = pipeline

In [None]:
final_model.fit(x_train, y_train)

In [None]:
final_model["trees"].feature_importances_

In [None]:
final_model["rfe"].ranking_
feature = ['fixed acidity', 'volatile acidity', 'citric acid', 'residual sugar','chlorides', 
           'free sulfur dioxide', 'total sulfur dioxide', 'density','pH', 'sulphates', 'alcohol']
feature_rank = pd.DataFrame({"feature":feature, "rank":final_model["rfe"].ranking_})
feature_rank

#### Metrics

In [None]:
y_train_pred = cross_val_predict(final_model, x_train, y_train, cv = 5)

In [None]:
new_train = x_train.copy()
new_train["quality"] = y_train
new_train["predicted quality"] = y_train_pred
new_train.head()

In [None]:
precision_score(y_train, y_train_pred)

In [None]:
recall_score(y_train, y_train_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_train, y_train_pred)

In [None]:
f1_score(y_train, y_train_pred)

In [None]:
conf = confusion_matrix(y_train, y_train_pred)

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(conf, annot= True, cmap = "RdBu")
plt.show();

In [None]:
cross_val_score(final_model, x_train, y_train, cv = 5)

In [None]:
precisions, recall, threshold = precision_recall_curve(y_train, y_train_pred)

In [None]:
sns.set_theme(style="dark")
##plt.figure(figsize = (20,8))
plt.plot(threshold, precisions[:-1],'b--',label = "precision")
plt.plot(threshold, recall[:-1], 'g-', label = "recall")
plt.title("Precision vs Recall")
plt.xlabel("Threshold")
plt.ylabel("Scores")
plt.legend()
plt.show();

In [None]:
##plt.figure(figsize = (20,8))
plt.plot(recall[:-1], precisions[:-1], 'b--')
plt.title("Precision vs Recall")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show();

In [None]:
fpr, tpr, threshold = roc_curve(y_train, y_train_pred)

In [None]:
##plt.figure(figsize = (20,8))
plt.plot(fpr, tpr, "b-", linewidth = 2)
plt.plot([0,1], [0,1], "k-")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show();

In [None]:
roc_auc_score(y_train, y_train_pred)

### Testing dataset

In [None]:
y_pred = final_model.predict(x_test)

In [None]:
precision_score(y_test, y_pred)

In [None]:
recall_score(y_test, y_pred)

In [None]:
accuracy_score(y_test, y_pred)

In [None]:
f1_score(y_test, y_pred)

In [None]:
roc_auc_score(y_test, y_pred)

In [None]:
fpr2, tpr2, threshold2 = roc_curve(y_test, y_pred)

In [None]:
plt.plot(fpr2, tpr2, "b-", linewidth = 2)
plt.plot([0,1], [0,1], "k-")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show();

In [None]:
conf2 = confusion_matrix(y_test, y_pred)

In [None]:
plt.figure(figsize = (10,10))
sns.heatmap(conf2, annot= True, cmap = "RdBu")
plt.show();