In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Using Random Forest with the Red Wine datase.Model evaluation with ROC curve.

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [None]:
sns.pairplot(df,hue="quality");

In [None]:
plt.subplots(figsize =(10,10))
sns.heatmap(df.corr(),annot=True,linewidths=0.2,cmap='plasma');

In [None]:
df.quality.value_counts()
print(df.quality.value_counts().plot(kind='bar'))

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler 

In [None]:
X = df.drop(['quality'],axis=1)
y = df.quality

X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=38,stratify=y)

In [None]:
numeric_features = df.describe().columns[:-1]

In [None]:
# Define preprocessing for numeric columns (normalize them so they're on the same scale)
 
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])
        
# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('randomforest', RandomForestClassifier(max_depth=15,n_estimators=400))])


# fit the pipeline to train a random forest classifier model on the training set
rfc = pipeline.fit(X_train, (y_train))

predictions = rfc.predict(X_test)

In [None]:
confusion_matrix(y_test,predictions)
pd.crosstab(y_test, predictions, rownames = ['Actual'], colnames =['Predicted'])

In [None]:
print(classification_report(y_test,predictions))

## Comment: 
* unable to predict quality 3,4,8 due to high imbalance of the data and less data for training.

ref: https://www.wine-searcher.com/wine-scores

Score	Explanation

95–100	Classic: a great wine

90–94	Outstanding: a wine of superior character and style

85–89	Very good: a wine with special qualities 9-10

80–84	Good: a solid, well-made wine 6-8

75–79	Mediocre: a drinkable wine that may have minor flaws 3-5

50–74	Not recommended  0-2

Using the guide above, here's the classification we will use for this dataset
* Bad - 0-2
* Mediocre - 3-5
* Good - 6
* Very Good - 7-8
* Outstanding - 9-10

### create 3 bins

In [None]:
df2 = df.copy()

In [None]:
# create 3 bins for mediocre,medium and good wine

bins = [2,5,6,np.inf]
labels = ["mediocre", "medium", "good"]
df2['quality'] = pd.cut(df2['quality'],bins=bins, labels=labels)

In [None]:
df2.quality.value_counts()

In [None]:
# label encode quality
from sklearn.preprocessing import LabelEncoder
label = LabelEncoder()
df2['quality'] = label.fit_transform(df2['quality'])

In [None]:
label.classes_

In [None]:
X = df2.drop(['quality'],axis=1)
y = df2.quality

In [None]:
from itertools import cycle
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score

# Binarize the output
y = label_binarize(y, classes=[0,1,2])
n_classes = y.shape[1]

# shuffle and split training and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=38,stratify=y)


# Define preprocessing for numeric columns (normalize them so they're on the same scale)
 
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])
        
# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('randomforest', RandomForestClassifier())])


# fit the pipeline to train a random forest classifier model on the training set

rfc = pipeline.fit(X_train, (y_train))

predictions =rfc.predict(X_test)

print(classification_report(y_test,predictions))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], predictions[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Compute micro-average ROC curve and ROC area

fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), predictions.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])

In [None]:
# First aggregate all false positive rates
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))

# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
for i in range(n_classes):
    mean_tpr += np.interp(all_fpr, fpr[i], tpr[i])

# Finally average it and compute AUC
mean_tpr /= n_classes

fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])

# Plot all ROC curves
plt.figure()
plt.subplots(figsize=(15,10))
plt.plot(fpr["micro"], tpr["micro"],
         label='micro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["micro"]),
         color='deeppink', linestyle=':', linewidth=4)

plt.plot(fpr["macro"], tpr["macro"],
         label='macro-average ROC curve (area = {0:0.2f})'
               ''.format(roc_auc["macro"]),
         color='navy', linestyle=':', linewidth=4)

colors = cycle(['aqua', 'darkorange', 'cornflowerblue','green','blue','red'])
for i, color in zip(range(n_classes), colors):
    plt.plot(fpr[i], tpr[i], color=color, lw=2,
             label='ROC curve for  {0} wine quality (area = {1:0.2f})'
             ''.format(label.classes_[i], roc_auc[i]))
    
plt.plot([0, 1], [0, 1], 'k--', lw=2)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()

### create 2 bins

In [None]:
df3 = df.copy()

In [None]:
# create 2 bins for good and bad wine

bins = [2, 6.5, 8]
labels = ["bad", "good"]
df3['quality'] = pd.cut(df3['quality'],bins=bins, labels=labels)

In [None]:
df3['quality'].value_counts()

In [None]:
# label encode quality
label = LabelEncoder()
df3['quality'] = label.fit_transform(df3['quality'])

In [None]:
label.classes_

In [None]:
X = df3.drop(['quality'],axis=1)
y = df3.quality

In [None]:
#shuffle and split training and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=38,stratify=y)


# Define preprocessing for numeric columns (normalize them so they're on the same scale)
 
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

# Combine preprocessing steps
preprocessor = ColumnTransformer(transformers=[('num', numeric_transformer, numeric_features)])
        
# Create preprocessing and training pipeline
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('randomforest', RandomForestClassifier(max_depth=15))])


# fit the pipeline to train a random forest classifier model on the training set

rfc = pipeline.fit(X_train, (y_train))

predictions =rfc.predict(X_test)

In [None]:
confusion_matrix(y_test,predictions)
pd.crosstab(y_test, predictions, rownames = ['Actual'], colnames =['Predicted'])

In [None]:
print(classification_report(y_test,predictions))

In [None]:
roc_auc_score(y_test, predictions)

In [None]:
def draw_roc_curve(y_pred_prob):
    fpr, tpr, thresholds = roc_curve(y_true=y_test, y_score=y_pred_prob)
    plt.subplots(figsize=(15,10))
    plt.plot([0, 1], [0, 1], 'k--')
    plt.plot(fpr, tpr, label='Random Forest Classifier')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Random Forest Classifier ROC Curve')
    plt.show();

In [None]:
y_pred_prob = rfc.predict_proba(X_test)[:,1]
draw_roc_curve(y_pred_prob)

In [None]:
# set threshold to 0.35

threshold = 0.35

from sklearn.preprocessing import binarize

y_pred_prob = y_pred_prob.reshape(1,-1)

y_pred_class = binarize(y_pred_prob,threshold=threshold)[0]

y_pred_class = y_pred_class.astype('int')

print(confusion_matrix(y_test,y_pred_class))

print(classification_report(y_test,y_pred_class))

print(f'roc_auc_score : {roc_auc_score(y_test, y_pred_class)}')

draw_roc_curve(y_pred_class)

In [None]:
# set threshold to 0.15

threshold = 0.15

y_pred_prob = y_pred_prob.reshape(1,-1)

y_pred_class = binarize(y_pred_prob,threshold=threshold)[0]

y_pred_class = y_pred_class.astype('int')

print(confusion_matrix(y_test,y_pred_class))

print(classification_report(y_test,y_pred_class))

print(f'roc_auc_score : {roc_auc_score(y_test, y_pred_class)}')

draw_roc_curve(y_pred_class)

### Comment:

* with 2 bins and lower threshold (i.e 0.15) , we have a good roc_auc_score : 0.85