# Importing libraries for exploratory data analysis, and the dataset for red wines

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from scipy import stats
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
pd.set_option('display.max_columns', 24)
df = pd.read_csv('../input/white-wine-quality/winequality-white.csv',delimiter=';')
df['quality_label']=df['quality'].apply(lambda x: 'low' if x<=5 else 'medium' if x<=7 else 'high')

In [None]:
df.head()

# selecting continuos features for predictions

In [None]:
attributes = df.columns[:-2]


In [None]:
df.quality_label.value_counts()

# running tukey highest significance difference on features to locate statistically significant features of wine, and ploting the results
* p values and alpha are inversed, therefore, pairs in which features with $p^{-1}$ values greater than 20$(0.05^{-1})$ are statistically significant within a confindence level 0f 95%

In [None]:
tukey_results = {}
for attribute in attributes:
    tukey_results[attribute]=pairwise_tukeyhsd(df[attribute],df['quality_label'],0.05)

In [None]:
tukey_df = pd.DataFrame(columns = ['pair','meandiff', 'inv_p_val','feature'])
for key,val in tukey_results.items():
    data = {'pair':['low-high', 'medium-high', 'medium-low'],
        'meandiff':val.meandiffs,
        'inv_p_val':val.pvalues**(-1),
        'feature':[key,key,key]}
    local_data = pd.DataFrame.from_dict(data)
    # display(tukey_df)
    tukey_df = tukey_df.append(local_data, ignore_index = True)

In [None]:
for ind,feature in enumerate(attributes):
    local_df= tukey_df[tukey_df['feature'] == feature]
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (8,8))
    fig.patch.set_facecolor('white')
    sns.barplot(data = local_df,x='pair',y='inv_p_val', hue='pair', ax = ax)
    ax.set_ylim(0,50)
    ax.axhline(y = 0.05**(-1), c = 'r', linestyle = '--')
    tit =f' rejection values for {feature}'
    ax.set_title(tit)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

# Using boxplots to visualize the IQR 25%-75% of different features

In [None]:
for ind,feature in enumerate(attributes):
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (4,4))
    fig.patch.set_facecolor('white')
    sns.boxplot(data = df,x='quality_label',y=f'{feature}', hue='quality_label',palette = 'PuOr', ax = ax)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

# using histograms to visualize distributions of datapoints in features

In [None]:
df.hist(bins=15, color='red', edgecolor='black', linewidth=1.0,
xlabelsize=8, ylabelsize=8, grid=False)

plt.tight_layout(rect=(0, 0, 1.2, 1.2))

# EDA Summary:
* except for free sulfur dioxide and citric acid, all other features have statistically significant difference between at least one of the 3 quality wine_labels
* The following features have approximately normal distribution of values: fixed acidity, chlorides ph.
* the following feature are sqewed towards higher values: volatile acidity, residual sugars,free sulfur dioxide, total sulfur dioxide, density,  sulphates, alcohol

# Selecting classification models:
* different models would be compared using different sampling methods and scaling methods, the goal is to select a method with low bias towards each quality label - a model in which the f1 scores for each model are the closest to the average f1 score.

In [None]:
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.naive_bayes import GaussianNB
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE

In [None]:
label_quality = LabelEncoder()

In [None]:
df['quality_label']=label_quality.fit_transform(df.quality_label)

* create_models_comparisons is comparing the selcted models in a given sampler and scaler, and summarizing the classification reports from each model in a table to be visualized.

In [None]:
def create_model_report(model,X,y, pipeline = False, sampler = None, scaler = None):
    """
    assumes
        model is sklearn predictor
        model_name str representation of model
        X is training data
        y, target values
        pipeline bool
        sampler sampler object (SMOTE,under_sampler,etc...)
        scaler scikit scaler
    return
        list of values - ['precision_low', 'recall_low', 'f1-score_low', 'support_low','precision_medium','recall_medium', 'f1-score_medium', 'support_medium','precision_high', 'recall_high', 'f1-score_high', 'support_high','accuracy']
    """
    def reporting(model_name, report):
        """
        assumes
            report dict classification report
        returns
            list of values from classification report
        """
        l = [model_name]
        for key, item in report.items():
            if key == 'accuracy':
                l.append(round(item,3))
            else:
                for k,value in item.items():
                    l.append(round(value,3))
        return l
    model_name = model[0]
    model_obj = model[1]
    wine_labels = list(df.quality_label.unique())
    if pipeline:
        X_samp,y_samp = sampler.fit_resample(X,y)
        X_scaled = scaler.fit_transform(X_samp)
        X_train,X_test, y_train,y_test = train_test_split(X_scaled, y_samp, test_size = 0.2, random_state=42)
        model_obj.fit(X_train,y_train)
        report = classification_report(y_test, model_obj.predict(X_test), labels = wine_labels, output_dict = True)
        return reporting(model_name, report)
    else:
            
        X_train,X_test, y_train,y_test = train_test_split(X, y, test_size = 0.2, random_state=42)
        model_obj.fit(X_train,y_train)
        report = classification_report(y_test, model_obj.predict(X_test), labels = wine_labels, output_dict = True)
        return reporting(model_name, report)

In [None]:
def create_models_comparison(scaler, models, model_names, sampler = None):
    """
    assumes:
        scaler a scikit learn scaler object
        models list of scikit-learn estimators
        model_names list of string corresponding to models
        sampler a sampler object default None
    returns:
        pandas DataFrame object
    """
    models_tup = []
    reports = []
    for i in range(len(models)):
        models_tup.append((model_names[i],models[i]))
    cols = ['model_name','precision_low', 'recall_low', 'f1-score_low', 'support_low','precision_medium','recall_medium', 'f1-score_medium', 'support_medium','precision_high', 'recall_high', 'f1-score_high', 'support_high','accuracy','mac_avg_precision','mac_avg_recall','mac_avg_f1-score','mac_avg_support','w_avg_precision','w_avg_recall','w_avg_f1-score','w_avg_support']
    if sampler:
        X=df.drop(['quality','quality_label'],axis =1,inplace = False).to_numpy()
        y=df['quality_label'].to_numpy()
        for model in models_tup:
            reports.append(create_model_report(model,X,y, pipeline = True, sampler=sampler, scaler=scaler))
    else:
        df_scaled = scaler.fit_transform(df)
        X=df_scaled[:,:11]
        y=df['quality_label'].to_numpy()
        for model in models_tup:
            reports.append(create_model_report(model,X,y))
    return pd.DataFrame(reports,columns = cols )
        
        

In [None]:
models = [RandomForestClassifier(),LogisticRegression(),LinearSVC(), SVC(),GaussianNB()]
models_names = ['RandomForestClassifier','LogisticRegression','LinearSVC', 'SVC','KNeighborsClassifier','GaussianNB']

# Comparing models with Standard scaler and no sampling

In [None]:
models_report_StnSc = create_models_comparison(StandardScaler(),models,models_names)
models_report_StnSc

In [None]:
benchmarks = models_report_StnSc.columns[1:]
for ind,benchmark in enumerate(benchmarks):
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (4,4))
    fig.patch.set_facecolor('white')
    sns.barplot(data = models_report_StnSc,x='model_name',y=f'{benchmark}', hue='model_name', ax = ax)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

# Comparing models with random undersampling and MinMaxScaler

In [None]:
models_report_under = create_models_comparison(MinMaxScaler(),models,models_names, sampler = RandomUnderSampler(random_state = 42))
models_report_under

In [None]:
benchmarks = models_report_under.columns[1:]
for ind,benchmark in enumerate(benchmarks):
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (4,4))
    fig.patch.set_facecolor('white')
    sns.barplot(data = models_report_under,x='model_name',y=f'{benchmark}', hue='model_name', ax = ax)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

# Comparing models with random SMOTE and StandardScaler

In [None]:
models_report_SMOTE = create_models_comparison(StandardScaler(),models,models_names, sampler = SMOTE(random_state = 42))
models_report_SMOTE

In [None]:

benchmarks = models_report_SMOTE.columns[1:]
for ind,benchmark in enumerate(benchmarks):
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (4,4))
    fig.patch.set_facecolor('white')
    sns.barplot(data = models_report_SMOTE,x='model_name',y=f'{benchmark}', hue='model_name', ax = ax)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

# Comparing models using Robust scaler without resampling

In [None]:
from sklearn.preprocessing import RobustScaler
models_report_robscale = create_models_comparison(RobustScaler(),models,models_names)
models_report_robscale

In [None]:

benchmarks = models_report_robscale.columns[1:]
for ind,benchmark in enumerate(benchmarks):
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (4,4))
    fig.patch.set_facecolor('white')
    sns.barplot(data = models_report_robscale,x='model_name',y=f'{benchmark}', hue='model_name', ax = ax)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

# Comparing models using robust scaler with SMOTE

In [None]:
models_report_robSMOTE = create_models_comparison(RobustScaler(),models,models_names, sampler = SMOTE(random_state=42))
models_report_robSMOTE


In [None]:
benchmarks = models_report_robSMOTE.columns[1:]
for ind,benchmark in enumerate(benchmarks):
    fig, ax = plt.subplots(nrows=1, ncols = 1,figsize = (4,4))
    fig.patch.set_facecolor('white')
    sns.barplot(data = models_report_robSMOTE,x='model_name',y=f'{benchmark}', hue='model_name', ax = ax)
    ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', borderaxespad=0.)
    ax.grid(b=True, which='both', axis='y')

In [None]:
scales = {'StandardScaler':'StnSc','MinMaxScaler':'MinMax','RobustScaler':'RobSc'}
samplers = {'RandomUnderSampler':'RanUnd', 'SMOTE':'SMOTE','None':'None'}
models_report_StnSc['scale'] = 'StandardScaler'
models_report_StnSc['sampler'] = 'None'
display(models_report_StnSc.scale.unique()[0])
models_report_StnSc['new_name'] = models_report_StnSc.model_name.apply(lambda x: f'{x} {samplers[models_report_StnSc.sampler.unique()[0]]} {scales[models_report_StnSc.scale.unique()[0]]}')
display(models_report_StnSc)
models_report_StnSc.drop(['scale', 'new_name', 'sampler'],inplace = True, axis = 1)

In [None]:
scales = {'StandardScaler':'StnSc','MinMaxScaler':'MinMax','RobustScaler':'RobSc'}
samplers = {'RandomUnderSampler':'RanUnd', 'SMOTE':'SMOTE','None':'None'}
models_report_StnSc['scale'] = 'StandardScaler'
models_report_StnSc['sampler'] = 'None'
models_report_under['sampler'] = 'RandomUnderSampler'
models_report_under['scale'] = 'MinMaxScaler'
models_report_SMOTE['scale'] = 'StandardScaler'
models_report_SMOTE['sampler'] = 'SMOTE'
models_report_robscale['scale'] = 'RobustScaler'
models_report_robscale['sampler'] = 'None'
models_report_robSMOTE['scale'] = 'RobustScaler'
models_report_robSMOTE['sampler'] = 'SMOTE'
model_reports = [
    models_report_StnSc,
    models_report_under,
    models_report_SMOTE,
    models_report_robscale,
    models_report_robSMOTE]
for model in model_reports:
    model.model_name = model.model_name.apply(lambda x: f'{x} {samplers[model.sampler.unique()[0]]} {scales[model.scale.unique()[0]]}')
df_model_reports_combined = models_report_StnSc.append([models_report_under,
    models_report_SMOTE,
    models_report_robscale,
    models_report_robSMOTE])

In [None]:
df_model_reports_combined.sort_values(['accuracy'], ascending = False).head(5)

# summary from models comparison:
* both standard scaling and robust (median-IQR) scaling are biased against quality_label high due to small number of data points (18 high compared to 744 low and 834 medium)
* Undersampling and MinMax scaler are biased in favor of label quality high, perhaps due to immense data loss (11 data points in support in minmax in comparison to 320 data points in suport in standard scaling without resampling )
* random forest classifier model achieved highet accuracy and balanced f1 and precision across labels both with standardscaler and robust scaler
### next step: comparing the results of random forest classifier with SMOTE scaling of standard scaler against robust scaler:

In [None]:
rfc_ss_df = models_report_SMOTE[models_report_SMOTE.model_name == 'RandomForestClassifier SMOTE StnSc']
rfc_rs_df = models_report_robSMOTE[models_report_robSMOTE.model_name == 'RandomForestClassifier SMOTE RobSc']
rfc_ss_df['scaler'] = 'StandardScaler'
rfc_rs_df['scaler'] = 'RobustScaler'
rfc_df = rfc_ss_df.append(rfc_rs_df)
rfc_df.drop('model_name',axis = 1, inplace = True)

In [None]:
models_report_SMOTE[models_report_SMOTE.model_name == 'RandomForestClassifier SMOTE StnSc']

In [None]:
rfc_df
rfc_df.head()

In [None]:
ss_precision = rfc_ss_df.drop(['model_name', 'recall_low', 'f1-score_low',
       'support_low', 'recall_medium', 'f1-score_medium',
       'support_medium', 'recall_high', 'f1-score_high',
       'support_high', 'accuracy', 'mac_avg_precision', 'mac_avg_recall',
       'mac_avg_f1-score', 'mac_avg_support', 'w_avg_precision',
       'w_avg_recall', 'w_avg_f1-score', 'w_avg_support', 'scaler','sampler','scale'],axis = 1,inplace = False).to_numpy()
ss_recall = rfc_ss_df.drop(['model_name', 'precision_low', 'f1-score_low',
       'support_low', 'precision_medium', 'f1-score_medium',
       'support_medium', 'precision_high', 'f1-score_high',
       'support_high', 'accuracy', 'mac_avg_precision', 'mac_avg_recall',
       'mac_avg_f1-score', 'mac_avg_support', 'w_avg_precision',
       'w_avg_recall', 'w_avg_f1-score', 'w_avg_support', 'scaler','sampler','scale'],axis = 1,inplace = False).to_numpy()
ss_f1 = rfc_ss_df.drop(['model_name', 'precision_low', 'recall_low',
       'support_low', 'precision_medium', 'recall_medium',
       'support_medium', 'precision_high', 'recall_high',
       'support_high', 'accuracy', 'mac_avg_precision', 'mac_avg_recall',
       'mac_avg_f1-score', 'mac_avg_support', 'w_avg_precision',
       'w_avg_recall', 'w_avg_f1-score', 'w_avg_support', 'scaler','sampler','scale'],axis = 1,inplace = False).to_numpy()
rs_precision = rfc_rs_df.drop(['model_name', 'recall_low', 'f1-score_low',
       'support_low', 'recall_medium', 'f1-score_medium',
       'support_medium', 'recall_high', 'f1-score_high',
       'support_high', 'accuracy', 'mac_avg_precision', 'mac_avg_recall',
       'mac_avg_f1-score', 'mac_avg_support', 'w_avg_precision',
       'w_avg_recall', 'w_avg_f1-score', 'w_avg_support', 'scaler','sampler','scale'],axis = 1,inplace = False).to_numpy()
rs_recall = rfc_rs_df.drop(['model_name', 'precision_low', 'f1-score_low',
       'support_low', 'precision_medium', 'f1-score_medium',
       'support_medium', 'precision_high', 'f1-score_high',
       'support_high', 'accuracy', 'mac_avg_precision', 'mac_avg_recall',
       'mac_avg_f1-score', 'mac_avg_support', 'w_avg_precision',
       'w_avg_recall', 'w_avg_f1-score', 'w_avg_support', 'scaler','sampler','scale'],axis = 1,inplace = False).to_numpy()
rs_f1 = rfc_rs_df.drop(['model_name', 'precision_low', 'recall_low',
       'support_low', 'precision_medium', 'recall_medium',
       'support_medium', 'precision_high', 'recall_high',
       'support_high', 'accuracy', 'mac_avg_precision', 'mac_avg_recall',
       'mac_avg_f1-score', 'mac_avg_support', 'w_avg_precision',
       'w_avg_recall', 'w_avg_f1-score', 'w_avg_support', 'scaler','sampler','scale'],axis = 1,inplace = False).to_numpy()

In [None]:
ss_precision

In [None]:
data = {'Scaler':['StandardScaler','RobustScaler'],
        'std_Precision':[np.std(ss_precision),np.std(rs_precision)],
        'std_Recall':[np.std(ss_recall),np.std(rs_recall)],
        'std_f1-score':[np.std(ss_f1),np.std(rs_f1)]}
std_df = pd.DataFrame.from_dict(data)
std_df

* Standard sclaer has lower deviation with every parameter.
* Therefore, the selected model for prediction is Random Forest classifier, with SMOTE sampling and standard scaler
* next, The model is to be optimized with the aid of GridsearchCV

In [None]:
X=df.drop(['quality','quality_label'],axis =1,inplace = False).to_numpy()
y=df['quality_label'].to_numpy()
smote = SMOTE()
stan_scaler = StandardScaler()
X_resamp, y_resamp = smote.fit_resample(X,y)
X_scaled = stan_scaler.fit_transform(X_resamp)
X_train,X_test,y_train,y_test=train_test_split(X_scaled,y_resamp,test_size = 0.2, random_state = 42)

In [None]:
## Using Cohen's Kappa to select the optimal parameters for the classifier

In [None]:
from sklearn.metrics import cohen_kappa_score, make_scorer
cohen_kappa = make_scorer(cohen_kappa_score)
params_dict={'n_estimators':[50,100,500,1000],'criterion':['gini','entropy'],'class_weight':['balanced','balanced_subsample'],'random_state':[None,42],'max_features':['auto','sqrt','log2']}
clf_rf=GridSearchCV(estimator=RandomForestClassifier(n_jobs=-1),param_grid=params_dict,scoring=cohen_kappa,cv=10)
clf_rf.fit(X_train,y_train)   
print(clf_rf.best_params_,clf_rf.best_score_)

In [None]:
white_rf2 = RandomForestClassifier(class_weight = 'balanced', criterion = 'entropy', max_features = 'auto', n_estimators = 1000)
white_rf2.fit(X_train, y_train)
print(classification_report(y_test, white_rf2.predict(X_test),labels = df.quality_label.unique()))
plot_confusion_matrix(white_rf2,X_test, y_test)

## Comparing to a dummy classifier

In [None]:
from sklearn.dummy import DummyClassifier

dumc = DummyClassifier(strategy = 'stratified', random_state=42)
dumc.fit(X_train, y_train)
print(classification_report(y_test,dumc.predict(X_test), labels = df.quality_label.unique()))
plot_confusion_matrix(dumc,X_test, y_test)