In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score, RandomizedSearchCV, KFold, GridSearchCV
from sklearn.metrics import SCORERS, confusion_matrix, classification_report, accuracy_score, roc_curve, roc_auc_score

from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier, RidgeClassifier, SGDClassifier, Perceptron
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, ExtraTreesClassifier, GradientBoostingClassifier, RandomForestClassifier

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

In [None]:
data = pd.read_csv("./../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data["quality"].value_counts(normalize=True)*100

In [None]:
data.isna().sum()

### Deep Dive into 'Fixed Acidity'

In [None]:
col="fixed acidity"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim(0,10)
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
col='fixed acidity'
ax = sns.boxplot(x=data["quality"], y=data[col])



plt.ylim(0,20)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
col="fixed acidity"
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)


plt.ylim(0,20)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into volatile acidity

In [None]:
col="volatile acidity"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim(0,1)
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
col='volatile acidity'
ax = sns.boxplot(x=data["quality"], y=data[col])



plt.ylim(0,2)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
col="volatile acidity"
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)


plt.ylim(0,3)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into citric acid

In [None]:
col="citric acid"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim(0,.5)
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
col='citric acid'
ax = sns.boxplot(x=data["quality"], y=data[col])



plt.ylim(-.5,1)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
col="citric acid"
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)


plt.ylim(-.5,2)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into Residual Sugar

In [None]:
col="residual sugar"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim(0,4)
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
col='residual sugar'
ax = sns.boxplot(x=data["quality"], y=data[col])



#plt.ylim(-.5,1)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
col="residual sugar"
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)


#plt.ylim(-.5,2)
plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into chlorides

In [None]:
col="chlorides"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
col='chlorides'
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
col="chlorides"
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into 'free sulfur dioxide'

In [None]:
col="free sulfur dioxide"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim((0,18))    
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into total sulfur dioxide

In [None]:
col="total sulfur dioxide"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim((0,60))    
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into density

In [None]:
col="density"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

#plt.ylim((0,60))    
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into pH

In [None]:
col="pH"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim((0,4))    
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into sulphates

In [None]:
col="sulphates"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim((0,1))    
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

### Deep dive into alcohol

In [None]:
col="alcohol"
data_gby_fixed_acidity = data[['quality', col]].groupby(['quality']).mean().reset_index().sort_values(by=col, ascending=False)

plt.subplots(figsize=(15,8))
ax = sns.barplot(x="quality", y=col, data=data_gby_fixed_acidity)

for p in ax.patches:
    ax.annotate('{:.2f}'.format(p.get_height()), 
                   (p.get_x()+0.4, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', 
                    fontsize=12)

plt.ylim((0,13))    
plt.title(f"Quality vs mean {col}", fontsize=16)
plt.ylabel(f"mean {col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.boxplot(x=data["quality"], y=data[col])


plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(15,8))
ax = sns.violinplot(x="quality", y=col, data=data)

plt.title(f"Quality vs {col}", fontsize=16)
plt.ylabel(f"{col}", fontsize=14)
plt.xlabel("Quality", fontsize=14)
plt.show()

In [None]:
plt.subplots(figsize=(19,9))
sns.heatmap(data.corr(), annot=True, fmt='.2f')
plt.show()

In [None]:
#Making binary classificaion for the response variable.
#Dividing wine as good and bad by giving the limit for the quality
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
data['quality'] = pd.cut(data['quality'], bins = bins, labels = group_names)

In [None]:
data.head()

In [None]:
#Now lets assign a labels to our quality variable
label_quality = LabelEncoder()
#Bad becomes 0 and good becomes 1 
data['quality'] = label_quality.fit_transform(data['quality'])
data['quality'].value_counts()

In [None]:
X = data.drop(columns=['quality'])
y = data['quality']

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X,y,stratify=y,shuffle=True)

In [None]:
def model_name(key):
    model_dict = {
        'lrc': "LogisticRegression", 
        'pac': "PassiveAggressiveClassifier",
        'rdc': "RidgeClassifier",
        'sgc': "SGDClassifier",
        'pcn': "Perceptron",
        'gpc': "GaussianProcessClassifier",
        'lda': "LinearDiscriminantAnalysis",
        'qda': "QuadraticDiscriminantAnalysis",
        'knn': "KNeighborsClassifier",
        'dtc': "DecisionTreeClassifier", 
        'etc': "ExtraTreesClassifier", 
        'knn': "KNeighborsClassifier",
        'gnb': "GaussianNB",
        'bnb': "BernoulliNB",
        'rfc': "RandomForestClassifier",
        'svc': "SVC",
        'lvc': "LinearSVC",
        'nvc': "NuSVC",
        'bgc': "BaggingClassifier",
        'abc': "AdaBoostClassifier",
        'gbc': "GradientBoostingClassifier",
        'lgb': "LGBMClassifier",
        'cgb': "CatBoostClassifier",
        'xgb': "XGBClassifier"
    }
    return model_dict[key]



def init(X, y, scoring='accuracy'):    
    result = pd.DataFrame(columns=['model', f'mean-{scoring}', f'std-{scoring}'])
    
    skf = StratifiedKFold(n_splits=10)
    model_dict = {
        'lrc': LogisticRegression(), 
        'pac': PassiveAggressiveClassifier(),
        'rdc': RidgeClassifier(),
        'sgc': SGDClassifier(),
        'pcn': Perceptron(),
        'gpc': GaussianProcessClassifier(),
        'lda': LinearDiscriminantAnalysis(),
        'qda': QuadraticDiscriminantAnalysis(),
        'knn': KNeighborsClassifier(),
        'dtc': DecisionTreeClassifier(), 
        'etc': ExtraTreesClassifier(), 
        'knn': KNeighborsClassifier(),
        'gnb': GaussianNB(),
        'bnb': BernoulliNB(),
        'rfc': RandomForestClassifier(),
        'svc': SVC(),
        'lvc': LinearSVC(),
        'nvc': NuSVC(),
        'bgc': BaggingClassifier(),
        'abc': AdaBoostClassifier(),
        'gbc': GradientBoostingClassifier(),
        'lgb': LGBMClassifier(),
        'cgb': CatBoostClassifier(verbose=0),
        'xgb': XGBClassifier()
    }

    for key in tqdm(model_dict.keys()):
        res = cross_val_score(model_dict[key], X, y=y, scoring=scoring, cv=skf, n_jobs=-1, verbose=0)
        res_p = pd.DataFrame(data={
            'model': [key],
            f'mean-{scoring}': [res.mean()],
            f'std-{scoring}': [res.std()]
        })
        result = pd.concat([result, res_p], axis=0)
    result = result.sort_values(by=[f'mean-{scoring}'], ascending=False)
    
    fig,ax =  plt.subplots(figsize=(20, 8), sharey=True)
    plt.title(f"Model vs mean {scoring}")
    auc_chart = sns.barplot(ax = ax, x='model', y=f'mean-{scoring}', data=result)
    for p in auc_chart.patches:
        auc_chart.annotate('{:.4f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', fontsize=9)
    
    ax.set_ylim((0,1))
    plt.ylabel(f"Mean of {scoring}")
    plt.xlabel("Model")
    plt.show()
    
    fig,ax =  plt.subplots(figsize=(20, 8), sharey=True)
    plt.title(f"Model vs {scoring} standard deviation")
    std_chart = sns.barplot(ax = ax, x='model', y=f'std-{scoring}', data=result)
    for p in std_chart.patches:
        std_chart.annotate('{:.4f}'.format(p.get_height()), (p.get_x()+0.3, p.get_height()),
                    ha='center', va='bottom',
                    color= 'black', fontsize=9)
    
    ylim_val = max(result[f'std-{scoring}']) + max(result[f'std-{scoring}'])*.1
    ax.set_ylim((0,ylim_val))
    plt.ylabel(f"Standard deviation of  {scoring}")
    plt.xlabel("Model")
    plt.show()
    
    return result

In [None]:
res = init(X_train, y_train, scoring='accuracy')

In [None]:
res

In [None]:
model_name('lgb')

In [None]:
%%time
clf = LGBMClassifier(verbose=0)
clf.fit(X_train, y_train)

print(clf.score(X_train, y_train), clf.score(X_valid, y_valid))
print("================================")

print("accuracy_score")
print(accuracy_score(y_train, clf.predict(X_train)))

print("================================")
print("classification_report train")
print(print(classification_report(y_train, clf.predict(X_train))))

print("================================")
print("classification_report test")
print(print(classification_report(y_valid, clf.predict(X_valid))))