In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.metrics import plot_confusion_matrix
from scipy.stats import norm, boxcox
from collections import Counter
from scipy import stats
from pandas_profiling import ProfileReport
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Loading Dataset

In [None]:
dataset = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

# Exploratory Data Analysis


## 1) Using Manual Methods

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.describe()

In [None]:
dataset.info()

In [None]:
dataset.isnull().values.any()

There are no Null Values in the Dataset

## Plotting Count for Qualities

In [None]:
sns.set_style('whitegrid')
plt.figure(figsize=(12, 6))
sns.countplot(x="quality", data=dataset, palette='husl');

## Finding Correlation among the variables

In [None]:
plt.figure(figsize=(20, 17))
matrix = np.triu(dataset.corr())
sns.heatmap(dataset.corr(), annot=True,
            linewidth=.8, mask=matrix, cmap="rocket");

## Visualising Numerical Data

In [None]:
sns.catplot(x="quality", y="fixed acidity", data=dataset, kind="box")
sns.catplot(x="quality", y="volatile acidity", data=dataset, kind="box")
sns.catplot(x="quality", y="citric acid", data=dataset, kind="box")
sns.catplot(x="quality", y="residual sugar", data=dataset, kind="box")
sns.catplot(x="quality", y="chlorides", data=dataset, kind="box")
sns.catplot(x="quality", y="density", data=dataset, kind="box")
sns.catplot(x="quality", y="pH", data=dataset, kind="box")
sns.catplot(x="quality", y="sulphates", data=dataset, kind="box")
sns.catplot(x="quality", y="alcohol", data=dataset, kind="box");

## Acidity Type with Different Qualities of Wine

In [None]:
acidity_count = dataset["fixed acidity"].value_counts().reset_index()
acidity_count

In [None]:
plt.figure(figsize=(30, 10))
plt.style.use("ggplot")
sns.barplot(x=acidity_count["index"], y=acidity_count["fixed acidity"])
plt.title("TYPE OF ACIDITY WITH QUALITY", fontsize=20)
plt.xlabel("ACIDITY", fontsize=20)
plt.ylabel("COUNT", fontsize=20)
plt.show()

## Distribution of pH with Different Qualities of Wine

In [None]:
plt.style.use("ggplot")
sns.displot(dataset["pH"]);  # using displot here
plt.title("DISTRIBUTION OF pH FOR DIFFERENT QUALITIES", fontsize=18)
plt.xlabel("pH", fontsize=20)
plt.ylabel("COUNT", fontsize=20)
plt.show()

# Skewness Correction

Here we will try to correct Skewness in some independent varaibles of our dataset

In [None]:
def skewnessCorrector(columnName):
    print('''Before Correcting''')
    (mu, sigma) = norm.fit(dataset[columnName])
    print("Mu before correcting {} : {}, Sigma before correcting {} : {}".format(
        columnName.upper(), mu, columnName.upper(), sigma))
    plt.figure(figsize=(20,10))
    plt.subplot(1,2,1)
    sns.distplot(dataset[columnName], fit=norm, color="orange")
    plt.title(columnName.upper() +
              " Distplot before Skewness Correction", color="black")
    plt.subplot(1,2,2)
    stats.probplot(dataset[columnName], plot=plt)
    plt.show();
    dataset[columnName], lam_fixed_acidity = boxcox(
        dataset[columnName])
    print('''After Correcting''')
    print("Mu after correcting {} : {}, Sigma after correcting {} : {}".format(
        columnName.upper(), mu, columnName.upper(), sigma))
    plt.figure(figsize=(20, 10))
    plt.subplot(1,2,1)
    sns.distplot(dataset[columnName], fit=norm, color="orange")
    plt.title(columnName.upper() +
              " Distplot After Skewness Correction", color="black")
    plt.subplot(1,2,2)
    stats.probplot(dataset[columnName], plot = plt)
    plt.show();


In [None]:
skewColumnList = [
    'fixed acidity', 'residual sugar', 'free sulfur dioxide', 'total sulfur dioxide', 'sulphates'
]
for columns in skewColumnList:
    skewnessCorrector(columns)


# Outlier Correction


We have detected several outliers in our dataset here we will try to correct them.


In [None]:
def detect_outliers(columns):
    outlier_indices = []

    for column in columns:
        # 1st quartile
        Q1 = np.percentile(dataset[column], 25)
        # 3st quartile
        Q3 = np.percentile(dataset[column], 75)
        # IQR
        IQR = Q3 - Q1
        # Outlier Step
        outlier_step = IQR * 1.5
        # detect outlier and their indeces
        outlier_list_col = dataset[(dataset[column] < Q1 - outlier_step)
                              | (dataset[column] > Q3 + outlier_step)].index
        # store indeces
        outlier_indices.extend(outlier_list_col)

    outlier_indices = Counter(outlier_indices)
    multiple_outliers = list(i for i, v in outlier_indices.items() if v > 1.5)

    return multiple_outliers


In [None]:
print("number of outliers detected --> ",
      len(dataset.loc[detect_outliers(dataset.columns[:-1])]))

In [None]:
dataset.loc[detect_outliers(dataset.columns[:-1])]

Dropping Outliers

In [None]:
dataset = dataset.drop(detect_outliers(dataset.columns[:-1]),axis = 0).reset_index(drop = True)

# 2) Using Pandas Profiling

In [None]:
!pip install pandas_profiling

In [None]:
ProfileReport(dataset)

# Data Preprocessing

- If quality value is less than or eqaul to 6 then it will be in class 0
- If quality value is greater than 6  then it will be in class 1

In [None]:
dataset['quality'] = np.where(dataset['quality'] > 6, 1, 0)
dataset['quality'].value_counts()

In [None]:
X = dataset.iloc[:, 0:-1].values
y = dataset.iloc[:, -1].values

In [None]:
X

In [None]:
y

In [None]:
X.shape

In [None]:
y.shape

## Splitting Dataset into Training Set and Test Set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=1)


## Standardizing Independent Variables

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)


# Training Classifiers on Training Set and drawing Inference

In [None]:
accuracy_scores = {}
def predictor(predictor, params):
    global accuracy_scores
    if predictor == 'lr':
        print('Training Logistic Regression on Training Set')
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(**params)

    elif predictor == 'svm':
        print('Training Support Vector Machine on Training Set')
        from sklearn.svm import SVC
        classifier = SVC(**params)

    elif predictor == 'knn':
        print('Training K-Nearest Neighbours on Training Set')
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(**params)

    elif predictor == 'dt':
        print('Training Decision Tree Classifier on Training Set')
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(**params)

    elif predictor == 'nb':
        print('Training Naive Bayes Classifier on Training Set')
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB(**params)

    elif predictor == 'rfc':
        print('Training Random Forest Classifier on Training Set')
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(**params)

    classifier.fit(X_train, y_train)

    print('''Predicting Single Cell Result''')
    single_predict = classifier.predict(sc.transform([[
        7.4, 0.7, 0.0, 1.9, 0.076, 11.0, 34.0, 0.9978, 3.51, 0.56, 9.4
    ]]))
    if single_predict > 0 :
        print('High Quality Wine')
    else:
        print('Low Quality Wine')
    print('''Prediciting Test Set Result''')
    y_pred = classifier.predict(X_test)
    
    result = np.concatenate((y_pred.reshape(len(y_pred), 1),
                             y_test.reshape(len(y_test), 1)),1)
    print(result, '\n')
    print('''Making Confusion Matrix''')
    from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
    y_pred = classifier.predict(X_test)
    cm = confusion_matrix(y_test, y_pred)
    print(cm, '\n')
    plot_confusion_matrix(classifier, X_test, y_test, cmap="pink")
    print('True Positives :', cm[0][0])
    print('False Positives :', cm[0][1])
    print('False Negatives :', cm[1][0])
    print('True Negatives :', cm[0][1], '\n')

    print('''Classification Report''')
    print(classification_report(y_test, y_pred,
          target_names=['0', '1'], zero_division=1))

    print('''Evaluating Model Performance''')
    accuracy = accuracy_score(y_test, y_pred)
    print(accuracy, '\n')

    print('''Applying K-Fold Cross validation''')
    from sklearn.model_selection import cross_val_score
    accuracies = cross_val_score(
        estimator=classifier, X=X_train, y=y_train, cv=10)
    print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
    accuracy_scores[classifier] = accuracies.mean()*100
    print("Standard Deviation: {:.2f} %".format(accuracies.std()*100), '\n')


## Training Logistic Regression on Training Set

In [None]:
predictor('lr', {'penalty': 'l1', 'solver': 'liblinear'})


## Training SVM on Training Set

In [None]:
predictor('svm', {'C': .5, 'gamma': 0.8,
          'kernel': 'linear', 'random_state': 0})


## Training Kernel SVM on Training Set

In [None]:
predictor('svm', {'C': .25, 'gamma': 0.1, 'kernel': 'rbf', 'random_state': 0})


## Training K-Nearest Neighbours on Training Set

In [None]:
predictor('knn', {'algorithm': 'auto', 'n_jobs': 1,
          'n_neighbors': 8, 'weights': 'distance'})


## Training Decision Tree on Training Set

In [None]:
predictor('dt', {'criterion': 'entropy', 'max_features': 'auto',
          'splitter': 'best', 'random_state': 0})


## Training Naive Bayes on Training Set

In [None]:
predictor('nb', {})


## Training Random Forest Classifier on Training Set

In [None]:
predictor('rfc', {'criterion': 'gini', 'max_features': 'log2', 'n_estimators': 100,'random_state':0})


### Finding which Classifier performed best

In [None]:
maxKey = max(accuracy_scores, key=lambda x: accuracy_scores[x])
print('The model with highest K-Fold Validation Accuracy score is  {0} with an accuracy of  {1:.2f}'.format(
    maxKey, accuracy_scores[maxKey]))


## Plotting Bar Chart for Accuracies of different classifiers

In [None]:
plt.figure(figsize=(12, 6))
model_accuracies = list(accuracy_scores.values())
model_names = ['LogisticRegression', 'SVC',
               'K-SVC', 'KNN', 'Decisiontree', 'GaussianNB', 'RandomForest']
sns.barplot(x=model_accuracies, y=model_names, palette='mako');


# Summary
- Random Forest Classifier performed best on this data set with an accuracy of 90.81%
- K-Nearest Classifier was just behind with an accuracy of an accuracy of 90.56% 

# **Please give your feedback by commenting below.**