# Red Wine Quality

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from matplotlib import style
import missingno as msno
import seaborn as sns

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso, RidgeCV
from sklearn.metrics import mean_squared_log_error, mean_squared_error, r2_score, mean_absolute_error  # for regression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingRegressor, VotingClassifier
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.svm import LinearSVC, SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression

import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Importing the dataset

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df.shape

In [None]:
df.columns

In [None]:
df.info()

In [None]:
# no null or Nan values.
df.isnull().sum()

In [None]:
# just to visualize. no missing values.
msno.matrix(df, color=(0, 0, 0))

## Visualize the distribution of variables i.e univariate analysis

In [None]:
df.describe(include='all')

In [None]:
sns.factorplot(data=df, kind='box', size=10, aspect=2.5)

In [None]:
df.hist(bins=10, figsize=(20, 20))
plt.show()

## Correlation between different features

In [None]:
# corelation matrix.
cor_mat = df.corr()
mask = np.array(cor_mat)
mask[np.tril_indices_from(mask)] = False
fig = plt.gcf()
fig.set_size_inches(30, 12)
sns.heatmap(data=cor_mat, mask=mask, square=True, annot=True, cbar=True)

**Inderences from the heat map**

* The quality of wine is highly related to volatile acidity (negative correlation).
* Also the quality of wine is highly corelated to alcohol (positice correlation).
* pH and citric acid / fixed acidity are highly inversely related as all of us know that acids have smaller pH values.


## How quality varies from different numeric features

In [None]:
def plot(feature_x, target='quality'):
    sns.factorplot(x=target, y=feature_x, data=df,
                   kind='bar', size=5, aspect=1)
    sns.factorplot(x=target, y=feature_x, data=df,
                   kind='violin', size=5, aspect=1)
    sns.factorplot(x=target, y=feature_x, data=df,
                   kind='swarm', size=5, aspect=1)

In [None]:
# for fixed acidity.
plot('fixed acidity', 'quality')

In [None]:
# for alcohol.
plot('alcohol', 'quality')

## Modelling the data

In [None]:
bins = (2, 6.5, 8)
group_names = ['bad', 'good']
df['quality'] = pd.cut(df['quality'], bins=bins, labels=group_names)

In [None]:
label_quality = LabelEncoder()

In [None]:
df.quality = label_quality.fit_transform(df.quality)

In [None]:
X = df.drop('quality', axis=1)
y = df.quality

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42)

In [None]:
models = [LinearSVC(), SVC(kernel='rbf'), KNeighborsClassifier(), RandomForestClassifier(),
          DecisionTreeClassifier(), GradientBoostingClassifier(), GaussianNB()]
model_names = ['LinearSVM', 'rbfSVM', 'KNearestNeighbors', 'RandomForestClassifier', 'DecisionTree',
               'GradientBoostingClassifier', 'GaussianNB']

acc = []

for model in range(len(models)):
    clf = models[model]
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    acc.append(accuracy_score(pred, y_test))

models = {'Modelling Algo': model_names, 'Accuracy': acc}

In [None]:
models_df = pd.DataFrame(models)

In [None]:
models_df

In [None]:
sns.barplot(y='Modelling Algo', x='Accuracy', data=models_df)

## Feature Scaling

In [None]:
def feature_scaling(X_train, X_test, y_train, y_test, name_scaler):
    models = [LinearSVC(), SVC(kernel='rbf'), KNeighborsClassifier(), RandomForestClassifier(),
              DecisionTreeClassifier(), GradientBoostingClassifier(), GaussianNB()]
    acc_sc = []
    for model in range(len(models)):
        clf = models[model]
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        acc_sc.append(accuracy_score(pred, y_test))
    models_df[name_scaler] = np.array(acc_sc)

In [None]:
scalers = [MinMaxScaler(), StandardScaler()]
names = ['Acc_Min_Max_Scaler', 'Acc_Standard_Scaler']
for scale in range(len(scalers)):
    scaler = scalers[scale]
    scaler.fit(df)
    scaled_df = scaler.transform(df)
    X = scaled_df[:, 0:11]
    Y = df.quality.to_numpy()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.20, random_state=42)
    feature_scaling(X_train, X_test, y_train, y_test, names[scale])

In [None]:
models_df

In [None]:
sns.barplot(y='Modelling Algo', x='Accuracy', data=models_df)

In [None]:
sns.barplot(y='Modelling Algo', x='Acc_Min_Max_Scaler', data=models_df)

In [None]:
sns.barplot(y='Modelling Algo', x='Acc_Standard_Scaler', data=models_df)

In [None]:
# preparing the features by using a StandardScaler as it gave better results.
scaler = StandardScaler()
X = df.drop('quality', axis=1)
y = df.quality

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42)

In [None]:
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Parameter Tuning and Model Selection

In [None]:
model, test_accuracy = [], []

### K-Nearest Neighbors (KNN)

In [None]:
params_dict = {'n_neighbors': [i+1 for i in range(50)], 'n_jobs': [-1]}
knn_clf = GridSearchCV(estimator=KNeighborsClassifier(),
                       param_grid=params_dict, scoring='accuracy', cv=10)

knn_clf.fit(X_train, y_train)

In [None]:
knn_clf.best_params_

In [None]:
# the best accuracy obtained by Grid search on the train set.
knn_clf.best_score_

In [None]:
pred = knn_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('KNN')
test_accuracy.append(accuracy)
print("KNN Accuracy:", accuracy)

### SVM

In [None]:
params_dict = {'C': [0.98, 1.0, 1.2, 1.5, 2.0, 5.0], 'gamma': [
    0.50, 0.60, 0.70, 0.80, 0.90, 1.00], 'kernel': ['linear', 'rbf']}
svm_clf = GridSearchCV(
    estimator=SVC(), param_grid=params_dict, scoring='accuracy', cv=10)

svm_clf.fit(X_train, y_train)

In [None]:
svm_clf.best_params_

In [None]:
svm_clf.best_score_

In [None]:
pred = svm_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('SVM')
test_accuracy.append(accuracy)
print("SVM Accuracy:", accuracy)

### Decision Tree

In [None]:
param_dict = {'criterion': ['gini', 'entropy'], 'max_depth': [
    4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50, 70, 90, 120, 150]}
dt_clf = GridSearchCV(DecisionTreeClassifier(),
                      param_grid=param_dict, scoring='accuracy', cv=10)

dt_clf.fit(X_train, y_train)

In [None]:
dt_clf.best_params_

In [None]:
dt_clf.best_score_

In [None]:
pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Decision Tree')
test_accuracy.append(accuracy)
print("Decision Tree Accuracy:", accuracy)

### Random Forest

In [None]:
params_dict = {'n_estimators': [100, 200, 300, 400,
                                500], 'max_features': ['auto', 'sqrt', 'log2']}
rf_clf = GridSearchCV(estimator=RandomForestClassifier(
    n_jobs=-1), param_grid=params_dict, scoring='accuracy', cv=10)

rf_clf.fit(X_train, y_train)

In [None]:
rf_clf.best_params_

In [None]:
rf_clf.best_score_

In [None]:
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Random Forest')
test_accuracy.append(accuracy)
print("Random Forest Accuracy:", accuracy)

### Gradient Boosting

In [None]:
gb_clf = GridSearchCV(estimator=GradientBoostingClassifier(),
                      cv=10, param_grid=dict({'n_estimators': [100, 200, 300, 400, 500, 600, 700]}))

gb_clf.fit(X_train, y_train)

In [None]:
gb_clf.best_params_

In [None]:
gb_clf.best_score_

In [None]:
pred = gb_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Gradient Boosting')
test_accuracy.append(accuracy)
print("Gradient Boosting Accuracy:", accuracy)

### Adaboost Classifier

In [None]:
param_dict = {'n_estimators': list(range(1, 201, 20))}

adaboost_clf = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier(
    criterion='gini', max_depth=1000),  algorithm='SAMME.R'),
    param_grid=param_dict)

adaboost_clf.fit(X_train, y_train)

In [None]:
adaboost_clf.best_params_

In [None]:
adaboost_clf.best_score_

In [None]:
pred = adaboost_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Adaboost')
test_accuracy.append(accuracy)
print("Adaboost Accuracy:", accuracy)

### Voting Classifier

#### Hard Voting

In [None]:
h_voting_clf = VotingClassifier([
    ('logistic regression', LogisticRegression(C=0.1, penalty='l2')),
    ('knn', KNeighborsClassifier(n_jobs=-1, n_neighbors=4)),
    ('svm', SVC(C=2.0, gamma=0.7, kernel='rbf')),
    ('random forests', RandomForestClassifier(
        max_features='auto', n_estimators=300)),
    ('gradient boosting', GradientBoostingClassifier(n_estimators=600)),
], voting='hard')

In [None]:
h_voting_clf.fit(X_train, y_train)

In [None]:
h_voting_clf.score(X_train, y_train)

In [None]:
pred = h_voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Hard Voting')
test_accuracy.append(accuracy)
print("Hard Voting Accuracy:", accuracy)

#### Soft Voting

In [None]:
s_voting_clf = VotingClassifier([
    ('logistic regression', LogisticRegression(C=0.1, penalty='l2')),
    ('knn', KNeighborsClassifier(n_jobs=-1, n_neighbors=4)),
    ('svm', SVC(C=2.0, gamma=0.7, kernel='rbf', probability=True)),
    ('random forests', RandomForestClassifier(
        max_features='auto', n_estimators=300)),
    ('gradient boosting', GradientBoostingClassifier(n_estimators=600)),
], voting='soft')

In [None]:
s_voting_clf.fit(X_train, y_train)

In [None]:
s_voting_clf.score(X_train, y_train)

In [None]:
pred = s_voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

model.append('Soft Voting')
test_accuracy.append(accuracy)
print("Soft Voting Accuracy:", accuracy)

## Final Evaluation

In [None]:
final = pd.DataFrame({
    'Model': model,
    'Test Accuracy': test_accuracy
})

In [None]:
final

## So finally we select *Hard Voting Classifier*!