# Load data & basic checks

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df.head()

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.quality.unique()

# Data exploration

In [None]:
import matplotlib.pylab as plt
import seaborn as sns

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df.quality = le.fit_transform(df.quality)

In [None]:
X = df.copy()
y = X.pop("quality")

## Distribution of target variable (Quality) - Oversampling

In [None]:
sns.countplot(x="quality", data=df)

As the dataset is too unbalanced, and the dataset is small, we will oversample the minority classes

In [None]:
from imblearn.over_sampling import SMOTE

#oversampling
strategy = {0: 200, 1: 300, 4: 400, 5: 250}
oversample = SMOTE(sampling_strategy=strategy)
X, y = oversample.fit_resample(X, y)

#recreate df
df = pd.concat([X, y], axis=1)

In [None]:
sns.countplot(x="quality", data=df)

## Mutual information classification

In [None]:
from sklearn.feature_selection import mutual_info_classif as MIC

def make_mi_scores(X, y):
    mi_scores = MIC(X, y)
    mi_scores = pd.Series(mi_scores, index=X.columns, name="MI Scores")
    mi_scores = mi_scores.sort_values(ascending=False).to_frame()
    
    sns.barplot(x="MI Scores", y=mi_scores.index, data=mi_scores)
    plt.show()
    
    return mi_scores

In [None]:
make_mi_scores(X, y)

* Features with high MI scores such as alcohol, volatile acidity and sulphates are worth visualizing

## Alcohol -- Quality

In [None]:
sns.stripplot(x="quality", y="alcohol", data=df)

## Sulphates -- Quality

In [None]:
sns.stripplot(x="quality", y="sulphates", data=df)

## Volatile acidity -- Quality

In [None]:
sns.stripplot(x="quality", y="volatile acidity", data=df)

# Feature engineering

**Domain knowledge**: Consider the following components: 
* acidity, 
* sulful dioxide and sulphates
* sugar/sweetness, 
* alcohol

## Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

def apply_pca(X, suffix="", standardize=True):
    #standardize
    if standardize:
        X = (X - X.mean(axis=0)) / X.std(axis=0)
    
    #apply pca
    pca = PCA()
    X_pca = pca.fit_transform(X)
    
    #convert to dataframe
    component_names = [f"PC{i+1}_{suffix}" for i in range(X_pca.shape[1])]
    X_pca = pd.DataFrame(X_pca, columns=component_names)
    
    #create loadings
    loadings = pd.DataFrame(
        pca.components_.T,
        columns=component_names,
        index = X.columns
    )
    
    return pca, X_pca, loadings

def plot_variance(pca, width=8, dpi=100):
    # Create figure
    fig, axs = plt.subplots(1, 2)
    n = pca.n_components_
    grid = np.arange(1, n + 1)
    
    # Explained variance
    evr = pca.explained_variance_ratio_
    axs[0].bar(grid, evr)
    axs[0].set(
        xlabel="Component", title="% Explained Variance", ylim=(0.0, 1.0)
    )
    
    # Cumulative Variance
    cv = np.cumsum(evr)
    axs[1].plot(np.r_[0, grid], np.r_[0, cv], "o-")
    axs[1].set(
        xlabel="Component", title="% Cumulative Variance", ylim=(0.0, 1.0)
    )
    
    # Set up figure
    fig.set(figwidth=8, dpi=100)
    
    return axs

### PCA with Acidity measures

In [None]:
features = ["fixed acidity", "volatile acidity", "pH", "citric acid"]

pca, X_pca_acidity, loadings = apply_pca(X[features], suffix="acidity")

plot_variance(pca)

In [None]:
loadings

In [None]:
# PC1, fixed acidity and citric acid - same sign
X["acidity_1"] = X["fixed acidity"] + X["citric acid"]

# PC1, volatile acidity and pH - same sign
X["acidity_2"] = X["volatile acidity"] + X["pH"]

# PC2, fixed acidity and volatile acidity - same sign
X["acidity_3"] = X["fixed acidity"] + X["volatile acidity"]

make_mi_scores(X, y)

According to the MI Scores chart, we will be using acidity_2 as a feature

In [None]:
sns.stripplot(x="quality", y="acidity_2", data=pd.concat([X, y], axis=1))

### PCA with sulfur dioxide and sulphates

In [None]:
features = ["total sulfur dioxide", "free sulfur dioxide", "sulphates"]

pca, X_pca_sulfur, loadings = apply_pca(X[features], suffix="sulfur")

plot_variance(pca)

In [None]:
loadings

In [None]:
# in PC1, total and free sulfur dioxide, same sign
X["sulfur_1"] = X["total sulfur dioxide"] + X["free sulfur dioxide"]

make_mi_scores(X, y)

In [None]:
sns.stripplot(x="quality", y="sulfur_1", data=pd.concat([X, y], axis=1))

### PCA with alcohol and sugar

In [None]:
features = ["density", "alcohol", "residual sugar", "chlorides"]

pca, X_pca_alcohol_and_sugar, loadings = apply_pca(X[features], suffix="alcohol_sugar")

plot_variance(pca)

In [None]:
loadings

In [None]:
X["feature_1"] = X["density"] / X["alcohol"]
X["feature_2"] = X["residual sugar"] + X["chlorides"]
make_mi_scores(X, y)

# Modeling

## Feature selection

In [None]:
make_mi_scores(X, y)

## Train test split

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)

## Modeling

In [None]:
from xgboost import XGBClassifier as XGBC
from sklearn.model_selection import GridSearchCV

xgb_model = XGBC(use_label_encoder=False)

grid = GridSearchCV(xgb_model, param_grid={
    "learning_rate": [0.05],
    "n_estimators": [500]
}, cv = 5)

grid.fit(X_train, y_train)
grid.best_score_

In [None]:
X = df.copy()
y = X.pop("quality")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=29)

In [None]:
grid.fit(X_train, y_train)
grid.best_score_