In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt 
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df_wine = pd.read_csv('../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

In [None]:
df_wine.info()
df_wine.describe()

In [None]:
df_wine.head()

In [None]:
### plot distribution of each feature to identify feature that might need scaling and/or normalization

for col in df_wine.columns:
    plt.hist(df_wine[col])
    print(col)
    plt.show()

In [None]:
### imbalanced dependent feature quality
print(df_wine['quality'].value_counts())

In [None]:
### check correlation (spearman due to ordinal scale of quality)
corr_sp = df_wine.corr('spearman')
plt.figure(figsize=(10,10))
sns.heatmap(corr_sp, annot = True)

In [None]:
### visualize relation between dependent feature and independent features further
fig, axs = plt.subplots(nrows=11, figsize=(10,20))

sns.boxplot(data = df_wine, x = 'quality', y = 'alcohol', ax=axs[0])
sns.boxplot(data = df_wine, x = 'quality', y = 'sulphates', ax=axs[1])
sns.boxplot(data = df_wine, x = 'quality', y = 'pH', ax=axs[2])
sns.boxplot(data = df_wine, x = 'quality', y = 'density', ax=axs[3])
sns.boxplot(data = df_wine, x = 'quality', y = 'total sulfur dioxide', ax=axs[4])
sns.boxplot(data = df_wine, x = 'quality', y = 'free sulfur dioxide', ax=axs[5])
sns.boxplot(data = df_wine, x = 'quality', y = 'chlorides', ax=axs[6])
sns.boxplot(data = df_wine, x = 'quality', y = 'residual sugar', ax=axs[7])
sns.boxplot(data = df_wine, x = 'quality', y = 'citric acid', ax=axs[8])
sns.boxplot(data = df_wine, x = 'quality', y = 'volatile acidity', ax=axs[9])
sns.boxplot(data = df_wine, x = 'quality', y = 'fixed acidity', ax=axs[10])

In [None]:
### drop features with low correlation to quality
for index, element in corr_sp['quality'].items():
    if element < 0.1 and element > -0.1:
        df_wine = df_wine.drop([index], axis = 1)

In [None]:
### check for missing values
df_wine.isnull().sum()

In [None]:
### train / test split 
y = df_wine['quality']
X = df_wine.drop('quality', axis = 1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=44, shuffle =True)

In [None]:
### use SMOTE to balance classes 
sm = SMOTE(random_state=42)
X_train_smote, y_train_smote = sm.fit_resample(X_train, y_train)

In [None]:
### standardize features
scaler = StandardScaler()
scaled_X_train_smote = scaler.fit_transform(X_train_smote)