In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

## Load The Dataset

from loading the dataset we know that the dataset has 12 features. They are chemical parameters in wine. In this project I try to classify quality of wine. From basic analysis we know there is not any missing value in the dataset. Almost of all feature has float data type except **quality** parameter. Then, I continue to EDA. 

In [None]:
path ='../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv'
rawData = pd.read_csv(path)
rawData

In [None]:
rawData.describe()

In [None]:
rawData.isnull().sum()

In [None]:
rawData.info()

In [None]:
from sklearn.preprocessing import MinMaxScaler
data = rawData.copy()
data1= data.drop('quality', axis=1)
scaler = MinMaxScaler()
dataScaled = pd.DataFrame(scaler.fit_transform(data), columns=[rawData.columns])
dataST = dataScaled.drop('quality', axis=1)
dataST

In [None]:
X = dataST
y = rawData['quality']

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
xTrain, xVal, yTrain, yVal = train_test_split(x_train, y_train, test_size=0.2, random_state=0)

## Exploratory Data Analysis

The data exploratory analysis it self I broke down into two parts there are univariate analysis and bivariate analysis.

##  Univariate Analysis

In the univariate analysis I try to understand the distribution each feature visually and statitstic test. The data of each feature I visualize with histogram to see the pattern of the data. The data looks have normal distribution in a glance but I need validate this visual result with statistic test. I am using saphiro wilk test to know the distribution of each feature. From saphiro test we know that all feature doesn't have normal distribution.



In [None]:
def histPlot(col):
    sns.histplot(x=col, data=rawData)
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
for col in rawData.columns:
    histPlot(col)

In [None]:
ls = []
a = 0
colName = [j for j in rawData.columns]
while a < 4:
    for i in range(5):
        ls.append((a, i))
    a += 1
fig, ax = plt.subplots(3, 5, figsize=(25, 15))
for k in range(12):
    sns.boxplot(ax=ax[ls[k][0], ls[k][1]], y=colName[k], data=rawData)
    

In [None]:
xquality=rawData['quality'].value_counts()
sns.barplot(y=xquality, x=xquality.index)

In [None]:
## Saphiro Wilk-Test

from scipy import stats

for i in dataST.columns:
    stat, p = stats.shapiro(dataST[i])
    if p > 0.05:
        print('{} feature has normal distribution (p ={})'.format(i, p))
    else:
        print('{} feature has not normal distribution (p = {})'.format(i, p))

## Bivariate Analysis

In bivariate analysis I try to know the pattern and correlation between predictor and target variable (**quality**). From boxplot we know that some predictor like **alcohol, sulphates, and citric acid** has positive correlation with target variable. From literaure we know that **sulphate and alcohol** apppears in fermentation process. So, it's make sense if the **sulphate and alcohol** has positive correlation wiht quality of wine. The sulphate and alcohol value indicate age of wine.

Heatmap shows us the correlation between each feature. We can know some feature has strong correlation with other feature but it's not strong enough. 

In [None]:
ls1 = []
b = 0
colName1 = [j for j in dataST.columns]
while b < 4:
    for i in range(5):
        ls1.append((b, i))
    b += 1
fig, ax1 = plt.subplots(3, 5, figsize=(25, 15))
for k in range(11):
    sns.boxplot(ax=ax1[ls1[k][0], ls1[k][1]], y=dataST[colName1[k]], x=y)

In [None]:
plt.figure(figsize=(10, 8))
corr = dataST.corr()
sns.heatmap(corr, annot=True)

## Feature Selection

I am using **Feature Importance** parameter from extra trees model.

In [None]:
from sklearn.ensemble import ExtraTreesClassifier
model = ExtraTreesClassifier()
model.fit(x_train, y_train)
featImportance=pd.Series(model.feature_importances_, index=x_train.columns)
featImportance.nlargest(5).plot(kind='barh')

In [None]:
y = y.apply(lambda value : 1 if value >= 7 else 0)
y.value_counts()

In [None]:
X = dataST[['alcohol', 'total sulfur dioxide', 'sulphates', 'volatile acidity', 'density']]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)



## Oversampling

Because of dataset that we use unbalance, we need to balance it. It's to avoid bias in predicting label. I am using oversampling with SMOTE to balance the data. It's generate syntetic label of the data and we can avoid the bias.

In [None]:
from imblearn.over_sampling import SMOTE

oversample = SMOTE()
x_train, y_train = oversample.fit_resample(x_train, y_train)
y_train.value_counts()

##  Building Model

In this process I build three different model. There are **KNN, Logistic Regression and SVM**. From model building result, we can conclude that **KNN** best performance among the other models. 

In [None]:
from sklearn.model_selection import GridSearchCV , cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier()
knn_model = knn.fit(x_train, y_train)
y_pred = knn_model.predict(x_test)
acc = accuracy_score(y_test, y_pred)
print('accuracy score of KNN :{}'.format(acc))

In [None]:
knn_params = {'n_neighbors':[2, 3, 5, 7, 9]}
knn_cv = GridSearchCV(knn, knn_params, cv=10)
knn_cv.fit(x_train, y_train)

In [None]:
print('Best Parameters:', knn_cv.best_params_)

In [None]:
knn = KNeighborsClassifier(n_neighbors=2)
opt_knn = knn.fit(x_train, y_train)
y_predopt = opt_knn.predict(x_test)
accuracy_score(y_test, y_predopt)

In [None]:
print(classification_report(y_test, y_predopt))

In [None]:
from sklearn.linear_model import LogisticRegression

modelLog = LogisticRegression()
modelLog.fit(x_train, y_train)
predLog = modelLog.predict(x_test)
accuracylog = accuracy_score(y_test, predLog)
accuracylog

In [None]:
print(classification_report(y_test, predLog))

In [None]:
from sklearn.svm import SVC

modelsvm = SVC(kernel='poly')
modelsvm.fit(x_train, y_train)
predsvm = modelsvm.predict(x_test)
accsvm = accuracy_score(y_test, predsvm)
accsvm

In [None]:
print(classification_report(y_test, predsvm))