In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

~1. Overview


~2. Data Preprocessing

    ~2.1  Defining "Quality" Logic
    
    ~2.2  Encoding categorical variables
    
    ~2.3  Standardisation 
    
    
~3. Classification Model

    ~3.1 Sample Model
    
    ~3.2 Tuning model
    


# 1. Overview

*Input Variables:*

**fixed acidity**: most acids involved with wine or fixed or nonvolatile

**volatile acidity**: the amount of acetic acid in wine

**citric acid**: found in small quantities, citric acid can add 'freshness' and flavor to wines

**residual sugar**: the amount of sugar remaining after fermentation stops

**chlorides**: the amount of salt in the wine

**free sulfur dioxide**: the free form of SO2 exists in equilibrium between molecular SO2 (as a dissolved gas) and bisulfite ion

**total sulfur dioxide**: amount of free and bound forms of S02

**density**: the density of water is close to that of water depending on the percent alcohol and sugar content

**pH**: describes how acidic or basic a wine is on a scale from 0 (very acidic) to 14 (very basic)

**sulphates**: a wine additive which can contribute to sulfur dioxide gas (S02) levels

**alcohol**: the percent alcohol content of the wine


*Output Variable:*

quality: target variable (score between 0 and 10, expect output 'good' / 'bad')

# 2. Data Preprocessing

In [None]:
import numpy as np
import pandas as pd

df = pd.read_csv("../input/red-wine-quality-cortez-et-al-2009/winequality-red.csv")
df.head()

Understanding the data

In [None]:
df.dtypes

In [None]:
df.info()

In [None]:
df.shape

In [None]:
df.isnull().sum()

In [None]:
df.columns.isna()


In [None]:
df.isin([' ?']).sum()

# 2.1 Defining "Quality" Logic

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set()
fig = plt.figure(figsize = [15,20])
cols = ['quality']
cnt = 1
for col in cols :
    plt.subplot(4,3,cnt)
    sns.boxplot(data = df, y = col)
    cnt+=1
plt.show()

From this graph, taking 75th or above for 'Good' definition, so

Assuming:

-quality >= 7.0 is GOOD

-quality < 7.0 is BAD

In [None]:
df['quality'] = ['good' if i>=7 else 'bad' for i in df['quality']]


Checking all data stype again

In [None]:
df.info()

# 2.2 Encoding categorical variables

We need to convert features which contain strings to numerical values. 

This is required by most model algorithms.

In [None]:
categorical_df = df.select_dtypes(include=['object'])
categorical_df.columns


from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()


categorical_df = categorical_df.apply(enc.fit_transform)
categorical_df.head()

df = df.drop(categorical_df.columns, axis=1)


df = pd.concat([df, categorical_df], axis=1)
df.head()



# 2.3 Standardisation

In [None]:
X = df.drop('quality', axis=1)
y = df['quality']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)

# 3.  Classification Model

# 3.1 Sample Model

Split Data

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve
# Cross validate model with Kfold stratified cross val
kfold = StratifiedKFold(n_splits=5)


Using default model:

1. Logistic Regression

2. SVM

3. Decision Tree

4. Random Forest

Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

def_lr= LogisticRegression()
def_lr.fit(X_train, y_train)

lr_pred = def_lr.predict(X_test)

from sklearn.metrics import accuracy_score

print("Logistic Regression accuracy: ", accuracy_score(y_test, lr_pred))


SVM

In [None]:
from sklearn import svm
def_svm = svm.SVC()
def_svm.fit(X_train, y_train)

svm_pred = def_svm.predict(X_test)
from sklearn.metrics import accuracy_score
print("SVM accuracy: ", accuracy_score(y_test, svm_pred))


Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

def_dt= DecisionTreeClassifier()
def_dt.fit(X_train, y_train)

dt_pred = def_dt.predict(X_test)

from sklearn.metrics import accuracy_score
print("Decision Tree accuracy", accuracy_score(y_test, dt_pred))
  
    


Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

def_rf = RandomForestClassifier()
def_rf.fit(X_train, y_train)


rf_pred = def_rf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Random Forests accuracy", accuracy_score(y_test, rf_pred))


# 3.2 Tuning model

As the best accuracy is Random Forest, so we will use Random Forest to tune the ML model

In [None]:
"""
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV , KFold


rf = RandomForestClassifier()

gs_grid = {"max_depth": [None],
              "max_features": [1, 3, 6, 8, 10],
              "min_samples_split": [2, 3, 6, 8, 10],
              "min_samples_leaf": [1, 3, 6, 8, 10],
              "bootstrap": [False],
              "n_estimators" :[100, 300, 500],
              "criterion": ["gini"]}


rf_CV = GridSearchCV(estimator = rf, param_grid=gs_grid, cv=kfold ,scoring="accuracy", n_jobs= 4, verbose = 1)

result = rf_CV.fit(X_train, y_train)

print(result.best_params_)
print(result.best_score_)
"""

Input the param into model

In [None]:
from sklearn.ensemble import RandomForestClassifier

final_rf = RandomForestClassifier(bootstrap=False , criterion='gini', max_features=3, min_samples_leaf=1, min_samples_split=6, n_estimators=100)
final_rf.fit(X_train, y_train)


final_rf_pred = final_rf.predict(X_test)

from sklearn.metrics import accuracy_score
print("Random Forests accuracy", accuracy_score(y_test, final_rf_pred))


Comparing 4 Model's accuary scoure:

I will choose **Random Forest** as Final Model