In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Importing dataset into dataframe**

In [None]:
df = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')
df.head()

In [None]:
df.info()

There are no missing values in the dataset. So,dividing categories of quality from 1-10 as 2 - 6.5 as 'Bad' quality wine and 6.5-10 as 'Good' quality wine as stated in the data description. 

In [None]:
bins_ = (2,6.5,8)
labels_ = ['bad','good']
df['quality']=pd.cut(df['quality'],bins=bins_,labels=labels_)
print(df['quality'])

Comparing the number of 'good' and 'bad' quality wines to check for bias in the dataset.

In [None]:
sns.countplot(df['quality'],data=df)

In [None]:
x1=df[df['quality']=='bad']['quality'].count()
x2=df[df['quality']=='good']['quality'].count()
ratio =x1/(x1+x2)
print("Percentage of Bad Quality data in the dataset:",ratio*100)

There are approx. 86% of 'bad' quality wines and only 14% 'good' quality wines which is a sign of bias in the dataset. We have to counter this Imbalance data before building the model.

In [None]:
plt.figure(figsize=(10,5))
sns.heatmap(df.corr(),annot=True)

There is no strong dependencies between the co-variates. Now, before building the model we have to encode the categorical variables.

In [None]:
from sklearn.preprocessing import StandardScaler,LabelEncoder

label_enc = LabelEncoder()
df['quality'] = label_enc.fit_transform(df['quality'])
df.head(10)

In [None]:
y = df['quality']
X = df.drop('quality',axis=1)

Since, this dataset is strongly imbalanced, even though SMOTE is considered the best practice for handling imbalanced data,I used undersampling technique to overcome the bias in this notebook.

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=0,replacement = True)
rus.fit(X,y)
X_resampled, y_resampled = rus.fit_resample(X,y)

I have used 85% of resampled data for training set(which is a bit high) because we have lost lot of data in undersampling 'bad' quality wines.

In [None]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test = train_test_split(X_resampled,y_resampled,random_state=0,train_size=0.85) 

Standardization of data is important for algorithms involving calculating distance in it.So before trying different algorithms,it's good to standardize the data.

In [None]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

I have tried 2 models RandomForestClassifier and LogisticRegressionCV.

In [None]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix,classification_report,mean_absolute_error,accuracy_score

model1 = RandomForestClassifier(n_estimators=500)
model1.fit(X_train,y_train)
pred1 = model1.predict(X_test)
print(mean_absolute_error(y_test,pred1))
print(accuracy_score(y_test,pred1))

In [None]:
print(classification_report(y_test,pred1))

In [None]:
print(confusion_matrix(y_test,pred1))

We have 94% accuracy using RandomForestClassifier. We will try out Logistic Regression with cross validation folds.

In [None]:
model2 = LogisticRegressionCV(cv=5,random_state=0)
model2.fit(X_train,y_train)
pred2 = model2.predict(X_test)
print(mean_absolute_error(y_test,pred2))
print(accuracy_score(y_test,pred2))

In [None]:
print(classification_report(y_test,pred2))

In [None]:
print(confusion_matrix(y_test,pred2))

We got a 88% accuracy using Logistic Regression.
So, comparing Logistic Regression and RandomForest Classifier, Random Forest Classifier outperforms Logistic Regression by 6% in this dataset. Any suggestions or feedback is most welcome!!