DATA LOADING AND CLEANING

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from pandas import DataFrame

sns.set()
%matplotlib inline

In [2]:
%cd C:\Users\Peter\Documents\Machine Learning Projects\Wine Quality
%ls

C:\Users\Peter\Documents\Machine Learning Projects\Wine Quality
 Volume in drive C has no label.
 Volume Serial Number is C41F-BCB2

 Directory of C:\Users\Peter\Documents\Machine Learning Projects\Wine Quality

10/05/2018  09:36 AM    <DIR>          .
10/05/2018  09:36 AM    <DIR>          ..
10/05/2018  09:24 AM            84,199 winequality-red.csv
10/05/2018  09:24 AM           264,426 winequality-white.csv
               2 File(s)        348,625 bytes
               2 Dir(s)  231,112,712,192 bytes free


In [3]:
red = pd.read_csv('winequality-red.csv',delimiter=';')
white = pd.read_csv('winequality-white.csv',delimiter=';')

In [4]:
red.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
white.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
#combining both the white and red wine datasets into one

wines = pd.concat([white,red])
wines.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [7]:
wines.shape

(6497, 12)

In [8]:
#showing the data type of each feature and if there is any missing value

wines.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6497 entries, 0 to 1598
Data columns (total 12 columns):
fixed acidity           6497 non-null float64
volatile acidity        6497 non-null float64
citric acid             6497 non-null float64
residual sugar          6497 non-null float64
chlorides               6497 non-null float64
free sulfur dioxide     6497 non-null float64
total sulfur dioxide    6497 non-null float64
density                 6497 non-null float64
pH                      6497 non-null float64
sulphates               6497 non-null float64
alcohol                 6497 non-null float64
quality                 6497 non-null int64
dtypes: float64(11), int64(1)
memory usage: 659.9 KB


In [9]:
#Selecting all features for the base model

X = wines.iloc[:,:11]
y = wines['quality']

print(X.shape)
print(y.shape)

(6497, 11)
(6497,)


In [10]:
#How better can we do with automatic feature selection using Recursive Feature Elimination wrapper method?
logreg = LogisticRegression()

selector = RFE(logreg)

selector = selector.fit(X,y)


In [11]:
#the best features according to RFE have a ranking of 1, so we'll create a second model with those features.

selected_features = DataFrame({'Feature':list(X.columns),'Ranking':selector.ranking_})
selected_features.sort_values(by='Ranking')

Unnamed: 0,Feature,Ranking
1,volatile acidity,1
4,chlorides,1
7,density,1
8,pH,1
9,sulphates,1
10,alcohol,2
2,citric acid,3
0,fixed acidity,4
3,residual sugar,5
5,free sulfur dioxide,6


In [12]:
X = wines[['volatile acidity','chlorides','density','pH','sulphates']]
y = wines['quality']

print(X.shape)
print(y.shape)

(6497, 5)
(6497,)


In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=30)

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(4872, 5)
(1625, 5)
(4872,)
(1625,)


In [14]:
logreg = LogisticRegression()

y_pred = logreg.fit(X_train,y_train).predict(X_test)

print('The Accuracy of the model is {:.1f}%'.format(accuracy_score(y_test,y_pred)*100))

The Accuracy of the model is 46.1%
