In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import os

# Wine Quality
Wine quality control Analysis

This database was created to identify a high quality or a low quality wine, based upon properties of the wine. The dataset consists of 12 properties of 6497 samples collected from the north of Portugal. Due to privacy and logistic issues, only physicochemical (inputs) and sensory (the output) variables are available (e.g. there is no data about grape types, wine brand, wine selling price, etc.). The data were collected from May/2004 to February/2007 using only protected designation of origin samples that were tested at the official certification entity (CVRVV). 

## The Dataset
The following properties of each kind of wine are measured and included within the CSV:

* fixed acidity  (g(tartaric acid)/dm3)
* volatile acidity(g(acetic acid)/dm3)
* citric acid(g/dm3)
* residual sugar(g/dm3)	
* chlorides(g(sodium chloride)/dm3)
* free sulfur dioxide(mg/dm3)
* total sulfur dioxide (mg/dm3)
* density(g/cm3)
* pH
* sulphates(g(potassium sulphate)/dm3)
* alcohol(vol.%)
* quality(score between 0 and 10)
* color: red(0) or white(1)
* quality high/low: quality high(1) or quality low(0)

Reference:
P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.

In [2]:
wine = pd.read_csv(os.path.join('..', 'Resources', 'winequality.csv'))
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,color,quality high/low
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0,0


In [6]:
# Assign X (data) and y (target)
X = wine.drop(["quality high/low","quality"], axis=1)
y = wine["quality high/low"]
print(X.shape, y.shape)

(6497, 12) (6497,)


Split our data into training and testing

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.5,stratify=y)

Create a Logistic Regression Model

In [8]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Fit (train) or model using the training data

In [9]:
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

Validate the model using the test data

In [10]:
print(f"Training Data Score: {classifier.score(X_train, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test, y_test)}")

Training Data Score: 0.8180418719211823
Testing Data Score: 0.811942136041859


Make predictions

In [11]:
predictions = classifier.predict(X_test)
print(f"First 30 Predictions:   {predictions[:30]}")
print(f"First 30 Actual labels: {y_test[:30].tolist()}")

First 30 Predictions:   [0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
First 30 Actual labels: [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]


In [12]:
pd.DataFrame({"Prediction": predictions, "Actual": y_test}).reset_index(drop=True)

Unnamed: 0,Prediction,Actual
0,0,0
1,0,0
2,0,0
3,0,0
4,0,0
...,...,...
3244,0,0
3245,1,0
3246,0,1
3247,0,0
