In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Goal:** We want to classify whether or not a given wine is "good" or "bad" based upon a variety of factors. Wines with a quality score of 7 or higher will be considered as "good" wines with any wines with a quality score of less than 7 will be considered as "bad" wines. 

**Methods:** A logistic regression will be performed over this dataset.

# Import Data 

In [None]:
data = pd.read_csv('/kaggle/input/red-wine-quality-cortez-et-al-2009/winequality-red.csv')

# Pre-processing the Data

Let's take a peek into the data and see what we are dealing with. 
1. Firstly, let's take a look at the first few rows of our data.
2. Secondly, we will generate a heatmap to numerically represent how each component affects each other and overall quality. 

*Looking at Our Data*

Let's see what our headings are and what the data types each are. 

In [None]:
print(data.iloc[:10,:])

*Making a Heatmap*

So, we have 12 categories with 11 float values and 1 integer value (quality). Let's see how these values all correlate to each other and, most importantly, to quality. 

In doing this, I want to see which aspects of a wine affect its quality score most, whether that be negatively or positively.

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(data.corr(),
            vmin=-1,
            cmap='coolwarm',
            annot=True);

*So what can we see here?*
* As volatile acidity decreases, quality increases
* As citric acid increases, so too does quality
* As sulphates increase, so too does quality
* As alcohol increases, so too does quality

# Perform the Logistic Regression

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

data['quality'].values[data['quality']<7] = 0 
data['quality'].values[data['quality']>=7] = 1

x = data.loc[:,data.columns != 'quality']
y = data.loc[:,data.columns == 'quality']
x = np.array(x)
y = np.array(y)

x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .25, random_state = 0)

logreg = LogisticRegression(max_iter = 7000)
logreg.fit(x_train,np.ravel(y_train))
y_pred = logreg.predict(x_test)

# Visualize Results

In [None]:
from sklearn.metrics import accuracy_score,confusion_matrix

plt.title('Confusion Matrix for Wine Quality')
sns.heatmap(pd.DataFrame(confusion_matrix(y_test,y_pred)), annot = True, cmap="gist_earth")
plt.show()

from sklearn.metrics import classification_report

print('\nClassification Report\n')
print(classification_report(y_test, y_pred))

# Interpretation

In our case, 1 is what represented our "good" quality wines and 0 represented our "bad" quality wines.

The top left quadrant of the confusion matrix represents true positives, meaning the true number of wines rated '0' or bad quality. The bottom right quadrant represents true negatives, in this case meaning good quality wines. 

The bottom left quadrant represents false negatives, so wines that were mistakenly rated as good quality. The top right quadrant represents false positives, so wines that were mistakenly rated as bad quality.

The logistic regression performed well for this data set, very accurately predicting the quality scores of the wines.