# Build a wine quality prediction model using K-NN
## Determine efficacy of scaling

In [27]:
# Imports....the usual suspects
import postgresql
import pandas
import numpy as np
import sklearn
import matplotlib
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error

## 1. Download the Wine Quality Dataset from the UCI
http://archive.ics.uci.edu/ml/datasets/Wine+Quality
Look for the 'Data Folder' link near top

In [13]:
df = pandas.read_csv("http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv", delimiter=";")

In [14]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [16]:
df.dtypes

fixed acidity           float64
volatile acidity        float64
citric acid             float64
residual sugar          float64
chlorides               float64
free sulfur dioxide     float64
total sulfur dioxide    float64
density                 float64
pH                      float64
sulphates               float64
alcohol                 float64
quality                   int64
dtype: object

In [20]:
x_df = df.drop(columns='quality')
y_df = df['quality']

## 2. Split the Red Wine Dataset into training and testing

In [23]:
X_train, X_test, y_train, y_test = train_test_split(x_df, y_df)

## 3. Build a K-NN Regression Model to predict the continuous quality variable in the Red Wine Dataset.  Predict for your test set (you're estimating quality column)

Hint, you may want to drop the last element of your test set if it's a giant array: red_y_test[:-1]

In [24]:
clf = KNeighborsRegressor()
clf.fit(X_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                    weights='uniform')

## 4. What is the mean of the absolute value of the differences between the Y actuals (quality column) and your rounded predictions?

In [26]:
res = clf.predict(X_test)

In [28]:
mean_absolute_error(res, y_test)

0.6214999999999999

## 5. Repeat steps 2-5 after scaling the data.  Try at least two types of scaling.  

In [36]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_df_scaled = scaler.fit_transform(x_df)
X_train, X_test, y_train, y_test = train_test_split(x_df_scaled, y_df)

clf = KNeighborsRegressor(n_neighbors=4)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
mean_absolute_error(pred, y_test)

0.494375

In [37]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_df_scaled = scaler.fit_transform(x_df)
X_train, X_test, y_train, y_test = train_test_split(x_df_scaled, y_df)

clf = KNeighborsRegressor(n_neighbors=4)
clf.fit(X_train, y_train)

pred = clf.predict(X_test)
mean_absolute_error(pred, y_test)

0.545625

## 6. Did Scaling help?

In [38]:
# Yes !! We can see from above two cells by looking at the value of MAE