# Part A: Build a wine quality prediction model using K-NN
## Determine efficacy of scaling

In [2]:
# Imports....the usual suspects
import postgresql
import pandas
import sklearn
import matplotlib
%matplotlib inline
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsRegressor

## 1. Download the Wine Quality Dataset from the UCI
http://archive.ics.uci.edu/ml/datasets/Wine+Quality
Look for the 'Data Folder' link near top

In [3]:
red_df = pandas.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv', delimiter=';')
red_df.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
1594,6.2,0.6,0.08,2.0,0.09,32.0,44.0,0.9949,3.45,0.58,10.5,5
1595,5.9,0.55,0.1,2.2,0.062,39.0,51.0,0.99512,3.52,0.76,11.2,6
1596,6.3,0.51,0.13,2.3,0.076,29.0,40.0,0.99574,3.42,0.75,11.0,6
1597,5.9,0.645,0.12,2.0,0.075,32.0,44.0,0.99547,3.57,0.71,10.2,5
1598,6.0,0.31,0.47,3.6,0.067,18.0,42.0,0.99549,3.39,0.66,11.0,6


## 2. Split the Red Wine Dataset into training and testing

In [4]:
red_x_train, red_x_test, red_y_train, red_y_test = train_test_split(red_df.drop(columns=['quality']), red_df['quality'], random_state=42)

## 3. Build a K-NN Regression Model to predict the continuous quality variable in the Red Wine Dataset.  Predict for your test set (you're estimating quality column)

Hint, you may want to drop the last element of your test set if it's a giant array: red_y_test[:-1]

In [68]:
clf = KNeighborsRegressor(n_neighbors=13)
clf.fit(red_x_train, red_y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                    weights='uniform')

## 4. What is the mean of the differences between the Y actuals (quality column) and your rounded predictions?

In [69]:
red_y_results = pandas.DataFrame(red_y_test)
red_y_results['prediction'] = clf.predict(red_x_test)
red_y_results['pred_rounded'] = red_y_results['prediction'].round().astype(int)
red_y_results['error'] = red_y_results['pred_rounded'] - red_y_results['quality']
red_y_results.tail()

Unnamed: 0,quality,prediction,pred_rounded,error
882,6,6.461538,6,0
250,6,5.692308,6,0
1122,6,5.923077,6,0
877,6,5.846154,6,0
1414,5,5.384615,5,0


In [70]:
red_y_results['error'].abs().mean()

0.525

## 5. Repeat steps 2-5 after scaling the data.  Try at least two types of scaling.  

In [71]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
x_df_scaled = scaler.fit_transform(red_df.drop(columns=['quality']))
x_train, x_test, y_train, y_test = train_test_split(x_df_scaled, red_df['quality'])

In [72]:
clf = KNeighborsRegressor(n_neighbors=13)
clf.fit(x_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=13, p=2,
                    weights='uniform')

In [73]:
red_y_results = pandas.DataFrame(y_test)
red_y_results['prediction'] = clf.predict(x_test)
red_y_results['pred_rounded'] = red_y_results['prediction'].round().astype(int)
red_y_results['error'] = red_y_results['pred_rounded'] - red_y_results['quality']
red_y_results.tail()

Unnamed: 0,quality,prediction,pred_rounded,error
79,4,5.153846,5,1
1589,5,5.307692,5,0
302,5,5.692308,6,1
1539,5,5.769231,6,1
798,6,5.615385,6,0


In [74]:
red_y_results['error'].abs().mean()

0.4575

In [75]:
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
x_df_scaled = scaler.fit_transform(red_df.drop(columns=['quality']))
x_train, x_test, y_train, y_test = train_test_split(x_df_scaled, red_df['quality'])

clf = KNeighborsRegressor(n_neighbors=13)
clf.fit(x_train, y_train)

red_y_results = pandas.DataFrame(y_test)
red_y_results['prediction'] = clf.predict(x_test)
red_y_results['pred_rounded'] = red_y_results['prediction'].round().astype(int)
red_y_results['error'] = red_y_results['pred_rounded'] - red_y_results['quality']

red_y_results['error'].abs().mean()

0.46

In [76]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x_df_scaled = scaler.fit_transform(red_df.drop(columns=['quality']))
x_train, x_test, y_train, y_test = train_test_split(x_df_scaled, red_df['quality'])

clf = KNeighborsRegressor(n_neighbors=13)
clf.fit(x_train, y_train)

red_y_results = pandas.DataFrame(y_test)
red_y_results['prediction'] = clf.predict(x_test)
red_y_results['pred_rounded'] = red_y_results['prediction'].round().astype(int)
red_y_results['error'] = red_y_results['pred_rounded'] - red_y_results['quality']

red_y_results['error'].abs().mean()

0.4225

## 6. Did Scaling help?

Yes!  However, running it a few times yielded a wide range of results.