In [9]:
# Linear Regression - wine quality data

In [10]:
import pandas as pd

wine_quality_white = pd.read_csv("winequality-white.csv", sep=";")
wine_quality_red = pd.read_csv("winequality-red.csv", sep=";")
wine_quality = pd.concat([wine_quality_white, wine_quality_red])
wine_quality.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


## Making Predictions

In [6]:
from numpy import cov

def calc_slope(x, y):
    return cov(x, y)[0, 1] / x.var()

# Calculate the intercept given the x column, y column, and the slope
def calc_intercept(x, y, slope):
    return y.mean() - (slope * x.mean())

x = wine_quality["density"]
y = wine_quality["quality"]
m = calc_slope(x, y)
b = calc_intercept(x, y, m)

def calc_predicted_y(x):
    return m * x + b

predicted_quality = wine_quality["density"].apply(calc_predicted_y)

In [7]:
predicted_quality.head()

0    5.256936
1    5.880427
2    5.782450
3    5.737915
4    5.737915
Name: density, dtype: float64

## Finding Error

In [14]:
from scipy.stats import linregress

# We've seen the r_value before -- we'll get to what p_value and stderr_slope are soon -- for now, don't worry about them.
slope, intercept, r_value, p_value, stderr_slope = linregress(wine_quality["density"], wine_quality["quality"])

# As you can see, these are the same values we calculated (except for slight rounding differences)
print(slope)
print(intercept)

def calc_predicted_y(x):
    return slope * x + intercept

predicted_y = wine_quality["density"].apply(calc_predicted_y)

sq_residuals = (predicted_y - wine_quality["quality"]) ** 2

rss = sq_residuals.sum()

print(rss)

-89.0700748242
94.4160813155
4490.27306663


## Standard Error

In [15]:
from scipy.stats import linregress
import numpy as np

# We can do our linear regression
# Sadly, the stderr_slope isn't the standard error, but it is the standard error of the slope fitting only
# We'll need to calculate the standard error of the equation ourselves
slope, intercept, r_value, p_value, stderr_slope = linregress(wine_quality["density"], wine_quality["quality"])

predicted_y = np.asarray([slope * x + intercept for x in wine_quality["density"]])
residuals = (wine_quality["quality"] - predicted_y) ** 2
rss = sum(residuals)

n = len(wine_quality["quality"])

standard_error = (rss / (n - 2)) ** (1/2)

within_one, within_two, within_three = 0, 0, 0

for idx, y in enumerate(wine_quality["quality"]):
    error = abs(y - predicted_y[idx])
    if error / standard_error <= 1:
        within_one += 1
    if error / standard_error <= 2:
        within_two += 1
    if error / standard_error <= 3:
        within_three += 1

within_one = within_one / n
within_two = within_two / n
within_three = within_three / n


In [17]:
within_one, within_two, within_three

(0.7074034169616746, 0.9398183777127905, 0.9935354779128829)