In [None]:
# Linear Regression with Vectorization

import numpy as np
import pandas as pd
import random

# Load the wine data
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv'
df = np.array(pd.read_csv(url, delimiter=';'))

# Add bias to the data frame
m_total = df.shape[0]
df = np.append(np.ones((m_total,1)), df, axis=1)

# Shuffle the data
random.shuffle(df)
# Split the data (80% training and 20% test)
X_train = df[:int(0.8*m_total),:-1]
# y is a flattened array so need to convert it to an m by 1 vector
y_train = df[:int(0.8*m_total),-1].reshape(X_train.shape[0],1)

X_test = df[int(0.8*m_total):,:-1]
y_test = df[int(0.8*m_total):,-1].reshape(X_test.shape[0],1)

# m = the number of training data; n = the number of features (including bias)
m = X_train.shape[0]
n = X_train.shape[1]

alpha = 0.0001
iterations = 1000
theta = np.ones((n,1))

for iteration in range(iterations):
    
    h = np.dot(X_train,theta)

    J = 0.5*(1/m)*((h-y_train)**2).sum(axis=0)

    if iteration > 990:

        print('Iteration: ' + str(iteration))
        print('Cost: ' + str(J))
        print('Theta: ' + str(theta))
        
    theta -= (alpha/m)*( np.dot( np.transpose(X_train),(h - y_train) ) )

