In [1]:
### IMPORT NECESSARY LIBRARIES AND MODULES

import pandas as pd
import numpy as np

In [2]:
### LOAD TRAINING DATA

df = pd.read_csv('usa_housing_training.csv')

# Input features
x_train = df[['avg_area_income', 'area_population']]

# Output
y_train = df[['house_price']]

# Need to use Numpy arrays
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# Feature scaling (see the previous assignment)
x_train = x_train / np.max(abs(x_train), axis=0)

# Append the bias column x0 = 1 to the left side of the input matrix (see lecture)
x_train = np.append(np.ones((len(x_train), 1)), x_train, axis=1)

x_train

array([[1.        , 0.73857166, 0.33160345],
       [1.        , 0.73581575, 0.5770193 ],
       [1.        , 0.56904431, 0.52975081],
       ...,
       [1.        , 0.58857621, 0.4778128 ],
       [1.        , 0.63138558, 0.61224607],
       [1.        , 0.60825922, 0.66791352]])

In [10]:
### THE COST FUNCTION (Mean Squared Error)

def cost_mse(actual, predicted):
    M = len(actual) # = len(predicted)
    
    cost = (1/(2*M)) * sum((predicted - actual)**2)
    
    return cost

In [11]:
############ Exercise 1: IMPLEMENT GRADIENT DESCENT ALGORITHM FOR MULTIPLE VARIABLES ############

# This function returns an array of parameter phi0, phi1, ..., phiN
# that are needed to construct the fitting hyperplane.
# x: input features
# y: actual output
# learningRate: used to control the descent's speed
# threshold: used to check convergence
# maxIters: maximum number of iterations to run
def grad_descent(x, y, learningRate, threshold, maxIters):
    # First, parameters phi0, phi1,... phiN are initialized (e.g. to 0)
    # so we create a Numpy array of 0s
    # Assume that the input x already has the bias column (remind that bias values x0 = 1)
    phi = np.zeros((x.shape[1], 1)) # x.shape[1] = number of x's columns i.e. number of input features
    
    ##################### YOUR CODE HERE ######################
    # HINT 1: You may want to use the Numpy function: dot() to do
    # the sum product of the input features and the parameters
    # yi_hat = phi0 * xi_0 + phi1 * xi_1 + ... + phiN * xi_N (i --> training data index)
    y_hat = np.dot(x, phi)
    
    # HINT 2: To convert a 1d array into a 2d array, just call reshape(): e.g. reshape(-1, 1) --> M rows, 1 column
    
    ###########################################################
    
    return phi # Return an array of computed parameters

In [12]:
### FIND THE PARAMETERS

learningRate = 0.5
threshold = 0.001
maxIters = 10000

# Run gradient descent
phi = grad_descent(x_train, y_train, learningRate, threshold, maxIters)

# If your implementation is correct, you will find
# phi = [[-782515.45192716] [2315296.12541882] [1042161.60848601]] after running 3853 iterations

In [13]:
############# Exercise 2: PREDICT TEST DATA ###############
# In practice, the dataset used for training should be different from the dataset used for testing
# in order to make sure that your predictive model has a good generalization (avoid bias)

# Load test data from the .csv file
df_test = pd.read_csv('usa_housing_test.csv')
df_test

Unnamed: 0,avg_area_income,avg_house_age,avg_nb_rooms,avg_nb_bathrooms,area_population,house_price,address
0,64698.463428,6.025336,8.147760,3.41,60828.249085,1.502056e+06,"4759 Daniel Shoals Suite 442\nNguyenburgh, CO ..."
1,78394.339278,6.989780,6.620478,2.42,36516.358972,1.573937e+06,"972 Joyce Viaduct\nLake William, TN 17778-6483"
2,39033.809237,7.671755,7.250029,3.10,39220.361467,1.042814e+06,"209 Natasha Stream Suite 961\nHuffmanland, NE ..."
3,79706.963058,5.067890,8.219771,3.12,39717.813576,1.556787e+06,"064 Hayley Unions\nNicholsborough, HI 44161-1887"
4,61929.077018,4.788550,5.097010,4.30,24595.901498,5.284852e+05,"5498 Rachel Locks\nNew Gregoryshire, PW 54755"
...,...,...,...,...,...,...,...
961,65729.222330,6.237787,6.860475,3.12,25573.854289,1.197073e+06,"641 Lisa Parkways Suite 552\nWest Amandaside, ..."
962,52723.876555,5.452237,8.124571,6.39,14802.088438,4.795006e+05,"86727 Kelly Plaza\nLake Veronica, IL 04474"
963,74102.191890,5.657841,7.683993,3.13,24041.270592,1.263721e+06,"2871 John Lodge\nAmychester, GU 61734-5597"
964,69639.140896,5.007510,7.778375,6.05,54056.128430,1.381831e+06,"5259 David Causeway Apt. 975\nSouth Alexstad, ..."


In [17]:
x_test = df_test[['avg_area_income', 'area_population']].to_numpy()
y_test = df_test[['house_price']].to_numpy()

############### YOUR CODE HERE ################
# 1. Scale feature values of x_test
# 2. Add the bias column to x_test
# 3. Calculate the predicted output y_hat by using the parameters phi calculated previously
# 4. Compute the prediction cost between y_hat and y_test

# If your implementation is correct, you should
# get cost = 34937094596.78
###############################################

array([[1502055.81737441],
       [1573936.56447772],
       [1042814.09782009],
       [1556786.60019477],
       [ 528485.2467306 ],
       [1030591.42921161],
       [ 743999.8191602 ],
       [ 895737.13338351],
       [1170720.89365429],
       [1744932.21099093],
       [ 980177.30513713],
       [ 948788.27570945],
       [ 772111.9721021 ],
       [1200961.82092263],
       [1520234.22937746],
       [1204598.03746313],
       [1132522.90129471],
       [1173474.37883732],
       [ 201898.0865725 ],
       [1124635.93162859],
       [1169944.24775875],
       [1302933.24756928],
       [1252391.17984217],
       [1809154.28936317],
       [1300265.21041547],
       [2152959.40894309],
       [1246246.82775976],
       [1393746.76046743],
       [1276448.79204297],
       [1161742.67744636],
       [1604920.97345841],
       [1352135.96726418],
       [ 734827.50801229],
       [1213382.22263647],
       [1182459.77166367],
       [1453381.62430311],
       [1195986.29883487],
 