In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# Download dataset from "https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data"
# Then, put 'train.csv' and 'test.csv' in the 'input' folder.
data = pd.read_csv('input/train.csv')

In [None]:
print(data)

In [None]:
print(data['GrLivArea'])
print(data['SalePrice'])

In [None]:
#Grab the relevant data, scale the predictor variable, and add a column of 1s for the gradient descent...
x_ = data['GrLivArea']
y = data['SalePrice']

x = (x_ - x_.mean()) / x_.std()
x = np.c_[np.ones(x.shape[0]), x] 

In [None]:
#GRADIENT DESCENT
alpha = 0.01 #Step size
iterations = 2000 #No. of iterations
m = y.size #No. of data points
np.random.seed(123) #Set the seed
theta = np.random.rand(2) #Pick some random values to start with

#GRADIENT DESCENT
def gradient_descent(x, y, theta, iterations, alpha):
    past_costs = []
    past_thetas = [theta]
    for i in range(iterations):
        prediction = np.dot(x, theta) #x=[1,2,3,4] -> x=[[1,1],[1,2],[1,3],[1,4]] , theta=[0.1, 0.2] --> [0.1+1*0.2, 0.1+2*0.2, ...]
        error = prediction - y
        cost = 1/(2*m) * np.dot(error.T, error) #error=[1,2,3] --> ...=1^2+2^2+3^3
        past_costs.append(cost)
        theta = theta - (alpha * (1/m) * np.dot(x.T, error))
        past_thetas.append(theta)
        
    return past_thetas, past_costs

#Pass the relevant variables to the function and get the new values back...
past_thetas, past_costs = gradient_descent(x, y, theta, iterations, alpha)
theta = past_thetas[-1]

#Print the results...
print("Gradient Descent: {:.2f}, {:.2f}".format(theta[0], theta[1]))

In [None]:
#Plot the cost function...
plt.title('Cost Function J')
plt.xlabel('No. of iterations')
plt.ylabel('Cost')
plt.plot(past_costs)
plt.show()

In [None]:
#Set the plot up,
fig = plt.figure()
ax = plt.axes()
plt.title('Sale Price vs Living Area')
plt.xlabel('Living Area in square feet (normalised)')
plt.ylabel('Sale Price ($)')
plt.scatter(x[:,1], y, color='red')

x1 = np.linspace(-3, 10, 1000)
y1 = past_thetas[0][1]*x1 + past_thetas[0][0]
y2 = past_thetas[20][1]*x1 + past_thetas[20][0]
y3 = past_thetas[100][1]*x1 + past_thetas[100][0]
y4 = past_thetas[-1][1]*x1 + past_thetas[-1][0]
plt.plot(x1, y1, 'b.')
plt.plot(x1, y2, 'g.')
plt.plot(x1, y3, 'm.')
plt.plot(x1, y4, 'k.')

plt.show()
plt.close()

In [None]:
data_ = pd.read_csv('input/test.csv')
x_ = data_['GrLivArea']
x = (x_ - x_.mean()) / x_.std()
x = np.c_[np.ones(x.shape[0]), x]
y = np.dot(x, theta)

fig = plt.figure()
ax = plt.axes()
plt.title('Sale Price vs Living Area')
plt.xlabel('Living Area in square feet (normalised)')
plt.ylabel('Sale Price ($)')
plt.scatter(x[:,1], y, color='red')
plt.show()
plt.close()