In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib as mp
import matplotlib.pyplot as plt

from pandas import read_csv

from matplotlib.animation import FuncAnimation

from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#reading the csv file
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'MEDV']
boston = read_csv('/kaggle/input/boston-house-prices/housing.csv', header=None, delimiter=r"\s+", names=column_names)
boston

In [None]:
#checking the features
boston['AGE']

In [None]:
#performing basic functions
print(max(boston['MEDV']))
print(min(boston['MEDV']))

In [None]:
#describing the data
boston.describe().round(decimals = 2)

In [None]:
#correlation between every column in the data
#using PEARSON CORRELATION
corr = boston.corr('pearson')

#absolute value of the correlation
corrs = [abs(corr[attr]['MEDV']) for attr in list(boston)]

#make a list of pair [(corr, feature)] using zip
l = list(zip(corrs, list(boston)))

#sorting the list pairs in reverse
#with the correlation value as the key for sorting
l.sort(key = lambda x : x[0], reverse=True)

#'UNZIP' pairs to 2 lists
#zip(*l) makes a list looking like ([a,b,c], [d,e,f], [g,h,i]) to ([a,d,g], [b,e,h], [c,f,i])
corrs, labels = list(zip((*l)))

#plot correlation wrt MEDV variables as a bar graph
index = np.arange(len(labels))
plt.figure(figsize=(15, 5))
plt.bar(index, corrs, width=0.5)
plt.xlabel('Attributes')
plt.ylabel('Correlation wrt MEDV Variables')
plt.xticks(index, labels)
plt.show()

In [None]:
#setting the values
X=boston['LSTAT'].values
Y=boston['MEDV'].values

In [None]:
#before normalisation
print(Y[:5])

In [None]:
#normalising
x_scaler = MinMaxScaler()
X = x_scaler.fit_transform(X.reshape(-1, 1))
X = X[:, -1]
y_scaler = MinMaxScaler()
Y = y_scaler.fit_transform(Y.reshape(-1, 1))
Y = Y[:, -1]

In [None]:
#after normalisation
print(Y[:5])

In [None]:
#Mean Squared Error
def error(m, x, c, t):
    N = x.size
    e = sum(((m * x + c) - t) ** 2)
    return e * 1/(2 * N)

In [None]:
#0.2 indicates that 20% of the data is randomly sampled as testing data
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.2)

In [None]:
#update Function
def update(m, x, c, t, learning_rate):
    grad_m = sum(2 * ((m * x + c) - t) * x)
    grad_c = sum(2 * ((m * x + c) - t))
    m = m - grad_m * learning_rate
    c = c - grad_c * learning_rate
    return m, c

In [None]:
#Gradient Descent Function
def gradient_descent(init_m, init_c, x, t, learning_rate, iterations, error_threshold):
    m = init_m
    c = init_c
    error_values = list()
    mc_values = list()
    for i in range(iterations):
        e = error(m, x, c, t)
        if e < error_threshold:
            print('Error less than the threshold. Stopping Gradient Descent.')
            break
        error_values.append(e)
        m, c = update(m, x, c, t, learning_rate)
        mc_values.append((m, c))
    return m, c, error_values, mc_values

In [None]:
%%time   
#time taken for computing the given number of iterations

init_m = 0.9
init_c = 0
learning_rate = 0.001
iterations = 250
error_threshold = 0.001


m, c, error_value, mc_values = gradient_descent(init_m, init_c, xtrain, ytrain, learning_rate, iterations, error_threshold)

In [None]:
#as the number of iterations increase, the changes in the line is less noticeable
#in order to reduce the processing time for the animation, it is advised to choose smaller values
mc_values_anim = mc_values[0:250:5]

In [None]:
#plotting a scatter plot of train dataset
plt.scatter(xtrain, ytrain, color='b')
plt.plot(xtrain, ((m * xtrain) + c), color='r')

In [None]:
#plot of Error vs Iteration Curve
plt.plot(np.arange(len(error_value)), error_value)
plt.xlabel('Iterations')
plt.ylabel('Error')

In [None]:
#calculating the prediction on the tset set as vectorized operation
predicted = (m * xtest) + c

In [None]:
#calculating MSE for the predicted values on the test dataset
mean_squared_error(ytest, predicted)

In [None]:
#putting xtest, ytest and predicted values into a single data frame so that we can see the 
#predicted values alongside the testing set
p = pd.DataFrame(list(zip(xtest, ytest, predicted)), columns =['X', 'Target Y', 'Predicted Y'])
p.head()

In [None]:
#plotting a scatter plot of test dataset wrt predicted values
plt.scatter(xtest, ytest, color='b')
plt.plot(xtest, predicted, color='r')

In [None]:
#reshape to change the shape that is required by the scaler
predicted = predicted.reshape(-1, 1)
xtest = xtest.reshape(-1, 1)
ytest = ytest.reshape(-1, 1)

xtest_scaled = x_scaler.inverse_transform(xtest)
ytest_scaled = y_scaler.inverse_transform(ytest)
predicted_scaled = y_scaler.inverse_transform(predicted)


#this is to remove extra dimensions

xtest_scaled = xtest_scaled[:, -1]
ytest_scaled = ytest_scaled[:, -1]
predicted_scaled = predicted_scaled[:, -1]

p = pd.DataFrame(list(zip(xtest_scaled, ytest_scaled, predicted_scaled)), columns =['X', 'Target Y', 'Predicted Y'])
p.round(decimals = 2)
p.head()