# STEP 1: Data Preprocessing

### Importing the dataset

In [1]:
# Importing the libraries 
import pandas as pd
import numpy as np

import matplotlib as mpl
import matplotlib.pyplot as plt

from matplotlib.animation import FuncAnimation

from IPython.display import HTML

In [None]:
# Importing the libraries for Boston Housing dataset
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

In [None]:
#Load the dataset
boston = load_boston()

#Description of the dataset
print(boston.DESCR)

In [None]:
# Initializing the dataframe
features = pd.DataFrame(boston.data, columns=boston.feature_names)
features

In [None]:
#Adding target variable to dataframe
target = pd.DataFrame(boston.target, columns=['target']) 
target

In [None]:
max(target['target'])

In [None]:
min(target['target'])

In [None]:
#Concatenate Features and target into a single DataFrame
#axis=1 makes it concatenate column wise
df=pd.concat([features,target], axis=1)
df

### Data Visualization

In [None]:
# Viewing the data statistics
df.describe()

In [None]:
# Check for missing values
df.isnull().sum()

In [None]:
# See rows with missing values
df[df.isnull().any(axis=1)]

### Correlation between target and attributes

In [None]:
#calculate correlation between every column on the data
corr=df.corr('pearson')

#take absolute values of correlations
corrs=[abs(corr[attr]['target']) for attr in list(features)]

#Make a list of pairs [(corr, feature)]
l=list(zip(corrs, list(features)))

# Sort the list of pairs in reverse/descending order,with the correlation value as the key for sorting
l.sort(key=lambda x: x[0], reverse=True)

#"Unzip" pairs to two lists
# zip(*l)- takes a list that looks like[[a,b,c],[d,e,f],[g,h,i]] and returns [[a,d,g],[b,e,h],[c,f,i]]
corrs, labels= list(zip((*l)))

# plot correlations with respect to the target variable as a bar graph
index=np.arange(len(labels))
plt.figure(figsize=(15,5))
plt.bar(index, corrs, width=0.5)
plt.xlabel('Attributes')
plt.ylabel('Correlation with the target variable')
plt.xticks(index, labels)
plt.show()

### Normalization of Data

In [None]:
X=df['LSTAT'].values
Y=df['target'].values

# MinMaxScaler provides a method called Inverse Transform

x_scalar=MinMaxScaler()
X=x_scalar.fit_transform(X.reshape(-1,1))
X=X[:, -1]

y_scalar=MinMaxScaler()
Y=y_scalar.fit_transform(X.reshape(-1,1))
Y=Y[:, -1]

# STEP 2: Defining the Error

#### MSE - Mean Squared Error

In [None]:
# defining the error function
def error(m,x,c,t):
    N=x.size
    e= sum(((m*x+c)-t)**2)
    return e*1/(2*N)

# STEP 3: Splitting the Data

#### Splitting data into fixed sets

In [None]:
# Splitting to training and testing data
xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size = 0.2)

# STEP 4: Linear Regression / Training the Model

In [None]:
# defining the update function for gradient descent
def update(m,x,c,t,learning_rate):
    grad_m=sum(2*((m*x+c)-t)*x)
    grad_c=sum(2*((m*x+c)-t))
    m=m-grad_m*learning_rate
    c=c-grad_c*learning_rate
    return m,c

In [None]:
# defining the gradient descent function
def gradient_descent(init_m, init_c,x,t,learning_rate, iterations, error_threshold):
    m=init_m
    c=init_c
    error_values=list()
    mc_values=list()
    for i in range(iterations):
        e=error(m,x,c,t)
        if e<error_threshold:
            print('Error less than the threshold. Stopping Gradient Descent')
            break
        error_values.append(e)
        m, c=update(m,x,c,t,learning_rate)
        mc_values.append((m,c))
    return m, c, error_values, mc_values

In [None]:
%%time
init_m=0.9
init_c=0
learning_rate=0.001
iterations=250
error_threshold=0.00001

m,c,error_values, mc_values=gradient_descent(init_m, init_c,xtrain,ytrain,learning_rate, iterations, error_threshold)


#### Visualization of the Training Model

In [None]:
mc_values_anim=mc_values[0:250:5]

In [None]:
fig,ax=plt.subplots()
ln,=plt.plot([],[],'ro-',animated=True)

def init():
    plt.scatter(xtest, ytest, color='g')
    ax.set_xlim(0,1.0)
    ax.set_ylim(0,1.0)
    return ln,

def update_frame(frame):
    m,c=mc_values_anim[frame]
    x1,y1=-0.5, m*-.5+c
    x2,y2=1.5, m*1.5+c
    ln.set_data([x1,x2],[y1,y2])
    return ln,

anim=FuncAnimation(fig, update_frame, frames=range(len(mc_values_anim)),init_func=init, blit=True)

HTML(anim.to_html5_video())

#### Visualization of the Learning Process

In [None]:
# Plotting the regression line
plt.scatter(xtrain, ytrain, color='b')
plt.plot(xtrain, (m*xtrain+c), color='r')

In [None]:
# Plotting error values
plt.plot(np.arange(len(error_values)),error_values)
plt.ylabel('Error')
plt.xlabel('Iterations')

# STEP 5: Prediction

In [None]:
predicted=(m*xtest)+c

In [None]:
mean_squared_error(ytest, predicted)

In [None]:
p=pd.DataFrame(list(zip(xtest, ytest, predicted)), columns=['x','y_target','predicted_y'])
p.head()

#### Plotting the predicted values against the target values

In [None]:
plt.scatter(xtest, ytest, color='b')
plt.plot(xtest, predicted, color='r')

### Revert normalization to obtain the predicted price in $1000s

In [None]:
predicted=predicted.reshape(-1,1)
xtest=xtest.reshape(-1,1)
ytest=ytest.reshape(-1,1)

xtest_scaled=x_scalar.inverse_transform(xtest)
ytest_scaled=y_scalar.inverse_transform(ytest)
predicted_scaled=y_scalar.inverse_transform(predicted)

xtest_scaled=xtest_scaled[:,-1]
ytest_scaled=ytest_scaled[:,-1]
predicted_scaled=predicted_scaled[:,-1]

p=pd.DataFrame(list(zip(xtest_scaled, ytest_scaled, predicted_scaled)), columns=['x','target_y','predicted_y'])
p=p.round(decimals=2)
p.head()