In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.metrics import r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

%matplotlib inline

In [None]:
class LinearRegression:
    def __init__(self,num_iter,strUrl,strDelim,tolerance):
        #constructor
        self.num_iter = num_iter
        self.strUrl = strUrl
        self.strDelim = strDelim
        self.tolerance = tolerance
        
    def data_preprocessing(self):

        #Read the CSV file from Google Drive and create the data frame
        dataFrame = pd.read_csv('https://drive.google.com/uc?export=download&id=' +self.strUrl.split('/')[-2], encoding = 'unicode_escape', delimiter = self.strDelim)

        # To drop any blank rows
        dataFrame.dropna(axis = 0, how = 'any', thresh = None, inplace = True) 

        # Drop duplicate rows
        dataFrame.drop_duplicates(inplace=True)

        #Rebuild index after dropping rows
        dataFrame.reset_index(drop = True, inplace = True)

        # Convert Functioning Day value as 'Yes': 1, 'No' : 0
        dataFrame['Functioning Day'] = dataFrame['Functioning Day'].map({'Yes': 1, 'No' : 0})

        # Convert Holiday value as 'Holiday': 1, 'No Holiday' : 0
        dataFrame['Holiday'] = dataFrame['Holiday'].map({'Holiday': 1, 'No Holiday' : 0})

        # Convert Seasons values as 'Winter' : 1, 'Spring' : 2, 'Summer' : 3, 'Autumn' : 4
        dataFrame['Seasons'] = dataFrame['Seasons'].map({'Winter' : 1, 'Spring' : 2, 'Summer' : 3, 'Autumn' : 4})

        X = dataFrame.iloc[:, [2, 3, 4, 5, 6, 7, 8]]

        Y = dataFrame.iloc[:, [1]]
        scalar = StandardScaler()
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state=5)

        # scaling the values
        X_train = scalar.fit(X_train).fit_transform(X_train)
        Y_train = scalar.fit(Y_train).fit_transform(Y_train)
        
        X_test = scalar.fit(X_test).fit_transform(X_test)
        Y_test = scalar.fit(Y_test).fit_transform(Y_test)

        #Adding a column of 1 in the beginning
        X_train = np.hstack((np.ones((len(X_train),1)),X_train))
        X_test = np.hstack((np.ones((len(X_test),1)),X_test))
        
        return X_train,Y_train,X_test,Y_test
    
    def compute_cost(self,X,y):
        #Calculating error on formulae : Err = [Sum((Hx(theta)-Y_expected)^2)]/(2*number_of_training_examples)
        y1 = np.matmul(X,self.theta)
        err = np.subtract(y1,y)
        return (np.sum(np.square(err)))/(2*len(X))

    def compute_variance(self, X_test, Y_test):
        Y_predict = np.matmul(X_test,self.theta)
        err = np.subtract(Y_predict,Y_test)
        return (np.var(err))
        
    def is_converged(self,oldTheta):
        #Returns true if difference between new weights and old weights is less than tolerance value
        diff = np.absolute(np.subtract(self.theta,oldTheta))
        meanDiff = np.mean(diff)
        return meanDiff < self.tolerance

    def gradient_descent(self,X_train,y_train,alpha):
        hist_cost = []
        hist_theta = []

        #initializing theta values with 0 shape : (number_of_attributes+1,1)
        self.theta = np.zeros((len(X_train[0]),1))

        n_iter = 0

        #iterating num_iter times and adjusting weights
        for i in range(0,self.num_iter):  
            #saving old theta to check convergance
            oldTheta = self.theta
            #saving old theta values
            hist_theta.append(self.theta) 
            #calculating hypothesis
            y1 = np.matmul(X_train,self.theta)
            #calculating gradient 
            gradient = (np.matmul(X_train.transpose(),(y1-y_train)))/len(X_train)
            #updating weights
            self.theta = self.theta - (alpha*gradient)
            #saving the costs for analysis
            hist_cost.append(self.compute_cost(X_train,y_train))
            #breaking the loop if converged
            n_iter = n_iter + 1
            if self.is_converged(oldTheta):
                break
        #returning historical cost and theta values
        return hist_cost,hist_theta,n_iter

    def eval_model(self,X_test,y_test):
        #calculating root mean squared error
        cost_test = np.sqrt(2*self.compute_cost(X_test,y_test))
        #returning y_predicted
        y_predict = np.matmul(X_test,self.theta)
        variance_val = self.compute_variance(X_test,y_test)
        return cost_test,y_predict,variance_val

if __name__ == '__main__':
    
    datasetUrl = 'https://drive.google.com/file/d/1M9KEyuwehbqOur2CKwN8wQL7DJvMhFTX/view?usp=sharing'
    delimeter = ','

    tolerance = 0.00001
    num_iter = 5000

    #creating a object to which will help to find minimal alpha
    model = LinearRegression(num_iter,datasetUrl,delimeter,tolerance)
    #preprocessing data
    X_train,y_train,X_test,y_test = model.data_preprocessing()
    #to store historical values
    hist_alpha = []
    hist_error = []
    #minimum alpha
    res_alpha = 0.0
    #minimum error - declaring it as large value for better comparison
    min_error = 1e9

    #checking for 100 iterations
    for i in range(1, 300):
        #obtaining small alpha by dividing it by 1000
        alpha = i/1000
        #using gradient descent with the above specified alpha
        model.gradient_descent(X_train,y_train,alpha)
        #saving alpha to use as X-axis later
        hist_alpha.append(alpha)
        #if error computed is smaller than min_error, update both min_error and res_alpha
        if model.compute_cost(X_test,y_test) < min_error:
            min_error = model.compute_cost(X_test,y_test)
            res_alpha = alpha
        #saving historical error values to plot
        hist_error.append(model.compute_cost(X_test,y_test))

    print("Min Alpha: ",res_alpha)
    print('\n')
    #plotting alpha vs all error values to see the trend
    plt.plot(hist_alpha,hist_error)
    plt.title('Alpha vs Error for Trial')
    plt.xlabel('Alpha')
    plt.ylabel('Error')
    plt.show()
    
    #performing gradient descent for res_alpha
    hist_cost,hist_theta,total_iter = model.gradient_descent(X_train,y_train,res_alpha)
    
    #evaluating model and fetching rmse and predicated output value
    rmse,y_predict,variance_val = model.eval_model(X_test,y_test)

    #calculating r2_value
    r2_value = r2_score(y_test,y_predict)
    
    #output
    print('\nRMSE: ',rmse)
    print('\nR2 Value: ',r2_value)
    print('\nVariance Value: ',variance_val)
    print('\n')
    print('Total Iteration: ',total_iter)
    print('\n')
    
    plt.plot(hist_cost)
    plt.title('Error vs Iteration Alpha = ' + str(res_alpha))
    plt.xlabel('Iteration')
    plt.ylabel('Error')
    plt.show()

In [None]:
#Trials

dis_alpha = [0.0001,
0.0002,
0.001,
0.002,
0.01,
0.1,
0.15,
0.25,
0.3,
0.8]

dis_tolerance = [0.0000001,
0.0000001,
0.000001,
0.00001,
0.0001,
0.001,
0.001,
0.001,
0.001,
0.001]

datasetUrl = 'https://drive.google.com/file/d/1M9KEyuwehbqOur2CKwN8wQL7DJvMhFTX/view?usp=sharing'
delimeter = ','
max_iter = 10000

hist_total_iter = []
hist_rmse = []
hist_r2 = []
hist_variance = []

for i in range(0,10):
    model = LinearRegression(max_iter,datasetUrl,delimeter,dis_tolerance[i])
    X_train,y_train,X_test,y_test = model.data_preprocessing()
    hist_cost,hist_theta,total_iter = model.gradient_descent(X_train,y_train,dis_alpha[i])
    rmse,y_predict,variance_val = model.eval_model(X_test,y_test)
    r2_value = r2_score(y_test,y_predict)
    hist_total_iter.append(total_iter)
    hist_rmse.append(rmse)
    hist_r2.append(r2_value)
    hist_variance.append(variance_val)

print("\n RMSE Values : ")
print(hist_rmse)
print("\n R2 Values : ")
print(hist_r2)
print("\nIteration Values : ")
print(hist_total_iter)
print("\nVariance values : ")
print(hist_variance)