In [63]:
#############################################################################
# File: DS312_Project2_RachelNewman
# Author: Rachel Newman
# Purpose: Perform simple Linear Regression with multiple features. 
# Resources: Collaborated with Jerome Busquin for Question 2 and Question 3
#            for Project 2. The work for Question 1 and Question 4 are my 
#            own work. 
############################################################################

In [15]:
# importing necessary libraries for homework
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [13]:
## need to upload California Housing Data as a pandas dataframe
house = pd.read_csv('/Users/rachi/OneDrive - Embry-Riddle Aeronautical University/JupyterNotebook/DS 312/Project 2/California_Housing_Data-1.csv')

## viewing the first 5 entries in the data frame
house.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [19]:
## looking at the features/column names in the data
house.columns

Index(['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup',
       'Latitude', 'Longitude', 'PRICE'],
      dtype='object')

In [23]:
## looking at the statistical summary of the data and determining whether there is any missing data
house.describe()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,PRICE
count,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0,20640.0
mean,3.870671,28.639486,5.429,1.096675,1425.476744,3.070655,35.631861,-119.569704,2.068558
std,1.899822,12.585558,2.474173,0.473911,1132.462122,10.38605,2.135952,2.003532,1.153956
min,0.4999,1.0,0.846154,0.333333,3.0,0.692308,32.54,-124.35,0.14999
25%,2.5634,18.0,4.440716,1.006079,787.0,2.429741,33.93,-121.8,1.196
50%,3.5348,29.0,5.229129,1.04878,1166.0,2.818116,34.26,-118.49,1.797
75%,4.74325,37.0,6.052381,1.099526,1725.0,3.282261,37.71,-118.01,2.64725
max,15.0001,52.0,141.909091,34.066667,35682.0,1243.333333,41.95,-114.31,5.00001


In [29]:
## splitting data into 80:20 split (80 percent for training, 20 for testing) and setting random_state = 0 so that split remains the same
## for all of the problems and the results are repeatable on your end!
features = house.drop('PRICE', axis = 1).values
target = house['PRICE'].values

x_train, x_test, y_train, y_test = train_test_split(features, target, test_size = 0.2, random_state = 0)

In [31]:
## Standardizing/normalizing all data
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

# Question 1: Theoretical Foundations of Linear Regression

Please see pdf submitted on Canvas for solution to Question 1. I wrote out the solution by hand and submitted it that way!

# Question 2: Implement Simple Linear Regression Using Gradient Descent From Scratch

### Task Overview: You are given a CSV file with 9 columns. Your objective is to predict the "PRICE" (Median House Value), which is your target variable (y), using the other 8 columns as input features (x₁, x₂, ..., x₈). This dataset is part of the California Housing Price data. Use an 80:20 split for training and testing: the first 80% of the data will be used for training and the remaining 20% will be used as the test set.

#### 1. Manual Implementation: Create a simple linear regression model using gradient descent from scratch on the training data.

In [45]:
## creating a function for manual gradient descent calculation

n = 1000 # number of iterations
alpha = 0.01 # learning rate

def gradient_descent(x, y, alpha, n):
    m, z = x.shape
    weight = np.zeros(z) ## initializing weights (parameters) to 0
    bias = 0 ## setting bias to 0
    for n in range(n):
        ## making predictions for target variable
        y_pred = np.dot(x, weight) + bias
        
        ## calculating partial derivatives of w and b
        deriv_weight = -(2/m)*np.dot(x.T, (y - y_pred))
        deriv_bias = -(2/m)*np.sum(y - y_pred)

        ## updating parameters w and b
        weight = weight - alpha*deriv_weight
        bias = bias - alpha*deriv_bias

        ## printing progress as n increases
        if n % 100 == 0: # updating every 100 iterations 
            mse = np.mean((y - y_pred)**2)
            print(f'Iteration # {n}: MSE = {mse}')
    return weight, bias

In [49]:
## training the model using the gradient descent

weight, bias = gradient_descent(x_train_scaled, y_train, alpha, n)
print(f'The learned weights are: {weight}')
print(f'The learned bias is: {bias}')

Iteration # 0: MSE = 5.633623435203409
Iteration # 100: MSE = 0.707654556695311
Iteration # 200: MSE = 0.5953227827574167
Iteration # 300: MSE = 0.5752836766745015
Iteration # 400: MSE = 0.5617964918598384
Iteration # 500: MSE = 0.5519076263504196
Iteration # 600: MSE = 0.5446236282988052
Iteration # 700: MSE = 0.5392480453103402
Iteration # 800: MSE = 0.5352730428898466
Iteration # 900: MSE = 0.5323272953496105
The learned weights are: [ 0.83652297  0.14636836 -0.21640587  0.24153559  0.00229364 -0.03423635
 -0.68043857 -0.64926218]
The learned bias is: 2.072498955450924


#### 2. Prediction: Use this model to make predictions on the test set.

In [55]:
## making predictions on the test set
y_pred_test = np.dot(x_test_scaled, weight) + bias

#### 3. Calculate MAE: Evaluate the Mean Absolute Error (MAE) of your predictions on the test set.

In [60]:
## calculating MAE manually
mae_manual = np.mean(np.abs(y_test - y_pred_test))
print(f'The Mean Absolute Error on the test set it: {mae_manual}')

The Mean Absolute Error on the test set it: 0.5385695598072363


#### 4. Calculate R² Value: Compute the R² value on the training and test set.

In [72]:
## calculating R^2 for training set
y_pred_train = np.dot(x_train_scaled, weight) + bias

train_res = np.sum((y_train - y_pred_train)**2) ##calculating the residual sum of squares on training set
train_total = np.sum((y_train - np.mean(y_train))**2) ## calculating the total sum of squares on training set

train_r2 = 1 - (train_res/train_total)
print(f'The R² on the training set is {train_r2}')

The R² on the training set is 0.6038924720454227


In [74]:
## calculating R^2 for test set
test_res = np.sum((y_test - y_pred_test)**2) ##calculating the residual sum of squares on test set
test_total = np.sum((y_test - np.mean(y_test))**2) ## calculating the total sum of squares on test set

test_r2 = 1 - (test_res/test_total)
print(f'The R² on the test set is {test_r2}')

The R² on the test set is 0.586569105392173


# Question 3: Linear Regression Using Scikit-Learn

#### 1. Library Implementation: Use the Scikit-Learn library to implement a simple linear regression model on the training data.

In [87]:
## using sklearn to create linear regression model on training set
linear_regression = LinearRegression()
## fitting the model to the training data
linear_regression.fit(x_train_scaled, y_train)

## printing out weights and bias for the model
print(f'The learned weights (coefficients) are: {linear_regression.coef_}')
print(f'The learned bias (intercept) is: {linear_regression.intercept_}')

The learned weights (coefficients) are: [ 0.82624793  0.1171006  -0.24891059  0.29038746 -0.00864349 -0.03056429
 -0.90042112 -0.87058566]
The learned bias (intercept) is: 2.072498958938836


#### 2. Predict and Compute MAE: Use the model to predict values on the test set and calculate the MAE.

In [90]:
## making predictions on both test and train and using sklearn to calculate MAE
y_pred_train_lr = linear_regression.predict(x_train_scaled)
y_pred_test_lr = linear_regression.predict(x_test_scaled)

In [92]:
## calculating MAE with sklearn
MAE_train = mean_absolute_error(y_train, y_pred_train_lr)
print(f'The Mean Absolute Error (MAE) for the Training set is: {MAE_train}')

MAE_test = mean_absolute_error(y_test, y_pred_test_lr)
print(f'The Mean Absolute Error (MAE) for the Test set is: {MAE_test}')

The Mean Absolute Error (MAE) for the Training set is: 0.5308743544863755
The Mean Absolute Error (MAE) for the Test set is: 0.5351261336554506


#### 3. Calculate R² Value: Compute the R² value on the training and test set.

In [95]:
## calculating the R² value using sklearn
r2_train = r2_score(y_train, y_pred_train_lr)
print(f'The R² value for the training set is: {r2_train}')
r2_test = r2_score(y_test, y_pred_test_lr)
print(f'The R² value for the test set is: {r2_test}')

The R² value for the training set is: 0.6088968118672871
The R² value for the test set is: 0.5943232652466204


# Question 4: Comparative Analysis

#### 1. Compare the MAE and R² values obtained from the manual implementation (Question 2) and the scikit-learn implementation (Question 3).

In [110]:
## comparing MAE from Q2 and Q3 for test data
print(f'The MAE for the manual implementation is: {mae_manual}')
print(f'The MAE for the scikit-learn implementation is: {MAE_test} \n')

## comparing R² from Q2 and Q3 for training data
print(f'The R² value for the training manual implementation is: {train_r2}')
print(f'The R² value for the training scikit-learn implementation is: {r2_train} \n')

## comparing R² from Q2 and Q3 for test data
print(f'The R² value for the test manual implementation is: {test_r2}')
print(f'The R² value for the test scikit-learn implementation is: {r2_test}')

The MAE for the manual implementation is: 0.5385695598072363
The MAE for the scikit-learn implementation is: 0.5351261336554506 

The R² value for the training manual implementation is: 0.6038924720454227
The R² value for the training scikit-learn implementation is: 0.6088968118672871 

The R² value for the test manual implementation is: 0.586569105392173
The R² value for the test scikit-learn implementation is: 0.5943232652466204


#### 2. Discuss which model achieves a lower MAE and a higher R² value, and provide reasons for these results

As seen above in the first part of the problem, the linear regression model that had a lower MAE is the model implemented with scikit-learn. The scikit-learn model had a MAE value of 0.5351, where as the manual implementation of gradient descent had a MAE of 0.5386. When comparing the R² values, it can be seen that the linear regression model built with scikit-learn has a higher value. The scikit-learn model had an R² of 0.5943 for the test data and the manual model had an R² of 0.5866. The reason the model built with scikit-learn has a lower MAE and a higher R² value is because the sklearn library is built to optimize using Ordinary Least Squares, whereas the gradient descent model is based off of factors such as the learning rate and the number of iterations that the model is run. The gradient descent may not converge to the optimal solution if the learning rate and number of iterations are not properly tuned. 