## Regression Tasks

**IMPORTANT: Make sure to set random_state=42 when using sklearn.model_selection.train_test_split**

**Task 1: Ordinary Least Squares (OLS)**

In [1]:
# Step 1: Load the necessary libraries and the Boston Housing Prices dataset
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

In [2]:
# Load the Boston Housing Prices dataset
import pandas as pd
import requests

def load_boston_5420():
    url = "https://faculty.tuck.dartmouth.edu/images/uploads/faculty/business-analytics/Boston_Housing.xlsx"

    response = requests.get(url)

    if response.status_code == 200:
        df = pd.read_excel(url)

        feature_names = df.columns[:-1].tolist()
        target_name = df.columns[-1]

        dataset = {
            'data': df[feature_names].values,
            'target': df[target_name].values,
            'feature_names': feature_names,
            'target_name': target_name,
            'DESCR': 'Boston Housing dataset for CS5420 @ MST'
        }

        return dataset
    else:
        print("Failed to download the dataset.")

boston = load_boston_5420()
X, y = boston['data'], boston['target']

In [3]:
# Step 2: Data preprocessing
# Split the dataset into training and testing sets (70% train, 30% test) with random_state=42
# (5 points)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)



In [4]:
# Step 3: Standardize the feature values
# NOTE: train and test must be transformed separately to prevent data leakage
# (5 points)
scalar_obj = StandardScaler()
x_train_scalar = scalar_obj.fit_transform(x_train)
x_test_scalar = scalar_obj.transform(x_test)


In [5]:
# Step 4: Initialize and train the OLS linear regression model
# (7.5 points)
from sklearn.linear_model import LinearRegression
linear_reg_obj = LinearRegression()
linear_reg_obj.fit(x_train_scalar, y_train)



In [6]:
# Step 5: Make predictions on the testing set and print the r2 score
# (7.5 points)
prediction = linear_reg_obj.predict(x_test_scalar)
Linear_R2 = r2_score(y_test, prediction)
print(f"R2: {Linear_R2}")

R2: 0.7112260057484932


**Task 2: Ridge Regression**

In [7]:
# Step 1: Import Ridge regression and the California Housing Prices dataset
from sklearn.datasets import fetch_california_housing
from sklearn.linear_model import Ridge


In [8]:
# Load the California Housing Prices dataset
data = fetch_california_housing()
X, y = data.data, data.target

In [9]:
# Split the dataset into training and testing sets (70% train, 30% test) with random_state=42
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the feature values
scalar_obj = StandardScaler()
x_train_scalar = scalar_obj.fit_transform(x_train)
x_test_scalar = scalar_obj.transform(x_test)


In [10]:
from hashlib import new
# Step 2: Train the Ridge Regression model different alpha values [0.001, 0.01, 0.1, 1, 10, 100] and store the best alpha in a variable
# based on mean_absolute_error
# (15 points)
alpha_values = [0.001, 0.01, 0.1, 1, 10, 100]
best_alpha = 0
best_mean_abs_error = 1.1
for val in alpha_values:
  new_ridge_model = Ridge(alpha = val)
  new_ridge_model.fit(x_train_scalar, y_train)
  prediction = new_ridge_model.predict(x_test_scalar)
  mean_abs_error = mean_absolute_error(y_test, prediction)
  if(mean_abs_error < best_mean_abs_error):
    best_alpha = val
    best_mean_abs_error = mean_abs_error
#print(f"Best Alpha: {best_alpha}")


In [11]:

# Step 4: Re-train the Ridge Regression model with the best alpha value and print r2_score
# (5 points)
ridge_model_2 = Ridge(alpha = best_alpha)
ridge_model_2.fit(x_train_scalar, y_train)
prediction_2 = ridge_model_2.predict(x_test_scalar)
ridge_r2 = r2_score(y_test, prediction_2)
print(f"R2 Score: {ridge_r2}")



R2 Score: 0.595944060491304


**Task 3: Lasso Regression**

In [12]:
# Import Lasso Regression
from sklearn.linear_model import Lasso

In [13]:
# Load the California Housing Prices dataset
data = fetch_california_housing()
X, y = data.data, data.target

In [14]:
# Split the dataset into training and testing sets (70% train, 30% test) with random_state=42
_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Standardize the feature values
scalar_obj = StandardScaler()
x_train_scalar = scalar_obj.fit_transform(x_train)
x_test_scalar = scalar_obj.transform(x_test)


In [15]:
# Step 3: Train the Lasso Regression model with different alpha values [0.001, 0.01, 0.1, 1, 10, 100] and store the best alpha in a variable
# based on mean_absolute_error
# (15 points)

alpha_values = [0.001, 0.01, 0.1, 1, 10, 100]
best_alpha = 0
best_mean_abs_error = 1.1
for val in alpha_values:
  new_lasso_model = Lasso(alpha = val)
  new_lasso_model.fit(x_train_scalar, y_train)
  prediction = new_lasso_model.predict(x_test_scalar)
  mean_abs_error = mean_absolute_error(y_test, prediction)
  if(mean_abs_error < best_mean_abs_error):
    best_alpha = val
    best_mean_abs_error = mean_abs_error
#print(f"Best Alpha: {best_alpha}")



In [16]:
# Step 4: Re-train the Lasso Regression model with the best alpha value and print r2_score
# (5 points)
lasso_model_2 = Lasso(alpha = best_alpha)
lasso_model_2.fit(x_train_scalar, y_train)
prediction_2 = lasso_model_2.predict(x_test_scalar)
lasso_r2 = r2_score(y_test, prediction_2)
print(f"R2 Score: {lasso_r2}")



R2 Score: 0.5963975777208825
