In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
import numpy as np


stock_prediction_data = pd.read_csv('/Users/shreyas/Desktop/ML/HW/hw3/stock_prediction_data.csv')
stock_price = pd.read_csv('/Users/shreyas/Desktop/ML/HW/hw3/stock_price.csv')

# Create a common index for merging
stock_prediction_data['Index'] = range(len(stock_prediction_data))
stock_price['Index'] = range(len(stock_price))


data = pd.merge(stock_prediction_data, stock_price, on='Index')
data.columns = data.columns.str.strip() 
X = data.drop(columns=[data.columns[-1]])
y = data[data.columns[-1]] 


X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)


scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)


linear_model = LinearRegression()
linear_model.fit(X_train_scaled, y_train)
y_val_pred_linear = linear_model.predict(X_val_scaled)
linear_mse = mean_squared_error(y_val, y_val_pred_linear)


poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_val_poly = poly.transform(X_val_scaled)

poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_val_pred_poly = poly_model.predict(X_val_poly)
poly_mse = mean_squared_error(y_val, y_val_pred_poly)


class LinearRegressionGD:
    def __init__(self, learning_rate=0.01, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations

    def fit(self, X, y):
        self.m, self.n = X.shape
        self.theta = np.zeros(self.n)
        self.bias = 0

        for _ in range(self.n_iterations):
            y_pred = np.dot(X, self.theta) + self.bias
            d_theta = (1 / self.m) * np.dot(X.T, (y_pred - y))
            d_bias = (1 / self.m) * np.sum(y_pred - y)

            self.theta -= self.learning_rate * d_theta
            self.bias -= self.learning_rate * d_bias

    def predict(self, X):
        return np.dot(X, self.theta) + self.bias

gd_model = LinearRegressionGD()
gd_model.fit(X_train_scaled, y_train)
y_val_pred_gd = gd_model.predict(X_val_scaled)
gd_mse = mean_squared_error(y_val, y_val_pred_gd)


X_train_b = np.c_[np.ones((X_train_scaled.shape[0], 1)), X_train_scaled]
theta_best = np.linalg.inv(X_train_b.T.dot(X_train_b)).dot(X_train_b.T).dot(y_train)
X_val_b = np.c_[np.ones((X_val_scaled.shape[0], 1)), X_val_scaled]
y_val_pred_closed = X_val_b.dot(theta_best)
closed_mse = mean_squared_error(y_val, y_val_pred_closed)


print(f"Linear Regression MSE (sklearn): {linear_mse}")
print(f"Polynomial Regression MSE (sklearn): {poly_mse}")
print(f"Gradient Descent MSE: {gd_mse}")
print(f"Closed Form Solution MSE: {closed_mse}")


if linear_mse < poly_mse:
    print("Linear regression is better for this situation.")
else:
    print("Polynomial regression is better for this situation.")


Linear Regression MSE (sklearn): 0.03607692317901041
Polynomial Regression MSE (sklearn): 0.07831879711096547
Gradient Descent MSE: 0.036133542359062056
Closed Form Solution MSE: 0.03607692317901008
Linear regression is better for this situation.
