Assignment 5

Question 1

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error

# Load and preprocess data
df = pd.read_csv('50_Startups.csv')
X = pd.get_dummies(df.drop('Profit', axis=1), columns=['State'], drop_first=True)
y = df['Profit']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Model Evaluation
y_pred = model.predict(X_test)
r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

# Model parameters
intercept = model.intercept_
coefficients = pd.DataFrame(model.coef_, X.columns, columns=['Coefficient'])

print(f"R-squared: {r2}")
print(f"Mean Squared Error: {mse}")
print(f"Intercept: {intercept}")
print(coefficients)

R-squared: 0.8987266414328637
Mean Squared Error: 82010363.04430099
Intercept: 54028.03959364581
                 Coefficient
R&D Spend           0.805630
Administration     -0.068788
Marketing Spend     0.029855
State_Florida     938.793006
State_New York      6.987760


Question 2

In [5]:
import numpy as np
import pandas as pd

# Load and Encode
df = pd.read_csv('50_Startups.csv')
df_encoded = pd.get_dummies(df, columns=['State'], drop_first=True).astype(float)
X = df_encoded.drop('Profit', axis=1).values
y = df_encoded['Profit'].values.reshape(-1, 1)

# Feature Scaling
X_mean = np.mean(X, axis=0)
X_std = np.std(X, axis=0)
X_scaled = (X - X_mean) / X_std

# Gradient Descent Function
def train_linear_regression(X, y, learning_rate=0.1, epochs=2000):
    m, n = X.shape
    weights = np.zeros((n, 1))
    bias = 0.0

    for i in range(epochs):
        y_pred = np.dot(X, weights) + bias
        error = y_pred - y

        # Gradients
        dw = (1 / m) * np.dot(X.T, error)
        db = (1 / m) * np.sum(error)

        # Parameter Updates
        weights -= learning_rate * dw
        bias -= learning_rate * db

    return weights, bias

# Training
weights, bias = train_linear_regression(X_scaled, y)

# Predictions for performance evaluation
y_pred = np.dot(X_scaled, weights) + bias

# Calculating Performance Parameters
mse = np.mean((y - y_pred)**2)
rmse = np.sqrt(mse)
mae = np.mean(np.abs(y - y_pred))
ss_res = np.sum((y - y_pred)**2)
ss_tot = np.sum((y - np.mean(y))**2)
r2 = 1 - (ss_res / ss_tot)

# Output Performance Parameters
print(f"R-squared (R2 Score): {r2:,.4f}")
print(f"Mean Squared Error (MSE): {mse:,.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:,.4f}")
print(f"Mean Absolute Error (MAE): {mae:,.4f}")
print(f"Final Bias (Intercept): {bias:,.4f}")

R-squared (R2 Score): 0.9508
Mean Squared Error (MSE): 78,406,792.8880
Root Mean Squared Error (RMSE): 8,854.7610
Mean Absolute Error (MAE): 6,475.5007
Final Bias (Intercept): 112,012.6392


Question 3

Intercept Comparison: Both the custom gradient descent and scikit-learn implementations yielded an identical intercept of approximately $112,012.6392$.

R&D Spend Coefficient: The learned weight for R&D Spend is $36,626.4283$ in both models, identifying it as the most significant predictor of profit.

Marketing Spend Coefficient: Both implementations calculated a positive weight of $3,266.2152$ for marketing expenditures.

Administration Coefficient: The models consistently assigned a slight negative weight of $-748.9975$ to administration costs.

State Feature Coefficients: The coefficients for State_Florida ($92.7302$) and State_New York ($-19.8422$) matched across both methods with a negligible difference of less than $10^{-11}$.

Performance Alignment: Both approaches achieved an identical $R^{2}$ score of $0.9508$ and a Mean Squared Error of approximately $78,406,792.89$.