In [18]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

In [10]:
# data preprocessing
test_df = pd.read_csv("test.csv")
train_df = pd.read_csv("train.csv")

cols_to_drop = ["id", "date", "zipcode"]

train_df = train_df.drop(columns=cols_to_drop, errors='ignore')
test_df = test_df.drop(columns=cols_to_drop, errors='ignore')

# scale price
train_df["price"] = train_df["price"] / 1000
test_df["price"] = test_df["price"] / 1000

X_train = train_df.drop(columns=["price"])
y_train = train_df["price"]

X_test = test_df.drop(columns=["price"])
y_test = test_df["price"]

# scale each feature so that the mean is 0, and the standard deviation is 1
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [None]:
# training mlr
model = LinearRegression()
model.fit(X_train_scaled, y_train)

coef_table = pd.DataFrame({
    "Feature": X_train.columns,
    "Coefficient": model.coef_
})

print("\nFeature Coefficients:")
print(coef_table)




Feature Coefficients:
          Feature  Coefficient
0      Unnamed: 0     8.456024
1        bedrooms   -12.807339
2       bathrooms    18.456913
3     sqft_living    57.161582
4        sqft_lot    11.127338
5          floors     8.151038
6      waterfront    64.230911
7            view    47.610288
8       condition    12.647609
9           grade    92.511076
10     sqft_above    48.439051
11  sqft_basement    27.688812
12       yr_built   -68.043173
13   yr_renovated    17.341926
14            lat    78.129852
15           long    -1.437669
16  sqft_living15    45.479128
17     sqft_lot15   -12.906560

Training MSE: 31415.747916100863


In [15]:
y_train_pred = model.predict(X_train_scaled)

train_mse = mean_squared_error(y_train, y_train_pred)
print("\nTraining MSE:", train_mse)


Training MSE: 31415.747916100863


In [16]:
train_r2 = r2_score(y_train, y_train_pred)
print("Training R^2:", train_r2)

Training R^2: 0.7271450489303788


In [17]:
y_test_pred = model.predict(X_test_scaled)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print("\nTesting MSE:", test_mse)
print("Testing R^2:", test_r2)


Testing MSE: 58834.673978213956
Testing R^2: 0.6471195893437873


In [20]:
# add column of ones to design matrix 
X_train_np = X_train_scaled
y_train_np = y_train.values.reshape(-1, 1)
X_train_aug = np.hstack([np.ones((X_train_np.shape[0], 1)), X_train_np])

# closed form solution
theta = np.linalg.inv(X_train_aug.T @ X_train_aug) @ X_train_aug.T @ y_train_np


In [21]:
def predict(X_new_scaled, theta):
    """
    X_new_scaled : numpy array of shape (n_samples, n_features)
    theta        : numpy array of shape (n_features+1, 1)
    """
    # augment with col of 1s
    X_new_aug = np.hstack([np.ones((X_new_scaled.shape[0], 1)), X_new_scaled])
    return X_new_aug @ theta 


In [23]:
y_train_pred_manual = predict(X_train_scaled, theta)
y_test_pred_manual = predict(X_test_scaled, theta)

In [None]:
# compute metrics
train_mse_manual = mean_squared_error(y_train, y_train_pred_manual)
train_r2_manual = r2_score(y_train, y_train_pred_manual)

test_mse_manual = mean_squared_error(y_test, y_test_pred_manual)
test_r2_manual = r2_score(y_test, y_test_pred_manual)

print("Manual Linear Regression - Training MSE:", train_mse_manual)
print("Manual Linear Regression - Training R^2:", train_r2_manual)
print("Manual Linear Regression - Testing MSE:", test_mse_manual)
print("Manual Linear Regression - Testing R^2:", test_r2_manual)

Manual Linear Regression - Training MSE: 32380.706555494446
Manual Linear Regression - Training R^2: 0.7187641011637014
Manual Linear Regression - Testing MSE: 61130.916468816176
Manual Linear Regression - Testing R^2: 0.6333471157621372
