In [11]:
import pandas as pd
import numpy as np

# Load the data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

# Use only the specified columns
df_filtered = df[['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']].copy()

# Apply log transformation to the target variable (fuel_efficiency_mpg)
df_filtered['fuel_efficiency_mpg'] = np.log1p(df_filtered['fuel_efficiency_mpg'])

In [12]:
df_filtered.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [13]:
median_horsepower = df_filtered['horsepower'].median()
print(f"Median Horsepower: {median_horsepower}")

Median Horsepower: 149.0


In [14]:
# helper functions
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T @ X
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv @ X.T @ y
    return w[0], w[1:]

def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T @ X
    reg = r * np.eye(XTX.shape[0])
    XTX_reg = XTX + reg
    XTX_inv = np.linalg.inv(XTX_reg)
    w = XTX_inv @ X.T @ y
    return w[0], w[1:]

def rmse(y, y_pred):
    error = y - y_pred
    mse = (error ** 2).mean()
    return np.sqrt(mse)

# seed = 42
def split_data(df, seed):
    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - n_val - n_test

    np.random.seed(seed)
    idx = np.arange(n)
    np.random.shuffle(idx)

    df_shuffled = df.iloc[idx]
    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train + n_val].copy()
    df_test = df_shuffled.iloc[n_train + n_val:].copy()
    
    y_train = df_train['fuel_efficiency_mpg'].values
    y_val = df_val['fuel_efficiency_mpg'].values
    y_test = df_test['fuel_efficiency_mpg'].values
    
    return df_train, df_val, df_test, y_train, y_val, y_test

# Perform initial split for Q3 and Q4
df_train, df_val, _, y_train, y_val, _ = split_data(df_filtered, seed=42)
features = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [15]:
# fill with 0 
X_train_0 = df_train[features].fillna(0).values
X_val_0 = df_val[features].fillna(0).values

w0_0, w_0 = train_linear_regression(X_train_0, y_train)
y_pred_0 = w0_0 + X_val_0 @ w_0
rmse_0 = round(rmse(y_val, y_pred_0), 2)
print(f"RMSE with 0 fill: {rmse_0}")

# fill with mean
mean_hp = df_train['horsepower'].mean()

X_train_mean = df_train[features].fillna(mean_hp).values
X_val_mean = df_val[features].fillna(mean_hp).values

w0_mean, w_mean = train_linear_regression(X_train_mean, y_train)
y_pred_mean = w0_mean + X_val_mean @ w_mean
rmse_mean = round(rmse(y_val, y_pred_mean), 2)
print(f"RMSE with mean fill: {rmse_mean}")

RMSE with 0 fill: 0.04
RMSE with mean fill: 0.04


In [16]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
best_rmse = float('inf')
best_r = None

# testing the regularization values
for r in r_values:
    w0, w = train_linear_regression_reg(X_train_0, y_train, r=r)
    
    y_pred = w0 + X_val_0 @ w
    score = rmse(y_val, y_pred)
    
    rounded_score = round(score, 2)
    # print(f"r={r:5} | RMSE: {rounded_score}")
    
    if rounded_score <= round(best_rmse, 2):
        best_rmse = score
        best_r = r

print(f"\nBest r: {best_r}")


Best r: 100


In [17]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores = []

for seed in seeds:
    df_train_s, df_val_s, _, y_train_s, y_val_s, _ = split_data(df_filtered, seed=seed)
    
    # Fill NAs with 0
    X_train_s = df_train_s[features].fillna(0).values
    X_val_s = df_val_s[features].fillna(0).values
    
    # Train (No regularization)
    w0, w = train_linear_regression(X_train_s, y_train_s)
    
    # Evaluate
    y_pred = w0 + X_val_s @ w
    score = rmse(y_val_s, y_pred)
    
    rmse_scores.append(score)

# Calculate standard deviation
std_rmse = np.std(rmse_scores)
print(f"Standard Deviation of RMSE scores: {round(std_rmse, 3)}")

Standard Deviation of RMSE scores: 0.001


In [18]:
# Split Data using seed 9
df_train_9, df_val_9, df_test_9, y_train_9, y_val_9, y_test_9 = split_data(df_filtered, seed=9)

# 
df_full_train = pd.concat([df_train_9, df_val_9])
y_full_train = np.concatenate([y_train_9, y_val_9])

# 
X_full_train = df_full_train[features].fillna(0).values
X_test = df_test_9[features].fillna(0).values

# train 
r_final = 0.001
w0, w = train_linear_regression_reg(X_full_train, y_full_train, r=r_final)

# eval 
y_pred_test = w0 + X_test @ w
rmse_test = rmse(y_test_9, y_pred_test)


print(f"RMSE on Test Set (r=0.001, seed=9): {round(rmse_test, 3)}")

RMSE on Test Set (r=0.001, seed=9): 0.039


In [19]:

y_test_original = np.expm1(y_test_9)


y_pred_log = w0 + X_test @ w
y_pred_original = np.expm1(y_pred_log)

rmse_test_final = rmse(y_test_original, y_pred_original)

print(f"Final RMSE (Original Scale): {round(rmse_test_final, 2)}")

Final RMSE (Original Scale): 0.61
