## Homework 2

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

In [4]:
# Use the specified columns
columns = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']
df = df[columns]

In [5]:
# Question 1: Identify column with missing values
missing_col = df.isnull().sum()
missing_col = missing_col[missing_col > 0].index[0]
print(f"Question 1: {missing_col}")

Question 1: horsepower


In [6]:
# Question 2: Median of horsepower
horsepower_median = df['horsepower'].median()
options = [49, 99, 149, 199]
closest_option = min(options, key=lambda x: abs(x - horsepower_median))
print(f"Question 2: {closest_option}")

Question 2: 149


In [7]:
# Prepare and split the dataset
def prepare_data(df, fill_method='zero', train_mean=None):
    df_copy = df.copy()

    if fill_method == 'zero':
        df_copy['horsepower'] = df_copy['horsepower'].fillna(0)
    elif fill_method == 'mean':
        if train_mean is not None:
            df_copy['horsepower'] = df_copy['horsepower'].fillna(train_mean)
        else:
            df_copy['horsepower'] = df_copy['horsepower'].fillna(df_copy['horsepower'].mean())

    return df_copy

In [8]:
def split_data(df, seed=42):
    np.random.seed(seed)
    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - n_val - n_test

    idx = np.arange(n)
    np.random.shuffle(idx)

    df_train = df.iloc[idx[:n_train]]
    df_val = df.iloc[idx[n_train:n_train+n_val]]
    df_test = df.iloc[idx[n_train+n_val:]]

    return df_train, df_val, df_test

In [9]:
def prepare_X_y(df):
    X = df.drop('fuel_efficiency_mpg', axis=1).values
    y = df['fuel_efficiency_mpg'].values
    return X, y

In [10]:
# Split the data
df_train, df_val, df_test = split_data(df, seed=42)

##### Question 3: Compare filling missing values with 0 vs mean

In [12]:
# Fill with 0
df_train_zero = prepare_data(df_train, 'zero')
df_val_zero = prepare_data(df_val, 'zero')
X_train_zero, y_train_zero = prepare_X_y(df_train_zero)
X_val_zero, y_val_zero = prepare_X_y(df_val_zero)

In [13]:
# Fill with mean
train_horsepower_mean = df_train['horsepower'].mean()
df_train_mean = prepare_data(df_train, 'mean', train_horsepower_mean)
df_val_mean = prepare_data(df_val, 'mean', train_horsepower_mean)
X_train_mean, y_train_mean = prepare_X_y(df_train_mean)
X_val_mean, y_val_mean = prepare_X_y(df_val_mean)

In [14]:
# Train models
model_zero = LinearRegression()
model_zero.fit(X_train_zero, y_train_zero)
y_pred_zero = model_zero.predict(X_val_zero)
rmse_zero = np.sqrt(mean_squared_error(y_val_zero, y_pred_zero))

model_mean = LinearRegression()
model_mean.fit(X_train_mean, y_train_mean)
y_pred_mean = model_mean.predict(X_val_mean)
rmse_mean = np.sqrt(mean_squared_error(y_val_mean, y_pred_mean))

In [15]:
if rmse_zero < rmse_mean:
    answer_3 = "With 0"
else:
    answer_3 = "With mean"

print(f"Question 3: {answer_3}")

Question 3: With mean


##### Question 4: Regularized linear regression

In [17]:
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_scores = []

In [18]:
for r in r_values:
    model = Ridge(alpha=r)
    model.fit(X_train_zero, y_train_zero)
    y_pred = model.predict(X_val_zero)
    rmse = np.sqrt(mean_squared_error(y_val_zero, y_pred))
    rmse_scores.append(rmse)

In [19]:
best_r = r_values[np.argmin(rmse_scores)]
print(f"Question 4: {best_r}")

Question 4: 0


##### Question 5: Test different seeds

In [20]:
seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
rmse_scores_seeds = []

In [21]:
for seed in seeds:
    df_train_seed, df_val_seed, df_test_seed = split_data(df, seed=seed)
    df_train_seed_zero = prepare_data(df_train_seed, 'zero')
    df_val_seed_zero = prepare_data(df_val_seed, 'zero')
    X_train_seed, y_train_seed = prepare_X_y(df_train_seed_zero)
    X_val_seed, y_val_seed = prepare_X_y(df_val_seed_zero)

    model_seed = LinearRegression()
    model_seed.fit(X_train_seed, y_train_seed)
    y_pred_seed = model_seed.predict(X_val_seed)
    rmse_seed = np.sqrt(mean_squared_error(y_val_seed, y_pred_seed))
    rmse_scores_seeds.append(rmse_seed)

In [22]:
std_rmse = np.std(rmse_scores_seeds)
std_options = [0.001, 0.006, 0.060, 0.600]
closest_std = min(std_options, key=lambda x: abs(x - std_rmse))
print(f"Question 5: {closest_std}")

Question 5: 0.006


##### Question 6: Final test with r=0.001

In [23]:
df_train_q6, df_val_q6, df_test_q6 = split_data(df, seed=9)
df_combined = pd.concat([df_train_q6, df_val_q6])
df_combined_zero = prepare_data(df_combined, 'zero')
df_test_q6_zero = prepare_data(df_test_q6, 'zero')
X_combined, y_combined = prepare_X_y(df_combined_zero)
X_test_q6, y_test_q6 = prepare_X_y(df_test_q6_zero)

In [24]:
model_q6 = Ridge(alpha=0.001)
model_q6.fit(X_combined, y_combined)
y_pred_q6 = model_q6.predict(X_test_q6)
rmse_q6 = np.sqrt(mean_squared_error(y_test_q6, y_pred_q6))

In [25]:
q6_options = [0.15, 0.515, 5.15, 51.5]
closest_q6 = min(q6_options, key=lambda x: abs(x - rmse_q6))
print(f"Question 6: {closest_q6}")

Question 6: 0.515
