In [2]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.metrics import mean_squared_error

# Load local CSV (make sure the file name matches exactly)
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

# Keep only required columns
cols = ['engine_displacement','horsepower','vehicle_weight','model_year','fuel_efficiency_mpg']
df = df[cols].copy()

# Convert '?' to NaN and numeric
df['horsepower'] = pd.to_numeric(df['horsepower'].replace('?', np.nan))

# === Q1 ===
print("Q1. Missing value counts:")
print(df.isna().sum())

# === Q2 ===
print("\nQ2. Median horsepower (50th percentile):", df['horsepower'].median())

# --- Helpers ---
def split60_20_20(df, seed):
    df_shuffled = df.sample(frac=1, random_state=seed).reset_index(drop=True)
    n = len(df_shuffled)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    train = df_shuffled.iloc[:n_train].reset_index(drop=True)
    val = df_shuffled.iloc[n_train:n_train+n_val].reset_index(drop=True)
    test = df_shuffled.iloc[n_train+n_val:].reset_index(drop=True)
    return train, val, test

def train_and_rmse(X_train, y_train, X_val, y_val, r=0.0):
    if r == 0:
        model = LinearRegression()
    else:
        model = Ridge(alpha=r)
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    return mean_squared_error(y_val, preds, squared=False)

features = ['engine_displacement','horsepower','vehicle_weight','model_year']
target = 'fuel_efficiency_mpg'

# === Q3 ===
train, val, test = split60_20_20(df, 42)

# fill 0
train0 = train.fillna({'horsepower': 0})
val0 = val.fillna({'horsepower': 0})
rmse0 = train_and_rmse(train0[features], train0[target], val0[features], val0[target], 0)

# fill mean (from train)
mean_hp = train['horsepower'].mean()
train_mean = train.fillna({'horsepower': mean_hp})
val_mean = val.fillna({'horsepower': mean_hp})
rmse_mean = train_and_rmse(train_mean[features], train_mean[target],
                           val_mean[features], val_mean[target], 0)

print(f"\nQ3. RMSE (fill 0) = {round(rmse0,2)}, RMSE (fill mean) = {round(rmse_mean,2)}")

# === Q4 ===
r_values = [0, 0.01, 0.1, 1, 5, 10, 100]
rmse_r = {}
train0 = train.fillna({'horsepower': 0})
val0 = val.fillna({'horsepower': 0})
for r in r_values:
    rmse_r[r] = train_and_rmse(train0[features], train0[target],
                               val0[features], val0[target], r)
    print(f"r={r:<5} → RMSE={round(rmse_r[r],2)}")

best_r = min(r_values, key=lambda x: (round(rmse_r[x],2), x))
print("Q4. Best r:", best_r)

# === Q5 ===
rmses = []
for s in range(10):
    tr, va, te = split60_20_20(df, s)
    tr0 = tr.fillna({'horsepower': 0})
    va0 = va.fillna({'horsepower': 0})
    rmses.append(train_and_rmse(tr0[features], tr0[target], va0[features], va0[target], 0))
std_rmse = np.std(rmses)
print(f"\nQ5. RMSEs for seeds 0–9: {[round(x,3) for x in rmses]}")
print("Standard deviation (rounded to 3 decimals):", round(std_rmse,3))

# === Q6 ===
tr, va, te = split60_20_20(df, 9)
combined = pd.concat([tr, va], ignore_index=True)
combined0 = combined.fillna({'horsepower': 0})
test0 = te.fillna({'horsepower': 0})
rmse_test = train_and_rmse(combined0[features], combined0[target], test0[features], test0[target], 0.001)
print("\nQ6. Test RMSE (r=0.001, seed=9):", round(rmse_test,3))


Q1. Missing value counts:
engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

Q2. Median horsepower (50th percentile): 149.0


TypeError: got an unexpected keyword argument 'squared'