In [None]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
df=pd.read_csv('car_fuel_efficiency.csv')

In [None]:
df.head()

In [None]:
plt.figure()
sns.histplot(df.fuel_efficiency_mpg)
plt.title('Fuel Efficiency')
plt.show()

In [None]:
df.isnull().sum()

In [None]:
df['horsepower'].median()

In [None]:
np.random.seed(42)

n=len(df)
n

In [None]:
n_val=int(0.2*n)
n_test=int(0.2*n)
n_train=n-n_val-n_test
n_train,n_val,n_test

In [None]:
idx=np.arange(n)
np.random.shuffle(idx)
df_shuffled=df.iloc[idx]
df_train=df_shuffled.iloc[:n_train].copy()
df_val=df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test=df_shuffled.iloc[n_train+n_val:].copy()

In [None]:
df_train.head()

In [None]:
df_val.head()

In [None]:
df_test.head()

In [None]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)

    return w[0], w[1:]

In [None]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [None]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']
base

In [None]:
def prepare_X_mean(df):
  df_num=df[base]
  df_num=df_num.fillna(df_train['horsepower'].mean())
  X=df_num.values
  return X

In [None]:
def prepare_X(df):
  df_num=df[base]
  df_num=df_num.fillna(0)
  X=df_num.values
  return X

In [None]:
y_train = df_train['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

del df_train['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']

In [None]:
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)
X_test = prepare_X(df_test)

In [None]:
w_0, w = train_linear_regression(X_train, y_train)
y_pred=w_0+X_train.dot(w)

In [None]:
#fillna(0)
rmse(y_train, y_pred)

In [None]:
#null replaced with mean
X_train = prepare_X_mean(df_train)
w_0, w = train_linear_regression(X_train, y_train)
y_pred=w_0+X_train.dot(w)
rmse(y_train, y_pred)

In [None]:
#Regularized
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    return w[0], w[1:]

r_values = [0, 0.01, 0.1, 1, 5, 10, 100]

for r in r_values:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_train.dot(w)
    
    score = rmse(y_train, y_pred)
    print(f"r={r:>6}: RMSE={score:.3f}")


In [None]:
import numpy as np

seeds = range(10)
rmse_scores = []

for seed in seeds:
    np.random.seed(seed)
    idx = np.arange(len(df))
    np.random.shuffle(idx)
    df_shuffled = df.iloc[idx]
    
    n = len(df)
    n_val = int(0.2 * n)
    n_test = int(0.2 * n)
    n_train = n - (n_val + n_test)

    df_train = df_shuffled.iloc[:n_train].copy()
    df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
    df_test = df_shuffled.iloc[n_train+n_val:].copy()
    
    y_train = df_train['fuel_efficiency_mpg'].values
    y_val = df_val['fuel_efficiency_mpg'].values
    
    X_train_df = df_train.drop(columns=['fuel_efficiency_mpg'])
    X_val_df = df_val.drop(columns=['fuel_efficiency_mpg'])
    
    X_train = X_train_df[base].fillna(0).values
    X_val = X_val_df[base].fillna(0).values
    
    w_0, w = train_linear_regression(X_train, y_train)
    
    y_pred = w_0 + X_val.dot(w)
    rmse_val = rmse(y_val, y_pred)
    
    rmse_scores.append(rmse_val)


std_rmse = round(np.std(rmse_scores), 3)
print("RMSE scores for each seed:", rmse_scores)
print("Standard deviation:", std_rmse)


In [None]:
import numpy as np

# Features to use
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

np.random.seed(9)
idx = np.arange(len(df))
np.random.shuffle(idx)
df_shuffled = df.iloc[idx]


n = len(df)
n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - n_val - n_test

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()


df_train_val = pd.concat([df_train, df_val], axis=0)

y_train_val = df_train_val['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values

X_train_val = df_train_val[base].fillna(0).values
X_test = df_test[base].fillna(0).values

def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])
    XTX = X.T.dot(X) + r * np.eye(X.shape[1])
    w = np.linalg.inv(XTX).dot(X.T).dot(y)
    return w[0], w[1:]

w_0, w = train_linear_regression_reg(X_train_val, y_train_val, r=0.001)

y_pred_test = w_0 + X_test.dot(w)
rmse_test = np.sqrt(((y_pred_test - y_test) ** 2).mean())

print("Test RMSE:", rmse_test)
