In [90]:
import pandas as pd
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

In [91]:
df = pd.read_csv("car_fuel_efficiency.csv")

In [95]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year'] #, 'fuel_efficiency_mpg']
# columns = list(df.columns)
# for column in columns:
#     if column not in base:
#         del df[column]

# df.head()

In [38]:
df.isnull().sum()

engine_displacement      0
horsepower             708
vehicle_weight           0
model_year               0
fuel_efficiency_mpg      0
dtype: int64

In [49]:
median = df['horsepower'].median()
mode = df['horsepower'].mode()[0]
mean = df['horsepower'].mean()
median, mode, mean

(149.0, 152.0, 149.65729212983547)

In [25]:
# Prepare and split the dataset
# Shuffle the dataset (the filtered one you created above), use seed 42.
# Split your data in train/val/test sets, with 60%/20%/20% distribution.

In [60]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(2)
np.random.shuffle(idx)

df_train = df.iloc[idx[:n_train]]
df_val = df.iloc[idx[n_train:n_train+n_val]]
df_test = df.iloc[idx[n_train+n_val:]]
df_train.head(5)

y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
y_train = df_train.fuel_efficiency_mpg.values

In [82]:
def train_linear_regression_reg(X, y, r=0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X]).round()
    XTX = X.T.dot(X) # gram matrix
    XTX = XTX + r*np.eye(XTX.shape[0])
    XTX_inv = np.linalg.inv(XTX)
    XTX.dot(XTX_inv).round(1)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:] # bias, weights

def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X]).round()
    XTX = X.T.dot(X) # gram matrix
    XTX_inv = np.linalg.inv(XTX)
    XTX.dot(XTX_inv).round(1)
    w_full = XTX_inv.dot(X.T).dot(y)
    return w_full[0], w_full[1:] # bias, weights

def rmse(y, y_pred):
    se = (y - y_pred) ** 2
    mse = se.mean()
    return np.sqrt(mse)

def prepare_X(df, fillna=0):
    df = df.copy() # not to change original dataframe
    features = base.copy()
    df_num = df[features]
    df_num = df_num.fillna(fillna)
    X = df_num.values
    return X

In [28]:
# We need to deal with missing values for the column from Q1.
# We have two options: fill it with 0 or with the mean of this variable.
# Try both options. For each, train a linear regression model without regularization using the code from the lessons.
# For computing the mean, use the training only!
# Use the validation dataset to evaluate the models and compare the RMSE of each option.
# Round the RMSE scores to 2 decimal digits using round(score, 2)
# Which option gives better RMSE?

In [83]:
X_train = prepare_X(df_train, 0)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X(df_val, 0)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

0.1194741653014412

In [84]:
# fillna(mean)
X_train = prepare_X(df_train, mean)
w0, w = train_linear_regression(X_train, y_train)
X_val = prepare_X(df_val, mean)
y_pred = w0 + X_val.dot(w)
rmse(y_val, y_pred)

0.1194741653014412

In [87]:
scores = []
for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression(X_train, y_train)
    X_val = prepare_X(df_val, 0)
    y_pred = w0 + X_val.dot(w)
    score = round(rmse(y_val, y_pred), 2)
    scores.append((r, w0, score))
    
print(sorted(scores, key=lambda sublist: sublist[2]))

[(0, 7.10626865176468, 0.12), (0.01, 7.10626865176468, 0.12), (0.1, 7.10626865176468, 0.12), (1, 7.10626865176468, 0.12), (5, 7.10626865176468, 0.12), (10, 7.10626865176468, 0.12), (100, 7.10626865176468, 0.12)]


In [71]:
We used seed 42 for splitting the data. Let's find out how selecting the seed influences our score.
Try different seed values: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9].
For each seed, do the train/validation/test split with 60%/20%/20% distribution.
Fill the missing values with 0 and train a model without regularization.
For each seed, evaluate the model on the validation dataset and collect the RMSE scores.
What's the standard deviation of all the scores? To compute the standard deviation, use np.std.
Round the result to 3 decimal digits (round(std, 3))

In [96]:
n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

idx = np.arange(n)

seed_scores = []

df_copy = df.copy().fillna(0)

for seed in [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]:
    np.random.seed(seed)
    np.random.shuffle(idx)
    df_train = df_copy.iloc[idx[:n_train]]
    df_val = df_copy.iloc[idx[n_train:n_train+n_val]]
    df_test = df_copy.iloc[idx[n_train+n_val:]]
    
    y_val = df_val.fuel_efficiency_mpg.values
    y_test = df_test.fuel_efficiency_mpg.values
    y_train = df_train.fuel_efficiency_mpg.values

    del df_val['fuel_efficiency_mpg']
    del df_test['fuel_efficiency_mpg']
    del df_train['fuel_efficiency_mpg']
    
    X_train = prepare_X(df_train, 0)
    w0, w = train_linear_regression(X_train, y_train)
    X_val = prepare_X(df_val, 0)
    y_pred = w0 + X_val.dot(w)
    score = rmse(y_val, y_pred)
    seed_scores.append([seed, score])

print(seed_scores)

seed_scores = np.array(seed_scores)
second_column = seed_scores[:, 1]
std_dev = np.std(second_column)
round(std_dev, 3)

[[0, 0.5206537286913236], [1, 0.5228476803845532], [2, 0.5168352470562333], [3, 0.525671620680411], [4, 0.5213723193310092], [5, 0.5251441954856986], [6, 0.5255570098952508], [7, 0.5068442927589892], [8, 0.5200615380985912], [9, 0.5140857705833753]]


0.006

In [97]:
# Split the dataset like previously, use seed 9.
# Combine train and validation datasets.
# Fill the missing values with 0 and train a model with r=0.001.
# What's the RMSE on the test dataset?

n = len(df)
n_val = int(n * 0.2)
n_test = int(n * 0.2)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.seed(9)
np.random.shuffle(idx)

df_copy = df.copy().fillna(0)
df_train = df_copy.iloc[idx[:n_train]]
df_val = df_copy.iloc[idx[n_train:n_train+n_val]]
df_test = df_copy.iloc[idx[n_train+n_val:]]

y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values
y_train = df_train.fuel_efficiency_mpg.values

df_full_train = pd.concat([df_train, df_val])
X_full_train = prepare_X(df_full_train)
y_full_train = np.concatenate([y_train, y_val])

w0, w = train_linear_regression_reg(X_full_train, y_full_train, 0.001)
X_test = prepare_X(df_test)
y_pred = w0 + X_test.dot(w)
score = rmse(y_test, y_pred)
score

0.5156256135782993