In [55]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [56]:
df = pd.read_csv('../car_fuel_efficiency.csv')
df

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.870990,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369
...,...,...,...,...,...,...,...,...,...,...,...
9699,140,5.0,164.0,2981.107371,17.3,2013,Europe,Diesel,Front-wheel drive,,15.101802
9700,180,,154.0,2439.525729,15.0,2004,USA,Gasoline,All-wheel drive,0.0,17.962326
9701,220,2.0,138.0,2583.471318,15.1,2008,USA,Diesel,All-wheel drive,-1.0,17.186587
9702,230,4.0,177.0,2905.527390,19.4,2011,USA,Diesel,Front-wheel drive,1.0,15.331551


In [7]:
dataset = df.loc[:, ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year', 'fuel_efficiency_mpg']]

Unnamed: 0,engine_displacement,horsepower,vehicle_weight,model_year,fuel_efficiency_mpg
0,170,159.0,3413.433759,2003,13.231729
1,130,97.0,3149.664934,2007,13.688217
2,170,78.0,3079.038997,2018,14.246341
3,220,,2542.392402,2009,16.912736
4,210,140.0,3460.870990,2009,12.488369
...,...,...,...,...,...
9699,140,164.0,2981.107371,2013,15.101802
9700,180,154.0,2439.525729,2004,17.962326
9701,220,138.0,2583.471318,2008,17.186587
9702,230,177.0,2905.527390,2011,15.331551


In [9]:
[col for col in dataset.columns if dataset[col].isnull().any()]

['horsepower']

In [16]:
dataset['horsepower'].dropna().median()

149.0

In [157]:
np.random.seed(42)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

array([15.80435416, 14.27237387, 11.74844072, ..., 18.40443466,
       20.50246014, 16.81624056])

In [174]:
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values


del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [175]:
def train_linear_regression(X, y):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [176]:
base = ['engine_displacement', 'horsepower', 'vehicle_weight', 'model_year']

In [177]:
def prepare_X(df):
    df_num = df[base]
    df_num = df_num.fillna(0)
    X = df_num.values
    return X

In [178]:
X_train = prepare_X(df_train)
w_0, w = train_linear_regression(X_train, y_train)

In [179]:
y_pred = w_0 + X_train.dot(w)

In [180]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [181]:
rmse(y_train, y_pred)

np.float64(0.5219709782195826)

In [189]:
X_val = prepare_X(df_val)
y_pred_val = w_0 + X_val.dot(w)
round(rmse(y_val, y_pred_val), 2)

np.float64(0.51)

In [190]:
def prepare_X_mean(df, mean):
    df_num = df[base]
    df_num = df_num.fillna(mean)
    X = df_num.values
    return X

In [191]:
mean = df_train.horsepower.mean()
mean

np.float64(149.696587537092)

In [192]:
X_train_mean = prepare_X_mean(df_train, mean)
w_0_mean, w_mean = train_linear_regression(X_train_mean, y_train)

In [193]:
y_pred_mean = w_0_mean + X_train_mean.dot(w_mean)

In [194]:
rmse(y_train, y_pred_mean)

np.float64(0.4667165231665452)

In [195]:
X_val_mean = prepare_X_mean(df_val, mean)
y_pred_val_mean = w_0 + X_val_mean.dot(w)
round(rmse(y_val, y_pred_val_mean), 2)

np.float64(0.5)

Regularization

In [196]:
def train_linear_regression_reg(X, y, r=0.0):
    ones = np.ones(X.shape[0])
    X = np.column_stack([ones, X])

    XTX = X.T.dot(X)
    reg = r * np.eye(XTX.shape[0])
    XTX = XTX + reg

    XTX_inv = np.linalg.inv(XTX)
    w = XTX_inv.dot(X.T).dot(y)
    
    return w[0], w[1:]

In [197]:
X_train = prepare_X(df_train)
X_val = prepare_X(df_val)

for r in [0, 0.01, 0.1, 1, 5, 10, 100]:
    w_0, w = train_linear_regression_reg(X_train, y_train, r=r)
    y_pred = w_0 + X_val.dot(w)
    print('%6s' %r, round(rmse(y_val, y_pred),2))

     0 0.51
  0.01 0.51
   0.1 0.52
     1 0.52
     5 0.52
    10 0.52
   100 0.52


In [198]:
np.random.seed(9)

n = len(df)

n_val = int(0.2 * n)
n_test = int(0.2 * n)
n_train = n - (n_val + n_test)

idx = np.arange(n)
np.random.shuffle(idx)

df_shuffled = df.iloc[idx]

df_train = df_shuffled.iloc[:n_train].copy()
df_val = df_shuffled.iloc[n_train:n_train+n_val].copy()
df_test = df_shuffled.iloc[n_train+n_val:].copy()

y_test = df_test.fuel_efficiency_mpg.values

In [199]:
df_tv = pd.concat([df_train, df_val])

In [200]:
X_train = prepare_X(df_tv)
w_0, w = train_linear_regression_reg(X_train, y_tv, 0.001)

In [201]:
X_test = prepare_X(df_test)
y_pred_test = w_0 + X_test.dot(w)
rmse(y_test, y_pred_test)

np.float64(12.360086735370217)