# Fuel Efficiency Prediction

This dataset, auto-mpg, is a well known efficiency predictor benchmark dataset for cars. We will train a neural network to predict over this with a few simplistic assumptions.

In [9]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.functional import one_hot

In [3]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'
column_names = ['MPG', 'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration', 'Model Year', 'Origin']
df =  pd.read_csv(url, names=column_names, na_values="?", comment='\t', sep=" ", skipinitialspace=True)
print(df.info())

df = df.dropna()
df = df.reset_index(drop=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB
None


In [8]:
df_train, df_test = train_test_split(df, test_size=0.2)
train_stats = df_train.describe().transpose()
numeric_column_names = ['Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration']

df_train_std, df_test_std = df_train.copy(), df_test.copy();

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std = train_stats.loc[col_name, 'std']

    df_train_std.loc[:, col_name] = (df_train_std.loc[:, col_name] - mean) / std;
    df_test_std.loc[:, col_name] = (df_test_std.loc[:, col_name] - mean) / std;

print(df_train_std.tail())

      MPG  Cylinders  Displacement  ...  Acceleration  Model Year  Origin
36   18.0   0.295098      0.334553  ...     -0.002855          71       1
351  32.4  -0.859473     -0.822623  ...      0.461806          81       3
28    9.0   1.449668      1.006461  ...      1.069440          70       1
283  17.0   1.449668      1.015793  ...     -0.038598          79       1
164  13.0   1.449668      0.987797  ...     -1.253865          75       1

[5 rows x 8 columns]


 -0.8594726  -0.8594726  -0.8594726  -0.8594726  -0.8594726  -0.8594726
  1.44966838 -0.8594726  -0.8594726  -0.8594726   0.29509789 -0.8594726
  0.29509789  1.44966838  1.44966838  1.44966838 -0.8594726  -0.8594726
  0.29509789  1.44966838 -0.8594726   0.29509789  0.29509789 -0.8594726
  0.29509789  1.44966838  0.29509789  1.44966838 -0.8594726  -1.43675785
 -0.28218736  0.29509789  1.44966838  1.44966838  1.44966838  0.29509789
  0.29509789 -0.8594726  -0.8594726  -0.8594726   0.29509789  1.44966838
  1.44966838  1.44966838 -0.8594726  -0.8594726   1.44966838 -0.8594726
 -0.8594726  -0.8594726   0.29509789 -0.8594726   0.29509789  1.44966838
 -0.8594726   1.44966838  0.29509789 -0.8594726  -0.8594726   1.44966838
  0.29509789 -0.8594726   1.44966838  1.44966838  1.44966838 -0.8594726
  1.44966838 -0.8594726   0.29509789  1.44966838 -0.8594726   0.29509789
 -0.8594726   1.44966838  1.44966838  0.29509789 -0.8594726  -0.8594726
  0.29509789  1.44966838 -0.8594726  -0.8594726  -0.859472

In [16]:
# The years from the tail print are littered between the 70s and early 80s
# As an example, we can bucket them

boundaries = torch.tensor([72, 74, 76, 78, 80]) # The buckets are < 72, 72 -74, 74 - 76 ... >= 80. torch uses boundaries with (lower, upper] by default
v = torch.tensor(df_train_std['Model Year'].values)
# right=False here sets the buckets to be inclusive of the lower bound and exclusive of the upper bound i.e. [lower, upper)
df_train_std['Model Year Bucket'] = torch.bucketize(v, boundaries=boundaries, right=False)
v = torch.tensor(df_test_std['Model Year'].values)
df_test_std['Model Year Bucket'] = torch.bucketize(v, boundaries=boundaries, right=False)
numeric_column_names.append('Model Year Bucket')


# We can also convert inputs to a one-hot encoded format directly with torch for categorical data
total_origin = len(set(df_train_std['Origin']))
origin_encoded = one_hot(torch.from_numpy(df_train_std['Origin'].values) % total_origin)

x_train_numeric = torch.tensor(df_train_std[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded], 1).float()

origin_encoded = one_hot(torch.from_numpy(df_test_std['Origin'].values) % total_origin)

x_test_numeric = torch.tensor(df_test_std[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()


# Convert the results to torch tensors
y_train = torch.tensor(df_train_std['MPG'].values).float()
y_test = torch.tensor(df_test_std['MPG'].values).float()

In [17]:
train_ds = TensorDataset(x_train, y_train)
batch_size = 8
torch.manual_seed(42)
train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

In [22]:
# build the model

hidden_units = [8, 4]
input_size = x_train.shape[1]
all_layers = []

for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 1))

model = nn.Sequential(*all_layers)

loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

print(model)

Sequential(
  (0): Linear(in_features=10, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)


In [None]:
torch.manual_seed(42)

num_epochs = 200
log_epochs = num_epochs // 10

# Training loop
for epoch in range(num_epochs):
    loss_hist_train = 0

    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:, 0]

        loss = loss_fn(pred, y_batch)
        loss.backward()

        optimizer.step()
        optimizer.zero_grad()

        loss_hist_train += loss.item()
    
    if epoch % log_epochs == 0:
        print(f'Epoch {epoch} Loss: {loss_hist_train / len(train_dl):.4f}')

Epoch 0 Loss: 225.0421
Epoch 20 Loss: 9.4168
Epoch 40 Loss: 8.4044
Epoch 60 Loss: 8.1083
Epoch 80 Loss: 8.6854
Epoch 100 Loss: 8.5295
Epoch 120 Loss: 8.2369
Epoch 140 Loss: 8.5553
Epoch 160 Loss: 8.5962
Epoch 180 Loss: 7.8838


In [None]:
# Now that the training is complete, we can test the model's predictions and see
# how well we can predict the MPG of a car based on the inputs we are given
with torch.no_grad():
    pred = model(x_test.float())[:, 0]
    loss = loss_fn(pred, y_test)
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred, y_test).item():.4f}')

Test MSE: 13.1225
Test MAE: 2.6589
