In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch.nn as nn


In [2]:
url = 'http://archive.ics.uci.edu/ml/machine-learning-databases/auto-mpg/auto-mpg.data'

In [3]:
columns_names= ['MPG', 'Cylinders','Displacement','Horsepower','Weight','Acceleration','Model Year', 'Origin']
df = pd.read_csv(url,names=columns_names, na_values='?', comment='\t', sep=' ', skipinitialspace=True)
df.to_csv("Auto MPG Car fuel dataset.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           398 non-null    float64
 1   Cylinders     398 non-null    int64  
 2   Displacement  398 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        398 non-null    float64
 5   Acceleration  398 non-null    float64
 6   Model Year    398 non-null    int64  
 7   Origin        398 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 25.0 KB


In [4]:
# drop the NA rows
df = df.dropna()
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392 entries, 0 to 391
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   MPG           392 non-null    float64
 1   Cylinders     392 non-null    int64  
 2   Displacement  392 non-null    float64
 3   Horsepower    392 non-null    float64
 4   Weight        392 non-null    float64
 5   Acceleration  392 non-null    float64
 6   Model Year    392 non-null    int64  
 7   Origin        392 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 24.6 KB


In [5]:
df.sample(15)

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
319,32.2,4,108.0,75.0,2265.0,15.2,80,3
159,16.0,6,250.0,105.0,3897.0,18.5,75,1
389,32.0,4,135.0,84.0,2295.0,11.6,82,1
228,15.5,8,350.0,170.0,4165.0,11.4,77,1
43,13.0,8,400.0,175.0,5140.0,12.0,71,1
211,13.0,8,350.0,145.0,4055.0,12.0,76,1
217,25.5,4,122.0,96.0,2300.0,15.5,77,1
138,14.0,8,304.0,150.0,4257.0,15.5,74,1
66,11.0,8,429.0,208.0,4633.0,11.0,72,1
156,16.0,8,318.0,150.0,4498.0,14.5,75,1


In [6]:
# train test split
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df, train_size=0.8, random_state=1)
train_stats = df_train.describe().transpose()
train_stats
    


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MPG,313.0,23.404153,7.666909,9.0,17.5,23.0,29.0,46.6
Cylinders,313.0,5.402556,1.701506,3.0,4.0,4.0,8.0,8.0
Displacement,313.0,189.51278,102.675646,68.0,104.0,140.0,260.0,455.0
Horsepower,313.0,102.929712,37.919046,46.0,75.0,92.0,120.0,230.0
Weight,313.0,2961.198083,848.602146,1613.0,2219.0,2755.0,3574.0,5140.0
Acceleration,313.0,15.704473,2.725399,8.5,14.0,15.5,17.3,24.8
Model Year,313.0,75.929712,3.675305,70.0,73.0,76.0,79.0,82.0
Origin,313.0,1.591054,0.807923,1.0,1.0,1.0,2.0,3.0


In [7]:
numeric_column_names = [
    'Cylinders', 'Displacement', 'Horsepower', 'Weight', 'Acceleration'
]
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

for col_name in numeric_column_names:
    mean = train_stats.loc[col_name, 'mean']
    std = train_stats.loc[col_name, 'std']
    df_train_norm.loc[:, col_name] = (df_train_norm.loc[:, col_name]-mean)/std
    df_test_norm.loc[:, col_name] = (df_test_norm.loc[:, col_name]-mean)/std

df_train_norm.tail()

  0.3511267  -0.8243028  -0.8243028  -0.8243028   0.3511267  -0.8243028
  0.3511267   1.52655621  1.52655621  1.52655621  0.3511267   1.52655621
 -0.8243028   0.3511267   1.52655621 -0.8243028  -0.8243028   0.3511267
 -0.8243028  -0.8243028  -0.8243028   0.3511267  -0.8243028   1.52655621
  0.3511267  -0.8243028   0.3511267  -0.8243028  -0.8243028   1.52655621
 -0.8243028   1.52655621  1.52655621 -0.8243028  -0.8243028  -0.8243028
 -0.8243028   0.3511267  -0.8243028   1.52655621 -0.8243028  -0.8243028
  1.52655621 -0.8243028  -0.8243028  -0.8243028   1.52655621  1.52655621
  0.3511267   0.3511267   1.52655621 -0.8243028  -0.8243028   1.52655621
  1.52655621 -0.8243028  -0.8243028   0.3511267   1.52655621 -0.8243028
  0.3511267  -0.8243028   1.52655621  1.52655621 -0.8243028  -0.8243028
 -1.41201755  1.52655621  0.3511267   1.52655621 -0.8243028  -0.8243028
 -0.8243028   1.52655621  1.52655621  0.3511267   0.3511267   1.52655621
 -0.8243028   1.52655621 -0.23658805 -0.8243028  -0.824302

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin
203,28.0,-0.824303,-0.90102,-0.736562,-0.950031,0.255202,76,3
255,19.4,0.351127,0.4138,-0.340982,0.29319,0.548737,78,1
72,13.0,1.526556,1.144256,0.713897,1.339617,-0.625403,72,1
235,30.5,-0.824303,-0.89128,-1.053025,-1.072585,0.475353,77,1
37,14.0,1.526556,1.563051,1.636916,1.47042,-1.35924,71,1


In [8]:
#  Bucketing: if year<73 bucket0 elif year>= 73 and <76 bucket1 elif year>=79 and <79 bucket2 elif bucket3
import torch
from torch import nn as nn
boundaries = torch.tensor([73, 76, 79])
v = torch.tensor(df_train_norm['Model Year'].values)
df_train_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries=boundaries, right=True)
v = torch.tensor(df_test_norm['Model Year'].values)
df_test_norm['Model Year Bucketed'] = torch.bucketize(v, boundaries, right=True)
numeric_column_names.append('Model Year Bucketed')

In [9]:
df_test_norm.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Model Year Bucketed
81,23.0,-0.824303,-0.677013,-0.156378,-0.536409,-0.441944,72,3,0
165,29.0,-0.824303,-0.90102,-0.736562,-0.931176,0.108434,75,3,1
351,32.4,-0.824303,-0.793886,-0.736562,-0.720241,0.401969,81,3,3
119,19.0,-0.824303,-0.667274,0.239201,-0.109825,-0.075025,73,2,1
379,38.0,-0.824303,-0.959456,-0.947537,-1.138576,0.181818,82,3,3


In [10]:
df_train_norm.head()

Unnamed: 0,MPG,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model Year,Origin,Model Year Bucketed
334,27.2,-0.824303,-0.530922,-0.499214,-0.555264,-0.001641,81,1,3
258,18.6,0.351127,0.345625,0.186457,0.776338,1.099115,78,1,2
139,29.0,-0.824303,-0.89128,-0.525586,-0.874613,0.291894,74,2,1
310,37.2,-0.824303,-1.008153,-1.000281,-1.110294,0.255202,80,3,3
349,33.0,-0.824303,-0.823104,-0.762934,-0.908786,-0.552019,81,2,3


In [11]:
# Preprocessing with unordered categorical feature. There are two ways to work with categorical datasets
# 1, One hot encoding
# 2. Embedding

# Here, using one hot encoding
from torch.nn.functional import one_hot
total_origin = len(set(df_train_norm['Origin']))
origin_encoded = one_hot(torch.from_numpy(df_train_norm['Origin'].values)%total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_column_names].values)
x_train = torch.cat([x_train_numeric, origin_encoded],1).float()
origin_encoded = one_hot(torch.from_numpy(df_test_norm['Origin'].values)%total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_column_names].values)
x_test = torch.cat([x_test_numeric, origin_encoded], 1).float()



In [12]:
x_train[:5],x_test[:5] 

(tensor([[-8.2430e-01, -5.3092e-01, -4.9921e-01, -5.5526e-01, -1.6412e-03,
           3.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00],
         [ 3.5113e-01,  3.4562e-01,  1.8646e-01,  7.7634e-01,  1.0991e+00,
           2.0000e+00,  0.0000e+00,  1.0000e+00,  0.0000e+00],
         [-8.2430e-01, -8.9128e-01, -5.2559e-01, -8.7461e-01,  2.9189e-01,
           1.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00],
         [-8.2430e-01, -1.0082e+00, -1.0003e+00, -1.1103e+00,  2.5520e-01,
           3.0000e+00,  1.0000e+00,  0.0000e+00,  0.0000e+00],
         [-8.2430e-01, -8.2310e-01, -7.6293e-01, -9.0879e-01, -5.5202e-01,
           3.0000e+00,  0.0000e+00,  0.0000e+00,  1.0000e+00]]),
 tensor([[-0.8243, -0.6770, -0.1564, -0.5364, -0.4419,  0.0000,  1.0000,  0.0000,
           0.0000],
         [-0.8243, -0.9010, -0.7366, -0.9312,  0.1084,  1.0000,  1.0000,  0.0000,
           0.0000],
         [-0.8243, -0.7939, -0.7366, -0.7202,  0.4020,  3.0000,  1.0000,  0.0000,
           0.0000],
  

In [13]:
y_train = torch.tensor(df_train_norm['MPG'].values).float()
y_test = torch.tensor(df_test_norm['MPG'].values).float()

In [14]:
y_train[:15], y_test[:15]

(tensor([27.2000, 18.6000, 29.0000, 37.2000, 33.0000, 13.0000, 22.4000, 44.6000,
         27.4000, 31.0000, 24.0000, 28.0000, 17.6000, 15.0000, 11.0000]),
 tensor([23.0000, 29.0000, 32.4000, 19.0000, 38.0000, 33.5000, 26.0000, 14.0000,
         28.0000, 32.4000, 21.5000, 41.5000, 17.0000, 33.5000, 29.0000]))

### Training a DNN regression Model

In [15]:
from torch.utils.data import DataLoader, TensorDataset
train_ds = TensorDataset(x_train, y_train)
batch_size = 8
train_dl = DataLoader(train_ds, batch_size, shuffle=True)

In [16]:
# Model with 2 fully connected layers
hidden_units = [8,4]
input_size = x_train.shape[1]
all_layers = []
for hidden_unit in hidden_units:
    layer = nn.Linear(input_size, hidden_unit)
    all_layers.append(layer)
    all_layers.append(nn.ReLU())
    input_size = hidden_unit

all_layers.append(nn.Linear(hidden_units[-1], 1))

In [17]:
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [18]:
# Defining MSE loss function
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

num_epochs = 200
log_epochs=20
for epoch in range(num_epochs):
    loss_hist_train=0
    for x_batch, y_batch in train_dl:
        pred = model(x_batch)[:,0]
        loss = loss_fn(pred, y_batch)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loss_hist_train += loss.item()
    if epoch % log_epochs == 0:
        print(f'Epoch {epoch} Loss ' f"{loss_hist_train/len(train_dl): .4f}")



Epoch 0 Loss  551.3687
Epoch 20 Loss  8.5176
Epoch 40 Loss  7.9879
Epoch 60 Loss  8.1616
Epoch 80 Loss  7.7914
Epoch 100 Loss  8.1319
Epoch 120 Loss  7.1195
Epoch 140 Loss  7.0347
Epoch 160 Loss  6.9103
Epoch 180 Loss  6.4259


In [19]:
with torch.no_grad():
    pred = model(x_test.float())[:,0]
    loss = loss_fn(pred,y_test)
    print(f'Test MSE: {loss.item():.4f}')
    print(f'Test MAE: {nn.L1Loss()(pred,y_test).item():.4f}')

Test MSE: 9.1150
Test MAE: 1.9561
