In [1]:
!pip install ucimlrepo
import pandas as pd
from ucimlrepo import fetch_ucirepo
auto_mpg = fetch_ucirepo(name="Auto MPG")
#url = "https://archive.ics.uci.edu/ml/machine-learning-datasets/auto-mpg.data"
column_names = ['Displacement','Cylinders','Horsepower','Weight','Acceleration','Model Year', 'Origin']

X = auto_mpg.data.features
y = auto_mpg.data.targets

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.3-py3-none-any.whl (7.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.3


In [2]:
print(X.shape)
print(y.shape)
print(y.head())
print(X.head())

(398, 7)
(398, 1)
    mpg
0  18.0
1  15.0
2  18.0
3  16.0
4  17.0
   displacement  cylinders  horsepower  weight  acceleration  model_year  \
0         307.0          8       130.0    3504          12.0          70   
1         350.0          8       165.0    3693          11.5          70   
2         318.0          8       150.0    3436          11.0          70   
3         304.0          8       150.0    3433          12.0          70   
4         302.0          8       140.0    3449          10.5          70   

   origin  
0       1  
1       1  
2       1  
3       1  
4       1  


In [3]:
df = pd.concat([X,y], axis=1)
print(df)

     displacement  cylinders  horsepower  weight  acceleration  model_year  \
0           307.0          8       130.0    3504          12.0          70   
1           350.0          8       165.0    3693          11.5          70   
2           318.0          8       150.0    3436          11.0          70   
3           304.0          8       150.0    3433          12.0          70   
4           302.0          8       140.0    3449          10.5          70   
..            ...        ...         ...     ...           ...         ...   
393         140.0          4        86.0    2790          15.6          82   
394          97.0          4        52.0    2130          24.6          82   
395         135.0          4        84.0    2295          11.6          82   
396         120.0          4        79.0    2625          18.6          82   
397         119.0          4        82.0    2720          19.4          82   

     origin   mpg  
0         1  18.0  
1         1  15.0  
2  

In [4]:
df_copy = df.copy()

In [5]:
df_copy.isna().sum()

displacement    0
cylinders       0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
mpg             0
dtype: int64

In [6]:
df_copy = df_copy.dropna()
df_copy  = df_copy.reset_index(drop=True)

#train/test splits
import sklearn
import sklearn.model_selection
df_train , df_test = sklearn.model_selection.train_test_split(df_copy,train_size=0.8,random_state=1)
train_stats = df_train.describe().transpose()
numeric_columns_names = ['cylinders',"displacement","horsepower","weight","acceleration"]
df_train_norm, df_test_norm = df_train.copy(), df_test.copy()

print(train_stats)

              count         mean         std     min     25%     50%     75%  \
displacement  313.0   189.512780  102.675646    68.0   104.0   140.0   260.0   
cylinders     313.0     5.402556    1.701506     3.0     4.0     4.0     8.0   
horsepower    313.0   102.929712   37.919046    46.0    75.0    92.0   120.0   
weight        313.0  2961.198083  848.602146  1613.0  2219.0  2755.0  3574.0   
acceleration  313.0    15.704473    2.725399     8.5    14.0    15.5    17.3   
model_year    313.0    75.929712    3.675305    70.0    73.0    76.0    79.0   
origin        313.0     1.591054    0.807923     1.0     1.0     1.0     2.0   
mpg           313.0    23.404153    7.666909     9.0    17.5    23.0    29.0   

                 max  
displacement   455.0  
cylinders        8.0  
horsepower     230.0  
weight        5140.0  
acceleration    24.8  
model_year      82.0  
origin           3.0  
mpg             46.6  


In [7]:
for col_name in numeric_columns_names:
  mean = train_stats.loc[col_name, 'mean']
  std = train_stats.loc[col_name, 'std']
  df_train_norm.loc[:,col_name] = (df_train_norm.loc[:,col_name]- mean)/ std
  df_test_norm.loc[:,col_name] = (df_test_norm.loc[:,col_name] - mean)/std
df_train_norm.tail()

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg
203,-0.90102,-0.824303,-0.736562,-0.950031,0.255202,76,3,28.0
255,0.4138,0.351127,-0.340982,0.29319,0.548737,78,1,19.4
72,1.144256,1.526556,0.713897,1.339617,-0.625403,72,1,13.0
235,-0.89128,-0.824303,-1.053025,-1.072585,0.475353,77,1,30.5
37,1.563051,1.526556,1.636916,1.47042,-1.35924,71,1,14.0


In [14]:
import torch
import torch.nn as nn

In [9]:
boundaries = torch.tensor([73,76,79])
v = torch.tensor(df_train_norm['model_year'].values)
df_train_norm['model_year_bucketed'] = torch.bucketize(v,boundaries,right=True)
v = torch.tensor(df_test_norm['model_year'].values)
df_test_norm['model_year_bucketed'] = torch.bucketize(v,boundaries,right=True)
numeric_columns_names.append('model_year_bucketed')

In [10]:
df_train_norm.head()

Unnamed: 0,displacement,cylinders,horsepower,weight,acceleration,model_year,origin,mpg,model_year_bucketed
334,-0.530922,-0.824303,-0.499214,-0.555264,-0.001641,81,1,27.2,3
258,0.345625,0.351127,0.186457,0.776338,1.099115,78,1,18.6,2
139,-0.89128,-0.824303,-0.525586,-0.874613,0.291894,74,2,29.0,1
310,-1.008153,-0.824303,-1.000281,-1.110294,0.255202,80,3,37.2,3
349,-0.823104,-0.824303,-0.762934,-0.908786,-0.552019,81,2,33.0,3


In [11]:
from torch.nn.functional import one_hot
total_origin = len(set(df_train_norm['origin']))
origin_encoded = one_hot(torch.from_numpy(df_train_norm['origin'].values) % total_origin)
x_train_numeric = torch.tensor(df_train_norm[numeric_columns_names].values)
x_train = torch.cat([x_train_numeric,origin_encoded], 1).float()
origin_encoded = one_hot(torch.from_numpy(df_test_norm['origin'].values) % total_origin)
x_test_numeric = torch.tensor(df_test_norm[numeric_columns_names].values)
x_test = torch.cat([x_test_numeric,origin_encoded],1).float()

y_train = torch.tensor(df_train_norm['mpg'].values).float()
y_test = torch.tensor(df_test_norm['mpg'].values).float()




In [12]:
from torch.utils.data import TensorDataset, DataLoader
train_ds = TensorDataset(x_train,y_train)
batch_size = 8
torch.manual_seed(1)
train_dl = DataLoader(train_ds,batch_size,shuffle=True)

In [16]:
hidden_units=[8,4]
input_size = x_train.shape[1]
all_layers = []

for hidden_unit in hidden_units:
  layer = nn.Linear(input_size,hidden_unit)
  all_layers.append(layer)
  all_layers.append(nn.ReLU())
  input_size = hidden_unit
all_layers.append(nn.Linear(hidden_units[-1],1))
model = nn.Sequential(*all_layers)
model

Sequential(
  (0): Linear(in_features=9, out_features=8, bias=True)
  (1): ReLU()
  (2): Linear(in_features=8, out_features=4, bias=True)
  (3): ReLU()
  (4): Linear(in_features=4, out_features=1, bias=True)
)

In [17]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

In [20]:
torch.manual_seed(1)
num_epochs = 200
log_epochs = 20

for epoch in range(num_epochs):
  loss_hist_train = 0
  for x_batch, y_batch in train_dl:
    pred= model(x_batch)[:,0]
    loss = loss_fn(pred,y_batch)
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    loss_hist_train += loss.item()
  if epoch % log_epochs == 0 :
    print(f'Epoch {epoch} Loss: {loss_hist_train/len(train_dl):.4f}')

Epoch 0 Loss: 27.8739
Epoch 20 Loss: 8.3590
Epoch 40 Loss: 7.9827
Epoch 60 Loss: 7.7648
Epoch 80 Loss: 7.1286
Epoch 100 Loss: 7.0069
Epoch 120 Loss: 6.5584
Epoch 140 Loss: 7.0826
Epoch 160 Loss: 7.2687
Epoch 180 Loss: 6.2733


In [21]:
with torch.no_grad():
  pred = model(x_test.float())[:,0]
  loss = loss_fn(pred,y_test)
  print(f'Test MSE: {loss.item():.4f}')
  print(f'Test MAE: {nn.L1Loss()(pred,y_test).item():.4f}')

Test MSE: 9.6432
Test MAE: 2.1133
