## Import necessary dependencies

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Activation
from sklearn.model_selection import train_test_split
from tensorflow.keras.callbacks import EarlyStopping
import pandas as pd
import io
import os
import requests
import numpy as np
from sklearn import metrics

## Read the dataset

In [5]:
df = pd.read_csv('auto-mpg.csv', na_values=['NA', '?'])# Strings NA and ? will be recognised as NA

## Have some insights about the dataset

In [9]:
df.shape

(398, 9)

In [6]:
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,year,origin,name
0,18.0,8,307.0,130.0,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449,10.5,70,1,ford torino


In [7]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

horsepower has 6 missing values. Let see what we can impute it with

In [8]:
df['horsepower'].describe()

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64

The median and average for horsepower are very close. Lets impute with median

In [10]:
df['horsepower'] = df['horsepower'].fillna(df['horsepower'].median())

In [11]:
df.isnull().sum()

mpg             0
cylinders       0
displacement    0
horsepower      0
weight          0
acceleration    0
year            0
origin          0
name            0
dtype: int64

Now we have taken care of all missing values

## Seperate X and y

In [12]:
X = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'year', 'origin']].values
y = df['mpg'].values

## Train test split
train ~ 0.75, test ~ 0.25

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=100)

## Define the model

In [24]:
model = Sequential()
model.add(Dense(25, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(10, activation='relu'))
model.add(Dense(1))

## Compile the model
Tell the model how it should learn

In [25]:
model.compile(loss='mean_squared_error', optimizer='adam')

## Create call backs

In [26]:
monitor = EarlyStopping(monitor='val_loss', min_delta=1e-3, patience=5, verbose=1, mode='auto', restore_best_weights=True)

## Train the model

In [27]:
model.fit(X_train, y_train, validation_data=(X_test, y_test), callbacks=[monitor], verbose=2, epochs=1000)

Train on 298 samples, validate on 100 samples
Epoch 1/1000
298/298 - 1s - loss: 37123.6468 - val_loss: 10558.8647
Epoch 2/1000
298/298 - 0s - loss: 2918.1732 - val_loss: 473.9703
Epoch 3/1000
298/298 - 0s - loss: 1640.6323 - val_loss: 2216.2693
Epoch 4/1000
298/298 - 0s - loss: 1200.6486 - val_loss: 185.9029
Epoch 5/1000
298/298 - 0s - loss: 125.8582 - val_loss: 280.9105
Epoch 6/1000
298/298 - 0s - loss: 233.6279 - val_loss: 147.8307
Epoch 7/1000
298/298 - 0s - loss: 85.6960 - val_loss: 88.8621
Epoch 8/1000
298/298 - 0s - loss: 91.2784 - val_loss: 90.4729
Epoch 9/1000
298/298 - 0s - loss: 76.4589 - val_loss: 82.9003
Epoch 10/1000
298/298 - 0s - loss: 73.6453 - val_loss: 82.3471
Epoch 11/1000
298/298 - 0s - loss: 72.0985 - val_loss: 79.4452
Epoch 12/1000
298/298 - 0s - loss: 70.4748 - val_loss: 78.7086
Epoch 13/1000
298/298 - 0s - loss: 70.0037 - val_loss: 77.9744
Epoch 14/1000
298/298 - 0s - loss: 69.3092 - val_loss: 78.3367
Epoch 15/1000
298/298 - 0s - loss: 69.2428 - val_loss: 76.683

Epoch 130/1000
298/298 - 0s - loss: 21.6884 - val_loss: 22.8155
Epoch 131/1000
298/298 - 0s - loss: 21.5833 - val_loss: 22.7211
Epoch 132/1000
298/298 - 0s - loss: 21.4141 - val_loss: 23.5733
Epoch 133/1000
298/298 - 0s - loss: 21.8298 - val_loss: 22.1631
Epoch 134/1000
298/298 - 0s - loss: 21.1146 - val_loss: 22.1043
Epoch 135/1000
298/298 - 0s - loss: 20.9721 - val_loss: 21.8802
Epoch 136/1000
298/298 - 0s - loss: 20.8474 - val_loss: 21.7793
Epoch 137/1000
298/298 - 0s - loss: 21.1277 - val_loss: 21.9891
Epoch 138/1000
298/298 - 0s - loss: 20.9117 - val_loss: 21.5339
Epoch 139/1000
298/298 - 0s - loss: 20.7547 - val_loss: 21.4954
Epoch 140/1000
298/298 - 0s - loss: 20.4865 - val_loss: 21.3302
Epoch 141/1000
298/298 - 0s - loss: 20.5589 - val_loss: 21.5921
Epoch 142/1000
298/298 - 0s - loss: 20.4301 - val_loss: 21.0840
Epoch 143/1000
298/298 - 0s - loss: 20.2338 - val_loss: 21.0991
Epoch 144/1000
298/298 - 0s - loss: 19.9895 - val_loss: 20.9827
Epoch 145/1000
298/298 - 0s - loss: 20.0

<tensorflow.python.keras.callbacks.History at 0x7fa6b0467450>

## Validate on the test set

In [30]:
prediction = model.predict(X_test)
score = np.sqrt(metrics.mean_squared_error(prediction, y_test))
print(f'After training the test score is: {score}')

After training the test score is: 4.038248609429321
