# Predicting Miles Per Gallon (MPG) using Neural Network with TensorFlow/Keras

In [1]:
from ucimlrepo import fetch_ucirepo 
  
# fetch dataset 
auto_mpg = fetch_ucirepo(id=9) 
  
# data (as pandas dataframes) 
X = auto_mpg.data.features 
y = auto_mpg.data.targets 
  
# metadata 
print(auto_mpg.metadata) 
  
# variable information 
print(auto_mpg.variables) 


{'uci_id': 9, 'name': 'Auto MPG', 'repository_url': 'https://archive.ics.uci.edu/dataset/9/auto+mpg', 'data_url': 'https://archive.ics.uci.edu/static/public/9/data.csv', 'abstract': 'Revised from CMU StatLib library, data concerns city-cycle fuel consumption', 'area': 'Other', 'tasks': ['Regression'], 'characteristics': ['Multivariate'], 'num_instances': 398, 'num_features': 7, 'feature_types': ['Real', 'Categorical', 'Integer'], 'demographics': [], 'target_col': ['mpg'], 'index_col': ['car_name'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1993, 'last_updated': 'Thu Aug 10 2023', 'dataset_doi': '10.24432/C5859H', 'creators': ['R. Quinlan'], 'intro_paper': None, 'additional_info': {'summary': 'This dataset is a slightly modified version of the dataset provided in the StatLib library.  In line with the use by Ross Quinlan (1993) in predicting the attribute "mpg", 8 of the original instances were removed because they had unknown values for th

In [2]:
auto_mpg.keys()

dict_keys(['data', 'metadata', 'variables'])

In [3]:
auto_mpg.data

{'ids':                       car_name
 0    chevrolet,chevelle,malibu
 1            buick,skylark,320
 2           plymouth,satellite
 3                amc,rebel,sst
 4                  ford,torino
 ..                         ...
 393            ford,mustang,gl
 394                  vw,pickup
 395              dodge,rampage
 396                ford,ranger
 397                 chevy,s-10
 
 [398 rows x 1 columns],
 'features':      displacement  cylinders  horsepower  weight  acceleration  model_year  \
 0           307.0          8       130.0    3504          12.0          70   
 1           350.0          8       165.0    3693          11.5          70   
 2           318.0          8       150.0    3436          11.0          70   
 3           304.0          8       150.0    3433          12.0          70   
 4           302.0          8       140.0    3449          10.5          70   
 ..            ...        ...         ...     ...           ...         ...   
 393         140.

In [4]:
# Create a DataFrame from X and y
import pandas as pd

df = pd.DataFrame(X, columns=auto_mpg.data.headers)
df['mpg'] = y
df['car_name'] = auto_mpg.data.ids

In [5]:
df.head()

Unnamed: 0,car_name,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
0,"chevrolet,chevelle,malibu",8,307.0,130.0,3504,12.0,70,1,18.0
1,"buick,skylark,320",8,350.0,165.0,3693,11.5,70,1,15.0
2,"plymouth,satellite",8,318.0,150.0,3436,11.0,70,1,18.0
3,"amc,rebel,sst",8,304.0,150.0,3433,12.0,70,1,16.0
4,"ford,torino",8,302.0,140.0,3449,10.5,70,1,17.0


In [6]:
# Check for missing values
df.isnull().sum()

car_name        0
cylinders       0
displacement    0
horsepower      6
weight          0
acceleration    0
model_year      0
origin          0
mpg             0
dtype: int64

In [7]:
# Drop rows with missing values
df = df.dropna()

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 392 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   car_name      392 non-null    object 
 1   cylinders     392 non-null    int64  
 2   displacement  392 non-null    float64
 3   horsepower    392 non-null    float64
 4   weight        392 non-null    int64  
 5   acceleration  392 non-null    float64
 6   model_year    392 non-null    int64  
 7   origin        392 non-null    int64  
 8   mpg           392 non-null    float64
dtypes: float64(4), int64(4), object(1)
memory usage: 30.6+ KB


In [9]:
df.describe()

Unnamed: 0,cylinders,displacement,horsepower,weight,acceleration,model_year,origin,mpg
count,392.0,392.0,392.0,392.0,392.0,392.0,392.0,392.0
mean,5.471939,194.41199,104.469388,2977.584184,15.541327,75.979592,1.576531,23.445918
std,1.705783,104.644004,38.49116,849.40256,2.758864,3.683737,0.805518,7.805007
min,3.0,68.0,46.0,1613.0,8.0,70.0,1.0,9.0
25%,4.0,105.0,75.0,2225.25,13.775,73.0,1.0,17.0
50%,4.0,151.0,93.5,2803.5,15.5,76.0,1.0,22.75
75%,8.0,275.75,126.0,3614.75,17.025,79.0,2.0,29.0
max,8.0,455.0,230.0,5140.0,24.8,82.0,3.0,46.6


In [10]:
# Drop car name since it is not a feature or target
df = df.drop('car_name', axis=1)

In [11]:
# Split the data into training, validation and testing sets
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [12]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [13]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(1)
])

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
model.summary()

In [None]:
model