## Import all the requiered libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sklearn
import seaborn as sns

import torch
from torch.autograd import Variable
import torch.utils.data as Data
import torch.nn as nn

## Explore and prepare the dataset

In [None]:
df = pd.read_csv("autos mpg.csv")

In [None]:
df.info()

In [None]:
df.head(5)

In [None]:
type(df['HP'][0])

**The HP column is in a string str format so we will be requiered to convert it into a numeric int or float format.**

In [None]:
df['HP'] = pd.to_numeric(df['HP'], errors = 'coerce')

In [None]:
df.info()

**The HP column has 6 blank cells and therefore it is worth it to skip the rows where these empty spaces are. The other columns are completed and do not have null values.**

In [None]:
df = df.dropna(axis = 0, how = "any")

In [None]:
df.info()

**MPG is our target value, labels, or the number that we want to predict with our neural network. And NAME column contains the names of every car in a string format. As a result, we will separate the MPG column into another df and we will explore the Name column to see how many unique values there are and if it can be used as a feature.**

In [None]:
y = df['MPG']
y.head(5)

In [None]:
mpg_min = y.min()
mpg_max = y.max()
mpg_avg = y.mean()
mpg_std = y.std()

In [None]:
summary = {'Statistics of MPG': [mpg_min, mpg_max, mpg_avg, mpg_std]}
stat_mpg = pd.DataFrame(data=summary, index=['Min', 'Max','Avg','Std'])
stat_mpg

In [None]:
stat_mpg.plot(kind='bar', color='goldenrod')

In [None]:
print(mpg_avg-mpg_std, ' - ', mpg_avg, ' + ', mpg_avg+mpg_std)

**This is important information to take into consideration because Miles per Gallon is our target value. The average MPG of all the dataset's car is 23.45 with a standard deviation of 7.81. Practically, it can be said that it is normal that a car performs from 15.64 to 31.25 miles per gallon. Data points outside this range can be considered outliers.**

In [None]:
df['NAME'].value_counts()

In [None]:
df = df.drop('NAME', axis = 1)

**There are 301 unique categoric values in the NAME column, which represents the 77% of the 392 records. Therefore, it is decided to skip it and not use it as a feature to train our neural network. Lets see the correlation of the other variables and the MPG column so we can select the best features!**

In [None]:
sns.heatmap(df.corr(), annot = True, cmap = 'coolwarm')

**We can perceive that each of the seven features: cylinders, displacement, HP, weight, acceleration, year, and origin, has a high level of correlation with MPG variable. As a result, we will use these seven features to train our model.** 

In [None]:
df = df.drop('MPG', axis = 1)

**Lets use Min Max Normalization to normalize all the seven features, transforming their values into a scale between 0 and 1.**

In [None]:
def normalize(dataset):
    data_normalized = ((dataset-dataset.min())/(dataset.max()-dataset.min()))
    return data_normalized

In [None]:
features = normalize(df)
features

## Pandas series to Numpy arrays

In [None]:
numpy_X = features[['CYLINDERS','DISPLACEMENT', 'HP', 'WEIGHT', 'ACCELERATION', 'YEAR', 'ORIGIN']].to_numpy()
numpy_X[:5]

In [None]:
numpy_y = y.to_numpy()
numpy_y[:5]

## Create a train (80%) and a test (20%) set 

In [None]:
train_X, test_X, train_y, test_y = train_test_split(numpy_X, numpy_y, test_size=0.20, random_state=0)

## Numpy arrays to Pytorch tensors train

In [None]:
# this is for the train set.
tensor_X = torch.from_numpy(train_X).float() 
tensor_y = torch.from_numpy(train_y).float() 
print(tensor_X.shape, tensor_y.shape)

In [None]:
tensor_y = tensor_y.unsqueeze(1)
print(tensor_X.shape, tensor_y.shape)

In [None]:
torch_dataset = Data.TensorDataset(tensor_X, tensor_y)

## Define a dataloader to load it in batches

In [None]:
batch = 45 # we will have 8 iterations in each epoch. 313 data divided by 45 data 
           # per batch, is equal to 7 batches or iterations to complete one epoch.

In [None]:
loader = Data.DataLoader(
    dataset = torch_dataset,      # torch TensorDataset format
    batch_size = batch,           # mini batch size
    shuffle=True,                 # random shuffle for training
    num_workers=2,                # subprocesses for loading data
)

## Define a nn, optimizer, and loss function

In [None]:
model = nn.Sequential(nn.Linear(7, 10), # first layer
   nn.ReLU(),
   nn.Linear(10, 10), # second layer
   nn.ReLU(),
   nn.Linear(10, 10), # third layer
   nn.ReLU(),
   nn.Linear(10, 1)   # fourth layer
   )

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.005)
loss_function = torch.nn.MSELoss()

## Train our Artificial Neural Network

In [None]:
loss_lst = []
for epoch in range(1, 101):
    batch = 1
    print("Epoch", epoch)
    for step, (batch_x, batch_y) in enumerate(loader): 
        var_X, var_y = Variable(batch_x), Variable(batch_y)
        prediction = model(var_X)
        rmse_loss = torch.sqrt(loss_function(prediction, var_y))

        optimizer.zero_grad()
        rmse_loss.backward()
        optimizer.step()
        
        loss_lst.append(float(rmse_loss))
        print("Batch: ", batch, ", loss: ", rmse_loss)
        batch += 1
    print("Result of the last epoch's batch: ", rmse_loss)
    print(' ')

## Plot the loss per epoch in training

In [None]:
plt.figure(figsize=(18,3)) 
plt.plot(loss_lst, c='red')
plt.show()

## Test our Artificial Neural Network

**Convert Numpy arrays to Pytorch tensors for testing set.**

In [None]:
# this is for the test set.
tensor_X_test = torch.from_numpy(test_X).float() 
tensor_y_test = torch.from_numpy(test_y).float() 
print(tensor_X_test.shape, tensor_y_test.shape)

tensor_y_test = tensor_y_test.unsqueeze(1)
print(tensor_X_test.shape, tensor_y_test.shape)

In [None]:
var_X_test = Variable(tensor_X_test)
var_y_test = Variable(tensor_y_test)

**Do predictions of the testing set and calculate the RMSE.**

In [None]:
predictions_test = model(var_X_test)

In [None]:
rmse_loss = torch.sqrt(loss_function(predictions_test, var_y_test))

In [None]:
rmse_loss

**The Root Mean Square Error of the testing set is 2.8, and since it is similar to the RMSE of the training set, we can conclude that there is not overfitting, perfect! An RMSE of 2.8 means that on average our model will do a prediction with a ± 2.8 error of miles per gallon. This is considered a satisfying result because 2.8 represents 12% out of the MPG average of our complete dataset and because 2.8 is lower than the MPG standard deviation of our whole dataset.**

## Plot the predictions against target values

In [None]:
y_pred = predictions_test.tolist()

In [None]:
y_real = var_y_test.tolist()

In [None]:
plt.figure(figsize=(18,3)) 
plt.plot(y_pred, c='blue', label="predictions")
plt.plot(y_real, c='red', label="target")
plt.legend(loc="upper right")
plt.show()

## Predict JUST one data with our model

In [None]:
data_x = var_X_test[0]
data_x

In [None]:
prediction = model(data_x)

In [None]:
print("Target value: ", var_y_test[0])
print("Prediction: ", prediction)

**When predicting only one data, the first one of the testing set, it can be seen that the target or real value is 28 MPG and that the prediction is 26.7. The prediction is really close and has an absolute error of 1.3 MPG or a percentage error of 4.6%.**