In [31]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.utils.data import Dataset, random_split
from tqdm.notebook import tqdm #Pretty progress bar
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler #For normalization

# 1. Load and explore the data

In [24]:
df = sns.load_dataset("mpg")
print("Shape:", df.shape)
print("Column values:", df.columns)
print("First 5 rows:", df.head(5))

Shape: (398, 9)
Column values: Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model_year', 'origin', 'name'],
      dtype='object')
First 5 rows:     mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0    3504          12.0   
1  15.0          8         350.0       165.0    3693          11.5   
2  18.0          8         318.0       150.0    3436          11.0   
3  16.0          8         304.0       150.0    3433          12.0   
4  17.0          8         302.0       140.0    3449          10.5   

   model_year origin                       name  
0          70    usa  chevrolet chevelle malibu  
1          70    usa          buick skylark 320  
2          70    usa         plymouth satellite  
3          70    usa              amc rebel sst  
4          70    usa                ford torino  


# 2. Preprocess the Data
* Handle missing values if there are any.
* Normalize or standardize numerical features if needed.
* Encode categorical variables (if any exist).
* Split the data into training and testing sets (e.g., 80/20 split).

In [27]:
#Handle missing values
if df.isnull().any(axis=1).sum() > 0:
    
    #Fill the empty rows of horsepower with the mean value of this column
    df['horsepower'] = df['horsepower'].fillna(df['horsepower'].mean())

In [49]:
#Normalize the data
scaler = MinMaxScaler()

#Get just the numeric columns
to_normalize=[]
for column in df.columns:
    if pd.api.types.is_numeric_dtype(df[column]):
        to_normalize.append(column)

#Normalize numerica columns
df = scaler.fit_transform(df[to_normalize])

#Convert back to df from np
df = pd.DataFrame(df_norm, columns=to_normalize)
print(df.head(5))

        mpg  cylinders  displacement  horsepower    weight  acceleration  \
0  0.239362        1.0      0.617571    0.456522  0.536150      0.238095   
1  0.159574        1.0      0.728682    0.646739  0.589736      0.208333   
2  0.239362        1.0      0.645995    0.565217  0.516870      0.178571   
3  0.186170        1.0      0.609819    0.565217  0.516019      0.238095   
4  0.212766        1.0      0.604651    0.510870  0.520556      0.148810   

   model_year  
0         0.0  
1         0.0  
2         0.0  
3         0.0  
4         0.0  


In [55]:
#Split the data 80/20
split80 = int(len(df) * 0.8)
train_data, test_data = random_split(df, [split80, len(df) - split80])