In [32]:
import numpy as np
import pandas as pd
import os

In [33]:
dataset_name = 'auto_mpg'

In [34]:
input_dir = './data'
inp_fname = 'auto-mpg.data'
output_dir = f'./../../processed/{dataset_name}/'
outp_fname = os.path.join(output_dir, f'{dataset_name}.csv')

# Read Data

In [35]:
cols = [
    "mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration" , "model year", "origin", "car name"
]

In [36]:
data = pd.read_csv(os.path.join(input_dir, inp_fname), header=None, names=cols, sep="\s+")
print(data.shape)
data.head()

(398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [37]:
print(data.shape)
data.head()

(398, 9)


Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin,car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [38]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   mpg           398 non-null    float64
 1   cylinders     398 non-null    int64  
 2   displacement  398 non-null    float64
 3   horsepower    398 non-null    object 
 4   weight        398 non-null    float64
 5   acceleration  398 non-null    float64
 6   model year    398 non-null    int64  
 7   origin        398 non-null    int64  
 8   car name      398 non-null    object 
dtypes: float64(4), int64(3), object(2)
memory usage: 28.1+ KB


In [39]:
id_col = "id"
target_col = "mpg"

# Insert Id Column

In [40]:
# insert Id column 
if id_col not in data.columns:
    N = data.shape[0]
    data.insert(0, id_col, np.arange(N))
    print(data.head())
data[id_col] = data[id_col].astype(str)

   id   mpg  cylinders  displacement horsepower  weight  acceleration  \
0   0  18.0          8         307.0      130.0  3504.0          12.0   
1   1  15.0          8         350.0      165.0  3693.0          11.5   
2   2  18.0          8         318.0      150.0  3436.0          11.0   
3   3  16.0          8         304.0      150.0  3433.0          12.0   
4   4  17.0          8         302.0      140.0  3449.0          10.5   

   model year  origin                   car name  
0          70       1  chevrolet chevelle malibu  
1          70       1          buick skylark 320  
2          70       1         plymouth satellite  
3          70       1              amc rebel sst  
4          70       1                ford torino  


# Replace "?" with np.nan

In [41]:
data.replace("?", np.nan, inplace=True)

In [42]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            398 non-null    object 
 1   mpg           398 non-null    float64
 2   cylinders     398 non-null    int64  
 3   displacement  398 non-null    float64
 4   horsepower    392 non-null    object 
 5   weight        398 non-null    float64
 6   acceleration  398 non-null    float64
 7   model year    398 non-null    int64  
 8   origin        398 non-null    int64  
 9   car name      398 non-null    object 
dtypes: float64(4), int64(3), object(3)
memory usage: 31.2+ KB


# Delete Car Name

In [43]:
data['car name'].nunique()

305

In [44]:
# car names are not unique, so cannot be used as identifier
# but there are too many to be used as a categorical feature

In [45]:
del data['car name']

In [46]:
print(data.shape)
data.head()

(398, 9)


Unnamed: 0,id,mpg,cylinders,displacement,horsepower,weight,acceleration,model year,origin
0,0,18.0,8,307.0,130.0,3504.0,12.0,70,1
1,1,15.0,8,350.0,165.0,3693.0,11.5,70,1
2,2,18.0,8,318.0,150.0,3436.0,11.0,70,1
3,3,16.0,8,304.0,150.0,3433.0,12.0,70,1
4,4,17.0,8,302.0,140.0,3449.0,10.5,70,1


# Save Main Data File

In [48]:
data.to_csv(outp_fname, index=False)