In [1]:
import pandas as pd
import numpy as np

In [2]:
%config Completer.use_jedi = False

In [13]:
# TODO: 26.04.21 use distinct preprocessing module (clean notebook)

In [3]:
columns = [
    "symboling",
    "normalized-losses",
    "make",
    "fuel-type",
    "aspiration",
    "num-of-doors",
    "body-style",
    "drive-wheels",
    "engine-location",
    "wheel-base",
    "length",
    "width",
    "height",
    "curb-weight",
    "engine-type",
    "num-of-cylinders",
    "engine-size",
    "fuel-system",
    "bore",
    "stroke",
    "compression-ratio",
    "horsepower",
    "peak-rpm",
    "city-mpg",
    "highway-mpg",
    "price",
]

In [4]:
df = pd.read_csv("data/imports-85.data", header=None, names=columns)

In [5]:
# Preprocessing (instructions)
# - '?' = missing values
# - ignore 'symboling'
# - Skip samples with missing values in 'normalized-losses' (target)

df = df.replace("?", np.nan)
df.drop("symboling", axis="columns", inplace=True)
df = df[df["normalized-losses"].notna()]

df.head(3)

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450
6,158,audi,gas,std,four,sedan,fwd,front,105.8,192.7,...,136,mpfi,3.19,3.4,8.5,110,5500,19,25,17710


In [6]:
len(df)

164

In [7]:
df.dtypes[:5]

normalized-losses    object
make                 object
fuel-type            object
aspiration           object
num-of-doors         object
dtype: object

In [8]:
# Correcting data types (also see: 'Attribute Information in `data/imports-85.names`')
numerics = [
    "normalized-losses",
    "wheel-base",
    "length",
    "width",
    "height",
    "curb-weight",
    "engine-size",
    "bore",
    "stroke",
    "compression-ratio",
    "horsepower",
    "peak-rpm",
    "city-mpg",
    "highway-mpg",
    "price"
]

objects = [
    "make",
    "fuel-type",
    "aspiration",
    "num-of-doors",
    "body-style",
    "drive-wheels",
    "engine-location", 
    "engine-type",
    "num-of-cylinders",
    "fuel-system",
]

print(len(numerics), len(objects))

15 10


In [9]:
# Since objects are already objects, we only need to assure 
# that continuous columns are interpreted correctly

for k, v in df.dtypes.items():
    if k in numerics and v == 'object':
        df[k] = df[k].astype(float)

In [10]:
# Cleaned DataFrame
df.head(3)

Unnamed: 0,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
3,164.0,audi,gas,std,four,sedan,fwd,front,99.8,176.6,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,164.0,audi,gas,std,four,sedan,4wd,front,99.4,176.6,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0
6,158.0,audi,gas,std,four,sedan,fwd,front,105.8,192.7,...,136,mpfi,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0


In [12]:
df.describe()

Unnamed: 0,normalized-losses,wheel-base,length,width,height,curb-weight,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
count,164.0,164.0,164.0,164.0,164.0,164.0,164.0,160.0,160.0,164.0,164.0,164.0,164.0,164.0,164.0
mean,122.0,98.164024,172.238415,65.59878,53.77439,2458.27439,117.957317,3.298437,3.237312,10.126951,96.207317,5137.804878,26.268293,31.847561,11466.518293
std,35.442168,5.120198,11.417833,1.923028,2.343942,475.087068,30.896294,0.267348,0.29421,3.836306,30.408563,479.459113,6.193305,6.514349,5803.490319
min,65.0,86.6,141.1,60.3,49.4,1488.0,61.0,2.54,2.07,7.0,48.0,4150.0,15.0,18.0,5118.0
25%,94.0,94.5,165.675,64.0,52.0,2090.75,97.0,3.05,3.1075,8.7,69.0,4800.0,22.0,28.0,7446.0
50%,115.0,96.55,172.0,65.4,54.1,2367.5,109.0,3.27,3.27,9.0,91.0,5200.0,26.0,32.0,9268.5
75%,150.0,100.4,177.8,66.5,55.5,2785.5,131.75,3.55,3.41,9.4,114.0,5500.0,31.0,37.0,14559.25
max,256.0,115.6,202.6,71.7,59.8,4066.0,258.0,3.94,4.17,23.0,200.0,6600.0,49.0,54.0,35056.0
