In [25]:
import pandas as pd
import numpy as np

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("https://archive.ics.uci.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [26]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [27]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


In [28]:
obj_df[obj_df.isnull().any(axis=1)]

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
27,dodge,gas,turbo,,sedan,fwd,front,ohc,four,mpfi
63,mazda,diesel,std,,sedan,fwd,front,ohc,four,idi


In [29]:
obj_df['num_doors'].value_counts()

num_doors
four    114
two      89
Name: count, dtype: int64

In [30]:
obj_df = obj_df.fillna({'num_doors':'four'})

### Approach #1 Find and Replace

In [31]:
obj_df['num_cylinders'].value_counts()

num_cylinders
four      159
six        24
five       11
eight       5
two         4
three       1
twelve      1
Name: count, dtype: int64

In [32]:
cleanup_nums = {'num_doors':{'four': 4, 'two': 2},
                'num_cylinders':{'four': 4, 'six': 6, 'five': 5, 'eight': 8,
                                'two': 2, 'twelve':12, 'three': 3}}

In [33]:
obj_df = obj_df.replace(cleanup_nums)
obj_df.head()

  obj_df = obj_df.replace(cleanup_nums)


Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi


### Approach #2 Label Encoding

In [11]:
obj_df['body_style'] = obj_df['body_style'].astype('category')
obj_df.dtypes

make                 object
fuel_type            object
aspiration           object
num_doors             int64
body_style         category
drive_wheels         object
engine_location      object
engine_type          object
num_cylinders         int64
fuel_system          object
dtype: object

In [12]:
obj_df['body_style'] = obj_df['body_style'].cat.codes
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,2,0,rwd,front,dohc,4,mpfi
1,alfa-romero,gas,std,2,0,rwd,front,dohc,4,mpfi
2,alfa-romero,gas,std,2,2,rwd,front,ohcv,6,mpfi
3,audi,gas,std,4,3,fwd,front,ohc,4,mpfi
4,audi,gas,std,4,3,4wd,front,ohc,5,mpfi


### Approach #3 One Hot Encoding

In [13]:
pd.get_dummies(obj_df, columns=['drive_wheels']).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,engine_location,engine_type,num_cylinders,fuel_system,drive_wheels_4wd,drive_wheels_fwd,drive_wheels_rwd
0,alfa-romero,gas,std,2,0,front,dohc,4,mpfi,False,False,True
1,alfa-romero,gas,std,2,0,front,dohc,4,mpfi,False,False,True
2,alfa-romero,gas,std,2,2,front,ohcv,6,mpfi,False,False,True
3,audi,gas,std,4,3,front,ohc,4,mpfi,False,True,False
4,audi,gas,std,4,3,front,ohc,5,mpfi,True,False,False


In [14]:
pd.get_dummies(obj_df, columns=["body_style", "drive_wheels"], prefix=["body", "drive"]).head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,engine_location,engine_type,num_cylinders,fuel_system,body_0,body_1,body_2,body_3,body_4,drive_4wd,drive_fwd,drive_rwd
0,alfa-romero,gas,std,2,front,dohc,4,mpfi,True,False,False,False,False,False,False,True
1,alfa-romero,gas,std,2,front,dohc,4,mpfi,True,False,False,False,False,False,False,True
2,alfa-romero,gas,std,2,front,ohcv,6,mpfi,False,False,True,False,False,False,False,True
3,audi,gas,std,4,front,ohc,4,mpfi,False,False,False,True,False,False,True,False
4,audi,gas,std,4,front,ohc,5,mpfi,False,False,False,True,False,True,False,False


### Custom Binary Encoding

In [15]:
obj_df['engine_type'].value_counts()

engine_type
ohc      148
ohcf      15
ohcv      13
dohc      12
l         12
rotor      4
dohcv      1
Name: count, dtype: int64

In [16]:
obj_df['OHC_Code'] = np.where(obj_df['engine_type'].str.contains('ohc'), 1, 0)

In [17]:
obj_df[['make', 'engine_type', 'OHC_Code']].head()

Unnamed: 0,make,engine_type,OHC_Code
0,alfa-romero,dohc,1
1,alfa-romero,dohc,1
2,alfa-romero,ohcv,1
3,audi,ohc,1
4,audi,ohc,1


### Scikit Learn

### OrdinalEncoder - Label Encode

In [22]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
obj_df['make_code'] = ord_enc.fit_transform(obj_df[['make']])
obj_df[['make', 'make_code']].head()

Unnamed: 0,make,make_code
0,alfa-romero,0.0
1,alfa-romero,0.0
2,alfa-romero,0.0
3,audi,1.0
4,audi,1.0


### OneHotEncoder

In [39]:
from sklearn.preprocessing import OneHotEncoder

oe_style = OneHotEncoder()
oe_results = oe_style.fit_transform(obj_df[['body_style']])
pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_).head()

Unnamed: 0,convertible,hardtop,hatchback,sedan,wagon
0,1.0,0.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0


In [41]:
obj_df = obj_df.join(pd.DataFrame(oe_results.toarray(), columns=oe_style.categories_))

In [42]:
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system,"(convertible,)","(hardtop,)","(hatchback,)","(sedan,)","(wagon,)"
0,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,1.0,0.0,0.0,0.0,0.0
1,alfa-romero,gas,std,2,convertible,rwd,front,dohc,4,mpfi,1.0,0.0,0.0,0.0,0.0
2,alfa-romero,gas,std,2,hatchback,rwd,front,ohcv,6,mpfi,0.0,0.0,1.0,0.0,0.0
3,audi,gas,std,4,sedan,fwd,front,ohc,4,mpfi,0.0,0.0,0.0,1.0,0.0
4,audi,gas,std,4,sedan,4wd,front,ohc,5,mpfi,0.0,0.0,0.0,1.0,0.0


### Advanced Approaches

### Backward Difference Encoding

In [45]:
!pip install category_encoders

Collecting category_encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl.metadata (8.0 kB)
Collecting statsmodels>=0.9.0 (from category_encoders)
  Downloading statsmodels-0.14.2-cp312-cp312-win_amd64.whl.metadata (9.5 kB)
Collecting patsy>=0.5.1 (from category_encoders)
  Downloading patsy-0.5.6-py2.py3-none-any.whl.metadata (3.5 kB)
Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
   ---------------------------------------- 0.0/81.9 kB ? eta -:--:--
   ----- ---------------------------------- 10.2/81.9 kB ? eta -:--:--
   --------------- ------------------------ 30.7/81.9 kB 435.7 kB/s eta 0:00:01
   ---------------------------------------- 81.9/81.9 kB 651.1 kB/s eta 0:00:00
Downloading patsy-0.5.6-py2.py3-none-any.whl (233 kB)
   ---------------------------------------- 0.0/233.9 kB ? eta -:--:--
   -------------- ------------------------- 81.9/233.9 kB 2.3 MB/s eta 0:00:01
   --------------- ------------------------ 92.2/233.9 kB 1.7 MB/s eta 0:00:01

In [46]:
import category_encoders as ce

In [49]:
obj_df = df.select_dtypes(include=['object']).copy()

encoder = ce.BackwardDifferenceEncoder(cols=['engine_type'])
encoder.fit_transform(obj_df, verbose=1).iloc[:, 8:14].head()



Unnamed: 0,engine_type_0,engine_type_1,engine_type_2,engine_type_3,engine_type_4,engine_type_5
0,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
1,-0.857143,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
2,0.142857,-0.714286,-0.571429,-0.428571,-0.285714,-0.142857
3,0.142857,0.285714,-0.571429,-0.428571,-0.285714,-0.142857
4,0.142857,0.285714,-0.571429,-0.428571,-0.285714,-0.142857


### Polynomial Encoding

In [50]:
encoder = ce.PolynomialEncoder(cols=['engine_type'])
encoder.fit_transform(obj_df, verbose=1).iloc[:, 8:14].head()



Unnamed: 0,engine_type_0,engine_type_1,engine_type_2,engine_type_3,engine_type_4,engine_type_5
0,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898
1,-0.566947,0.545545,-0.408248,0.241747,-0.109109,0.032898
2,-0.377964,0.0,0.408248,-0.564076,0.436436,-0.197386
3,-0.188982,-0.327327,0.408248,0.080582,-0.545545,0.493464
4,-0.188982,-0.327327,0.408248,0.080582,-0.545545,0.493464
