<a href="https://colab.research.google.com/github/sabiipoks/blog-posts/blob/master/Feature_Preprocessing_for_Categorical_and_Ordinal_Features_The_Most_Important_Step.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Feature Preprocessing for Categorical and Ordinal Features- The Most Important Step



Dataset from http://archive.ics.uci.edu/ml/datasets/Automobile obtained from UCI Machine Learning Repository.


Dua, D. and Graff, C. (2019). UCI Machine Learning Repository [http://archive.ics.uci.edu/ml]. Irvine, CA: University of California, School of Information and Computer Science.

In [0]:
# import pandas
import pandas as pd

# define column names for the dataset as the dataset we will be importing does not have column names
columns = ['symboling', 'normalized-losses', 'make', 'fuel-type', 'aspiration', 'num-of-doors', 'body-style', 'drive-wheels', 'engine-location', 'wheel-base', 'length', 'width', 'height',
           'curb-weight', 'engine-type', 'num-of-cylinders', 'engine-size', 'fuel-system', 'bore', 'stroke', 'compression-ratio', 'horsepower', 'peak-rpm', 'city-mpg', 'highway-mpg', 'price']

# read the dataset
df = pd.read_csv('https://query.data.world/s/sdhzwzf6n2ivkvgabugicfo6oxiais', header=None, names=columns)

In [128]:
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,height,curb-weight,engine-type,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,168.8,64.1,48.8,2548,dohc,four,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,171.2,65.5,52.4,2823,ohcv,six,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,176.6,66.2,54.3,2337,ohc,four,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,176.6,66.4,54.3,2824,ohc,five,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [129]:
# As this dataset contains a lot of features, let us select a few categorical features for the purpose of demonstration.
select_columns = ['fuel-type','engine-location','num-of-cylinders']

df = df[select_columns]
df.head()

Unnamed: 0,fuel-type,engine-location,num-of-cylinders
0,gas,front,four
1,gas,front,four
2,gas,front,six
3,gas,front,four
4,gas,front,five


In [130]:
# find unique values for feature fuel-type
print(df['fuel-type'].unique())

# find unique values for feature engine-location
print(df['engine-location'].unique())

# find unique values for feature num-of-cylinders
print(df['num-of-cylinders'].unique())

['gas' 'diesel']
['front' 'rear']
['four' 'six' 'five' 'three' 'twelve' 'two' 'eight']


In [131]:
# import Label encoder
from sklearn.preprocessing import LabelEncoder

# create laber encoder
label_encoder = LabelEncoder()

# create a copy of dataset
df_le = df.copy()

# fit the label encoder and transform the labels in the dataset to create new label encoded features
df_le['enc-fuel-type'] = label_encoder.fit_transform(df_le['fuel-type'])
df_le['enc-engine-location'] = label_encoder.fit_transform(df_le['engine-location'])
df_le['enc-num-of-cylinders'] = label_encoder.fit_transform(df_le['num-of-cylinders'])

# drop original categorical features
columns_to_drop = ['fuel-type','engine-location','num-of-cylinders']
df_le = df_le.drop(columns=columns_to_drop)

df_le.head()

Unnamed: 0,enc-fuel-type,enc-engine-location,enc-num-of-cylinders
0,1,0,2
1,1,0,2
2,1,0,3
3,1,0,2
4,1,0,1


In [132]:
# import OneHotEncoder
from sklearn.preprocessing import OneHotEncoder

# create one hot encoder
one_hot_encoder = OneHotEncoder()

# create a copy of the dataset
df_ohe = df.copy()

# fit one hot encoder
one_hot_encoder = one_hot_encoder.fit(df_ohe)

# transform dataset 
ohelabels = one_hot_encoder.transform(df_ohe).toarray()
df_ohe = pd.DataFrame(ohelabels, columns=one_hot_encoder.get_feature_names())

df_ohe.head()


Unnamed: 0,x0_diesel,x0_gas,x1_front,x1_rear,x2_eight,x2_five,x2_four,x2_six,x2_three,x2_twelve,x2_two
0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,0.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
