# Exercise 13

This particular Automobile Data Set includes a good mix of categorical values as well as continuous values and serves as a useful example that is relatively easy to understand. Since domain understanding is an important aspect when deciding how to encode various categorical values - this data set makes a good case study.

Read the data into Pandas

In [468]:
import pandas as pd

# Define the headers since the data does not have any
headers = ["symboling", "normalized_losses", "make", "fuel_type", "aspiration",
           "num_doors", "body_style", "drive_wheels", "engine_location",
           "wheel_base", "length", "width", "height", "curb_weight",
           "engine_type", "num_cylinders", "engine_size", "fuel_system",
           "bore", "stroke", "compression_ratio", "horsepower", "peak_rpm",
           "city_mpg", "highway_mpg", "price"]

# Read in the CSV file and convert "?" to NaN
df = pd.read_csv("http://mlr.cs.umass.edu/ml/machine-learning-databases/autos/imports-85.data",
                  header=None, names=headers, na_values="?" )
df.head()

Unnamed: 0,symboling,normalized_losses,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,wheel_base,...,engine_size,fuel_system,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [469]:
df.shape

(205, 26)

In [470]:
df.dtypes

symboling              int64
normalized_losses    float64
make                  object
fuel_type             object
aspiration            object
num_doors             object
body_style            object
drive_wheels          object
engine_location       object
wheel_base           float64
length               float64
width                float64
height               float64
curb_weight            int64
engine_type           object
num_cylinders         object
engine_size            int64
fuel_system           object
bore                 float64
stroke               float64
compression_ratio    float64
horsepower           float64
peak_rpm             float64
city_mpg               int64
highway_mpg            int64
price                float64
dtype: object

In [471]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
1,alfa-romero,gas,std,two,convertible,rwd,front,dohc,four,mpfi
2,alfa-romero,gas,std,two,hatchback,rwd,front,ohcv,six,mpfi
3,audi,gas,std,four,sedan,fwd,front,ohc,four,mpfi
4,audi,gas,std,four,sedan,4wd,front,ohc,five,mpfi


# Exercise 13.1

Does the database contain missing values? If so, replace them using one of the methods explained in class

In [472]:
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
import category_encoders as ce

In [473]:
df.isnull().sum()

symboling             0
normalized_losses    41
make                  0
fuel_type             0
aspiration            0
num_doors             2
body_style            0
drive_wheels          0
engine_location       0
wheel_base            0
length                0
width                 0
height                0
curb_weight           0
engine_type           0
num_cylinders         0
engine_size           0
fuel_system           0
bore                  4
stroke                4
compression_ratio     0
horsepower            2
peak_rpm              2
city_mpg              0
highway_mpg           0
price                 4
dtype: int64

In [474]:
df.dropna().shape

(159, 26)

In [475]:
df.normalized_losses.fillna(df.normalized_losses.mean(), inplace=True)
df.bore.fillna(df.bore.mean(), inplace=True)
df.stroke.fillna(df.stroke.mean(), inplace=True)
df.peak_rpm.fillna(df.peak_rpm.median(), inplace=True)

In [476]:
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            2
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           2
peak_rpm             0
city_mpg             0
highway_mpg          0
price                4
dtype: int64

In [477]:
df.dropna().shape

(197, 26)

In [478]:
df.dropna(inplace=True)
df.shape

(197, 26)

In [479]:
df.isnull().sum()

symboling            0
normalized_losses    0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_type          0
num_cylinders        0
engine_size          0
fuel_system          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
dtype: int64

# Exercise 13.2

Split the data into training and testing sets

Train a Random Forest Regressor to predict the price of a car using the nominal features

In [480]:
obj_df = df.select_dtypes(include=['object']).copy()
obj_df1=obj_df.copy()

In [481]:
obj_df1["fuel_type"]=pd.DataFrame(obj_df1.fuel_type.astype("category").cat.codes)
obj_df1["make"]=pd.DataFrame(obj_df1.make.astype("category").cat.codes)
obj_df1["aspiration"]=pd.DataFrame(obj_df1.aspiration.astype("category").cat.codes)
obj_df1["num_doors"]=pd.DataFrame(obj_df1.num_doors.astype("category").cat.codes)
obj_df1["body_style"]=pd.DataFrame(obj_df1.body_style.astype("category").cat.codes)
obj_df1["drive_wheels"]=pd.DataFrame(obj_df1.drive_wheels.astype("category").cat.codes)
obj_df1["engine_location"]=pd.DataFrame(obj_df1.engine_location.astype("category").cat.codes)
obj_df1["engine_type"]=pd.DataFrame(obj_df1.engine_type.astype("category").cat.codes)
obj_df1["num_cylinders"]=pd.DataFrame(obj_df1.num_cylinders.astype("category").cat.codes)
obj_df1["fuel_system"]=pd.DataFrame(obj_df1.fuel_system.astype("category").cat.codes)

In [482]:
obj_df1.head()

Unnamed: 0,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,0,1,0,1,0,2,0,0,2,5
1,0,1,0,1,0,2,0,0,2,5
2,0,1,0,1,2,2,0,4,3,5
3,1,1,0,0,3,1,0,2,2,5
4,1,1,0,0,3,0,0,2,1,5


In [483]:
new_df = df.select_dtypes(include=['float64','int64']).copy()
data = pd.concat([new_df, obj_df1], axis=1)

In [484]:
data.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,make,fuel_type,aspiration,num_doors,body_style,drive_wheels,engine_location,engine_type,num_cylinders,fuel_system
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,1,0,1,0,2,0,0,2,5
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,1,0,1,0,2,0,0,2,5
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0,1,0,1,2,2,0,4,3,5
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,...,1,1,0,0,3,1,0,2,2,5
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,...,1,1,0,0,3,0,0,2,1,5


In [485]:
data.isnull().sum()

symboling            0
normalized_losses    0
wheel_base           0
length               0
width                0
height               0
curb_weight          0
engine_size          0
bore                 0
stroke               0
compression_ratio    0
horsepower           0
peak_rpm             0
city_mpg             0
highway_mpg          0
price                0
make                 0
fuel_type            0
aspiration           0
num_doors            0
body_style           0
drive_wheels         0
engine_location      0
engine_type          0
num_cylinders        0
fuel_system          0
dtype: int64

In [486]:
X = data.drop(['price'], axis=1)
y = data['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [487]:
rfr1 = RandomForestRegressor(n_estimators= 500, random_state=42)
rfr1.fit(X_train, y_train);
y_pred = rfr1.predict(X_test)

In [488]:
RMSE=(np.sqrt(metrics.mean_squared_error(y_pred, y_test)))
print("RMSE =",RMSE)

RMSE = 2219.150348011949


# Exercise 13.3

Create dummy variables for the categorical features

Train a Random Forest Regressor and compare

In [489]:
obj_df2= obj_df.copy()
obj_df2= pd.get_dummies(obj_df2, columns = ["fuel_type"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["make"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["aspiration"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["num_doors"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["body_style"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["drive_wheels"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["engine_location"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["engine_type"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["num_cylinders"], drop_first = True)
obj_df2= pd.get_dummies(obj_df2, columns = ["fuel_system"], drop_first = True)
obj_df2.shape

(197, 48)

In [490]:
data1 = pd.concat([new_df, obj_df2], axis=1)

In [491]:
data1.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,num_cylinders_three,num_cylinders_twelve,num_cylinders_two,fuel_system_2bbl,fuel_system_4bbl,fuel_system_idi,fuel_system_mfi,fuel_system_mpfi,fuel_system_spdi,fuel_system_spfi
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0,0,0,0,0,0,0,1,0,0
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0,0,0,0,0,0,0,1,0,0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,...,0,0,0,0,0,0,0,1,0,0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,...,0,0,0,0,0,0,0,1,0,0


In [492]:
X1 = data1.drop(['price'], axis=1)
y1 = data1['price']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_state=1)

In [493]:
rfr2 = RandomForestRegressor(n_estimators= 500, random_state=42)
rfr2.fit(X1_train, y1_train);
y_pred2 = rfr2.predict(X1_test)

In [494]:
RMSE1=(np.sqrt(metrics.mean_squared_error(y_pred2, y1_test)))
print("RMSE =",RMSE1)

RMSE = 2201.920395402495


# Exercise 13.4

Apply two other methods of categorical encoding

compare the results

### Polynomial Coding

In [495]:
obj_df2 = ce.PolynomialEncoder().fit_transform(obj_df)
obj_df2.dropna().shape

(189, 49)

In [496]:
data2 = pd.concat([new_df, obj_df2], axis=1)
data2.dropna(inplace=True)

In [497]:
data2.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,num_cylinders_3,num_cylinders_4,num_cylinders_5,fuel_system_0,fuel_system_1,fuel_system_2,fuel_system_3,fuel_system_4,fuel_system_5,fuel_system_6
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0.241747,-0.109109,0.032898,-0.540062,0.540062,-0.43082,0.282038,-0.149786,0.061546,-0.01707
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0.241747,-0.109109,0.032898,-0.540062,0.540062,-0.43082,0.282038,-0.149786,0.061546,-0.01707
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,-0.564076,0.436436,-0.197386,-0.540062,0.540062,-0.43082,0.282038,-0.149786,0.061546,-0.01707
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,...,0.241747,-0.109109,0.032898,-0.540062,0.540062,-0.43082,0.282038,-0.149786,0.061546,-0.01707
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,...,0.080582,-0.545545,0.493464,-0.540062,0.540062,-0.43082,0.282038,-0.149786,0.061546,-0.01707


In [498]:
X2 = data2.drop(['price'], axis=1)
y2 = data2['price']

X2_train, X2_test, y2_train, y2_test = train_test_split(X2, y2, test_size=0.3, random_state=1)

In [499]:
rfr3 = RandomForestRegressor(n_estimators= 500, random_state=42)
rfr3.fit(X2_train, y2_train);
y_pred3 = rfr3.predict(X2_test)

In [500]:
RMSE2=(np.sqrt(metrics.mean_squared_error(y_pred3, y2_test)))
print("RMSE =",RMSE2)

RMSE = 1864.3916975849727


### Helmert Coding

In [501]:
obj_df3 = ce.HelmertEncoder().fit_transform(obj_df)
obj_df3.dropna().shape

(189, 49)

In [502]:
data3 = pd.concat([new_df, obj_df3], axis=1)
data3.dropna(inplace=True)

In [503]:
data3.head()

Unnamed: 0,symboling,normalized_losses,wheel_base,length,width,height,curb_weight,engine_size,bore,stroke,...,num_cylinders_3,num_cylinders_4,num_cylinders_5,fuel_system_0,fuel_system_1,fuel_system_2,fuel_system_3,fuel_system_4,fuel_system_5,fuel_system_6
0,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,3,122.0,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,1,122.0,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,2,164.0,99.8,176.6,66.2,54.3,2337,109,3.19,3.4,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,2,164.0,99.4,176.6,66.4,54.3,2824,136,3.19,3.4,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [504]:
X3 = data3.drop(['price'], axis=1)
y3 = data3['price']

X3_train, X3_test, y3_train, y3_test = train_test_split(X3, y3, test_size=0.3, random_state=1)

In [505]:
rfr4 = RandomForestRegressor(n_estimators= 500, random_state=42)
rfr4.fit(X3_train, y3_train);
y_pred4 = rfr4.predict(X3_test)

In [506]:
RMSE3=(np.sqrt(metrics.mean_squared_error(y_pred4, y3_test)))
print("RMSE =",RMSE3)

RMSE = 1893.1676036595547


### Resumen de resultados

In [507]:
print("RMSE Nominal features  =",RMSE)
print("RMSE Categorical features  =",RMSE1)
print("RMSE Polynomial Coding  =",RMSE2)
print("RMSE Helmert Coding  =",RMSE3)

RMSE Nominal features  = 2219.150348011949
RMSE Categorical features  = 2201.920395402495
RMSE Polynomial Coding  = 1864.3916975849727
RMSE Helmert Coding  = 1893.1676036595547


Observamos que el mejor (menor) *Error Cuadrático Medio o RMSE (Root Mean Squared Error)* se presenta en la predicción con Random Forest para las características categóricas *"Polynomial Coding"* con un RMSE de 1.864,39.