In [40]:
from __future__ import print_function
import os
data_path = ['./']

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, LeaveOneOut
from sklearn.linear_model import LinearRegression, LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler

In [2]:
import pandas as pd
import numpy as np

In [3]:
filepath = os.sep.join(data_path + ['CarPrice_Assignment.csv'])
data = pd.read_csv(filepath, sep=',')
print(data.shape)

(205, 26)


In [4]:
data.dtypes.value_counts()

object     10
int64       8
float64     8
dtype: int64

In [5]:
# parse Brand name from CarName column and drop original CarName column
data['BrandName'] = data['CarName'].apply(lambda x: x.split()[0])
data = data.drop('CarName', axis=1)
data

Unnamed: 0,car_ID,symboling,fueltype,aspiration,doornumber,carbody,drivewheel,enginelocation,wheelbase,carlength,...,fuelsystem,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price,BrandName
0,1,3,gas,std,two,convertible,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,13495.0,alfa-romero
1,2,3,gas,std,two,convertible,rwd,front,88.6,168.8,...,mpfi,3.47,2.68,9.0,111,5000,21,27,16500.0,alfa-romero
2,3,1,gas,std,two,hatchback,rwd,front,94.5,171.2,...,mpfi,2.68,3.47,9.0,154,5000,19,26,16500.0,alfa-romero
3,4,2,gas,std,four,sedan,fwd,front,99.8,176.6,...,mpfi,3.19,3.40,10.0,102,5500,24,30,13950.0,audi
4,5,2,gas,std,four,sedan,4wd,front,99.4,176.6,...,mpfi,3.19,3.40,8.0,115,5500,18,22,17450.0,audi
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,gas,std,four,sedan,rwd,front,109.1,188.8,...,mpfi,3.78,3.15,9.5,114,5400,23,28,16845.0,volvo
201,202,-1,gas,turbo,four,sedan,rwd,front,109.1,188.8,...,mpfi,3.78,3.15,8.7,160,5300,19,25,19045.0,volvo
202,203,-1,gas,std,four,sedan,rwd,front,109.1,188.8,...,mpfi,3.58,2.87,8.8,134,5500,18,23,21485.0,volvo
203,204,-1,diesel,turbo,four,sedan,rwd,front,109.1,188.8,...,idi,3.01,3.40,23.0,106,4800,26,27,22470.0,volvo


In [6]:
# Select the object (string) columns
mask = data.dtypes == object
categorical_cols = data.columns[mask]
categorical_cols

Index(['fueltype', 'aspiration', 'doornumber', 'carbody', 'drivewheel',
       'enginelocation', 'enginetype', 'cylindernumber', 'fuelsystem',
       'BrandName'],
      dtype='object')

In [11]:
# Determine how many extra columns would be created
num_ohc_cols = (data[categorical_cols]
                .apply(lambda x: x.nunique())
                .sort_values(ascending=False))


# No need to encode if there is only one value
small_num_ohc_cols = num_ohc_cols.loc[num_ohc_cols>1]

# Number of one-hot columns is one less than the number of categories
small_num_ohc_cols -= 1

print(small_num_ohc_cols)

# assuming the original ones are dropped. 
small_num_ohc_cols.sum()

BrandName         27
fuelsystem         7
enginetype         6
cylindernumber     6
carbody            4
drivewheel         2
fueltype           1
aspiration         1
doornumber         1
enginelocation     1
dtype: int64


56

In [12]:
# Copy of the data
data_ohc = data.copy()

# The encoders
le = LabelEncoder()
ohc = OneHotEncoder()

for col in num_ohc_cols.index:
    
    # Integer encode the string categories
    dat = le.fit_transform(data_ohc[col]).astype(int)
    
    # Remove the original column from the dataframe
    data_ohc = data_ohc.drop(col, axis=1)

    # One hot encode the data--this returns a sparse array
    new_dat = ohc.fit_transform(dat.reshape(-1,1))

    # Create unique column names
    n_cols = new_dat.shape[1]
    col_names = ['_'.join([col, str(x)]) for x in range(n_cols)]

    # Create the new dataframe
    new_df = pd.DataFrame(new_dat.toarray(), 
                          index=data_ohc.index, 
                          columns=col_names)

    # Append the new data to the dataframe
    data_ohc = pd.concat([data_ohc, new_df], axis=1)

In [13]:
data_ohc

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,...,drivewheel_1,drivewheel_2,fueltype_0,fueltype_1,aspiration_0,aspiration_1,doornumber_0,doornumber_1,enginelocation_0,enginelocation_1
0,1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
1,2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,...,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
3,4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,...,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
4,5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,...,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
201,202,-1,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,...,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0
202,203,-1,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0
203,204,-1,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,...,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0


In [20]:
data_drop = data.drop(num_ohc_cols.index, axis=1)
data_drop

Unnamed: 0,car_ID,symboling,wheelbase,carlength,carwidth,carheight,curbweight,enginesize,boreratio,stroke,compressionratio,horsepower,peakrpm,citympg,highwaympg,price
0,1,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,13495.0
1,2,3,88.6,168.8,64.1,48.8,2548,130,3.47,2.68,9.0,111,5000,21,27,16500.0
2,3,1,94.5,171.2,65.5,52.4,2823,152,2.68,3.47,9.0,154,5000,19,26,16500.0
3,4,2,99.8,176.6,66.2,54.3,2337,109,3.19,3.40,10.0,102,5500,24,30,13950.0
4,5,2,99.4,176.6,66.4,54.3,2824,136,3.19,3.40,8.0,115,5500,18,22,17450.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
200,201,-1,109.1,188.8,68.9,55.5,2952,141,3.78,3.15,9.5,114,5400,23,28,16845.0
201,202,-1,109.1,188.8,68.8,55.5,3049,141,3.78,3.15,8.7,160,5300,19,25,19045.0
202,203,-1,109.1,188.8,68.9,55.5,3012,173,3.58,2.87,8.8,134,5500,18,23,21485.0
203,204,-1,109.1,188.8,68.9,55.5,3217,145,3.01,3.40,23.0,106,4800,26,27,22470.0


In [25]:
y_col = 'price'

# Split the data that is not one-hot encoded
feature_cols = [x for x in data_drop.columns if x != y_col]
X_data_drop = data_drop[feature_cols]
y_data_drop = data_drop[y_col]

X_train_drop, X_test_drop, y_train_drop, y_test_drop = train_test_split(X_data_drop, y_data_drop, 
                                                    test_size=0.3, random_state=42)

# Split the data that is one-hot encoded
feature_cols = [x for x in data_ohc.columns if x != y_col]
X_data_ohc = data_ohc[feature_cols]
y_data_ohc = data_ohc[y_col]

X_train_ohc, X_test_ohc, y_train_ohc, y_test_ohc = train_test_split(X_data_ohc, y_data_ohc, 
                                                                    test_size=0.3, random_state=42)

In [26]:
error_lr = list()

lr = LinearRegression()

lr = lr.fit(X_train_drop, y_train_drop)
y_train_drop_pred = lr.predict(X_train_drop)
y_test_drop_pred = lr.predict(X_test_drop)

error_lr.append(pd.Series({'train': mean_squared_error(y_train_drop, y_train_drop_pred),
                           'test' : mean_squared_error(y_test_drop,  y_test_drop_pred)},
                           name='no enc'))

lr = LinearRegression()

lr = lr.fit(X_train_ohc, y_train_ohc)
y_train_ohc_pred = lr.predict(X_train_ohc)
y_test_ohc_pred = lr.predict(X_test_ohc)

error_lr.append(pd.Series({'train': mean_squared_error(y_train_ohc, y_train_ohc_pred),
                           'test' : mean_squared_error(y_test_ohc,  y_test_ohc_pred)},
                          name='one-hot enc'))

error_lr = pd.concat(error_lr, axis=1)
error_lr

Unnamed: 0,no enc,one-hot enc
train,7652381.0,1243309.0
test,11843650.0,9724301.0


In [28]:
scalers = {'standard': StandardScaler(),
           'minmax': MinMaxScaler(),
           'maxabs': MaxAbsScaler()}

training_test_sets = {
    'not_encoded': (X_train_drop, y_train_drop, X_test_drop, y_test_drop),
    'one_hot_encoded': (X_train_ohc, y_train_ohc, X_test_ohc, y_test_ohc)}

In [34]:
mask = X_train_drop.dtypes == float
float_columns = X_train_drop.columns[mask]

In [38]:
float_columns

Index(['wheelbase', 'carlength', 'carwidth', 'carheight', 'boreratio',
       'stroke', 'compressionratio'],
      dtype='object')

In [37]:
skew_limit = 0.75
skew_vals = X_train_drop[float_columns].skew()

skew_cols = (skew_vals
             .sort_values(ascending=False)
             .to_frame()
             .rename(columns={0:'Skew'})
             .query('abs(Skew) > {0}'.format(skew_limit)))

skew_cols

Unnamed: 0,Skew
compressionratio,2.893401
carwidth,1.00834
wheelbase,1.005296


In [32]:
mask = X_train_drop.dtypes == float
float_columns = X_train_drop.columns[mask]

# iterate over all possible combinations and get the errors
errors = {}
for encoding_label, (_X_train, _y_train, _X_test, _y_test) in training_test_sets.items():
    for scaler_label, scaler in scalers.items():
        lr = LinearRegression()
        trainingset = _X_train.copy()  # copy because we dont want to scale this more than once.
        testset = _X_test.copy()
        trainingset[float_columns] = scaler.fit_transform(trainingset[float_columns])
        testset[float_columns] = scaler.transform(testset[float_columns])
        lr.fit(trainingset, _y_train)
        predictions = lr.predict(testset)
        key = encoding_label + ' - ' + scaler_label + 'scaling'
        errors[key] = mean_squared_error(_y_test, predictions)

errors = pd.Series(errors)
for key, error_val in errors.items():
    print(key, error_val)

not_encoded - standardscaling 11843650.222782062
not_encoded - minmaxscaling 11843650.222782016
not_encoded - maxabsscaling 11843650.222782869
one_hot_encoded - standardscaling 9724301.025178172
one_hot_encoded - minmaxscaling 9724301.025168572
one_hot_encoded - maxabsscaling 9724301.025177369


In [45]:
import warnings
warnings.filterwarnings('ignore', module='sklearn')

In [43]:
alphas = np.array([0.00001, 0.0001, 0.001, 0.01, 0.1, 1])

lassoCV = LassoCV(alphas=alphas,
                  max_iter=10000,
                  cv=LeaveOneOut()).fit(X_train_drop, y_train_drop)

lassoCV_mse = mean_squared_error(y_test_drop, lassoCV.predict(X_test_drop))

print(lassoCV.alpha_, lassoCV_mse)  # Lasso is slower

1.0 11845231.642826049


In [47]:
alphas = np.array([0.0001, 0.001, 0.01, 0.1, 1])

lassoCV = LassoCV(alphas=alphas,
                  max_iter=10000,
                  cv=LeaveOneOut()).fit(X_train_ohc, y_train_ohc)

lassoCV_mse = mean_squared_error(y_test_ohc, lassoCV.predict(X_test_ohc))

print(lassoCV.alpha_, lassoCV_mse)  # Lasso is slower

1.0 7125759.860192333


In [50]:
errors = {}
for encoding_label, (_X_train, _y_train, _X_test, _y_test) in training_test_sets.items():
    for scaler_label, scaler in scalers.items():
        lassocv = LassoCV(alphas=alphas, max_iter=10000, cv=LeaveOneOut())
        trainingset = _X_train.copy()  # copy because we dont want to scale this more than once.
        testset = _X_test.copy()
        trainingset[float_columns] = scaler.fit_transform(trainingset[float_columns])
        testset[float_columns] = scaler.transform(testset[float_columns])
        lassocv.fit(trainingset, _y_train)
        predictions = lassocv.predict(testset)
        key = encoding_label + ' - ' + scaler_label + 'scaling' + str(lassocv.alpha_)
        errors[key] = mean_squared_error(_y_test, predictions)

In [51]:
errors = pd.Series(errors)
for key, error_val in errors.items():
    print(key, error_val)

not_encoded - standardscaling1.0 11843625.048696605
not_encoded - minmaxscaling1.0 11842658.778776003
not_encoded - maxabsscaling1.0 11798513.894788014
one_hot_encoded - standardscaling1.0 7135900.509251606
one_hot_encoded - minmaxscaling1.0 7214231.277464152
one_hot_encoded - maxabsscaling1.0 7208921.973776866


In [52]:
degree = 20
pf = PolynomialFeatures(degree)

NameError: name 'PolynomialFeatures' is not defined