# Preprocessing and Training Dataset

In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [16]:
df_cars = pd.read_csv('new_cars_cleaned.csv', low_memory = False)
df_cars.head()

Unnamed: 0,MSRP,EPA Fuel Economy Est - City (MPG),Propulsion Source,Drivetrain,Passenger Capacity,Passenger Doors,Body Style,Base Curb Weight (lbs),Passenger Volume (ft³),Wheelbase (in),...,Roadside Assistance Years,Roadside Assistance Miles/km,Model Year,Manufacturer,Car Model,Engine Type,Front Tire Width,Front Tire Aspect Ratio,Front Wheel Diameter (in),Engine Compression
0,40600.0,22.0,Gas,FWD,5,4,SUV,3790.0,104.0,108.3,...,4.0,50000,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged
1,45500.0,22.0,Gas,FWD,5,4,SUV,3829.0,104.0,108.3,...,4.0,50000,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged
2,43600.0,22.0,Gas,FWD,5,4,SUV,3821.0,104.0,108.3,...,4.0,50000,2019,Acura,RDX,I4,255.0,45.0,20.0,Turbocharged
3,37400.0,22.0,Gas,FWD,5,4,SUV,3783.0,104.0,108.3,...,4.0,50000,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged
4,42600.0,21.0,Gas,AWD,5,4,SUV,4026.0,104.0,108.3,...,4.0,50000,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged


In [7]:
df_cars.dtypes

MSRP                                    float64
EPA Fuel Economy Est - City (MPG)       float64
Propulsion Source                        object
Drivetrain                               object
Passenger Capacity                        int64
Passenger Doors                           int64
Body Style                               object
Base Curb Weight (lbs)                  float64
Passenger Volume (ft³)                  float64
Wheelbase (in)                          float64
Track Width, Front (in)                 float64
Height, Overall (in)                    float64
Fuel Tank Capacity, Approx (gal)        float64
Max Torque                              float64
Fuel System                              object
Max Horsepower                          float64
Displacement                            float64
Transmission Type                        object
Transmission Gears                      float64
Suspension Type - Front                  object
Suspension Type - Rear                  

As we can see above there are a lot of non-numerical categorical features still left in our dataset.  So to address this we will use pandas get_dummies function to convert these to dummy/indicator variables (convert them to numerical). Some of the categorical features (such as 'Parking Aid' or 'Fog Lamps' are simply binary yes/no features while others (like 'Engine Type' or 'Body Style') have several different values.  For those that are yes/no we will use dummy variables and for the others we will one hot encode (n features vs n - 1 features where n is the number of unique values in the feature).

In [8]:
# I am going to make a copy of our dataset and test out pandas get dummies method on a few of the categorical features
df_cars_test = df_cars.copy()
df_cars['Parking Aid'].head(20)

0     Yes
1     Yes
2     Yes
3      No
4     Yes
5     Yes
6     Yes
7      No
8      No
9      No
10    Yes
11     No
12     No
13     No
14     No
15    Yes
16     No
17     No
18     No
19     No
Name: Parking Aid, dtype: object

In [10]:
df_cars_test = pd.get_dummies(data = df_cars_test, columns = ['Parking Aid', 'Back-Up Camera'], drop_first = True)

In [11]:
# Lets see if it works as intended
df_cars_test.dtypes

MSRP                                    float64
EPA Fuel Economy Est - City (MPG)       float64
Propulsion Source                        object
Drivetrain                               object
Passenger Capacity                        int64
Passenger Doors                           int64
Body Style                               object
Base Curb Weight (lbs)                  float64
Passenger Volume (ft³)                  float64
Wheelbase (in)                          float64
Track Width, Front (in)                 float64
Height, Overall (in)                    float64
Fuel Tank Capacity, Approx (gal)        float64
Max Torque                              float64
Fuel System                              object
Max Horsepower                          float64
Displacement                            float64
Transmission Type                        object
Transmission Gears                      float64
Suspension Type - Front                  object
Suspension Type - Rear                  

In [12]:
df_cars_test.head()

Unnamed: 0,MSRP,EPA Fuel Economy Est - City (MPG),Propulsion Source,Drivetrain,Passenger Capacity,Passenger Doors,Body Style,Base Curb Weight (lbs),Passenger Volume (ft³),Wheelbase (in),...,Model Year,Manufacturer,Car Model,Engine Type,Front Tire Width,Front Tire Aspect Ratio,Front Wheel Diameter (in),Engine Compression,Parking Aid_Yes,Back-Up Camera_Yes
0,40600.0,22.0,Gas,FWD,5,4,SUV,3790.0,104.0,108.3,...,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged,1,1
1,45500.0,22.0,Gas,FWD,5,4,SUV,3829.0,104.0,108.3,...,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged,1,1
2,43600.0,22.0,Gas,FWD,5,4,SUV,3821.0,104.0,108.3,...,2019,Acura,RDX,I4,255.0,45.0,20.0,Turbocharged,1,1
3,37400.0,22.0,Gas,FWD,5,4,SUV,3783.0,104.0,108.3,...,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged,0,1
4,42600.0,21.0,Gas,AWD,5,4,SUV,4026.0,104.0,108.3,...,2019,Acura,RDX,I4,235.0,55.0,19.0,Turbocharged,1,1


In [17]:
# With it working perfectly lets convert all the yes/no features into binary 1/0 features using the pandas get dummies method 
dummy_cols = ['Air Bag-Frontal-Driver', 'Air Bag-Frontal-Passenger', 'Air Bag-Passenger Switch (On/Off)', 'Air Bag-Side Body-Front', 
              'Air Bag-Side Body-Rear', 'Air Bag-Side Head-Front', 'Air Bag-Side Head-Rear', 'Brakes-ABS', 'Child Safety Rear Door Locks',
              'Daytime Running Lights', 'Daytime Running Lights', 'Traction Control', 'Night Vision', 'Rollover Protection Bars',
              'Fog Lamps', 'Tire Pressure Monitor', 'Stability Control', 'Back-Up Camera', 'Parking Aid']

df_cars = pd.get_dummies(data=df_cars, columns = dummy_cols, drop_first = True)
df_cars.head()

Unnamed: 0,MSRP,EPA Fuel Economy Est - City (MPG),Propulsion Source,Drivetrain,Passenger Capacity,Passenger Doors,Body Style,Base Curb Weight (lbs),Passenger Volume (ft³),Wheelbase (in),...,Daytime Running Lights_Yes,Daytime Running Lights_Yes.1,Traction Control_Yes,Night Vision_Yes,Rollover Protection Bars_Yes,Fog Lamps_Yes,Tire Pressure Monitor_Yes,Stability Control_Yes,Back-Up Camera_Yes,Parking Aid_Yes
0,40600.0,22.0,Gas,FWD,5,4,SUV,3790.0,104.0,108.3,...,1,1,1,0,0,0,1,1,1,1
1,45500.0,22.0,Gas,FWD,5,4,SUV,3829.0,104.0,108.3,...,1,1,1,0,0,1,1,1,1,1
2,43600.0,22.0,Gas,FWD,5,4,SUV,3821.0,104.0,108.3,...,1,1,1,0,0,1,1,1,1,1
3,37400.0,22.0,Gas,FWD,5,4,SUV,3783.0,104.0,108.3,...,1,1,1,0,0,0,1,1,1,0
4,42600.0,21.0,Gas,AWD,5,4,SUV,4026.0,104.0,108.3,...,1,1,1,0,0,0,1,1,1,1


In [18]:
df_cars.dtypes

MSRP                                     float64
EPA Fuel Economy Est - City (MPG)        float64
Propulsion Source                         object
Drivetrain                                object
Passenger Capacity                         int64
Passenger Doors                            int64
Body Style                                object
Base Curb Weight (lbs)                   float64
Passenger Volume (ft³)                   float64
Wheelbase (in)                           float64
Track Width, Front (in)                  float64
Height, Overall (in)                     float64
Fuel Tank Capacity, Approx (gal)         float64
Max Torque                               float64
Fuel System                               object
Max Horsepower                           float64
Displacement                             float64
Transmission Type                         object
Transmission Gears                       float64
Suspension Type - Front                   object
Suspension Type - Re

In [20]:
# Now we still have a lot of categorical features left, these we will One hot encode also using get dummies but without 
# drop first = True. Also since I am not entirely sure about the 'Car Model' feature, I am going to make a copy of our dataframe
# here to potentially make a model without that feature down the line

df_cars_drop_model = df_cars.copy()
df_cars_drop_model = df_cars_drop_model.drop('Car Model', axis = 1)
df_cars = pd.get_dummies(data = df_cars)

In [29]:
df_cars.select_dtypes(include='object')

0
1
2
3
4
...
32257
32258
32259
32260
32261


In [21]:
df_cars.shape

(32262, 538)

#  Train/Test Split

Now we are ready to separate our data into our target feature y ('MSRP') and all our other features into X and then perform a train test split.  I will split the data first before scaling to prevent any data leakage.  If we scale before splitting then our eventual train data will be scaled also using our eventual test data, thus influencing it.

In [22]:
X = df_cars.drop(columns = 'MSRP')
y = df_cars['MSRP']

In [23]:
# Perform the train/test split on our X and y data arrays, and then do a quick check over them

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)

In [24]:
X_train.shape, X_test.shape

((22583, 537), (9679, 537))

In [25]:
y_train.shape, y_test.shape

((22583,), (9679,))

In [27]:
X_train.head()

Unnamed: 0,EPA Fuel Economy Est - City (MPG),Passenger Capacity,Passenger Doors,Base Curb Weight (lbs),Passenger Volume (ft³),Wheelbase (in),"Track Width, Front (in)","Height, Overall (in)","Fuel Tank Capacity, Approx (gal)",Max Torque,...,Engine Type_Missing,Engine Type_V10,Engine Type_V12,Engine Type_V6,Engine Type_V8,Engine Type_W12,Engine Compression_Missing,Engine Compression_NA/other,Engine Compression_Supercharged,Engine Compression_Turbocharged
30209,30.0,5,4,2970.0,94.7,105.7,60.7,57.4,13.2,184.0,...,0,0,0,0,0,0,0,0,0,1
2450,18.0,0,0,3500.5,97.5,111.5,61.6,65.5,19.0,260.0,...,1,0,0,0,0,0,1,0,0,0
6793,11.0,12,3,3500.5,97.5,155.0,61.6,82.8,31.0,373.0,...,0,0,0,0,1,0,0,1,0,0
1504,20.0,2,2,3131.0,58.9,95.4,60.0,53.0,14.5,173.0,...,0,0,0,0,0,0,0,1,0,0
28338,17.0,5,2,3500.5,97.5,121.9,61.6,67.1,18.0,220.0,...,0,0,0,1,0,0,0,1,0,0


# Scale our X_train and X_test data

In [30]:
# Now we just need to scale all our X data, but we need to be sure to only fit on the train data

scaler = StandardScaler()

scaler.fit(X_train)
X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

Our data is split into train and test sets and the independent features have been scaled. Now we finally should be set and ready to procede to modeling.

In [39]:
# Save our train/test data for use in our next step Modeling

np.savetxt('x_train_scaled.txt', X_train_scaled, delimiter = ',')
np.savetxt('x_test_scaled.txt', X_test_scaled, delimiter = ',')
np.savetxt('y_train.txt', y_train, delimiter = ',')
np.savetxt('y_test.txt', y_test, delimiter = ',')