In [1]:
import numpy as np
import pandas as pd

#importing scikit-learn modules
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

In [2]:
car_sales_missing = pd.read_csv("https://raw.githubusercontent.com/ramiraza/zero-to-mastery-ml/master/data/car-sales-extended-missing-data.csv")

In [3]:
car_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [6]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [10]:
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [50]:
from sklearn.model_selection import train_test_split

X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

#creating a split between the data for training & testing
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [51]:
X_train.isna().sum()

Make             35
Colour           38
Odometer (KM)    36
Doors            38
dtype: int64

In [52]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# defining columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# fill categorical data with missing & numerical (mean) values
cat_imputer  = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer  = SimpleImputer(strategy="mean")

# applying the imputer
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

X_train_imputed = imputer.fit_transform(X_train)
X_test_imputed  = imputer.transform(X_test)

In [53]:
X_train_imputed

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [54]:
# adding data into a dataframe
car_sales_filled_train = pd.DataFrame(X_train_imputed, columns=["Make", "Colour", "Odometer (KM)", "Doors"])
car_sales_filled_test = pd.DataFrame(X_test_imputed, columns=["Make", "Colour", "Odometer (KM)", "Doors"])

car_sales_filled_test.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

In [55]:
car_sales_filled_train.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64

No more missing values within the data as demonstrated above

In [110]:
# Import OneHotEncoder class from sklearn
from sklearn.preprocessing import OneHotEncoder

# Now let's one hot encode the features with the same code as before 
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder(handle_unknown="ignore")
transformer = ColumnTransformer([("one_hot", 
                                 one_hot, 
                                 categorical_features)],
                                 remainder="passthrough")

# Fill train and test values separately
transformed_X_train = transformer.fit_transform(car_sales_filled_train) # fit and transform the training data
transformed_X_test = transformer.transform(car_sales_filled_test) # transform the test data

In [111]:
# Check transformed and filled X_train
transformed_X_train.toarray()

array([[0., 1., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 1., 0., ..., 0., 0., 4.],
       ...,
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 1., 0., ..., 0., 0., 4.],
       [0., 1., 0., ..., 0., 0., 4.]])

#### Fitting to the model
Since the data has been transformed, now the fitting and scoring can be done to ensure the model works as intended

In [112]:
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100)
model.fit(transformed_X_train, y_train);

In [113]:
model.score(transformed_X_test, y_test)

-0.04799525690647011

In [106]:
import sklearn

In [107]:
print(sklearn.__version__)

0.24.2
