In [1]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

In [2]:
missing_data =  pd.read_csv("car-sales-extended-missing-data.csv")

missing_data.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [3]:
missing_data.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [4]:
missing_data.dropna(subset=["Price"],inplace=True)

missing_data.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [5]:
X = missing_data.drop("Price",axis=1)

Y = missing_data["Price"]

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size=0.2)
                                                    
                                                    

# Imputation

In [7]:
cat_impute = SimpleImputer(strategy="constant",fill_value="missing")
var_impute = SimpleImputer(strategy="constant",fill_value=4)
num_impute = SimpleImputer(strategy="mean")

cat_feature = ["Make","Colour"]
var_feature = ["Doors"]
num_feature = ["Odometer (KM)"]

imputer = ColumnTransformer([("cat_impute",cat_impute,cat_feature),
                             ("var_impute",var_impute,var_feature),
                             ("num_impute",num_impute,num_feature)
                            ])

filled_X_train = imputer.fit_transform(X_train)
filled_X_test = imputer.fit_transform(X_test)

filled_X_train

array([['BMW', 'White', 5.0, 195419.0],
       ['Toyota', 'Blue', 4.0, 103909.0],
       ['Toyota', 'White', 4.0, 53765.0],
       ...,
       ['Toyota', 'White', 4.0, 28809.0],
       ['Nissan', 'White', 4.0, 192747.0],
       ['Nissan', 'Blue', 3.0, 195105.0]], dtype=object)

In [8]:
car_sales_filled_train = pd.DataFrame(filled_X_train,columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_test = pd.DataFrame(filled_X_test,columns=["Make", "Colour", "Doors", "Odometer (KM)"])

car_sales_filled_train.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [9]:
categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],
                                   
                                remainder="passthrough")

transformed_X_train = transformer.fit_transform(car_sales_filled_train)

transformed_X_test = transformer.transform(car_sales_filled_test)

transformed_X_train.toarray()

array([[1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.95419e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.03909e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 5.37650e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.88090e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.92747e+05],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 1.95105e+05]])

In [10]:
model = RandomForestRegressor()

model.fit(transformed_X_train,Y_train)

model.score(transformed_X_test,Y_test)

0.30088100222599945