# 1.2 Dealing with missing Data
    1. Fill with some values (Imputation)
    2. Remove the samples with missing data.

In [131]:
# importing libraries

import numpy as np
import pandas as pd

In [132]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [133]:
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [134]:
car_sales_missing["Doors"].value_counts()

4.0    811
5.0     75
3.0     64
Name: Doors, dtype: int64

# Option 1: Fill with missing value

In [135]:
# fill "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)
# fill "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)
# fill "Odometer (KM)" column
car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean(), inplace=True)
# fill "Make" column
car_sales_missing["Doors"].fillna(4, inplace=True)

In [136]:
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [137]:
car_sales_missing.dropna(inplace=True)

In [138]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [139]:
car_sales_missing.shape

(950, 5)

In [140]:
len(car_sales_missing)

950

In [141]:
# X and y 
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [142]:
# Onehot encoding

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_feature = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_feature)], remainder= "passthrough")

transform_X = transformer.fit_transform(car_sales_missing)



In [143]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(6)
X_train, X_test, y_train, y_test = train_test_split(transform_X, y, test_size=0.2)
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

In [144]:
model.score(X_test,y_test)

0.9994322694319474

In [145]:
model.score(X_train,y_train)

0.9999363829115532

# Option 2: Fill missing value with sklearn

In [100]:
car_sales_missing_2 = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing_2

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [101]:
car_sales_missing_2.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [102]:
car_sales_missing_2.dropna(subset="Price", inplace=True)
car_sales_missing_2.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [103]:
# x and y separation
X1 = car_sales_missing_2.drop("Price", axis=1)
y1 = car_sales_missing_2["Price"]

In [104]:
np.random.seed(42)
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.2)

In [105]:
# Fill the missing value with Scikit-learn

from sklearn.impute import SimpleImputer

# Fill the categorical data with "missing" and numerical values with mean
categorical_imputer = SimpleImputer(strategy= "constant", fill_value= "missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
numerical_imputer = SimpleImputer(strategy="mean")

# define the columns
categorical_feature = ["Make", "Colour"]
door_feature = ["Doors"]
numerical_feature = ["Odometer (KM)"]

# Transform the column according to the imputer

imputer = ColumnTransformer([("categorical_imputer", categorical_imputer, categorical_feature),
                             ("numerical_imputer", numerical_imputer, numerical_feature),
                             ("door_imputer", door_imputer, door_feature)
                            ])

filled_X1_train = imputer.fit_transform(X1_train)
filled_X1_test = imputer.fit_transform(X1_test)

In [106]:
filled_X1_train

array([['Honda', 'White', 71934.0, 4.0],
       ['Toyota', 'Red', 162665.0, 4.0],
       ['Honda', 'White', 42844.0, 4.0],
       ...,
       ['Toyota', 'White', 196225.0, 4.0],
       ['Honda', 'Blue', 133117.0, 4.0],
       ['Honda', 'missing', 150582.0, 4.0]], dtype=object)

In [107]:
car_sales_missing_2.isna().sum()

Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [118]:
car_sales_filled_X1_train = pd.DataFrame(filled_X1_train, columns=["Make", "Colour", "Odometer (KM)", "Doors"])
car_sales_filled_X1_test = pd.DataFrame(filled_X1_test, columns=["Make", "Colour", "Odometer (KM)", "Doors"])

In [119]:
car_sales_filled_X1_train

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,71934.0,4.0
1,Toyota,Red,162665.0,4.0
2,Honda,White,42844.0,4.0
3,Honda,White,195829.0,4.0
4,Honda,Blue,219217.0,4.0
...,...,...,...,...
755,Toyota,missing,218803.0,4.0
756,BMW,Blue,245427.0,5.0
757,Toyota,White,196225.0,4.0
758,Honda,Blue,133117.0,4.0


In [124]:
# Onehot encoding

from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_feature = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_feature)], remainder= "passthrough")

transform_X1_train = transformer.fit_transform(car_sales_filled_X1_train)
transform_X1_test = transformer.fit_transform(car_sales_filled_X1_test)

transform_X1_train.toarray()

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 7.19340e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.62665e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 4.28440e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.96225e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.33117e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.50582e+05]])

In [128]:
from sklearn.ensemble import RandomForestRegressor
np.random.seed(42)
clf = RandomForestRegressor()
clf.fit(transform_X1_train, y1_train)

RandomForestRegressor()

In [129]:
clf.score(transform_X1_test, y1_test)

0.25366332156443805

In [130]:
clf.score(transform_X1_train, y1_train)

0.8815306236443183