In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt

In [2]:
# It can be filled with some values
# Or we can remove the rows that has null values

In [3]:
cars_sales_missing = pd.read_csv('Data/car-sales-extended-missing-data.csv')

In [4]:
cars_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [5]:
# cars_sales_missing.isna().head()

In [6]:
# So here are some missing values there
cars_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [7]:
# Lets convert our data into X and Y

X = cars_sales_missing.drop("Price", axis=1)
y = cars_sales_missing.Price

In [8]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [9]:
y.head()

0    15323.0
1    19943.0
2    28343.0
3    13434.0
4    14043.0
Name: Price, dtype: float64

In [10]:
# Now we can see that the data in the X has some categorical data 
# For example Make Colour and then Doors are all the data files where the data is in the text form or categorical form
# We can convert the data into numbers and then it can be used by ML algorithm

In [11]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

transformer = ColumnTransformer(
[
    ("one_hot",
    one_hot, 
    categorical_features)
], 
    remainder="passthrough"
)

transformed_X = transformer.fit_transform(X)

transformed_X

<1000x16 sparse matrix of type '<class 'numpy.float64'>'
	with 4000 stored elements in Compressed Sparse Row format>

In [12]:
transformed_X = pd.DataFrame(transformed_X)

In [13]:
transformed_X.head()

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
1,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 13)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
3,"(0, 3)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
4,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 11)\t1.0\n..."


In [14]:
# here when we were transforming the data we might have got the error as there are some missing values in the dataset
# But main problem over here is that there is update in the OneHotEncoder which has made is possible to handle the missing values

In [15]:
# Lets deal with missing data

# Fill missing data with pandas

In [16]:
cars_sales_missing["Make"].fillna("missing", inplace=True)

In [17]:
cars_sales_missing["Make"].isna().sum()

0

In [18]:
cars_sales_missing["Colour"].fillna("missing", inplace=True)

In [19]:
cars_sales_missing["Colour"].isna().sum()

0

In [20]:
cars_sales_missing["Odometer (KM)"].fillna(cars_sales_missing["Odometer (KM)"].mean(), inplace=True)

In [21]:
cars_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors            50
Price            50
dtype: int64

In [22]:
cars_sales_missing["Doors"].fillna(4, inplace=True)

In [25]:
cars_sales_missing["Price"].dropna(inplace=True)

In [26]:
cars_sales_missing.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0


In [27]:
cars_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [28]:
X = cars_sales_missing.drop("Price", axis=1)

In [29]:
X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431.0,4.0
1,BMW,Blue,192714.0,5.0
2,Honda,White,84714.0,4.0
3,Toyota,White,154365.0,4.0
4,Nissan,Blue,181577.0,3.0


In [30]:
y = cars_sales_missing.Price

In [31]:
y.head()

0    15323.0
1    19943.0
2    28343.0
3    13434.0
4    14043.0
Name: Price, dtype: float64

In [32]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

feature_removable = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

transformer = ColumnTransformer([
    ("one_hot", 
    one_hot, 
    feature_removable)
], 
remainder="passthrough"
)

In [33]:
new_X = transformer.fit_transform(X)

In [34]:
transformed_X = pd.DataFrame(new_X)

In [35]:
transformed_X

Unnamed: 0,0
0,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
1,"(0, 0)\t1.0\n (0, 6)\t1.0\n (0, 13)\t1.0\n..."
2,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
3,"(0, 3)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
4,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 11)\t1.0\n..."
...,...
995,"(0, 3)\t1.0\n (0, 5)\t1.0\n (0, 12)\t1.0\n..."
996,"(0, 4)\t1.0\n (0, 9)\t1.0\n (0, 11)\t1.0\n..."
997,"(0, 2)\t1.0\n (0, 6)\t1.0\n (0, 12)\t1.0\n..."
998,"(0, 1)\t1.0\n (0, 9)\t1.0\n (0, 12)\t1.0\n..."
