## Working with missing data using sklearn

we have performed the filling of the missing values using the pandas and then sklearn to split and train what if we could do all of these in sklearn itself?

In [1]:
import pandas as pd
import sklearn
import numpy as np
import matplotlib.pyplot  as plt
%matplotlib inline
print("Imported!")

Imported!


In [2]:
file = pd.read_csv("../datasets/Car_sales_missing.csv")

file["Price_in_thousands"] = file["Price_in_thousands"].fillna(file["Price_in_thousands"].mean())
file.head(20)

Unnamed: 0,Manufacturer,Sales_in_thousands,__year_resale_value,Vehicle_type,Price_in_thousands,Engine_size,Horsepower,Wheelbase,Width,Length,Curb_weight,Fuel_capacity,Fuel_efficiency,Latest_Launch,Power_perf_factor
0,Acura,16.919,16.36,Passenger,21.5,1.8,140.0,101.2,67.3,172.4,2.639,13.2,28.0,02-02-2012,58.28015
1,Acura,39.384,19.875,Passenger,28.4,3.2,225.0,108.1,70.3,192.9,3.517,17.2,25.0,06-03-2011,91.370778
2,Acura,14.114,18.225,Passenger,27.472447,3.2,225.0,106.9,70.6,192.0,3.47,17.2,26.0,01-04-2012,
3,Acura,8.588,29.725,Passenger,42.0,3.5,210.0,114.6,71.4,196.6,3.85,18.0,22.0,03-10-2011,91.389779
4,Audi,20.397,22.255,Passenger,23.99,1.8,,102.6,68.2,178.0,2.998,16.4,27.0,10-08-2011,62.777639
5,Audi,18.78,23.555,Passenger,33.95,2.8,200.0,108.7,76.1,192.0,3.561,18.5,22.0,08-09-2011,84.565105
6,Audi,1.38,39.0,Passenger,62.0,4.2,310.0,113.0,74.0,198.2,3.902,23.7,21.0,2/27/2012,134.656858
7,BMW,19.747,,Passenger,26.99,2.5,170.0,107.3,68.4,176.0,3.179,16.6,26.0,6/28/2011,71.191207
8,BMW,9.231,28.675,Passenger,33.4,2.8,193.0,107.3,68.5,176.0,3.197,16.6,24.0,1/29/2012,81.877069
9,BMW,17.527,36.125,Passenger,38.9,2.8,193.0,111.4,70.9,188.0,3.472,18.5,25.0,04-04-2011,83.998724


In [3]:
x = file.drop("Price_in_thousands",axis =1)
y = file["Price_in_thousands"]
print(x.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157 entries, 0 to 156
Data columns (total 14 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Manufacturer         157 non-null    object 
 1   Sales_in_thousands   155 non-null    float64
 2   __year_resale_value  117 non-null    float64
 3   Vehicle_type         156 non-null    object 
 4   Engine_size          153 non-null    float64
 5   Horsepower           153 non-null    float64
 6   Wheelbase            153 non-null    float64
 7   Width                154 non-null    float64
 8   Length               154 non-null    float64
 9   Curb_weight          152 non-null    float64
 10  Fuel_capacity        155 non-null    float64
 11  Fuel_efficiency      154 non-null    float64
 12  Latest_Launch        152 non-null    object 
 13  Power_perf_factor    144 non-null    float64
dtypes: float64(11), object(3)
memory usage: 17.3+ KB
None


## now we need to convert the missing values using sklearn

In [10]:
from sklearn.impute import SimpleImputer
# filling missing values is also alled as imputing

from sklearn.compose import ColumnTransformer


# fill the catergorical into missing and numerical values with mean
# for every value keep your strategy constant and keep filling it using the missing word.
category_imputor = SimpleImputer(strategy="constant", fill_value="missing")


# filling numerical value
num_imputer = SimpleImputer(strategy="mean")

# define columns 
category_features = ["Manufacturer", "Vehicle_type","Latest_Launch"]
numerical_features = ["Sales_in_thousands","__year_resale_value","Engine_size","Horsepower","Wheelbase","Width","Length","Curb_weight","Fuel_capacity","Fuel_efficiency","Power_perf_factor"]
imputer = ColumnTransformer([("category_imputor",category_imputor,category_features),
                             ("num_imputer",num_imputer,numerical_features)
                            ])


# Transform the data:
filled_x = imputer.fit_transform(x)
filled_x

array([['Acura', 'Passenger', '02-02-2012', ..., 13.2, 28.0, 58.28014952],
       ['Acura', 'Passenger', '06-03-2011', ..., 17.2, 25.0, 91.37077766],
       ['Acura', 'Passenger', '01-04-2012', ..., 17.2, 26.0,
        77.10568763923612],
       ...,
       ['Volvo', 'Passenger', '6/25/2011', ..., 17.9, 25.0, 71.1559776],
       ['Volvo', 'Passenger', '4/26/2011', ..., 18.5, 23.0,
        77.10568763923612],
       ['Volvo', 'Passenger', '11/14/2011', ..., 21.1, 24.0, 85.73565451]],
      dtype=object)

In [19]:
print(filled_x)


[['Acura' 'Passenger' '02-02-2012' ... 13.2 28.0 58.28014952]
 ['Acura' 'Passenger' '06-03-2011' ... 17.2 25.0 91.37077766]
 ['Acura' 'Passenger' '01-04-2012' ... 17.2 26.0 77.10568763923612]
 ...
 ['Volvo' 'Passenger' '6/25/2011' ... 17.9 25.0 71.1559776]
 ['Volvo' 'Passenger' '4/26/2011' ... 18.5 23.0 77.10568763923612]
 ['Volvo' 'Passenger' '11/14/2011' ... 21.1 24.0 85.73565451]]


In [20]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer


categorical_features = ["Manufacturer","Vehicle_type","Latest_Launch"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",one_hot,categorical_features)],
                               remainder = "passthrough")

transfromed_x = transformer.fit_transform(x)
transfromed_x

<157x173 sparse matrix of type '<class 'numpy.float64'>'
	with 2198 stored elements in Compressed Sparse Row format>

In [21]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

In [25]:
from sklearn.model_selection import train_test_split
x_train,x_test, y_train,y_test = train_test_split(transfromed_x,y,test_size = 0.2)


<125x173 sparse matrix of type '<class 'numpy.float64'>'
	with 1750 stored elements in Compressed Sparse Row format>

In [27]:
# model.fit(x_train,y_train)
# model.score(x_test,y_test)