### Missing data

In [1]:
import numpy as np
import pandas as pd

In [2]:
data = pd.read_csv('Toyota.csv', index_col=0, na_values=['??', '????'])

In [3]:
data.head()

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,three,1165
1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170


In [4]:
data1 = data.copy() # deep copy default, changes will not be reflected in the original

In [5]:
data2 = data.copy()

In [6]:
data2.isnull().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [7]:
data2.isna().sum()

Price          0
Age          100
KM            15
FuelType     100
HP             6
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [8]:
missing = data2[data2.isnull().any(axis=1)]
missing

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
6,16900,27.0,,Diesel,,,0,2000,3,1245
7,18600,30.0,75889.0,,90.0,1.0,0,2000,3,1245
9,12950,23.0,71138.0,Diesel,,,0,1900,3,1105
15,22000,28.0,18739.0,Petrol,,0.0,0,1800,3,1185
...,...,...,...,...,...,...,...,...,...,...
1428,8450,72.0,,Petrol,86.0,,0,1300,3,1015
1431,7500,,20544.0,Petrol,86.0,1.0,0,1300,3,1025
1432,10845,72.0,,Petrol,86.0,0.0,0,1300,3,1015
1433,8500,,17016.0,Petrol,86.0,0.0,0,1300,3,1015


In [9]:
missing.shape

(340, 10)

In [10]:
data2.describe()

Unnamed: 0,Price,Age,KM,HP,MetColor,Automatic,CC,Weight
count,1436.0,1336.0,1421.0,1430.0,1286.0,1436.0,1436.0,1436.0
mean,10730.824513,55.672156,68647.239972,101.478322,0.674961,0.05571,1566.827994,1072.45961
std,3626.964585,18.589804,37333.023589,14.768255,0.468572,0.229441,187.182436,52.64112
min,4350.0,1.0,1.0,69.0,0.0,0.0,1300.0,1000.0
25%,8450.0,43.0,43210.0,90.0,0.0,0.0,1400.0,1040.0
50%,9900.0,60.0,63634.0,110.0,1.0,0.0,1600.0,1070.0
75%,11950.0,70.0,87000.0,110.0,1.0,0.0,1600.0,1085.0
max,32500.0,80.0,243000.0,192.0,1.0,1.0,2000.0,1615.0


In [11]:
data2['Age'].mean()

55.67215568862275

In [27]:
data2['Age'].fillna(data2['Age'].mean, inplace=True)

In [22]:
data2['KM'].median()

63634.0

In [23]:
data2['KM'].fillna(data2['KM'].median, inplace=True)

In [24]:
data2['HP'].mean()

101.47832167832168

In [25]:
data2['HP'].fillna(data2['HP'].mean(), inplace=True)

In [28]:
data2.isnull().sum()

Price          0
Age            0
KM             0
FuelType     100
HP             0
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

#### Imputing values for categorical values

In [29]:
data2['FuelType'].value_counts()

Petrol    1177
Diesel     144
CNG         15
Name: FuelType, dtype: int64

In [30]:
data2['FuelType'].value_counts().index[0]   # Gives the value with max occurrences

'Petrol'

In [31]:
data2['FuelType'].fillna(data2['FuelType'].value_counts().index[0], inplace=True)

In [32]:
data2.isnull().sum()

Price          0
Age            0
KM             0
FuelType       0
HP             0
MetColor     150
Automatic      0
CC             0
Doors          0
Weight         0
dtype: int64

In [33]:
data2['MetColor'].mode()

0    1.0
dtype: float64

In [34]:
data2['MetColor'].fillna(data2['MetColor'].mode().index[0], inplace=True)

In [36]:
data2.isnull().sum() # data2 has no missing values now

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64

In [38]:
data1.head(25) # has missing values

Unnamed: 0,Price,Age,KM,FuelType,HP,MetColor,Automatic,CC,Doors,Weight
0,13500,23.0,46986.0,Diesel,90.0,1.0,0,2000,three,1165
1,13750,23.0,72937.0,Diesel,90.0,1.0,0,2000,3,1165
2,13950,24.0,41711.0,Diesel,90.0,,0,2000,3,1165
3,14950,26.0,48000.0,Diesel,90.0,0.0,0,2000,3,1165
4,13750,30.0,38500.0,Diesel,90.0,0.0,0,2000,3,1170
5,12950,32.0,61000.0,Diesel,90.0,0.0,0,2000,3,1170
6,16900,27.0,,Diesel,,,0,2000,3,1245
7,18600,30.0,75889.0,,90.0,1.0,0,2000,3,1245
8,21500,27.0,19700.0,Petrol,192.0,0.0,0,1800,3,1185
9,12950,23.0,71138.0,Diesel,,,0,1900,3,1105


In [39]:
# Imputing missing values using lambda
data1 = data1.apply(lambda x:x.fillna(x.mean()) if x.dtype == 'float' else x.fillna(x.value_counts().index[0]))

In [40]:
data1.isnull().sum()

Price        0
Age          0
KM           0
FuelType     0
HP           0
MetColor     0
Automatic    0
CC           0
Doors        0
Weight       0
dtype: int64