In [53]:
import pandas as pd
import numpy as np

from sklearn.impute import SimpleImputer

In [54]:
df = pd.read_csv('Data.csv')

In [55]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,,Yes
5,France,35.0,58000.0,Yes
6,Spain,,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [22]:
df.iloc[:,1:3] # First is for rows and other for columns

Unnamed: 0,Age,Salary
0,44.0,72000.0
1,27.0,48000.0
2,30.0,54000.0
3,38.0,61000.0
4,40.0,
5,35.0,58000.0
6,,52000.0
7,48.0,79000.0
8,50.0,83000.0
9,37.0,67000.0


 # Null Values handling - Numeric

### Approach 1: Removing the rows with null value

In [18]:
df.dropna()

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
3,Spain,38.0,61000.0,No
5,France,35.0,58000.0,Yes
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


### Approach 2: Filling the value with Imputers

In [39]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

In [40]:
df.iloc[:,1:3].values

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01,     nan],
       [3.5e+01, 5.8e+04],
       [    nan, 5.2e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [41]:
imputer.fit(df.iloc[:,1:3].values) # This is calculating what we want to do

In [42]:
x = imputer.transform(df.iloc[:,1:3].values) # Write the values now or transforming the values

In [43]:
x

array([[4.4e+01, 7.2e+04],
       [2.7e+01, 4.8e+04],
       [3.0e+01, 5.4e+04],
       [3.8e+01, 6.1e+04],
       [4.0e+01, 4.8e+04],
       [3.5e+01, 5.8e+04],
       [2.7e+01, 5.2e+04],
       [4.8e+01, 7.9e+04],
       [5.0e+01, 8.3e+04],
       [3.7e+01, 6.7e+04]])

In [44]:
df.iloc[:,1:3] = x # replacing the values back to dataframe

In [45]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


# Null Values handling - Categorical

### Filling with Imputer using most frequent value

#### For Country :

In [46]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

In [47]:
imputer.fit(df.iloc[:,:1].values)

In [49]:
df.iloc[:,:1] = imputer.transform(df.iloc[:,:1].values)

In [50]:
df


Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,France,30.0,54000.0,
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


#### For Purchased :

In [51]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer.fit(df.iloc[:,3:].values)
df.iloc[:,3:] = imputer.transform(df.iloc[:,3:].values)

In [52]:
df

Unnamed: 0,Country,Age,Salary,Purchased
0,France,44.0,72000.0,Yes
1,Spain,27.0,48000.0,Yes
2,France,30.0,54000.0,Yes
3,Spain,38.0,61000.0,No
4,Germany,40.0,48000.0,Yes
5,France,35.0,58000.0,Yes
6,Spain,27.0,52000.0,No
7,France,48.0,79000.0,Yes
8,Germany,50.0,83000.0,No
9,France,37.0,67000.0,Yes


In [56]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')
imputer.fit(df)
df = imputer.transform(df)

In [57]:
df

array([['France', 44.0, 72000.0, 'Yes'],
       ['Spain', 27.0, 48000.0, 'Yes'],
       ['France', 30.0, 54000.0, 'Yes'],
       ['Spain', 38.0, 61000.0, 'No'],
       ['Germany', 40.0, 48000.0, 'Yes'],
       ['France', 35.0, 58000.0, 'Yes'],
       ['Spain', 27.0, 52000.0, 'No'],
       ['France', 48.0, 79000.0, 'Yes'],
       ['Germany', 50.0, 83000.0, 'No'],
       ['France', 37.0, 67000.0, 'Yes']], dtype=object)