## Code clarification for cleaning

This notebook elaborates on the logic used for the preprocessor object with dummy data to understand what each line of code is doing.

The method used is as follows. 
1. We first calculate the medians based on the train data. 
2. We then merge the train and test data with the median data to get a dataframe with all values replaced with medians: df2
3. We then use the update method to fill the null values in the original dataset with those in df2.
4. Indices need to match for the .update() method, so we first reset indices, and later restore the original indices. 

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [2]:
data =  [['A', 10, 20, np.nan, np.nan, 50, 30], ['A', 2, 1, 5, np.nan, 34, 35], ['A', 13, 212, 3, 6, np.nan, 37],
         ['B', 120, 230, 53, np.nan, 63, 23], ['B', 22, 115, 15, 61, 4, 15], ['B', np.nan, 22, 12, np.nan, np.nan, 31],
         ['C', 105, 120, np.nan, 22, 520, 3], ['C', 26, 11, 15, np.nan, 34, 3], ['C', 13, np.nan, 13, 234, np.nan, 10],
         ['D', 101, 220, 654, 143, 634, 123], ['D', 32, 21, 61, 24, np.nan, 32], ['D', 11, 72, 23, np.nan, 534, 30]
        ]
df = pd.DataFrame(data, columns=['Country','col1','col2','col3','col4','col5','col6'])


median_data = df.groupby('Country').median().reset_index()

In [3]:
df

Unnamed: 0,Country,col1,col2,col3,col4,col5,col6
0,A,10.0,20.0,,,50.0,30
1,A,2.0,1.0,5.0,,34.0,35
2,A,13.0,212.0,3.0,6.0,,37
3,B,120.0,230.0,53.0,,63.0,23
4,B,22.0,115.0,15.0,61.0,4.0,15
5,B,,22.0,12.0,,,31
6,C,105.0,120.0,,22.0,520.0,3
7,C,26.0,11.0,15.0,,34.0,3
8,C,13.0,,13.0,234.0,,10
9,D,101.0,220.0,654.0,143.0,634.0,123


In [4]:
median_data

Unnamed: 0,Country,col1,col2,col3,col4,col5,col6
0,A,10.0,20.0,4.0,6.0,42.0,35
1,B,71.0,115.0,15.0,61.0,33.5,23
2,C,26.0,65.5,14.0,128.0,277.0,3
3,D,32.0,72.0,61.0,83.5,584.0,32


In [5]:
xtrain,xtest,ytrain,ytest = train_test_split(df, df['col6'], test_size=0.25)

In [6]:
# Reset index is required to make sure indices match for later merging. The original indices will later be restored
xtrain. reset_index(inplace=True)
xtrain

Unnamed: 0,index,Country,col1,col2,col3,col4,col5,col6
0,1,A,2.0,1.0,5.0,,34.0,35
1,10,D,32.0,21.0,61.0,24.0,,32
2,2,A,13.0,212.0,3.0,6.0,,37
3,4,B,22.0,115.0,15.0,61.0,4.0,15
4,0,A,10.0,20.0,,,50.0,30
5,11,D,11.0,72.0,23.0,,534.0,30
6,9,D,101.0,220.0,654.0,143.0,634.0,123
7,7,C,26.0,11.0,15.0,,34.0,3
8,6,C,105.0,120.0,,22.0,520.0,3


In [7]:
xtest.reset_index(inplace=True)
xtest

Unnamed: 0,index,Country,col1,col2,col3,col4,col5,col6
0,5,B,,22.0,12.0,,,31
1,3,B,120.0,230.0,53.0,,63.0,23
2,8,C,13.0,,13.0,234.0,,10


In [8]:
xtrain_imputed = xtrain[['Country']].merge(median_data, on='Country',  how='left')
xtrain_imputed

Unnamed: 0,Country,col1,col2,col3,col4,col5,col6
0,A,10.0,20.0,4.0,6.0,42.0,35
1,D,32.0,72.0,61.0,83.5,584.0,32
2,A,10.0,20.0,4.0,6.0,42.0,35
3,B,71.0,115.0,15.0,61.0,33.5,23
4,A,10.0,20.0,4.0,6.0,42.0,35
5,D,32.0,72.0,61.0,83.5,584.0,32
6,D,32.0,72.0,61.0,83.5,584.0,32
7,C,26.0,65.5,14.0,128.0,277.0,3
8,C,26.0,65.5,14.0,128.0,277.0,3


In [9]:
xtest_imputed = xtest[['Country']].merge(median_data, on='Country', how='left')
xtest_imputed

Unnamed: 0,Country,col1,col2,col3,col4,col5,col6
0,B,71.0,115.0,15.0,61.0,33.5,23
1,B,71.0,115.0,15.0,61.0,33.5,23
2,C,26.0,65.5,14.0,128.0,277.0,3


In [10]:
xtrain.update(xtrain_imputed, overwrite=False)
xtrain.set_index('index', inplace=True)
xtrain.index.name = None
xtrain

Unnamed: 0,Country,col1,col2,col3,col4,col5,col6
1,A,2.0,1.0,5.0,6.0,34.0,35
10,D,32.0,21.0,61.0,24.0,584.0,32
2,A,13.0,212.0,3.0,6.0,42.0,37
4,B,22.0,115.0,15.0,61.0,4.0,15
0,A,10.0,20.0,4.0,6.0,50.0,30
11,D,11.0,72.0,23.0,83.5,534.0,30
9,D,101.0,220.0,654.0,143.0,634.0,123
7,C,26.0,11.0,15.0,128.0,34.0,3
6,C,105.0,120.0,14.0,22.0,520.0,3


In [11]:
xtest.update(xtest_imputed, overwrite=False)
xtest.set_index('index', inplace=True)
xtest.index.name = None
xtest

Unnamed: 0,Country,col1,col2,col3,col4,col5,col6
5,B,71.0,22.0,12.0,61.0,33.5,31
3,B,120.0,230.0,53.0,61.0,63.0,23
8,C,13.0,65.5,13.0,234.0,277.0,10
