In [1]:
import numpy as np
import pandas as pd

%reload_ext autoreload
%autoreload 2

from ds_discovery.transition.cleaners import ColumnCleaners as clean
from ds_discovery.transition.discovery import DataDiscovery as discover
from ds_behavioral import DataBuilderTools

import ds_discovery
print('DTU: {}'.format(ds_discovery.__version__))

DTU: 1.07.038


# Missing Values
#### As part of the feature engineering this demo shows how you can replace missing values with representative distributions of values

In [2]:
from ds_discovery.feature.engineer import FeatureBuilderTools as fbt

### Create a synthetic dataset
* the `quantity` parameter represents an approximate pecentage of values to have representative values, the rest being NaN or empty string
* each number type is represented but integers will be floats as they contain NaN

In [35]:
sample_size=1000
df = pd.DataFrame()

df['num'] = DataBuilderTools.get_number(20, 100, quantity=0.7, size=sample_size)
df['str'] = DataBuilderTools.get_category(list('ABCDE'), quantity=0.6, size=sample_size)
df['cat'] = DataBuilderTools.get_category(list('ABCDE'), quantity=0.5, size=sample_size)
df['float'] = DataBuilderTools.get_number(-1.0, 1.0, quantity=0.6, size=sample_size)
df['date'] = DataBuilderTools.get_datetime('01/01/2000', '12/31/2018', quantity=0.7, size=sample_size)

# type the catagorical
_ = clean.to_category_type(df, headers='cat', inplace=True)

In [36]:
discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,cat,category,0.5,1000,6,|A|B|C|D|E
1,date,datetime64[ns],0.3,700,700,max=2018-12-19 10:07:47.605986688 | min=2000-01-17 08:51:12.257505860 |...
2,float,float64,0.4,600,522,max=0.999 | min=-0.998 | mean=0.04
3,num,float64,0.3,700,81,max=100.0 | min=20.0 | mean=59.37
4,str,object,0.4,1000,6,Sample: A | C | E


## Replace NaN values
* Call the `replace_missing()` method passing it the DataFrame

In [37]:
df = fbt.replace_missing(df)

In [38]:
discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,cat,category,0.5,1000,6,|A|B|C|D|E
1,date,datetime64[ns],0.0,1000,1000,max=2018-12-19 10:07:47.605986688 | min=2000-01-17 08:51:12.257505860 |...
2,float,float64,0.0,1000,524,max=1.0 | min=-1.0 | mean=0.05
3,num,float64,0.0,1000,81,max=100.0 | min=20.0 | mean=60.2
4,str,object,0.4,1000,6,Sample: C | B | A


## Replace alternative null values
As we see from the above data dictionary, both the category and string object have not been cleaned. We can also note that the usable count is the full 1000 values. This is because our null fields are blank text fields and not NaNs

To also replace alternative null values we can pass a custome nulls list, Note this is in addition to np.NaN

In [39]:
df = fbt.replace_missing(df, nulls_list=[''])

In [40]:
discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,cat,category,0.0,1000,5,|A|B|C|D|E
1,date,datetime64[ns],0.0,1000,1000,max=2018-12-19 10:07:47.605986688 | min=2000-01-17 08:51:12.257505860 |...
2,float,float64,0.0,1000,524,max=1.0 | min=-1.0 | mean=0.05
3,num,float64,0.0,1000,81,max=100.0 | min=20.0 | mean=60.2
4,str,object,0.0,1000,5,Sample: D | B | E
