In [138]:
import numpy as np
import pandas as pd

%reload_ext autoreload
%autoreload 2

from ds_discovery.transition.discovery import DataDiscovery as discover
from ds_behavioral import DataBuilderTools

import ds_discovery
print('DTU: {}'.format(ds_discovery.__version__))

DTU: 1.07.038


# Quasi-constant features

Quasi-constant features are those that show the same value for the great majority of the observations of the dataset.

During the transitioning process when auto removing columns, we can apply quasi-constant removal


#### Create the Dataframe

In [139]:
df = pd.DataFrame()
df['single_num'] = DataBuilderTools.get_number(1, 1, size=100, seed=31)
df['two_num'] = DataBuilderTools.get_number(1, 2, quantity=0.9, size=100, seed=31)
df['weight_num'] = DataBuilderTools.get_number(1, 2, weight_pattern=[90, 1], size=100, seed=31)
df['null_num'] = DataBuilderTools.get_number(1, 100, quantity=0, size=100, seed=31)
df['normal_num'] = DataBuilderTools.get_number(1, 100, size=100, seed=31)
df['single_cat'] = DataBuilderTools.get_category(['A'], size=100, seed=31)
df['two_cat'] = DataBuilderTools.get_category(['A', 'B'], quantity=0.9, size=100, seed=31)
df['weight_cat'] = DataBuilderTools.get_category(['A', 'B', 'C'], weight_pattern=[80, 1, 1], size=100, seed=31)
df['normal_cat'] = DataBuilderTools.get_category(list('ABCDE'), size=100, seed=31)


In [140]:
discover.data_dictionary(df)

Unnamed: 0,Attribute,Type,% Nulls,Count,Unique,Observations
0,normal_cat,object,0.0,100,5,Sample: E | A | B
1,normal_num,int64,0.0,100,62,max=99 | min=2 | mean=45.35
2,null_num,object,1.0,0,0,Sample: Null Values
3,single_cat,object,0.0,100,1,Sample: A
4,single_num,int64,0.0,100,1,max=1 | min=1 | mean=1.0
5,two_cat,object,0.1,100,3,Sample: B | A |
6,two_num,float64,0.1,90,2,max=2.0 | min=1.0 | mean=1.47
7,weight_cat,object,0.0,100,3,Sample: A | B | C
8,weight_num,int64,0.0,100,2,max=2 | min=1 | mean=1.01


## Auto Remove Columns
When we auto remove it removes the obvious columns of `single_num, null_num', single_cat`

In [141]:
clean.auto_remove_columns(df,inplace=True)

{'remove': {'headers': ['single_num', 'null_num', 'single_cat'],
  'drop': False,
  'exclude': False}}

## Quasi-Constant Auto Remove
As we can see the value count in weight_cat is predominatly value `A` 

In [142]:
df['weight_cat'].value_counts()

A    98
B     1
C     1
Name: weight_cat, dtype: int64

In [143]:
df['weight_num'].value_counts()

1    99
2     1
Name: weight_num, dtype: int64

------
so we want to remove this and any other values that have a threshold.

We do this by adding the `predominant_max` parameter to the method and we see the `weight_num, weight_cat` are also removed

In [144]:
clean.auto_remove_columns(df, predominant_max=0.9, inplace=True)

{'remove': {'headers': ['weight_num', 'weight_cat'],
  'drop': False,
  'exclude': False}}