# Dataframe filter
> Simple dataframe filter, Interactivel y filter column by value, one by one

In [1]:
# default_exp df_filter

In [2]:
# export
import pandas as pd
import numpy as np

In [3]:
def get_cal_housing():
    from sklearn.datasets import california_housing
    data = california_housing.fetch_california_housing()
    df = pd.DataFrame(data['data'], columns=data['feature_names'])
    return df

In [4]:
df = get_cal_housing()

In [5]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


## A tool to filter data

In [8]:
# export
from ipywidgets import VBox, Button,\
    FloatSlider, IntSlider,Dropdown, Label,Checkbox,\
    interact

import plotly.express as px

def detect_number_column(df):
    """
    Detect number columns in dataframe
    """
    cols = df.columns
    dtypes = [df[col].dtype.name for col in cols]
    return pd.DataFrame({"cols":cols, "dtypes":dtypes})

class DataFilter:
    """
    Single column number filter
    """
    def __init__(self, df: pd.DataFrame):
        """
        df: input dataframe
        
        data_filter = DataFilter(df)
        
        # start filtering
        data_filter()
        """
        self.df = df
        
    def show_distribution(self, col_name):
        fig = px.histogram(self.df, x=col_name, height=300, width=800)
        return fig
    
    def create_filter(self, field: str) -> None:
        big_boxes = []
    
        dtype = self.df[field].dtype.name
        if 'float' in dtype:
            slider = FloatSlider
        elif 'int' in dtype:
            slider = IntSlider
        else:
            print(f"filter of {dtype} not supported")
            
        btn = Button(description="Run Filter")
        btn.on_click(self.execute_filter)
        
        print(f"NaN count: {(self.df[field].isna()).sum()}")
        
        widget = VBox([
                    Label(f"Range for {field}"),
                    Dropdown(options=["Larger Than or equal to", "Smaller Than or equal to"]),
                    slider(
                        min = self.df[field].min(),
                        max = self.df[field].max(),
                        step = (self.df[field].max()-self.df[field].min())/100),
                    Checkbox(description="Remove NaN", value=True),
                    btn
                ])
        self.widget = widget
        widget.original_name = field
                    
        display(widget)
    
    def execute_filter(
        self, _) -> None:
        """
        This function will be used as a callback
        for ipywidgets.Button.on_click
        """
        original_name = self.widget.original_name
        label_,condi_,value_,remove_na_, btn_ = self.widget.children
        label, condi, value, remove_na = label_.value ,condi_.value ,value_.value, remove_na_.value
        condi = ">=" if condi=='Larger Than or equal to' else "<="
        expression = f"({original_name} {condi} {value})"
        
        if remove_na:
            self.remove_na(original_name)
            
        print(f"Filter with query expression: {expression}")
        before = len(self.df)
        self.df = self.df.query(expression).reset_index(drop=True)
        after = len(self.df)
        print(f"[Before]: {before}, [After]: {after}")
        
    def remove_na(self, field):
        """
        Remove nan value in a dataframe
        """
        before = len(self.df)
        self.df = self.df[~self.df[field].isna()]
        after = len(self.df)
        print(f"Remove NA on {field} [Before]: {before}, [After]: {after}")
        
    def __call__(self):
        """
        Execute an interact to filter things column by column
        """
        @interact
        def select_field(field = list(self.df.columns)):
            # visualize histogram
            self.show_distribution(field).show()
            
            # create a filter execution interactive
            self.create_filter(field)

In [9]:
df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [10]:
data_filter = DataFilter(df)

In [11]:
data_filter()

interactive(children=(Dropdown(description='field', options=('MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'P…

## Extract out the dataframe afterwards

In [12]:
data_filter.df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32
