# Filtering Outlier POI from Patterns

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [8]:
import pandas as pd
import numpy as np
from src.safegraph_eval.preprocessing import preprocessing

In [10]:
# ingest sample Monthly Patterns data from lowe's for 2021 Jan to Sep downloaded from shop
# https://shop.safegraph.com/?brands=SG_BRAND_4f8a39fb928268edb1ab256922e436d7&countries=US&poi=ALL&tab=datasets&cart=sg_p-202100-202108
patterns = pd.DataFrame()
for month in range(1,10,1):
    _ = pd.read_csv(f"data/Lowes-PATTERNS-2021_0{month}-2021-10-28/patterns.csv")
    patterns = patterns.append(_,ignore_index=True)

In [33]:
display(patterns.head()) # each row is Patterns data for a Lowe's POI for a specific month

Unnamed: 0,placekey,parent_placekey,location_name,street_address,city,region,postal_code,safegraph_brand_ids,brands,date_range_start,...,visitor_daytime_cbgs,visitor_country_of_origin,distance_from_home,median_dwell,bucketed_dwell_times,related_same_day_brand,related_same_month_brand,popularity_by_hour,popularity_by_day,device_type
0,222-222@5wg-42f-5vf,,Lowe's,925 E 17th St,Idaho Falls,ID,83404,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-07:00,...,"{""160199705011"":39,""160199704031"":36,""16051960...","{""US"":1090}",13121.0,21.0,"{""<5"":36,""5-10"":434,""11-20"":500,""21-60"":731,""6...","{""The Home Depot"":15,""Walmart"":13,""Sam's Club""...","{""Walmart"":70,""The Home Depot"":39,""McDonald's""...","[21,19,15,14,10,13,17,71,139,168,233,304,319,3...","{""Monday"":211,""Tuesday"":196,""Wednesday"":205,""T...","{""android"":739,""ios"":384}"
1,222-222@5r8-dy2-vvf,,Lowe's,2205 SE Adams Blvd,Bartlesville,OK,74006,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-06:00,...,"{""401139400033"":8,""401139400041"":7,""4014700050...","{""US"":106}",13246.0,24.0,"{""<5"":5,""5-10"":38,""11-20"":34,""21-60"":59,""61-12...","{""Russell Cellular"":30,""Walmart"":13,""QuikTrip""...","{""Walmart"":72,""Russell Cellular"":65,""Sonic"":45...","[6,6,6,4,3,3,11,15,25,36,31,34,42,36,38,30,23,...","{""Monday"":27,""Tuesday"":28,""Wednesday"":31,""Thur...","{""android"":99,""ios"":12}"
2,zzw-222@8g7-nkk-zfz,,Lowe's,3341 Lexington Rd,Athens,GA,30605,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-05:00,...,"{""132219602023"":60,""132219602013"":54,""13221960...","{""US"":1295}",16266.0,21.0,"{""<5"":59,""5-10"":482,""11-20"":556,""21-60"":849,""6...","{""Walmart"":11,""Kroger"":5,""Goodwill Industries""...","{""Walmart"":59,""Chick-fil-A"":42,""Kroger"":41,""Mc...","[40,42,40,46,54,66,72,100,189,247,348,364,376,...","{""Monday"":271,""Tuesday"":255,""Wednesday"":275,""T...","{""android"":641,""ios"":692}"
3,zzw-222@3bt-by7-2c5,,Lowe's,333 E Tudor Rd,Anchorage,AK,99503,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-09:00,...,"{""020200023011"":12,""020200018024"":10,""02020000...","{""US"":459}",6470.0,21.0,"{""<5"":14,""5-10"":187,""11-20"":196,""21-60"":290,""6...","{""The Home Depot"":11,""Walmart"":8,""Holiday Stat...","{""Walmart"":48,""Costco"":48,""The Home Depot"":40,...","[5,6,6,7,7,13,34,41,72,74,91,113,154,146,113,1...","{""Monday"":93,""Tuesday"":100,""Wednesday"":120,""Th...","{""android"":226,""ios"":250}"
4,222-222@8sz-9cw-bc5,,Lowe's,605 SW H K Dodgen Loop,Temple,TX,76502,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-06:00,...,"{""480270213031"":94,""480270203002"":84,""48027020...","{""US"":1961}",13110.0,22.0,"{""<5"":75,""5-10"":786,""11-20"":817,""21-60"":1404,""...","{""Walmart"":13,""H-E-B"":8,""The Home Depot"":8,""CE...","{""Walmart"":69,""H-E-B"":61,""CEFCO"":44,""McDonald'...","[58,58,64,62,65,90,110,175,299,416,563,642,696...","{""Monday"":479,""Tuesday"":409,""Wednesday"":392,""T...","{""android"":1062,""ios"":972}"


In [20]:
# How many unique Lowe's have Patterns for this time period?
patterns["placekey"].drop_duplicates().size 

1706

In [32]:
print(preprocessing.label_and_remove_outliers_iqr.__doc__)


    Label outliers in Patterns using k*IQR filtering, by group, and optionally remove them. 
    If remove, POIs with values in any month that are determined as outliers are removed.
    Returns a DataFrame.

    Parameters:
        df_ (pandas DataFrame): DataFrame containing SafeGraph Patterns data.
        group_column (str): The SafeGraph column that will be used to group the data (e.g., safegraph_brand_ids). Outliers are determined based in the distribution of values in each group.
        column_to_filter (str): The SafeGraph column on which perform the outlier filtering (e.g., raw_visit_counts).
    Optional Parameters:
        k (float): Value that will be multiplied by the interquartile range to determine outliers. Defaults to 1.5.
        remove (bool): Whether or not to remove the outlier rows from the returned DataFrame. Defaults to True.
        verbose (bool): If True, prints how many POI were removed as outliers. Defaults to True.
        brand_whitelist (list): List of

In [31]:
patterns_filtered_default_params = preprocessing.label_and_remove_outliers_iqr(
    patterns, group_column = "brands", column_to_filter = "raw_visit_counts")
display(patterns_filtered_default_params.head())

135 POI out of 1706 were removed as outliers.



Unnamed: 0,placekey,parent_placekey,location_name,street_address,city,region,postal_code,safegraph_brand_ids,brands,date_range_start,...,visitor_daytime_cbgs,visitor_country_of_origin,distance_from_home,median_dwell,bucketed_dwell_times,related_same_day_brand,related_same_month_brand,popularity_by_hour,popularity_by_day,device_type
0,222-222@5wg-42f-5vf,,Lowe's,925 E 17th St,Idaho Falls,ID,83404,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-07:00,...,"{""160199705011"":39,""160199704031"":36,""16051960...","{""US"":1090}",13121.0,21.0,"{""<5"":36,""5-10"":434,""11-20"":500,""21-60"":731,""6...","{""The Home Depot"":15,""Walmart"":13,""Sam's Club""...","{""Walmart"":70,""The Home Depot"":39,""McDonald's""...","[21,19,15,14,10,13,17,71,139,168,233,304,319,3...","{""Monday"":211,""Tuesday"":196,""Wednesday"":205,""T...","{""android"":739,""ios"":384}"
1,222-222@5r8-dy2-vvf,,Lowe's,2205 SE Adams Blvd,Bartlesville,OK,74006,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-06:00,...,"{""401139400033"":8,""401139400041"":7,""4014700050...","{""US"":106}",13246.0,24.0,"{""<5"":5,""5-10"":38,""11-20"":34,""21-60"":59,""61-12...","{""Russell Cellular"":30,""Walmart"":13,""QuikTrip""...","{""Walmart"":72,""Russell Cellular"":65,""Sonic"":45...","[6,6,6,4,3,3,11,15,25,36,31,34,42,36,38,30,23,...","{""Monday"":27,""Tuesday"":28,""Wednesday"":31,""Thur...","{""android"":99,""ios"":12}"
2,zzw-222@8g7-nkk-zfz,,Lowe's,3341 Lexington Rd,Athens,GA,30605,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-05:00,...,"{""132219602023"":60,""132219602013"":54,""13221960...","{""US"":1295}",16266.0,21.0,"{""<5"":59,""5-10"":482,""11-20"":556,""21-60"":849,""6...","{""Walmart"":11,""Kroger"":5,""Goodwill Industries""...","{""Walmart"":59,""Chick-fil-A"":42,""Kroger"":41,""Mc...","[40,42,40,46,54,66,72,100,189,247,348,364,376,...","{""Monday"":271,""Tuesday"":255,""Wednesday"":275,""T...","{""android"":641,""ios"":692}"
3,zzw-222@3bt-by7-2c5,,Lowe's,333 E Tudor Rd,Anchorage,AK,99503,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-09:00,...,"{""020200023011"":12,""020200018024"":10,""02020000...","{""US"":459}",6470.0,21.0,"{""<5"":14,""5-10"":187,""11-20"":196,""21-60"":290,""6...","{""The Home Depot"":11,""Walmart"":8,""Holiday Stat...","{""Walmart"":48,""Costco"":48,""The Home Depot"":40,...","[5,6,6,7,7,13,34,41,72,74,91,113,154,146,113,1...","{""Monday"":93,""Tuesday"":100,""Wednesday"":120,""Th...","{""android"":226,""ios"":250}"
5,222-222@5z4-qrt-wff,zzw-222@5z4-qrt-w49,Lowe's,29335 Central Ave,Lake Elsinore,CA,92532,SG_BRAND_4f8a39fb928268edb1ab256922e436d7,Lowe's,2021-01-01T00:00:00-08:00,...,"{""060650427153"":71,""060650430011"":43,""06065042...","{""US"":947}",8548.0,20.0,"{""<5"":40,""5-10"":362,""11-20"":393,""21-60"":457,""6...","{""Costco"":17,""The Home Depot"":10,""Starbucks"":6...","{""Costco"":52,""Starbucks"":45,""The Home Depot"":4...","[7,8,8,4,6,23,47,64,113,155,238,257,258,282,27...","{""Monday"":193,""Tuesday"":184,""Wednesday"":148,""T...","{""android"":614,""ios"":368}"


In [34]:
# How many unique Lowe's remain?
patterns_filtered_default_params["placekey"].drop_duplicates().size 

1571

In [46]:
# Which plackeys were removed?
removed_pks_default_params = set(patterns["placekey"]).difference(patterns_filtered_default_params["placekey"])
print(removed_pks_default_params)

{'zzw-222@5pm-hvv-wx5', 'zzw-222@5pt-529-9xq', 'zzw-222@8fc-f7w-xyv', '222-222@8f6-sk4-2c5', 'zzw-222@8gk-tg5-fpv', '222-223@5pn-r25-rkz', '222-222@5pn-4bm-hkf', '222-222@8gm-hvf-ygk', 'zzy-222@8f3-nxy-2p9', '222-222@5qy-dxp-8jv', 'zzw-222@5r2-mxn-hbk', '222-222@8f5-7pp-kfz', 'zzw-222@8gg-394-nwk', 'zzw-222@5qw-jwm-4d9', 'zzw-222@8fb-5n8-bzf', 'zzw-222@8gg-d7p-6c5', '222-223@8gg-smy-syv', '222-222@5pm-h48-nqz', 'zzw-222@5qx-pc9-bkz', '222-222@8gf-fw9-k75', '222-222@8f3-jdq-bc5', 'zzw-222@8f2-2wy-y5f', '223-222@5qs-w4b-t9z', 'zzw-222@5pq-wk3-tvz', '222-222@8gj-gmc-9mk', '222-222@8sy-smk-vxq', '222-222@8gh-tnj-kxq', '222-222@63j-2vy-3yv', '222-222@8gm-mqz-ch5', '222-222@5qs-vx6-249', '222-222@5py-jk7-ygk', 'zzw-223@8gg-vp7-hdv', '222-222@8dy-eh9-xnq', '222-222@8st-wwv-tvz', '222-222@8f2-qp6-cbk', 'zzw-222@8gf-xdg-kj9', '222-222@8gm-zd5-99f', '222-222@8dy-rzk-bp9', 'zzw-222@8dj-ds7-c3q', 'zzw-222@5pn-xqw-hyv', '222-222@8gn-9rx-fmk', '222-222@5r7-chn-9cq', 'zzw-222@5r4-2t7-swk', 'zzw-222@8

In [53]:
# Let's look at one:
patterns.loc[patterns["placekey"]=="222-222@5p2-2tw-vs5",
             ["date_range_start","street_address", "city", "region", "raw_visit_counts"]]
# Raw visit counts are in the 1000s, so it was filtered out for being too high relative to the distribution of visits at other Lowe's.

Unnamed: 0,date_range_start,street_address,city,region,raw_visit_counts
464,2021-01-01T00:00:00-06:00,18375 Wright St,Omaha,NE,2550
2360,2021-02-01T00:00:00-06:00,18375 Wright St,Omaha,NE,2048
3898,2021-03-01T00:00:00-06:00,18375 Wright St,Omaha,NE,3199
5446,2021-04-01T00:00:00-05:00,18375 Wright St,Omaha,NE,3934
7122,2021-05-01T00:00:00-05:00,18375 Wright St,Omaha,NE,5143
8785,2021-06-01T00:00:00-05:00,18375 Wright St,Omaha,NE,3588
10340,2021-07-01T00:00:00-05:00,18375 Wright St,Omaha,NE,3161
12116,2021-08-01T00:00:00-05:00,18375 Wright St,Omaha,NE,2810
13924,2021-09-01T00:00:00-05:00,18375 Wright St,Omaha,NE,2615


In [44]:
# Let's say we think these are valid and want to increase the bound of what is considered an "outlier".

# We can increase the k parameter in the filtering function from 1.5 which is the default.
patterns_filtered_higher_k = preprocessing.label_and_remove_outliers_iqr(
    patterns, group_column = "brands", column_to_filter = "raw_visit_counts", k = 2)

60 POI out of 1706 were removed as outliers.



In [48]:
# Which plackeys were removed this time?
removed_pks_higher_k = set(patterns["placekey"]).difference(patterns_filtered_higher_k["placekey"])
print(removed_pks_higher_k)

{'zzw-222@5pq-928-w6k', 'zzw-222@8sr-858-n3q', 'zzw-222@8gg-sm2-w49', '222-222@8gm-mqz-ch5', '222-222@8fc-mpn-3kf', '222-222@5qs-vx6-249', 'zzw-222@8gk-tbc-y5f', 'zzw-222@5pm-hvv-wx5', '222-222@5pn-mqv-hqz', '222-222@8fc-r4x-dqf', '222-222@647-2wc-whq', '222-222@8f2-rtk-zj9', 'zzw-222@5qw-5f6-7t9', '222-222@8dy-eh9-xnq', 'zzw-222@5pn-6qr-m6k', '222-222@8f2-qp6-cbk', '222-222@5pm-fpw-ch5', '222-222@8f6-sk4-2c5', '222-222@8gm-zd5-99f', '222-222@8f2-55m-r6k', 'zzw-222@5ps-78z-j35', '222-223@5pn-r25-rkz', '222-222@8g8-7v2-389', '222-222@5pn-4bm-hkf', 'zzy-222@8f3-nxy-2p9', '222-222@8dy-ssp-h5z', 'zzw-222@8f3-fwj-2ff', '222-222@5qy-dxp-8jv', 'zzw-222@8dj-ds7-c3q', '222-222@8gn-kkm-cbk', 'zzw-222@5r2-mxn-hbk', '222-222@5pv-9wp-9mk', 'zzw-222@5pn-xqw-hyv', '222-222@5r7-chn-9cq', '222-222@5r7-fr7-8sq', '222-222@5pt-4pw-y35', 'zzw-222@8g8-4jz-wtv', 'zzw-222@5r4-2t7-swk', '222-222@8gh-x7p-rkz', '222-222@8gg-r84-4y9', '222-222@8g9-693-w49', '222-222@8gk-yjz-ty9', '222-222@8gg-9tn-5xq', 'zzw-222@8

In [50]:
# The original placekey from Omaha NE is no longer removed.
"222-222@5p2-2tw-vs5" in removed_pks_higher_k

False