# Setup

In [1]:
!pwd

/home/umni2/a/umnilab/users/verma99/mk/spr_4711/code


In [2]:
from mobilkit.umni import *

In [4]:
import osmnx
import yaml

In [7]:
SP.start()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/22 21:15:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/09/22 21:15:52 WARN SparkConf: Note that spark.local.dir will be overridden by the value set by the cluster manager (via SPARK_LOCAL_DIRS in mesos/standalone/kubernetes and LOCAL_DIRS in YARN).
23/09/22 21:15:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


# Prepare data

In [5]:
with open('../data/pois/poi_categories.yml', 'rb') as f:
    poi_categories = yaml.safe_load(f)

poi_classes = []
for source, data in poi_categories.items():
    for cat, fclasses in data.items():
        for fclass in fclasses:
            poi_classes.append((source, cat, fclass))
poi_classes = Pdf(poi_classes, columns=['source', 'category', 'fclass']).disp()

64 rows x 3 cols; Memory: 0.0 MiB


Unnamed: 0,source,category,fclass
,<object>,<object>,<object>
0.0,SafeGraph,Education,Elementary and Secondary Schools


## OSM

In [8]:
def get_osm_pois(geocode, categories: Pdf, save=True, overwrite=False):
    outfile = Path('../data/pois/osm.parquet')
    if outfile.exists() and not overwrite:
        return SP.read_parquet(outfile)
    fclasses = categories.query('source == "OSM"')['fclass'].unique().tolist()
    df = osmnx.geometries_from_place(geocode, tags={'amenity': fclasses})
    df = df.rename(columns={'amenity': 'fclass'}).to_crs(CRS_M)
    df.geometry = df.centroid
    df = df[['name', 'fclass', 'geometry']].reset_index()
    df = df.rename(columns={'osmid': 'id'}).astype({'id': str})
    df = df.merge(categories, on='fclass').to_crs(CRS_DEG)
    df = df[['id', 'name', 'category', 'fclass', 'geometry']]
    if save:
        df.to_parquet(U.mkfile(outfile))
    return df

osm_pois = get_osm_pois('Indiana', poi_classes, overwrite=0).disp() # t=2:11

                                                                                

+---------+------------+---------+------+--------------------+
|       id|        name| category|fclass|            geometry|
+---------+------------+---------+------+--------------------+
|358649475|Black School|Education|school|[01 01 00 00 00 D...|
+---------+------------+---------+------+--------------------+
only showing top 1 row



## SafeGraph

In [18]:
df = SP.read_parquet(SAFEGRAPH / 'patterns/monthly/2021-04-01').disp()

+-------------------+-------------------+-------------------+----------+------------+--------------------+--------------------+------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+---+
|           placekey|            started|              ended|raw_visits|raw_visitors|       visits_by_day|   visitor_home_cbgs|dist_4m_home|median_dwell|         dwell_times|      brand_same_day|    brand_same_month|  popularity_by_hour|   popularity_by_day|visitor_daytime_cbgs|android|ios|
+-------------------+-------------------+-------------------+----------+------------+--------------------+--------------------+------------+------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+-------+---+
|222-222@8fy-fkc-wx5|2021-04-01 04:00:00|2021-05-01 04:00:00|       226|         173|[13, 7, 13, 7, 2,...|{120690313114 -> .

In [22]:
def get_sg_pois(poi_path, pat_path, categories, rgn_code='IN',
                save=True, overwrite=False):
    outfile = Path('../data/pois/safegraph.parquet')
    if outfile.exists() and not overwrite:
        return SP.read_parquet(outfile)
    df = SP.read_parquet(poi_path)
    if isinstance(rgn_code, str):
        df = df.filter(f'region == "{rgn_code}"')
    df = df.select(F.col('placekey').alias('id'), 'name', 'lon', 'lat',
                   F.col('top_category').alias('fclass'), 'area_sqft')
    pat = SP.read_parquet(pat_path).select(*[
        F.col(k).alias(v) for k, v in {
            'placekey': 'id', 'raw_visits': 'visits',
            'raw_visitors': 'visitors', 'visitor_home_cbgs': 'home_bgs'
            }.items()])
    df = df.join(pat, on='id', how='left')
    cats = categories.query('source == "SafeGraph"').drop(columns='source')
    df = df.toPandas().merge(cats, on='fclass').drop(columns='fclass')
    df = mk.geo.pdf2gdf(df, 'lon', 'lat', CRS_DEG)
    if save:
        df.to_parquet(U.mkfile(outfile))
    return df

poi_path = SAFEGRAPH / 'pois/us/2020-11-06.parquet'
pat_path = SAFEGRAPH / 'patterns/monthly/2021-04-01'
sg_pois = get_sg_pois(poi_path, pat_path, poi_classes, overwrite=1).disp() # t=0:11

                                                                                

## Combine POIs

In [11]:
def combine_pois(save=True, overwrite=False):
    outfile = Path('../data/pois/pois.parquet')
    if outfile.exists() and not overwrite:
        return SP.read_parquet(outfile)
    osm = gpd.read_parquet('../data/pois/osm.parquet')
    sg = gpd.read_parquet('../data/pois/safegraph.parquet')
    pois = pd.concat([
        osm.assign(source='OSM'), sg.assign(source='SafeGraph')
    ]).reset_index(drop=True)
    if save:
        pois.to_parquet(U.mkfile(outfile))
    return pois

pois = combine_pois(overwrite=0).disp() # t=0:03

+---------+------------+---------+------+--------------------+------+----+----+---------+------+--------+
|       id|        name| category|fclass|            geometry|source| lon| lat|area_sqft|visits|visitors|
+---------+------------+---------+------+--------------------+------+----+----+---------+------+--------+
|358649475|Black School|Education|school|[01 01 00 00 00 D...|   OSM|null|null|     null|  null|    null|
+---------+------------+---------+------+--------------------+------+----+----+---------+------+--------+
only showing top 1 row



# Visualize