# 2. Initial Data Transformation (if applying for a Data Engineering and/or Science Position)
This notebook is intended to explore the data join process and determine the join threshold

In [1]:
import pandas as pd
import geopandas as gpd
import helper.functions as hf
from datetime import datetime

In [2]:
# make logger
start = datetime.now()
logger = hf.make_logger('2-initial_data_transformation_explore')

In [3]:
# read service data
logger.info('Loading files')
load_time = datetime.now()
sr = hf.load_service_data('data/raw/sr.csv.gz')
# read geojson
geo = gpd.read_file('data/raw/city-hex-polygons-8.geojson')
load_time = datetime.now() - load_time
logger.info(f'Files loaded: {load_time}')

In [4]:
# join using geospatial join
logger.info("Joining files on geometry")
join_time = datetime.now()
sr.crs = geo.crs
sr['h3_level8_index'] = sr.sjoin(geo, how='left')['index'] # only want the index col
join_time = datetime.now() - join_time
logger.info(f"Finished join: {join_time}")
sr = sr.drop(['geometry'], axis=1) # don't need to keep geometry data so dropping

## What does the join data tell us about nulls
There is about 23% of the records which can't join on geo data.
There is also a number of records missing a reference_number which is odd.

In [5]:
fails = hf.count_na(sr,'h3_level8_index')
df_set = sr.shape[0]
print(f"Records failed join: {fails}")
print(f"Records total: {df_set}")
print(f"Join error: {round(fails/df_set, 2)*100}%")

Records failed join: 212367
Records total: 941634
Join error: 23.0%
