# 2. Initial Data Transformation (if applying for a Data Engineering and/or Science Position)
This notebook is intended to explore the data join process and determine the join threshold

In [2]:
import pandas as pd
import geopandas as gpd
from src.helper import count_na

In [3]:
# read service data
col_types = {
    'notification_number':str,
    'reference_number':str
    }
date_cols = ['creation_timestamp','completion_timestamp']
sr = pd.read_csv('data/raw/sr.csv.gz', compression='gzip', index_col=0, parse_dates=date_cols,dtype=col_types)
sr = gpd.GeoDataFrame(sr, geometry=gpd.points_from_xy(sr.longitude, sr.latitude))
# read geojson
geo = gpd.read_file('data/raw/city-hex-polygons-8.geojson')

In [4]:
# join using geospatial join
sr.crs = geo.crs
sr['h3_level8_index'] = sr.sjoin(geo, how='left')['index']
sr = sr.drop(['geometry'], axis=1)
sr[sr['h3_level8_index'].isna()]

Unnamed: 0,notification_number,reference_number,creation_timestamp,completion_timestamp,directorate,department,branch,section,code_group,code,cause_code_group,cause_code,official_suburb,latitude,longitude,h3_level8_index
13742,000400525315,9108302863,2020-01-23 13:28:51+02:00,2020-02-18 14:29:45+02:00,URBAN MOBILITY,Roads Infrastructure Management,RIM Area Central,District: Blaauwberg,TD Customer complaint groups,"RequestNewRoadway painted, mounted signs",,,,,,
13743,000400527116,,2020-01-30 12:46:49+02:00,2020-02-12 11:49:55+02:00,URBAN MOBILITY,Roads Infrastructure Management,RIM Area North,District : Bellville,TD Customer complaint groups,"RequestNewRoadway painted, mounted signs",,,,,,
13744,000400528840,9108378958,2020-02-06 12:29:29+02:00,2020-03-25 11:35:58+02:00,URBAN MOBILITY,Roads Infrastructure Management,RIM Area Central,District: Blaauwberg,TD Customer complaint groups,"RequestNewRoadway painted, mounted signs",,,,,,
13745,000400530412,,2020-02-12 08:38:03+02:00,NaT,,,,,TD Customer complaint groups,"RequestNewRoadway painted, mounted signs",,,,,,
13746,000400530772,,2020-02-13 09:27:42+02:00,2020-06-11 15:45:58+02:00,,,,,TD Customer complaint groups,Paint Markings Lines&Signs,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941619,001016508411,9109974490,2020-12-31 23:03:57+02:00,2021-01-12 11:57:59+02:00,ENERGY,Electricity Generation and Distribution,Electricity Retail Management,Customer Support Services and Rev Man,ELECTRICITY TECHNICAL COMPLAINTS,Street Lights - Single Light Out,,,,,,
941620,001016508412,9109974479,2020-12-31 22:25:58+02:00,2020-12-31 22:37:41+02:00,WATER AND SANITATION,Distribution Services,Reticulation,Reticulation Water Distribution,WATER,Burst Pipe,,,,,,
941622,001016508415,9109974482,2020-12-31 22:29:04+02:00,2021-01-05 13:43:00+02:00,URBAN WASTE MANAGEMENT,Solid Waste Management,Collections,Collections,SOLID WASTE,Non-Collection of 240L Bin,,,,,,
941628,001016508424,9109974515,2020-12-31 23:24:26+02:00,2021-01-01 08:56:05+02:00,WATER AND SANITATION,Distribution Services,Reticulation,Reticulation WW Conveyance,SEWER,Sewer: Blocked/Overflow,,,,,,


## What does the join data tell us about nulls
There is about 23% of the records which can't join on geo data.
There is also a number of records missing a reference_number which is odd.

In [5]:
fails = count_na(sr,'h3_level8_index')
df_set = sr.shape[0]
print(f"Records failed join: {fails}")
print(f"Records total: {df_set}")
print(f"Join error: {round(fails/df_set, 2)*100}%")

Records failed join: 212367
Records total: 941634
Join error: 23.0%
