# Clean the data and make parquet files

 The files we were given for the northwest region from replica have header rows embedded through out due to the way the google cloud on their end stacks things.

 Here we clean convert every column to strings and then remove those rows and save the data to parquet files which are easier to deal with.  



In [None]:
%load_ext autoreload
%autoreload 2
from replicaEVSE import datautils as du
import os
import dask.dataframe as dd
import pandas as pd

datadir = '../../data/'

### path to the data

In [None]:
datapath = '../../data/'
popfile = 'northwest_2021_Q4_population.csv'
tripsatfile = 'northwest_2021_Q4_saturday_trip.csv'
tripthufile = 'northwest_2021_Q4_thursday_trip.csv'

### load and convert all the data to strings

In [None]:
pop_df = du.load_data(os.path.join(datapath, popfile))
tripsat_df = du.load_data(os.path.join(datapath, tripsatfile))
tripthu_df = du.load_data(os.path.join(datapath, tripthufile))

### clean the data of bad rows (embedded headers in the data) and return the dask data frames for inspection.

In [None]:
pop_df = du.clean_pop_data(pop_df)
tripsat_df = du.clean_trip_data(tripsat_df)
tripthu_df = du.clean_trip_data(tripthu_df)

### Or we can use our wrapper to clean save the data in the parquet format to speed analysis later

In [None]:
popparquet = 'northwest_2021_Q4_population.parquet'
tripsatparquet = 'northwest_2021_Q4_saturday_trip.parquet'
tripthuparquet = 'northwest_2021_Q4_thursday_trip.parquet'
pop_df = pop_df.to_parquet(os.path.join(datapath, popparquet))
tripsat_df = tripsat_df.to_parquet(os.path.join(datapath, tripsatparquet))
tripthu_df = tripthu_df.to_parquet(os.path.join(datapath, tripthuparquet))

In [None]:
popparquet = 'northwest_2021_Q4_population.parquet'
tripthuparquet = 'northwest_2021_Q4_thursday_trip.parquet'
df = dd.read_parquet(os.path.join(datapath, tripthuparquet))

# Make a joined table and save as parquet

In [None]:
# get list of blockgroups
gdf = pd.read_pickle(datadir+'/blockgroup_boundaries.pkl')
bgrp_list = list(gdf.GEOID.values)

trip_sat_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_saturday_trip.parquet')
trip_thu_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_thursday_trip.parquet')

# dtype_dict = {"person_id": str, "home_cty": "category", "home_st": "category"}
dtype_dict = {"person_id": str, "home_cty": str, "home_st": str}
counties = dd.read_parquet(datadir+'/population_counties_dataset.parquet', engine='pyarrow')

pop_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_population.parquet')
# pop_ddf = dd.merge(pop_ddf, counties, on='person_id', how='left')


trip_sat_ddf['weekday'] = 'saturday'
trip_thu_ddf['weekday'] = 'thursday'

# stack the two dataframes
trips = dd.concat([trip_sat_ddf, trip_thu_ddf], axis=0, keys=["saturday", "thursday"])


# only trips that end in WA
trips_ddf = trips.loc[trips['destination_bgrp'].isin(bgrp_list)]
 
merged_ddf = dd.merge(trips_ddf, pop_ddf, on='person_id', how='left')

# Create charge_type column from travel_purpose column
merged_ddf['charge_type'] = merged_ddf.travel_purpose

merged_ddf['charge_type'] = merged_ddf.travel_purpose
merged_ddf['charge_type'] = merged_ddf['charge_type'].where(
    merged_ddf.charge_type.isin(
    ['WORK', 'HOME']), 'PUBLIC')

merged_ddf.to_parquet(datadir+'/wa_pop_and_trips.parquet')

In [None]:
### make sure the join worked
# get list of blockgroups
gdf = pd.read_pickle(datadir+'/blockgroup_boundaries.pkl')
bgrp_list = list(gdf.GEOID.values)

trip_sat_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_saturday_trip.parquet')
trip_thu_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_thursday_trip.parquet')

# dtype_dict = {"person_id": str, "home_cty": "category", "home_st": "category"}
dtype_dict = {"person_id": str, "home_cty": str, "home_st": str}
counties = dd.read_parquet(datadir+'/population_counties_dataset.parquet', engine='pyarrow')

pop_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_population.parquet')
# pop_ddf = dd.merge(pop_ddf, counties, on='person_id', how='left')


trip_sat_ddf['weekday'] = 'saturday'
trip_thu_ddf['weekday'] = 'thursday'

# stack the two dataframes
trips = dd.concat([trip_sat_ddf, trip_thu_ddf], axis=0, keys=["saturday", "thursday"])


# only trips that end in WA
trips_ddf = trips.loc[trips['destination_bgrp'].isin(bgrp_list)]

In [None]:
trips_into_wa_len = len(trips_ddf)

In [None]:
trips_into_wa_len = 51727268

In [None]:
# trip persons who are also in population dataset
trips_in_pop_ddf = trips_ddf.loc[trips_ddf['person_id'].isin(pop_ddf['person_id'])]

In [None]:
len(pop_ddf['person_id'].unique().compute())

In [None]:
len(trips_ddf['person_id'].unique())

In [None]:
8538399/1e6

In [None]:
len(merged_ddf['person_id'].unique())