# Clean the data and make parquet files

 The files we were given for the northwest region from replica have header rows embedded through out due to the way the google cloud on their end stacks things.

 Here we clean convert every column to strings and then remove those rows and save the data to parquet files which are easier to deal with.  



In [2]:
%load_ext autoreload
%autoreload 2
from replicaEVSE import datautils as du
import os
import dask.dataframe as dd
import pandas as pd

datadir = '../../data/'

### path to the data

In [3]:
datapath = '../../data/'
popfile = 'northwest_2021_Q4_population.csv'
tripsatfile = 'northwest_2021_Q4_saturday_trip.csv'
tripthufile = 'northwest_2021_Q4_thursday_trip.csv'

### load and convert all the data to strings

In [None]:
pop_df = du.load_data(os.path.join(datapath, popfile))
tripsat_df = du.load_data(os.path.join(datapath, tripsatfile))
tripthu_df = du.load_data(os.path.join(datapath, tripthufile))

### clean the data of bad rows (embedded headers in the data) and return the dask data frames for inspection.

In [None]:
pop_df = du.clean_pop_data(pop_df)
tripsat_df = du.clean_trip_data(tripsat_df)
tripthu_df = du.clean_trip_data(tripthu_df)

### Or we can use our wrapper to clean save the data in the parquet format to speed analysis later

In [None]:
popparquet = 'northwest_2021_Q4_population.parquet'
tripsatparquet = 'northwest_2021_Q4_saturday_trip.parquet'
tripthuparquet = 'northwest_2021_Q4_thursday_trip.parquet'
pop_df = pop_df.to_parquet(os.path.join(datapath, popparquet))
tripsat_df = tripsat_df.to_parquet(os.path.join(datapath, tripsatparquet))
tripthu_df = tripthu_df.to_parquet(os.path.join(datapath, tripthuparquet))

In [None]:
popparquet = 'northwest_2021_Q4_population.parquet'
tripthuparquet = 'northwest_2021_Q4_thursday_trip.parquet'
df = dd.read_parquet(os.path.join(datapath, tripthuparquet))

# Make a joined table and save as parquet

In [4]:
# get list of blockgroups
gdf = pd.read_pickle(datadir+'/blockgroup_boundaries.pkl')
bgrp_list = list(gdf.GEOID.values)

trip_sat_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_saturday_trip.parquet')
trip_thu_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_thursday_trip.parquet')

pop_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_population.parquet')
# pop_ddf = dd.merge(pop_ddf, counties, on='person_id', how='left')


trip_sat_ddf['weekday'] = 'saturday'
trip_thu_ddf['weekday'] = 'thursday'

# stack the two dataframes
trips = dd.concat([trip_sat_ddf, trip_thu_ddf], axis=0, keys=["saturday", "thursday"])


# only trips that end in WA
trips_ddf = trips.loc[trips['destination_bgrp'].isin(bgrp_list)]
 
merged_ddf = dd.merge(trips_ddf, pop_ddf, on='person_id', how='left')

# Create charge_type column from travel_purpose column
merged_ddf['charge_type'] = merged_ddf.travel_purpose

merged_ddf['charge_type'] = merged_ddf.travel_purpose
merged_ddf['charge_type'] = merged_ddf['charge_type'].where(
    merged_ddf.charge_type.isin(
    ['WORK', 'HOME']), 'PUBLIC')



In [5]:
# sort the table before saving it. needs to be sorted by person_id most critically
# since we are then chunking the table. If they are not near each other we will get
# some funky results. 
merged_ddf = merged_ddf.sort_values(by=['person_id', 'start_time', 'weekday']).reset_index(drop=True)

: 

: 

In [None]:
merged_ddf.to_parquet(datadir+'/wa_pop_and_trips.parquet', overwrite=True)

In [20]:
dtype_dict = {"person_id": str, "home_cty": str, "home_st": str}
counties = dd.read_csv(datadir+'replica-wa_tes-04_22_23-population_dataset.csv', dtype=dtype_dict)
counties.to_parquet(datadir+'population_counties_dataset.parquet', engine='pyarrow', overwrite=True)

### make a table of just trips into WA

In [3]:

# get list of blockgroups
gdf = pd.read_pickle(datadir+'/blockgroup_boundaries.pkl')
bgrp_list = list(gdf.GEOID.values)

trip_sat_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_saturday_trip.parquet')
trip_thu_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_thursday_trip.parquet')

# dtype_dict = {"person_id": str, "home_cty": "category", "home_st": "category"}
dtype_dict = {"person_id": str, "home_cty": str, "home_st": str}
counties = dd.read_parquet(datadir+'/population_counties_dataset.parquet', engine='pyarrow')

trip_sat_ddf['weekday'] = 'saturday'
trip_thu_ddf['weekday'] = 'thursday'

# stack the two dataframes
trips = dd.concat([trip_sat_ddf, trip_thu_ddf], axis=0, keys=["saturday", "thursday"])


# only trips that end in WA
trips_ddf = trips.loc[trips['destination_bgrp'].isin(bgrp_list)]

In [4]:
trips_into_wa_len = len(trips_ddf) # = 51727268

In [8]:
merged_ddf = dd.read_parquet(datadir+'/wa_pop_and_trips.parquet')
len(merged_ddf)

51727268

In [9]:
len(trips_ddf['person_id'].unique()) # = 8538399

8538399

In [12]:
merged_ddf['charge_type'].value_counts().compute()

PUBLIC    31224030
HOME      16301817
WORK       4201421
Name: charge_type, dtype: int64

In [30]:
trucks = trip_thu_ddf.loc[trip_thu_ddf['mode'] == 'COMMERCIAL']

In [31]:
trucks.head()

Unnamed: 0,activity_id,person_id,mode,travel_purpose,previous_activity_type,start_time,end_time,distance_miles,vehicle_type,origin_bgrp,...,origin_building_use_l1,origin_building_use_l2,destination_land_use_l1,destination_land_use_l2,destination_building_use_l1,destination_building_use_l2,origin_lat,origin_lng,destination_lat,destination_lng
0,13222013121352548534,742346565412155034,COMMERCIAL,COMMERCIAL,COMMERCIAL,0 days 15:40:41,0 days 16:04:17,14.393738,MEDIUM_COMMERCIAL,530630132023,...,,,,,,,47.65845,-117.06962,47.61218,-117.23991
1,14464857373953879015,1131805974781405462,COMMERCIAL,COMMERCIAL,COMMERCIAL,0 days 14:32:58,0 days 14:36:41,1.588356,MEDIUM_COMMERCIAL,160439702001,...,,,,,,,44.07032,-111.41578,44.07426,-111.44187
2,3966168606325826669,13559326651888885763,COMMERCIAL,COMMERCIAL,COMMERCIAL,0 days 10:03:19,0 days 10:06:30,0.529648,MEDIUM_COMMERCIAL,530350921001,...,,,,,,,47.50375,-122.68775,47.50282,-122.67951
3,12375184329754153291,15084998859716916562,COMMERCIAL,COMMERCIAL,COMMERCIAL,0 days 17:58:18,0 days 18:33:05,39.021394,MEDIUM_COMMERCIAL,160399602001,...,,,,,,,43.14157,-115.66474,43.54904,-116.15638
4,11314222112736317103,7254577039155896706,COMMERCIAL,COMMERCIAL,COMMERCIAL,0 days 07:41:59,0 days 08:17:52,25.556345,MEDIUM_COMMERCIAL,530110402021,...,,,,,,,45.89144,-122.51884,45.56358,-122.54219


In [33]:
trucks['person_id'].nunique().compute()

2190600

In [25]:
trip_thu_ddf = dd.read_parquet(datadir+'/northwest_2021_Q4_thursday_trip.parquet')
trip_thu_ddf['vehicle_type'].value_counts().compute()

nan                  55409122
MEDIUM_COMMERCIAL     1946802
HEAVY_COMMERCIAL       243798
Name: vehicle_type, dtype: int64

In [34]:
len(trucks)

2190600

In [35]:
ubers = trip_thu_ddf.loc[trip_thu_ddf['mode'] == 'ON_DEMAND_AUTO']

In [36]:
ubers['person_id'].value_counts().compute()

990158709039785955      16
9183962885016971228     15
7236881691111520416     14
17416661576243475932    11
14826177188316444853    11
                        ..
16069901924244521885     1
16069935096932885818     1
16069955311810168656     1
1607009112529347857      1
9999937766891715451      1
Name: person_id, Length: 310740, dtype: int64