# Run johns simulation using my datasets on a subsample of the data

In [None]:
import pandas as pd
import dask.dataframe as dd
import dask.array as da
import numpy as np
import replicaEVSE.load_curve as sim
import os
from dask.diagnostics import ProgressBar
import joblib

%reload_ext autoreload
%autoreload 2



pd.set_option('display.max_columns', None)

datadir = '../../data'



#Created in the EIA_data_download.ipynb notebook
existing_load=pd.read_csv('../../data/EIA_demand_summary.csv') 

merged_ddf = dd.read_parquet(os.path.join(datadir, 'wa_pop_and_trips.parquet'))

#df = merged_ddf.head(10000)
#ddf = dd.from_pandas(df, npartitions=4)


# right now, only look at private auto trips
ddf = merged_ddf.loc[merged_ddf['mode'] == 'PRIVATE_AUTO']

ddf = ddf.reset_index(drop=True)


# sort on person_id and start_time
#ddf = ddf.sort_values(by=['person_id', 'start_time', 'weekday']).reset_index(drop=True)

#ddf = ddf.reset_index(drop=True)
# df = ddf.compute()


In [None]:
from dask.distributed import Client, LocalCluster
cluster = LocalCluster(n_workers=8)  # Launches a scheduler and workers locally
client = Client(cluster)  # Connect to distributed cluster and override default

pbar = ProgressBar()
pbar.register()

In [None]:
# this data set only includes trips with people who are also in the population data set
# this means no COMMERCIAL trips
len(merged_ddf)

In [None]:
print(client)

### run the simulation on small dataset to test

In [None]:
# this takes ~1min for 10,000 trips
out = sim.simulate_person_load(df, existing_load, 'base', managed=False)

In [None]:
# try to use dask
# this doesn't work
out = ddf.map_partitions(sim.simulate_person_load, existing_load, 'base', managed=False, align_dataframes=False)
out.compute()

In [None]:
number_of_chunks = 10000 # 10000 rows in each chunck
# df_list = np.array_split(ddf, number_of_chunks)

In [None]:
# run the simulation in parallel
# df must be a pandas dataframe
charge_sims = joblib.Parallel(verbose=10, n_jobs=-1)(
    joblib.delayed(sim.simulate_person_load)(
    df=df,
    existing_load=existing_load,
    simulation_id='base',
    managed=False
) for df in df_list)

In [None]:
# restack the dataframes
charges_list = [x['charges'] for x in charge_sims]
charges_df = dd.concat(charges_list)

loads_list = [x['loads'] for x in charge_sims]
loads_df = dd.concat(loads_list)

In [None]:
frac = 0.001 # ~2e4
thu = dd.read_parquet(os.path.join(datadir, 'northwest_2021_Q4_thursday_trip.parquet'))
sat = dd.read_parquet(os.path.join(datadir, 'northwest_2021_Q4_saturday_trip.parquet'))
pop = dd.read_parquet(os.path.join(datadir, 'northwest_2021_Q4_population.parquet'))
ddf = dd.read_parquet(os.path.join(datadir, 'wa_pop_and_trips.parquet'))
# df = ddf.head(10000000)
# trips_ddf = dd.concat([thu, sat])
# ddf = dd.from_pandas(df, chunksize=1000)
# df['mode'].value_counts()

In [None]:
len(trips_ddf.loc[trips_ddf['mode'] == 'PRIVATE_AUTO'])

In [None]:
len(sat) + len(thu), len(ddf), len(pop)

In [None]:
pop_len = 58.222322
trips_len = 159.624453 # million
join_len = 51.727268 
other_len = 49.674863
pop_len = 14.889896

In [None]:
unique_people_in_pop_len = len(pop['person_id'].unique())


In [None]:
trucks = trips_ddf.loc[trips_ddf['mode'] == 'COMMERCIAL']
other = trips_ddf.loc[trips_ddf['mode'] != 'PRIVATE_AUTO']

In [None]:
len(other)

In [None]:
trucks['person_id'].value_counts()

In [None]:
trucks.loc[trucks['vehicle_type'] == 'MEDIUM_COMMERCIAL']['distance_miles'].hist(bins=100,  alpha=0.5)
trucks.loc[trucks['vehicle_type'] == 'HEAVY_COMMERCIAL']['distance_miles'].hist(bins=100, log=True, alpha=0.5)

In [None]:
trips = dd.read_parquet(os.path.join(datadir, 'northwest_2021_Q4_thursday_trip.parquet'))