# process_vehicle_registration_transactions.ipynb
**This notebook extracts vehicle registration counts by census tract and by month from the vehicle registration transactions data file (scraped by the `scrape_veh_registration_data.ipynb` file) and stores the resulting vehicle counts in `data/vehicles/vehicle_counts.csv`.**
TODO: update text below
+ For each month and for each census tract, counts the number of "Original Registration" and "Registration Renewal" entries within the past year (365 days).
+ Likewise, for each month and for each ZIP code, counts the number of unique EV models on the market in WA until that date (month). This is taken as an estimate for the EV model variety at each point in time.
+ Stores resulting dataframe (panel data by month (January 2017 to December 2021) and by ZIP code (all ZIP codes in WA)) in `data/vehicles/ev_counts.csv`.

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import addfips
af = addfips.AddFIPS()
from uszipcode import SearchEngine
search = SearchEngine()

import timeit

import utils as u
from config.GLOBAL import *



In [2]:
df_fips = pd.read_csv("config/fips2county_tab.tsv", sep='\t', header='infer', dtype=str, encoding='latin-1', index_col=["StateAbbr", "CountyName"])
# df_fips = pd.read_csv("config/fips2county_tab.tsv", sep='\t', header='infer', dtype=str, encoding='latin-1')
df_fips

Unnamed: 0_level_0,Unnamed: 1_level_0,StateFIPS,CountyFIPS_3,StateName,CountyFIPS,STATE_COUNTY
StateAbbr,CountyName,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
AL,Autauga,01,001,Alabama,01001,AL | AUTAUGA
AL,Baldwin,01,003,Alabama,01003,AL | BALDWIN
AL,Barbour,01,005,Alabama,01005,AL | BARBOUR
AL,Bibb,01,007,Alabama,01007,AL | BIBB
AL,Blount,01,009,Alabama,01009,AL | BLOUNT
...,...,...,...,...,...,...
WY,Sweetwater,56,037,Wyoming,56037,WY | SWEETWATER
WY,Teton,56,039,Wyoming,56039,WY | TETON
WY,Uinta,56,041,Wyoming,56041,WY | UINTA
WY,Washakie,56,043,Wyoming,56043,WY | WASHAKIE


In [3]:
#not needed
state_Abbr_to_FIPS = dict()
for StateAbbr in df_fips["StateAbbr"].unique():
    state_Abbr_to_FIPS[StateAbbr] = df_fips.loc[df_fips["StateAbbr"] == StateAbbr, "StateFIPS"].value_counts().index[0]
state_Abbr_to_FIPS

KeyError: 'StateAbbr'

In [3]:
def get_countyFIPS(row):
    try:
        countyFIPS = df_fips.loc[(row["state"], row["county"]), "CountyFIPS"]
    except KeyError:
        try:
            countyFIPS = df_fips.loc[(row["state"], search.by_zipcode(row["zip_code"]).county.replace(" County", "")), "CountyFIPS"]
        except:
            countyFIPS = None
    return countyFIPS

def get_census_tract(row):
    if type(row["_2020_census_tract"]) is str and row["county_FIPS_3"] is not None:
        if len(row["_2020_census_tract"])!=6:
            print(row["_2020_census_tract"])
            a
        census_tract = "{0:s}{1:s}".format(row["county_FIPS_3"].split(".")[0], row["_2020_census_tract"])
        try:
            census_tract = int(census_tract)
        except ValueError:
            census_tract = 0
    else:
        census_tract = 0
    return census_tract

In [41]:
# multiindex = pd.MultiIndex.from_product([times, zip_codes], names=["time", "zip"])
multiindex = pd.MultiIndex.from_product([times, census_tracts_2010], names=["time", "census_tract"])
columns = ["n_veh"]
columns = ["n_veh", "n_veh_new_sales", "n_veh_used_sales", "n_veh_renewals"]

In [None]:
# create final dataframe and add county and ZIP code column to it
final_df = pd.DataFrame(index=multiindex, columns=["county", "zip_code"] + columns)
# for zip_code in zip_codes:
#     final_df.loc[(slice(None), zip_code), "county"] = search.by_zipcode(zip_code).county.replace(" County", "")
for census_tract in census_tracts:
    if census_tract in tract_to_zip.index:
        zip_code = tract_to_zip.loc[census_tract, "zip_code"]
        final_df.loc[(slice(None), census_tract), "zip_code"] = zip_code
        final_df.loc[(slice(None), census_tract), "county"] = search.by_zipcode(zip_code).county.replace(" County", "")
final_df

In [10]:
chunksize = 1e4
n_rows = 18.9e6
n_chunks = n_rows/chunksize
print(n_chunks)

1890.0


In [11]:
#run for 2021 only for now
times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2021,2022) for m in range(1,13)]

In [None]:
#find no. of vehicles in each census tract for each time step
final_df["n_veh"] = 0
with pd.read_csv("data/vehicles/Vehicle_Registration_Transactions_by_Department_of_Licensing.csv", chunksize=chunksize, dtype={"_2020_census_tract": str}, usecols=["start_of_month", "state", "county", "zip_code", "transaction_type", "_2020_census_tract", "vehicle_record_count"]) as reader:
    #TODO: only read necessary columns + clean up code + print runtime for different steps and optimize the worst one(s)
    for i,chunk in enumerate(reader):
        start = timeit.default_timer()

        #print progress
        if np.round(i/n_chunks, 3) % 0.01 == 0 or True:
            print(np.round(i/n_chunks, 4), "", end="")

        # convert transaction date column to date format
        chunk["start_of_month"] = pd.to_datetime(chunk["start_of_month"]).dt.date #fast

        #only run for 2021 for now
        # chunk = chunk[(chunk["start_of_month"] >= pd.to_datetime("2020-01-01").date()) and (chunk["start_of_month"] <= pd.to_datetime("2021-12-31").date())]
        # chunk = chunk[chunk["start_of_month"] <= pd.to_datetime("2021-12-31").date()]

        #add complete census_tract column
        chunk["county_FIPS_3"] = chunk.apply(get_countyFIPS, axis=1) #1.5-2s
        chunk["census_tract"] = chunk.apply(get_census_tract, axis=1) #fast (0.5s)

        for time in times: #TODO: find out how fast this is when n_veh actually contains values
            #filter data to relevant time span and registration types (0.0055s*12*10000=11min)
            chunk_time = chunk.loc[(chunk["start_of_month"] > dt.datetime(time.year-1, time.month, time.day-1).date()) & (chunk["start_of_month"] < time)]
            chunk_time = chunk_time.loc[chunk_time["transaction_type"].isin(["Original Registration", "Registration Renewal"])]

            #vehicle count (0.0015s*12*10000=3min)
            n_veh = chunk_time.groupby("census_tract")["vehicle_record_count"].sum()

            """
            start_assign = timeit.default_timer()
            if len(n_veh) > 0:
                print(final_df["n_veh"].sum())
                for census_tract in census_tracts:
                    if census_tract in n_veh.index:
                        # start = timeit.default_timer()
                        final_df.loc[(time, census_tract),"n_veh"] += n_veh[census_tract]
                        # stop = timeit.default_timer()
                        # print("{0:.4f}assign_inner".format(stop-start), "", end="")
                print(final_df["n_veh"].sum())
            stop_assign = timeit.default_timer()
            print("{0:.4f}assign".format(stop_assign-start_assign), "", end="")
            #"""

            #"""
            if len(n_veh) > 0:
                n_veh = n_veh.filter(items=census_tracts, axis=0)
                final_df.loc[(time, slice(None)), "n_veh"] = final_df["n_veh"].add(n_veh, fill_value=0)
            #"""

        stop = timeit.default_timer()
        print("{0:.1f}s,".format(stop-start), "", end="")

        # if i > 2:
        #     break
        # continue

print("done")

0.0 2.3s, 0.0005 3.0s, 0.0011 2.9s, 0.0016 2.8s, 0.0021 3.0s, 0.0026 3.6s, 0.0032 2.7s, 0.0037 1.9s, 0.0042 5.3s, 0.0048 4.4s, 0.0053 3.2s, 0.0058 2.9s, 0.0063 3.2s, 0.0069 5.2s, 0.0074 3.0s, 0.0079 2.1s, 0.0085 2.2s, 0.009 2.6s, 0.0095 2.7s, 0.0101 2.6s, 0.0106 2.7s, 0.0111 2.8s, 0.0116 3.0s, 0.0122 2.2s, 0.0127 2.3s, 0.0132 2.3s, 0.0138 3.2s, 0.0143 2.7s, 0.0148 2.6s, 0.0153 2.7s, 0.0159 2.6s, 0.0164 2.4s, 0.0169 2.2s, 0.0175 2.1s, 0.018 2.8s, 0.0185 2.6s, 0.019 2.9s, 0.0196 2.6s, 0.0201 3.2s, 0.0206 2.6s, 0.0212 2.0s, 0.0217 2.0s, 0.0222 2.2s, 0.0228 2.8s, 0.0233 2.5s, 0.0238 2.6s, 0.0243 2.6s, 0.0249 3.2s, 0.0254 2.7s, 0.0259 2.0s, 0.0265 2.0s, 0.027 2.4s, 0.0275 2.8s, 0.028 2.7s, 0.0286 2.5s, 0.0291 2.7s, 0.0296 2.6s, 0.0302 2.4s, 0.0307 2.0s, 0.0312 1.9s, 0.0317 2.5s, 0.0323 2.7s, 0.0328 2.8s, 0.0333 2.7s, 0.0339 2.7s, 0.0344 3.1s, 0.0349 5.5s, 0.0354 5.0s, 0.036 2.8s, 0.0365 4.9s, 0.037 3.7s, 0.0376 2.7s, 0.0381 2.7s, 0.0386 2.2s, 0.0392 2.0s, 0.0397 2.3s, 0.0402 2.7s, 0.0407 2.

In [None]:
#output of the fast method is floats (in the n_veh column), thus can convert to int
res = final_df["n_veh"].astype(int)

In [124]:
print(chunk)

                 start_of_month     county state  zip_code  \
190000  2021-12-01T00:00:00.000       King    WA     98101   
190001  2021-12-01T00:00:00.000     Pierce    WA     98375   
190002  2021-12-01T00:00:00.000       King    WA     98027   
190003  2021-12-01T00:00:00.000       King    WA     98122   
190004  2021-12-01T00:00:00.000    Spokane    WA     99206   
...                         ...        ...   ...       ...   
199995  2020-01-01T00:00:00.000     Pierce    WA     98374   
199996  2020-01-01T00:00:00.000    Spokane    WA     99001   
199997  2020-01-01T00:00:00.000  Snohomish    WA     98037   
199998  2020-01-01T00:00:00.000      Clark    WA     98660   
199999  2020-01-01T00:00:00.000       King    WA     98146   

                        transaction_type _2020_census_tract  \
190000              Registration Renewal             007302   
190001              Registration Renewal             073126   
190002              Registration Renewal             031912   
190

In [21]:
(final_df["n_veh"]>0).sum()
# final_df.shape

21291

In [20]:
final_df

Unnamed: 0,time,census_tract_2020,county,zip_code,n_veh
0,2015-01-31,53001950100,Adams,99169.0,0.0
1,2015-01-31,53001950200,Adams,99371.0,0.0
2,2015-01-31,53001950301,,,0.0
3,2015-01-31,53001950302,,,0.0
4,2015-01-31,53001950303,,,0.0
...,...,...,...,...,...
165907,2022-09-30,53077940003,Yakima,98903.0,0.0
165908,2022-09-30,53077940005,Yakima,98948.0,0.0
165909,2022-09-30,53077940006,Yakima,98948.0,0.0
165910,2022-09-30,53077940007,,,0.0


In [51]:
name = "veh_counts2021"
number = 4
name = "veh_counts_file%d"%number
name = "veh_counts_file%d_"%number
name

'veh_counts_file4_'

In [52]:
final_df2 = pd.read_csv("data/vehicles/%s.csv"%name, usecols=["time", "census_tract"] + columns)
# final_df2 = pd.read_csv("data/vehicles/%s_2010tracts.csv"%name, usecols=["time", "census_tract_2010", "n_veh"])
final_df2["time"] = pd.to_datetime(final_df2["time"])
final_df2 = final_df2.rename({"census_tract": "census_tract_2020"}, axis=1)
final_df2 = final_df2.set_index(["time", "census_tract_2020"])
# final_df2.index = final_df2.index.set_names("census_tract_2020", level=1)
final_df2

Unnamed: 0_level_0,Unnamed: 1_level_0,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals
time,census_tract_2020,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-31,53001950100,0.0,0.0,0.0,0.0
2015-01-31,53001950200,0.0,0.0,0.0,0.0
2015-01-31,53001950301,0.0,0.0,0.0,0.0
2015-01-31,53001950302,0.0,0.0,0.0,0.0
2015-01-31,53001950303,0.0,0.0,0.0,0.0
...,...,...,...,...,...
2022-09-30,53077940003,2230.0,9.0,24.0,139.0
2022-09-30,53077940005,4126.0,6.0,77.0,272.0
2022-09-30,53077940006,4079.0,10.0,81.0,276.0
2022-09-30,53077940007,3566.0,8.0,34.0,229.0


In [53]:
#express in terms of 2010 census tracts as the index
final_df_2010tracts = final_df2.reset_index()
final_df_2010tracts["census_tract_2010"] = final_df_2010tracts["census_tract_2020"].map(df_tract_20_10.to_dict())

final_df_2010tracts2 = final_df_2010tracts[["time", "census_tract_2010", "census_tract_2020"] + columns].groupby(["time", "census_tract_2010"]).sum()
final_df_2010tracts2 = final_df_2010tracts2.drop("census_tract_2020", axis=1)

final_df_2010tracts2.to_csv("data/vehicles/%s_2010tracts.csv"%name)
final_df_2010tracts2

Unnamed: 0_level_0,Unnamed: 1_level_0,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2015-01-31,53001950100,0.0,0.0,0.0,0.0
2015-01-31,53001950200,0.0,0.0,0.0,0.0
2015-01-31,53001950300,0.0,0.0,0.0,0.0
2015-01-31,53001950400,0.0,0.0,0.0,0.0
2015-01-31,53001950500,0.0,0.0,0.0,0.0
...,...,...,...,...,...
2022-09-30,53077940002,4101.0,18.0,59.0,247.0
2022-09-30,53077940003,2230.0,9.0,24.0,139.0
2022-09-30,53077940004,5452.0,12.0,66.0,366.0
2022-09-30,53077940005,4126.0,6.0,77.0,272.0


In [99]:
#create empty dataframe that's supposed to hold the data summed from all 4 files
# times_data = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2023) for m in range(1,13)][:-3]
df_veh_all_files = u.create_empty_df(times_data, census_tracts_2010)
df_veh_all_files[columns] = 0
df_veh_all_files

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-31,53001950100,Adams,99169,53001,0,0,0,0
2017-01-31,53001950200,Adams,99371,53001,0,0,0,0
2017-01-31,53001950300,Adams,99344,53001,0,0,0,0
2017-01-31,53001950400,Adams,99344,53001,0,0,0,0
2017-01-31,53001950500,Adams,99344,53001,0,0,0,0
...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,0,0,0,0
2022-09-30,53077940003,Yakima,98903,53077,0,0,0,0
2022-09-30,53077940004,Yakima,98951,53077,0,0,0,0
2022-09-30,53077940005,Yakima,98948,53077,0,0,0,0


In [100]:
df_veh_all_files_empty = df_veh_all_files.copy()

In [101]:
df_veh_all_files = df_veh_all_files_empty.copy()

In [102]:
df_veh_all_files

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-31,53001950100,Adams,99169,53001,0,0,0,0
2017-01-31,53001950200,Adams,99371,53001,0,0,0,0
2017-01-31,53001950300,Adams,99344,53001,0,0,0,0
2017-01-31,53001950400,Adams,99344,53001,0,0,0,0
2017-01-31,53001950500,Adams,99344,53001,0,0,0,0
...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,0,0,0,0
2022-09-30,53077940003,Yakima,98903,53077,0,0,0,0
2022-09-30,53077940004,Yakima,98951,53077,0,0,0,0
2022-09-30,53077940005,Yakima,98948,53077,0,0,0,0


In [103]:
#sum up vehicle counts derived from file 1, 2, and 3
filenames = []
filenames += ["data/vehicles/veh_counts_file1__2010tracts.csv"] #01/2017-02/2018
filenames += ["data/vehicles/veh_counts_file2__2010tracts.csv"] #02/2018-01/2019
filenames += ["data/vehicles/veh_counts_file3__2010tracts.csv"] #01/2019-12/2019
filenames += ["data/vehicles/veh_counts_file4__2010tracts.csv"] #01/2020-present

for filename in filenames:
    print("Adding %s..."%filename)

    df_veh = pd.read_csv(filename)
    df_veh["time"] = pd.to_datetime(df_veh["time"]).dt.date
    df_veh = df_veh.set_index(["time", "census_tract_2010"])

    df_veh_all_files[columns] += df_veh[columns]

Adding data/vehicles/veh_counts_file1__2010tracts.csv...
Adding data/vehicles/veh_counts_file2__2010tracts.csv...
Adding data/vehicles/veh_counts_file3__2010tracts.csv...
Adding data/vehicles/veh_counts_file4__2010tracts.csv...


In [104]:
df_veh_all_files

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2017-01-31,53001950100,Adams,99169,53001,147.0,14.0,14.0,119.0
2017-01-31,53001950200,Adams,99371,53001,122.0,15.0,9.0,98.0
2017-01-31,53001950300,Adams,99344,53001,388.0,24.0,48.0,316.0
2017-01-31,53001950400,Adams,99344,53001,145.0,13.0,17.0,115.0
2017-01-31,53001950500,Adams,99344,53001,262.0,24.0,37.0,201.0
...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,4101.0,18.0,59.0,247.0
2022-09-30,53077940003,Yakima,98903,53077,2230.0,9.0,24.0,139.0
2022-09-30,53077940004,Yakima,98951,53077,5452.0,12.0,66.0,366.0
2022-09-30,53077940005,Yakima,98948,53077,4126.0,6.0,77.0,272.0


In [106]:
df_veh_all_files.to_csv("data/vehicles/veh_counts__2010tracts.csv")

In [119]:
#use 2021 vehicle counts for all other years too
times_in_2021 = [time for time in times if time.year == 2021]
# print(times_in_2021)
final_df_all = final_df2.copy()
# print(final_df_all.index[0][0].dtype)
# for year in [2015, 2016, 2017, 2018, 2019, 2022]:
for time in times:
    if time.year != 2021:
        # times_in_year = [t for t in times if t.year == year]
        print(time)
        print(type(time))
        # print(times_in_year)
        print(final_df_all)
        # print(final_df_all.loc[(times_in_year, slice(None)), "n_veh"])
        # print(final_df_all.xs(times_in_year, level="time"))
        # final_df_all.loc[(times_in_year, slice(None)), "n_veh"] = final_df2.loc[(times_in_2021, slice(None)), "n_veh"]
        time_in_2021 = [t for t in times if time.month == t.month and t.year==2021][0]
        print(time_in_2021,"j")
        print(final_df_all.loc[str(time), 53001950100])
        # p
        # idx = pd.IndexSlice
        # print(final_df_all.loc[idx[time, :], "n_veh"])
        print(final_df_all.loc[(str(time), slice(None)), "n_veh"])
        print(final_df2.loc[(str(time_in_2021), slice(None)), "n_veh"])
        # k
        # final_df_all.loc[(str(time), slice(None)), "n_veh"] = final_df2.loc[(str(time_in_2021), slice(None)), "n_veh"]
        print(final_df_all.loc[(str(time), slice(None)), "n_veh"])

        # n_veh = n_veh.filter(items=census_tracts, axis=0)
        print(final_df_all.sum())
        # final_df_all.loc[(str(time), slice(None)), "n_veh"] = final_df_all.loc[(str(time), slice(None)), "n_veh"].add(final_df2.loc[(str(time_in_2021), slice(None)), "n_veh"], fill_value=0)

        for census_tract in census_tracts_2010:
            # final_df_all.loc[(str(time), census_tract), "n_veh"] = final_df2.loc[(str(time_in_2021), census_tract), "n_veh"]
            final_df.loc[time, census_tract]["n_veh"] = final_df2.loc[str(time_in_2021), census_tract]["n_veh"]
        print(final_df_all.sum())
        print(final_df_all.loc[(str(time), slice(None)), "n_veh"])
        l
final_df_all

2015-01-31
<class 'datetime.date'>
                              n_veh
time       census_tract_2010       
2015-01-31 5.300195e+10         0.0
           5.300195e+10         0.0
           5.300195e+10         0.0
           5.300195e+10         0.0
           5.300195e+10         0.0
...                             ...
2022-09-30 5.307794e+10         0.0
           5.307794e+10         0.0
           5.307794e+10         0.0
           5.307794e+10         0.0
           5.307794e+10         0.0

[135501 rows x 1 columns]
2021-01-31 j
n_veh    0.0
Name: (2015-01-31 00:00:00, 53001950100.0), dtype: float64
time        census_tract_2010
2015-01-31  5.300195e+10         0.0
            5.300195e+10         0.0
            5.300195e+10         0.0
            5.300195e+10         0.0
            5.300195e+10         0.0
                                ... 
            5.307794e+10         0.0
            5.307794e+10         0.0
            5.307794e+10         0.0
            5.307794e+

KeyError: datetime.date(2015, 1, 31)

In [85]:
final_df_all.loc[("2015-01-31", 53001950100)]
# final_df_all.index[0][0].dtype

n_veh    0.0
Name: (2015-01-31 00:00:00, 53001950100), dtype: float64

In [None]:
#express in terms of 2010 census tracts as the index (WRONG METHOD, DO NOT USE)
final_df = pd.read_csv("data/vehicles/%s.csv"%name)
final_df = final_df.rename(columns={"census_tract": "census_tract_2020"})
final_df_2010tracts = final_df.copy()
final_df_2010tracts["census_tract_2010"] = final_df_2010tracts["census_tract_2020"].map(df_tract_20_10.to_dict())
final_df_2010tracts = final_df_2010tracts.set_index(["time", "census_tract_2010"])
final_df_2010tracts.to_csv("data/vehicles/%s_2010tracts.csv"%name)
final_df_2010tracts

In [8]:
# save resulting dataframe to file
final_df.to_csv("data/vehicles/veh_counts.csv")
# final_df.to_csv("data/vehicles/veh_counts_slow.csv")
# final_df.to_csv("data/vehicles/veh_counts_fast.csv")

In [111]:
slow = pd.read_csv("data/vehicles/veh_counts_slow.csv", index_col=("time", "census_tract"))
fast = pd.read_csv("data/vehicles/veh_counts_fast.csv", index_col=("time", "census_tract"))

In [113]:
ratio = slow["n_veh"]/fast["n_veh"]
ratio.describe()

count    8228.0
mean        1.0
std         0.0
min         1.0
25%         1.0
50%         1.0
75%         1.0
max         1.0
Name: n_veh, dtype: float64

In [115]:
(slow == fast).all()

county      False
zip_code    False
n_veh        True
dtype: bool

In [120]:
(~slow["county"].isna() == ~fast["county"].isna()).all()

True

In [116]:
slow.shape,fast.shape

((165912, 3), (165912, 3))

This concludes the main functionality of this notebook.