# combine_data_sources.ipynb
**Combines the different data sources to create one dataframe containing all data (EV registrations, EV product variety, gas prices, EV charging locations, ...) by month and ZIP code and saves the resulting dataframe in `data/df.csv`.**

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from uszipcode import SearchEngine
search = SearchEngine()

import utils as u
from config.GLOBAL import *



In [5]:
times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2023) for m in range(1,13)][:-3]
times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2018,2023) for m in range(1,13)][:-3]
# times

In [17]:
df_empty = u.create_empty_df(times, census_tracts_2010, "census_tract_2010")
df_empty

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-01-31,53001950100,Adams,99169,53001
2018-01-31,53001950200,Adams,99371,53001
2018-01-31,53001950300,Adams,99344,53001
2018-01-31,53001950400,Adams,99344,53001
2018-01-31,53001950500,Adams,99344,53001
...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077
2022-09-30,53077940003,Yakima,98903,53077
2022-09-30,53077940004,Yakima,98951,53077
2022-09-30,53077940005,Yakima,98948,53077


In [18]:
df = df_empty.copy()

In [28]:
df.to_csv("data/index_test_slow.csv")

In [6]:
df.shape

(135594, 2)

In [19]:
# n_ev (number of registered EVs), m_ev (number of different EV make/models ["product variety"])
# df_ev = pd.read_csv("data/vehicles/ev_counts.csv")
# df_ev = pd.read_csv("data/vehicles/ev_counts_2010tracts.csv")
# df_ev = pd.read_csv("data/vehicles/ev_counts_2010tracts_to_2011.csv")
df_ev = pd.read_csv("data/vehicles/ev_counts__2010tracts.csv")
df_ev = df_ev.convert_dtypes()
df_ev["time"] = pd.to_datetime(df_ev["time"]).dt.date

df_ev = df_ev.set_index(["time", "census_tract_2010"])
df_ev.index.dtypes[1] = "int64"

# print(df)
# print(df_ev)
# h

##### df = df.merge(df_ev, on=["time", "zip"], how="inner", suffixes=(None,"_y"))
df = df.join(df_ev, on=["time", "census_tract_2010"], how="inner", rsuffix="_y")
# df = df.drop(columns="zip_code_y") #the county column appears twice, so removing the duplicate he
# df = df.drop(columns="county_y") #the county column appears twice, so removing the duplicate he

# alternative way (instead of using join):
# for time in times:
#     print(time)
#     for zip_code in zip_codes:
#         ev_counts = df_ev.loc[(time, zip_code), cols]
#         for col in cols:
#             if not np.isnan(ev_counts[col]):
#                 df.loc[(time, zip_code), col] = ev_counts[col]

print("done")

done


In [20]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,n_bev_used_sales,n_phev_used_sales,n_ev_renewals,n_bev_renewals,n_phev_renewals,m_ev,m_bev,m_phev
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-01-31,53001950100,Adams,99169,53001,8,4,4,8,4,4,8,4,4,8,4,4,64,29,37
2018-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,0,0,0,0,0,64,29,37
2018-01-31,53001950300,Adams,99344,53001,6,0,6,6,0,6,6,0,6,6,0,6,64,29,37
2018-01-31,53001950400,Adams,99344,53001,0,0,0,0,0,0,0,0,0,0,0,0,64,29,37
2018-01-31,53001950500,Adams,99344,53001,3,3,0,3,3,0,3,3,0,3,3,0,64,29,37
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,6,4,2,6,4,2,6,4,2,6,4,2,121,60,64
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,2,4,6,2,4,121,60,64
2022-09-30,53077940004,Yakima,98951,53077,2,2,0,2,2,0,2,2,0,2,2,0,121,60,64
2022-09-30,53077940005,Yakima,98948,53077,2,2,0,2,2,0,2,2,0,2,2,0,121,60,64


In [21]:
#n_veh (number of registered light-duty vehicles)
# df_veh = pd.read_csv("data/vehicles/ev_counts.csv")
# df_veh = pd.read_csv("data/vehicles/veh_counts2021_2010tracts.csv")
# df_veh = pd.read_csv("data/vehicles/veh_counts2021_2010tracts_all_years.csv")
# df_veh = pd.read_csv("data/vehicles/veh_counts2021_2010tracts_all_years_to_2011.csv")
df_veh = pd.read_csv("data/vehicles/veh_counts__2010tracts.csv")
df_veh = df_veh.convert_dtypes()
df_veh["time"] = pd.to_datetime(df_veh["time"]).dt.date

df_veh = df_veh.set_index(["time", "census_tract_2010"])
df_veh.index.dtypes[1] = "int64"

##### df = df.merge(df_veh, on=["time", "zip"], how="inner", suffixes=(None,"_y"))
df = df.join(df_veh, on=["time", "census_tract_2010"], how="inner", rsuffix="_y")
# df = df.drop(columns="zip_code_y") #the county column appears twice, so removing the duplicate he
# df = df.drop(columns="county_y") #the county column appears twice, so removing the duplicate he

# alternative way (instead of using join):
# for time in times:
#     print(time)
#     for zip_code in zip_codes:
#         ev_counts = df_veh.loc[(time, zip_code), cols]
#         for col in cols:
#             if not np.isnan(ev_counts[col]):
#                 df.loc[(time, zip_code), col] = ev_counts[col]

print("done")

done


In [22]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,...,m_ev,m_bev,m_phev,county_y,zip_code_y,countyFIPS_y,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-31,53001950100,Adams,99169,53001,8,4,4,8,4,4,8,...,64,29,37,Adams,99169,53001,2233,12,19,126
2018-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,...,64,29,37,Adams,99371,53001,1452,4,10,85
2018-01-31,53001950300,Adams,99344,53001,6,0,6,6,0,6,6,...,64,29,37,Adams,99344,53001,6699,29,80,359
2018-01-31,53001950400,Adams,99344,53001,0,0,0,0,0,0,0,...,64,29,37,Adams,99344,53001,2337,11,22,128
2018-01-31,53001950500,Adams,99344,53001,3,3,0,3,3,0,3,...,64,29,37,Adams,99344,53001,4372,20,41,257
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,6,4,2,6,4,2,6,...,121,60,64,Yakima,98948,53077,4101,18,59,247
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,...,121,60,64,Yakima,98903,53077,2230,9,24,139
2022-09-30,53077940004,Yakima,98951,53077,2,2,0,2,2,0,2,...,121,60,64,Yakima,98951,53077,5452,12,66,366
2022-09-30,53077940005,Yakima,98948,53077,2,2,0,2,2,0,2,...,121,60,64,Yakima,98948,53077,4126,6,77,272


In [23]:
# gas_price
# df_gas = pd.read_csv("data/gas/Weekly_Retail_Gasoline_and_Diesel_Prices.csv", header=6)#, parse_dates=["Month"])
df_gas = pd.read_excel("data/gas/PET_PRI_GND_DCUS_SWA_M.xls", sheet_name="Data 1", header=2)#, parse_dates=["Month"])
df_gas = df_gas.rename(columns={"Date" : "time"})

df_gas["time"] = pd.to_datetime(df_gas["time"])
df_gas["time"] = df_gas["time"].apply(u.get_last_day_of_month)
df_gas = df_gas.set_index("time")

for time in times:
    # gas_price = df_gas.loc[time, "Washington All Grades All Formulations Retail Gasoline Prices $/gal"]#.values[0]
    gas_price = df_gas.loc[time, "Washington All Grades All Formulations Retail Gasoline Prices (Dollars per Gallon)"]#.values[0]
    df.loc[(time, slice(None)), "gas_price"] = gas_price
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,...,m_bev,m_phev,county_y,zip_code_y,countyFIPS_y,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals,gas_price
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-31,53001950100,Adams,99169,53001,8,4,4,8,4,4,8,...,29,37,Adams,99169,53001,2233,12,19,126,2.966
2018-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,...,29,37,Adams,99371,53001,1452,4,10,85,2.966
2018-01-31,53001950300,Adams,99344,53001,6,0,6,6,0,6,6,...,29,37,Adams,99344,53001,6699,29,80,359,2.966
2018-01-31,53001950400,Adams,99344,53001,0,0,0,0,0,0,0,...,29,37,Adams,99344,53001,2337,11,22,128,2.966
2018-01-31,53001950500,Adams,99344,53001,3,3,0,3,3,0,3,...,29,37,Adams,99344,53001,4372,20,41,257,2.966
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,6,4,2,6,4,2,6,...,60,64,Yakima,98948,53077,4101,18,59,247,4.644
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,...,60,64,Yakima,98903,53077,2230,9,24,139,4.644
2022-09-30,53077940004,Yakima,98951,53077,2,2,0,2,2,0,2,...,60,64,Yakima,98951,53077,5452,12,66,366,4.644
2022-09-30,53077940005,Yakima,98948,53077,2,2,0,2,2,0,2,...,60,64,Yakima,98948,53077,4126,6,77,272,4.644


In [32]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,n_ev,n_bev,n_phev,m_ev,m_bev,m_phev,n_veh,gas_price
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2011-01-31,53001950100,Adams,99169,0,0,0,7,6,1,2066,3.293
2011-01-31,53001950200,Adams,99371,0,0,0,7,6,1,1439,3.293
2011-01-31,53001950300,Adams,99344,0,0,0,7,6,1,6123,3.293
2011-01-31,53001950400,Adams,99344,0,0,0,7,6,1,2039,3.293
2011-01-31,53001950500,Adams,99344,0,0,0,7,6,1,4125,3.293
...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,6,4,2,121,60,64,3548,4.644
2022-09-30,53077940003,Yakima,98903,4,1,3,121,60,64,2279,4.644
2022-09-30,53077940004,Yakima,98951,2,2,0,121,60,64,4483,4.644
2022-09-30,53077940005,Yakima,98948,1,1,0,121,60,64,3230,4.644


In [33]:
df.shape

(205437, 10)

In [24]:
#evse
#number of installed L2 and DCFC station locations by zip code and month
# df_evse = pd.read_csv("data/evse/EV_charging_stations_WA.csv")
# df_evse = pd.read_csv("data/evse/evse.csv")
df_evse = pd.read_csv("data/evse/evse_2010tracts.csv")
# df_evse_key = pd.read_csv("config/EV_charging_stations_key.csv")
# df_evse = df_evse.drop("Unnamed: 0", axis=1)
# print(df_evse.columns)
# df_evse.columns = df_evse_key["name"].to_list() + ["census_tract_2020"]
# df_evse.columns = df_evse_key["name"].to_list() + ["census_tract_2020", "census_tract_2010"]
# print(df_evse.columns)
# k

# print(df_evse["open_date"].min())
# df_evse = df_evse[df_evse["open_date"] != "0022-07-22"]
# df_evse["open_date"] = pd.to_datetime(df_evse["open_date"]).dt.date
# # j
# df_evse["l2_count"] = df_evse["l2_count"].fillna(0).astype(int)
# df_evse["dcfc_count"] = df_evse["dcfc_count"].fillna(0).astype(int)

# df_evse["has_l2_or_dcfc"] = df_evse["l2_count"] > 0 #boolean if location has at least one L2 charger
# df_evse["has_l2_or_dcfc"] = (df_evse["l2_count"] > 0) | (df_evse["dcfc_count"] > 0) #boolean if location has at least one L2 or DCFC charger

df_evse = df_evse.convert_dtypes()
df_evse["time"] = pd.to_datetime(df_evse["time"]).dt.date

df_evse = df_evse.set_index(["time", "census_tract_2010"])
df_evse.index.dtypes[1] = "int64"

##### df = df.merge(df_evse, on=["time", "zip"], how="inner", suffixes=(None,"_y"))
df = df.join(df_evse, on=["time", "census_tract_2010"], how="inner", rsuffix="_y")
# df = df.drop(columns="zip_code_y") #the county column appears twice, so removing the duplicate he
# df = df.drop(columns="county_y") #the county column appears twice, so removing the duplicate he

# print(df.head())

"""
df["n_evse"] = 0
for time in times:
    print(time, "", end="")
    # for zip_code in zip_codes:
    for census_tract in census_tracts_2010:
        # dff = df_evse[(df_evse["open_date"] < time) & (df_evse["zip"] == zip_code)] #create helper dataframe containing all stations built in this zip code until this time
        dff = df_evse[(df_evse["open_date"] < time) & (df_evse["census_tract_2010"] == census_tract)] #create helper dataframe containing all stations built in this census tract until this time
        n_evse = sum(dff["has_l2_or_dcfc"]) #counts number of locations with at least one charging station in that ZIP code and for that month
        df.loc[(time, zip_code), "n_evse"] = n_evse
#"""
print("done")

done


In [25]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,...,m_phev,county_y,zip_code_y,countyFIPS_y,n_veh,n_veh_new_sales,n_veh_used_sales,n_veh_renewals,gas_price,n_evse
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-31,53001950100,Adams,99169,53001,8,4,4,8,4,4,8,...,37,Adams,99169,53001,2233,12,19,126,2.966,1
2018-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,...,37,Adams,99371,53001,1452,4,10,85,2.966,0
2018-01-31,53001950300,Adams,99344,53001,6,0,6,6,0,6,6,...,37,Adams,99344,53001,6699,29,80,359,2.966,0
2018-01-31,53001950400,Adams,99344,53001,0,0,0,0,0,0,0,...,37,Adams,99344,53001,2337,11,22,128,2.966,0
2018-01-31,53001950500,Adams,99344,53001,3,3,0,3,3,0,3,...,37,Adams,99344,53001,4372,20,41,257,2.966,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,6,4,2,6,4,2,6,...,64,Yakima,98948,53077,4101,18,59,247,4.644,0
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,...,64,Yakima,98903,53077,2230,9,24,139,4.644,0
2022-09-30,53077940004,Yakima,98951,53077,2,2,0,2,2,0,2,...,64,Yakima,98951,53077,5452,12,66,366,4.644,2
2022-09-30,53077940005,Yakima,98948,53077,2,2,0,2,2,0,2,...,64,Yakima,98948,53077,4126,6,77,272,4.644,0


In [36]:
df.shape

(205437, 11)

In [26]:
df_no_census = df.copy()

In [None]:
df = df_no_census.copy()

In [34]:
len(census_tracts_2010),df.index.levshape

(1458, (93, 1457))

In [13]:
df = empty_df.copy()
df.to_csv("data/census/empty_df.csv")

In [27]:
#census data (if file all_acs_2011_to_2022.csv is constructed already)
df_census = pd.read_csv("data/census/all_acs_2011_to_2022.csv")

# print(df_evse["open_date"].min())
# df_evse = df_evse[df_evse["open_date"] != "0022-07-22"]
# df_evse["open_date"] = pd.to_datetime(df_evse["open_date"]).dt.date
# # j
# df_evse["l2_count"] = df_evse["l2_count"].fillna(0).astype(int)
# df_evse["dcfc_count"] = df_evse["dcfc_count"].fillna(0).astype(int)

# df_evse["has_l2_or_dcfc"] = df_evse["l2_count"] > 0 #boolean if location has at least one L2 charger
# df_evse["has_l2_or_dcfc"] = (df_evse["l2_count"] > 0) | (df_evse["dcfc_count"] > 0) #boolean if location has at least one L2 or DCFC charger

df_census = df_census.convert_dtypes()
df_census["time"] = pd.to_datetime(df_census["time"]).dt.date

df_census = df_census.set_index(["time", "census_tract_2010"])
df_census.index.dtypes[1] = "int64"
# print(df_census)
# o

##### df = df.merge(df_evse, on=["time", "zip"], how="inner", suffixes=(None,"_y"))
df = df.join(df_census, on=["time", "census_tract_2010"], how="inner", rsuffix="_y")
df = df.drop(columns="zip_code_y") #the county column appears twice, so removing the duplicate he
df = df.drop(columns="county_y") #the county column appears twice, so removing the duplicate he

# print(df.head())

"""
df["n_evse"] = 0
for time in times:
    print(time, "", end="")
    # for zip_code in zip_codes:
    for census_tract in census_tracts_2010:
        # dff = df_evse[(df_evse["open_date"] < time) & (df_evse["zip"] == zip_code)] #create helper dataframe containing all stations built in this zip code until this time
        dff = df_evse[(df_evse["open_date"] < time) & (df_evse["census_tract_2010"] == census_tract)] #create helper dataframe containing all stations built in this census tract until this time
        n_evse = sum(dff["has_l2_or_dcfc"]) #counts number of locations with at least one charging station in that ZIP code and for that month
        df.loc[(time, zip_code), "n_evse"] = n_evse
#"""
print("done")

done


In [28]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,...,n_evse,n_total_pop,n_white,n_bachelor,n_workers_16plus,n_drove_alone,median_hh_inc,n_units_tot,n_units_1detached,n_units_1attached
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-31,53001950100,Adams,99169,53001,8,4,4,8,4,4,8,...,1,2493,2226,286,1119,854,47813,1187,974,11
2018-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,...,0,1705,1555,162,666,443,49261,807,640,5
2018-01-31,53001950300,Adams,99344,53001,6,0,6,6,0,6,6,...,0,6884,3913,252,2508,1820,44107,1949,811,12
2018-01-31,53001950400,Adams,99344,53001,0,0,0,0,0,0,0,...,0,3399,2347,173,1304,908,47036,1020,607,69
2018-01-31,53001950500,Adams,99344,53001,3,3,0,3,3,0,3,...,0,4971,3440,119,2073,1522,54223,1564,1121,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,6,4,2,6,4,2,6,...,0,4731,2076,146,1455,1266,51950,1323,1051,38
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,...,0,3542,1637,283,1180,984,51151,1112,772,5
2022-09-30,53077940004,Yakima,98951,53077,2,2,0,2,2,0,2,...,2,6124,3289,111,2395,1759,43721,1786,1186,44
2022-09-30,53077940005,Yakima,98948,53077,2,2,0,2,2,0,2,...,0,4727,2843,155,1782,1351,54397,1412,1037,23


In [12]:
#census data
df = empty_df.copy()
a = [time.year for time in times for census_tract in census_tracts_2010]
# print(a)
# df["year"] = [time.year for time in times for census_tract in census_tracts_2010]

#create year column
df = df.reset_index()
df["time"] = pd.to_datetime(df["time"])#.dt.date
df["year"] = df["time"].dt.year
df["time"] = pd.to_datetime(df["time"]).dt.date
df = df.set_index(["time", "census_tract_2010"])
# l
vars = ["n_total_pop", "n_white", "n_bachelor", "n_workers_16plus", "n_drove_alone", "median_hh_inc", "n_units_tot", "n_units_1detached", "n_units_1attached"]
df[vars] = 0
# years = range(2015,2020)
years = range(2011,2020)
for year in years:
    print(year)
    df_census = pd.read_csv("data/census/all_acs_by_tract_%d.csv"%year)

    df_census = df_census.rename(columns={"GEOID": "census_tract_2010"})

    df_census = df_census.set_index("census_tract_2010")

    # print(df_census.columns)
    # print(vars)
    # print(df["n_total_pop"])
    # print(df.head())
    # df.loc[df["year"] == year, vars] = df.loc[df["year"] == year, vars].join(df_census, on="census_tract_2010", how="inner", rsuffix="_y")
    # print(df.head())

    ##### df = df.merge(df_ev, on=["time", "zip"], how="inner", suffixes=(None,"_y"))
    # df = df.join(df_ev, on=["time", "zip"], how="inner", rsuffix="_y")
    # df = df.join(df_ev, on=["time", "census_tract"], how="inner", rsuffix="_y")
    # df = df.drop(columns="county_y") #the county column appears twice, so removing the duplicate he

    # """
    # alternative way (instead of using join):
    for time in times:
        if time.year == year:
            print(time)
            # df.loc[df["year"] == year, vars] = df.loc[df["year"] == year, vars].join(df_census, on="census_tract_2010", how="inner", rsuffix="_y")
            print(df.loc[(time, slice(None))])
            df.loc[(time, slice(None))] = df.loc[(time, slice(None))].join(df_census, on="census_tract_2010", how="inner", rsuffix="_y")
            print(print(df.loc[(time, slice(None))]))
            # print(df.head())
            k

            # for census_tract in census_tracts_2010:
                # print(census_tract)
                # df.loc[(time, census_tract), vars] = df_census.loc[census_tract]
                # ev_counts = df_ev.loc[(time, zip_code), cols]
                # for col in cols:
                #     if not np.isnan(ev_counts[col]):
                #         df.loc[(time, zip_code), col] = ev_counts[col]
    #"""

print("done")

2011
2011-01-31
                   county zip_code  year  n_total_pop  n_white  n_bachelor  \
census_tract_2010                                                            
53001950100         Adams    99169  2011            0        0           0   
53001950200         Adams    99371  2011            0        0           0   
53001950300         Adams    99344  2011            0        0           0   
53001950400         Adams    99344  2011            0        0           0   
53001950500         Adams    99344  2011            0        0           0   
...                   ...      ...   ...          ...      ...         ...   
53077940002        Yakima    98948  2011            0        0           0   
53077940003        Yakima    98903  2011            0        0           0   
53077940004        Yakima    98951  2011            0        0           0   
53077940005        Yakima    98948  2011            0        0           0   
53077940006        Yakima    98948  2011        

NameError: name 'k' is not defined

In [29]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,county,zip_code,countyFIPS,n_ev,n_bev,n_phev,n_ev_new_sales,n_bev_new_sales,n_phev_new_sales,n_ev_used_sales,...,n_evse,n_total_pop,n_white,n_bachelor,n_workers_16plus,n_drove_alone,median_hh_inc,n_units_tot,n_units_1detached,n_units_1attached
time,census_tract_2010,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2018-01-31,53001950100,Adams,99169,53001,8,4,4,8,4,4,8,...,1,2493,2226,286,1119,854,47813,1187,974,11
2018-01-31,53001950200,Adams,99371,53001,0,0,0,0,0,0,0,...,0,1705,1555,162,666,443,49261,807,640,5
2018-01-31,53001950300,Adams,99344,53001,6,0,6,6,0,6,6,...,0,6884,3913,252,2508,1820,44107,1949,811,12
2018-01-31,53001950400,Adams,99344,53001,0,0,0,0,0,0,0,...,0,3399,2347,173,1304,908,47036,1020,607,69
2018-01-31,53001950500,Adams,99344,53001,3,3,0,3,3,0,3,...,0,4971,3440,119,2073,1522,54223,1564,1121,26
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-09-30,53077940002,Yakima,98948,53077,6,4,2,6,4,2,6,...,0,4731,2076,146,1455,1266,51950,1323,1051,38
2022-09-30,53077940003,Yakima,98903,53077,6,2,4,6,2,4,6,...,0,3542,1637,283,1180,984,51151,1112,772,5
2022-09-30,53077940004,Yakima,98951,53077,2,2,0,2,2,0,2,...,2,6124,3289,111,2395,1759,43721,1786,1186,44
2022-09-30,53077940005,Yakima,98948,53077,2,2,0,2,2,0,2,...,0,4727,2843,155,1782,1351,54397,1412,1037,23


In [30]:
#TODO: drop the one 2010 census tract that has not been mapped to
# df.drop(...)
#save resulting dataframe as csv file
# df.to_csv("data/data.csv")
# df.to_csv("data/data_no_census.csv")
# df.to_csv("data/data_2011_to_2022.csv")
df.to_csv("data/data__2018_to_2022.csv")

In [49]:
#investigations
df = pd.read_csv("data/data_2011_to_2022.csv")#, index_col=["time", "census_tract_2010"])
# df = pd.read_csv("data/data_2011_to_2022_R.csv")#, index_col=["time", "census_tract_2010"])
df

Unnamed: 0,time,census_tract_2010,county,zip_code,n_ev,n_bev,n_phev,m_ev,m_bev,m_phev,...,n_evse,n_total_pop,n_white,n_bachelor,n_workers_16plus,n_drove_alone,median_hh_inc,n_units_tot,n_units_1detached,n_units_1attached
0,2011-01-31,53001950100,Adams,99169.0,0,0,0,7,6,1,...,0,2554,2376,147,1005,698,43575.0,1132,884,37
1,2011-01-31,53001950200,Adams,99371.0,0,0,0,7,6,1,...,0,1644,1385,225,702,503,40395.0,828,656,0
2,2011-01-31,53001950300,Adams,99344.0,0,0,0,7,6,1,...,0,6704,3376,153,2434,1848,42393.0,1854,646,0
3,2011-01-31,53001950400,Adams,99344.0,0,0,0,7,6,1,...,0,2839,1829,158,1061,665,41821.0,870,514,28
4,2011-01-31,53001950500,Adams,99344.0,0,0,0,7,6,1,...,0,4548,2350,100,1629,1248,35305.0,1526,1082,27
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
205432,2022-09-30,53077940002,Yakima,98948.0,6,4,2,121,60,64,...,0,4731,2076,146,1455,1266,51950.0,1323,1051,38
205433,2022-09-30,53077940003,Yakima,98903.0,4,1,3,121,60,64,...,0,3542,1637,283,1180,984,51151.0,1112,772,5
205434,2022-09-30,53077940004,Yakima,98951.0,2,2,0,121,60,64,...,2,6124,3289,111,2395,1759,43721.0,1786,1186,44
205435,2022-09-30,53077940005,Yakima,98948.0,1,1,0,121,60,64,...,0,4727,2843,155,1782,1351,54397.0,1412,1037,23


In [31]:
to_exclude = set([*census_tracts_2010_0n_veh, *census_tracts_2010_less_than_100n_veh, *census_tracts_2010_missing_ACS_data])
census_tracts_2010_2 = [x for x in census_tracts_2010 if x not in to_exclude]
# census_tracts_2010_2 = [x for x in census_tracts_2010 if x not in census_tracts_2010_0n_veh]
# census_tracts_2010_2 = [x for x in census_tracts_2010 if x not in census_tracts_2010_0n_veh]
len(census_tracts_2010_2)

1442

In [51]:
c = 0
for census_tract_2010 in census_tracts_2010:
    times_for_tract = df.loc[df["census_tract_2010"] == census_tract_2010]
    n_times = len(times_for_tract)
    if n_times != 141:
        print(census_tract_2010, n_times)
        # print(n_times, "", end="")
        c += 1
    # print()
c

53009000200 0


1

In [50]:
len(df["census_tract_2010"].unique())
# len(census_tracts_2010)

1457

In [59]:
preds = pd.read_csv("results/preds.csv")
res = pd.read_csv("results/res.csv")
preds = preds.rename(columns={"Unnamed: 0" : "tract-time"})
res = res.rename(columns={"Unnamed: 0" : "tract-time"})
preds = preds.set_index("tract-time")
res = res.set_index("tract-time")

In [60]:
res

Unnamed: 0_level_0,x
tract-time,Unnamed: 1_level_1
53001950100-2011-01-31,1.884832
53001950100-2011-02-28,1.850987
53001950100-2011-03-31,1.822081
53001950100-2011-04-30,1.716856
53001950100-2011-05-31,1.536090
...,...
53077940006-2022-05-31,0.372868
53077940006-2022-06-30,0.362798
53077940006-2022-07-31,0.324819
53077940006-2022-08-31,0.292269


In [66]:
#print tract-times that are in preds but not in res
counter = 0
l = []
for i in preds.index:
    if i not in res.index:
        counter += 1
        l += [i]
        print(i)
counter

53033005302-2015-01-31
53033005302-2015-02-28
53033005302-2015-03-31
53033005302-2015-04-30
53033005302-2015-05-31
53033005302-2015-06-30
53033005302-2015-07-31
53033005302-2015-08-31
53033005302-2015-09-30
53033005302-2015-10-31
53033005302-2015-11-30
53033005302-2015-12-31
53033005302-2018-01-31
53033005302-2018-02-28
53033005302-2018-03-31
53033005302-2018-04-30
53033005302-2018-05-31
53033005302-2018-06-30
53033005302-2018-07-31
53033005302-2018-08-31
53033005302-2018-09-30
53033005302-2018-10-31
53033005302-2018-11-30
53033005302-2018-12-31
53033005302-2020-01-31
53033005302-2020-02-29
53033005302-2020-03-31
53033005302-2020-04-30
53033005302-2020-05-31
53033005302-2020-06-30
53033005302-2020-07-31
53033005302-2020-08-31
53033005302-2020-09-30
53033005302-2020-10-31
53033005302-2020-11-30
53033005302-2020-12-31
53033005302-2021-01-31
53033005302-2021-02-28
53033005302-2021-03-31
53033005302-2021-04-30
53033005302-2021-05-31
53033005302-2021-06-30
53033005302-2021-07-31
53033005302

216

In [70]:
l = [j.split("-")[0] for j in l]
l

['53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53033005302',
 '53035081400',
 '53035081400',
 '53035081400',
 '53035081400',
 '53035081400',
 '530350

In [69]:
[[x,l.count(x)] for x in set(l)]

[['53033005302', 57],
 ['53037975401', 33],
 ['53053072906', 57],
 ['53035081400', 69]]