# combine_data_sources.ipynb
**Combines the different data sources to create one dataframe containing all data (EV registrations, EV product variety, gas prices, EV charging locations, ...) by month and ZIP code and saves the resulting dataframe in `data/df.csv`.**

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
from uszipcode import SearchEngine
search = SearchEngine()

import utils as u
from config.counties_zips import zip_codes



In [2]:
# times = [(y, m) for y in range(2017,2022) for m in range(1,13)]
# times = [(y, m) for y in range(2017,2018) for m in range(1,3)]
times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2022) for m in range(1,13)]
# times = [u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20)) for y in range(2017,2018) for m in range(1,3)]
# zip_codes = df["zip"].unique()
# print(zip_codes, len(zip_codes))
print(len(zip_codes))

598


In [3]:
times_full = [time for time in times for _ in zip_codes]
zip_codes_full = [*zip_codes]*len(times)

df = pd.DataFrame()
df["time"] = times_full
df["zip"]  = zip_codes_full
df["county"] = ""
for zip_code in zip_codes:
    df.loc[df["zip"] == zip_code, "county"] = search.by_zipcode(zip_code).county
df.head()

Unnamed: 0,time,zip,county
0,2017-01-31,98001,King County
1,2017-01-31,98002,King County
2,2017-01-31,98003,King County
3,2017-01-31,98004,King County
4,2017-01-31,98005,King County


In [7]:
df.to_csv("data/index.csv")

In [4]:
# gas_price
df_gas = pd.read_csv("data/gas/Weekly_Retail_Gasoline_and_Diesel_Prices.csv", header=6, parse_dates=["Month"])
df_gas = df_gas.rename(columns={"Month" : "time"})

df_gas["time"] = df_gas["time"].apply(u.get_last_day_of_month)

for time in times:
    gas_price = df_gas.loc[df_gas["time"] == time, "Washington All Grades All Formulations Retail Gasoline Prices $/gal"].values[0]
    df.loc[df["time"] == time, "gas_price"] = gas_price

# print(df.tail())

In [9]:
def get_date_from_tuple(t):
    """
    Extract the last day of the month (as a datetime.datetime object) from a given string such as "(2017, 1)" or "(2021, 10)".
    """
    y = int(t[1:5])
    m = int(t[7:8]) if len(t)==9 else int(t[7:9])
    last_day_of_month = u.get_last_day_of_month(dt.datetime(year=y, month=m, day=20))
    return last_day_of_month

In [16]:
df.shape

(35880, 4)

In [5]:
# n_ev, m_ev
df_ev = pd.read_csv("data/vehicles/ev.csv")
df_ev["time"] = df_ev["time"].apply(u.get_date_from_tuple)
cols = ["n_ev", "n_bev", "n_phev", "m_ev", "m_bev", "m_phev"]
df[cols] = 0

# print(df.head())

for time in times:
    print(time)
    for zip_code in zip_codes:
        ev_counts = df_ev.loc[(df_ev["time"] == time) & (df_ev["zip"] == zip_code), cols]
        for col in cols:
#             print(df.loc[(df["time"] == time) & (df["zip"] == zip_code), col])
#             print(ev_counts[col])
#             print(ev_counts[col].values[0])
            if not ev_counts[col].empty:
                df.loc[(df["time"] == time) & (df["zip"] == zip_code), col] = ev_counts[col].values[0]
#             print(df.loc[(df["time"] == time) & (df["zip"] == zip_code), col])
#             print(ev_counts[col])
#         print(ev_counts)
#         print(df.loc[(df["time"] == time) & (df["zip"] == zip_code), cols])
#         df.loc[(df["time"] == time) & (df["zip"] == zip_code), cols] = ev_counts
print("done")

2017-01-31 00:00:00
2017-02-28 00:00:00
2017-03-31 00:00:00
2017-04-30 00:00:00
2017-05-31 00:00:00
2017-06-30 00:00:00
2017-07-31 00:00:00
2017-08-31 00:00:00
2017-09-30 00:00:00
2017-10-31 00:00:00
2017-11-30 00:00:00
2017-12-31 00:00:00
2018-01-31 00:00:00
2018-02-28 00:00:00
2018-03-31 00:00:00
2018-04-30 00:00:00
2018-05-31 00:00:00
2018-06-30 00:00:00
2018-07-31 00:00:00
2018-08-31 00:00:00
2018-09-30 00:00:00
2018-10-31 00:00:00
2018-11-30 00:00:00
2018-12-31 00:00:00
2019-01-31 00:00:00
2019-02-28 00:00:00
2019-03-31 00:00:00
2019-04-30 00:00:00
2019-05-31 00:00:00
2019-06-30 00:00:00
2019-07-31 00:00:00
2019-08-31 00:00:00
2019-09-30 00:00:00
2019-10-31 00:00:00
2019-11-30 00:00:00
2019-12-31 00:00:00
2020-01-31 00:00:00
2020-02-29 00:00:00
2020-03-31 00:00:00
2020-04-30 00:00:00
2020-05-31 00:00:00
2020-06-30 00:00:00
2020-07-31 00:00:00
2020-08-31 00:00:00
2020-09-30 00:00:00
2020-10-31 00:00:00
2020-11-30 00:00:00
2020-12-31 00:00:00
2021-01-31 00:00:00
2021-02-28 00:00:00


In [6]:
df.head()

Unnamed: 0,time,zip,county,gas_price,n_ev,n_bev,n_phev,m_ev,m_bev,m_phev
0,2017-01-31,98001,King County,2.743,98.0,57.0,41.0,117.0,28.0,23.0
1,2017-01-31,98002,King County,2.743,39.0,18.0,21.0,117.0,28.0,23.0
2,2017-01-31,98003,King County,2.743,79.0,40.0,39.0,117.0,28.0,23.0
3,2017-01-31,98004,King County,2.743,551.0,414.0,137.0,117.0,28.0,23.0
4,2017-01-31,98005,King County,2.743,299.0,219.0,80.0,117.0,28.0,23.0


In [7]:
df.shape

(35880, 10)

In [176]:
df.to_csv("data/df2.csv")

In [13]:
#evse
#number of installed L2 stations + DCFC stations by zip code and month
df_evse = pd.read_csv("data/evse/EV_charging_stations_WA.csv")
df_evse_key = pd.read_csv("config/EV_charging_stations_key.csv")
df_evse.columns = df_evse_key["name"]

df_evse["open_date"] = pd.to_datetime(df_evse["open_date"])

df_evse["l2_count"] = df_evse["l2_count"].fillna(0).astype(int)
df_evse["dcfc_count"] = df_evse["dcfc_count"].fillna(0).astype(int)

# df_evse["has_l2_or_dcfc"] = df_evse["l2_count"] > 0 #boolean if location has at least one L2 charger
df_evse["has_l2_or_dcfc"] = (df_evse["l2_count"] > 0) | (df_evse["dcfc_count"] > 0) #boolean if location has at least one L2 or DCFC charger

df["n_evse"] = 0
for t in times:
    print(t)
    for zip_code in zip_codes:
        dff = df_evse[(df_evse["open_date"] < t) & (df_evse["zip"] == zip_code)]
#         n_evse = dff["has_l2_or_dcfc"].value_counts()
        n_evse = sum(dff["has_l2_or_dcfc"]) #counts number of locations with at least one charging station in that ZIP code and for that month
        df.loc[(df["time"] == t) & (df["zip"] == zip_code), "n_evse"] = n_evse
    
#     df = zip_code_level_registrations.size()
#     m_ev = len(df["make_model"].unique())
#     for zip, count in zip_code_level_EV_counts.iteritems():
#         final_df.loc[(final_df["time"] == t) & (final_df["zip"] == zip), "n_ev"] = count
#         final_df.loc[(final_df["time"] == t) & (final_df["zip"] == zip), "m_ev"] = m_ev
    


2017-01-31 00:00:00
2017-02-28 00:00:00
2017-03-31 00:00:00
2017-04-30 00:00:00
2017-05-31 00:00:00
2017-06-30 00:00:00
2017-07-31 00:00:00
2017-08-31 00:00:00
2017-09-30 00:00:00
2017-10-31 00:00:00
2017-11-30 00:00:00
2017-12-31 00:00:00
2018-01-31 00:00:00
2018-02-28 00:00:00
2018-03-31 00:00:00
2018-04-30 00:00:00
2018-05-31 00:00:00
2018-06-30 00:00:00
2018-07-31 00:00:00
2018-08-31 00:00:00
2018-09-30 00:00:00
2018-10-31 00:00:00
2018-11-30 00:00:00
2018-12-31 00:00:00
2019-01-31 00:00:00
2019-02-28 00:00:00
2019-03-31 00:00:00
2019-04-30 00:00:00
2019-05-31 00:00:00
2019-06-30 00:00:00
2019-07-31 00:00:00
2019-08-31 00:00:00
2019-09-30 00:00:00
2019-10-31 00:00:00
2019-11-30 00:00:00
2019-12-31 00:00:00
2020-01-31 00:00:00
2020-02-29 00:00:00
2020-03-31 00:00:00
2020-04-30 00:00:00
2020-05-31 00:00:00
2020-06-30 00:00:00
2020-07-31 00:00:00
2020-08-31 00:00:00
2020-09-30 00:00:00
2020-10-31 00:00:00
2020-11-30 00:00:00
2020-12-31 00:00:00
2021-01-31 00:00:00
2021-02-28 00:00:00


In [14]:
df.head()

Unnamed: 0,time,zip,county,gas_price,n_ev,n_bev,n_phev,m_ev,m_bev,m_phev,n_evse
0,2017-01-31,98001,King County,2.743,98.0,57.0,41.0,117.0,28.0,23.0,0
1,2017-01-31,98002,King County,2.743,39.0,18.0,21.0,117.0,28.0,23.0,2
2,2017-01-31,98003,King County,2.743,79.0,40.0,39.0,117.0,28.0,23.0,1
3,2017-01-31,98004,King County,2.743,551.0,414.0,137.0,117.0,28.0,23.0,16
4,2017-01-31,98005,King County,2.743,299.0,219.0,80.0,117.0,28.0,23.0,1


In [15]:
df.shape

(35880, 11)

In [16]:
#save resulting dataframe as csv file
df.to_csv("data/df.csv")

In [228]:
from uszipcode import SearchEngine, SimpleZipcode, ComprehensiveZipcode
search = SearchEngine()
zipcode = search.by_zipcode(98119)
# zipcode.zipcode # access attributes
# zipcode.major_city
zipcode.county

'King County'