# scrape_veh_registration_data.ipynb
**This notebook downloads all relevant vehicle registration data (EVs and all vehicles) from the general purpose open data portal for Washington at [Data.WA.gov](Data.WA.gov) and store the outputs in the corresponding folder `data/vehicles/raw/`**
+ Each downloaded file is recorded with today's date in its filename, e.g. `Electric_Vehicle_Population_Size_History (as of 2022-02-28).csv`
+ This notebook also updates the most recent file for each of the datasets using the ``update_latest_file()`` function from the `utils.py` library, and stores it in a separate file, e.g. `Electric_Vehicle_Population_Size_History.csv` (without any date note)
+ Time required for scraping: less than a minute for all EV data, 10-20 minutes for data on all vehicles

### Datasets that can be downloaded with this notebook
+ **Electric Vehicle Population Data**
  + https://data.wa.gov/Transportation/Electric-Vehicle-Population-Data/f6w7-q2d2
  + Snapshot of currently active EV registrations (“BEVs + PHEVs that are currently registered through WA DOL”)
  + By ZIP code, model/make, EV type, range, …
  + ~87.7k rows, 15 MB
+ **Electric Vehicle Population Size History**
  + https://data.wa.gov/Transportation/Electric-Vehicle-Population-Size-History/d886-d5q2
  + Record of total number PHEVs and BEVs in WA
  + By EV type, month
  + January 2017 to December 2021
+ **Electric Vehicle Population Size History By County**
  + https://data.wa.gov/Transportation/Electric-Vehicle-Population-Size-History-By-County/3d5d-sdqb
  + Record of number of PHEVs, BEVs, and non-EVs in all states
  + By county, passenger/truck, EV type, month
  + January 2017 to November 2021
+ **Electric Vehicle Title and Registration Activity**
  + https://data.wa.gov/Transportation/Electric-Vehicle-Title-and-Registration-Activity/rpr4-cgyd
  + Record of EV title and registration events
  + By ZIP code, model/make, EV type, range, DOL transaction date, transaction type…
  + Transaction types:
    + Original Title
    + Transfer Title
    + Original Registration
    + Registration Renewal
    + Registration at time of Transfer
  + 2010-01-06 to 2021-12-30
  + 502k rows, 192 MB
+ **Vehicle Registration Transactions by Department of Licensing**
  + https://data.wa.gov/Transportation/Vehicle-Registration-Transactions-by-Department-of/brw6-jymh
  + Record of all vehicle registration transactions
  + By ZIP code, model/make, vehicle type, fuel type, electrification level, transaction type, transaction month
  + Transaction types:
    + Original Registration
    + Registration Renewal
    + Registration at time of Transfer
+ **Vehicle Title Transactions by Department of Licensing**
  + https://data.wa.gov/Transportation/Vehicle-Title-Transactions-by-Department-of-Licens/cdk6-5kdf
  + Record of all vehicle title transactions
  + By ZIP code, model/make, vehicle type, fuel type, electrification level, transaction type, transaction month
  + Transaction types:
    + Original Title
    + Transfer Title


In [1]:
import requests
import datetime as dt

from utils import scrape_from_api, update_latest_file

In [2]:
folder = "data/vehicles/"
# folder = "D:/WSDOT EV Forecasting - Task 2/data/vehicles/"
today = dt.date.today().strftime("%Y-%m-%d")
dics = {}

In [20]:
# Electric_Vehicle_Population_Data
fname_base = "Electric_Vehicle_Population_Data"
# filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

url = "https://data.wa.gov/resource/f6w7-q2d2.csv"
params = {}
params["$limit"] = int(150e3) #1000 is default, ~87k is current (Feb. 2022) count of EVs in WA
# params["$select"] = "vin_1_10, county, city, state, zip_code, model_year, make, model, ev_type, cafv_type, electric_range, base_msrp, legislative_district, dol_vehicle_id"

dics[fname_base] = [url,params]

# scrape_from_api(url, params, filename)
# update_latest_file(fname_base, folder)

In [21]:
# Electric_Vehicle_Population_Size_History
fname_base = "Electric_Vehicle_Population_Size_History"
# filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

url = "https://data.wa.gov/resource/d886-d5q2.csv"
params = {}
# params["$limit"] = 10
# params["$select"] = "date, battery_electric_vehicle"

dics[fname_base] = [url,params]

# scrape_from_api(url, params, filename)
# update_latest_file(fname_base, folder)

In [22]:
# Electric_Vehicle_Population_Size_History_By_County
fname_base = "Electric_Vehicle_Population_Size_History_By_County"
# filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

url = "https://data.wa.gov/resource/3d5d-sdqb.csv"
params = {}
params["$limit"] = 17000 #1000 is default, ~12k is current (Feb. 2022) number of rows in the dataset
# params["$select"] = "date, state, battery_electric_vehicles_bevs_"
# params["state"] = "WA"

dics[fname_base] = [url,params]

# scrape_from_api(url, params, filename)
# update_latest_file(fname_base, folder)

In [23]:
# Electric_Vehicle_Title_and_Registration_Activity
fname_base = "Electric_Vehicle_Title_and_Registration_Activity"
# filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

url = "https://data.wa.gov/resource/rpr4-cgyd.csv"
params = {}
params["$limit"] = int(700e3) #1000 is default, ~502k is current (Feb. 2022) number of rows in the dataset
# params["$select"] = "electric_vehicle_type, vin_1_10, model_year, make, model, new_or_used_vehicle, sale_price, transaction_date, transaction_type, transaction_year, county, city, zip"
# params["zip"] = "98119"

dics[fname_base] = [url,params]

# scrape_from_api(url, params, filename)
# update_latest_file(fname_base, folder)

In [7]:
# Vehicle_Registration_Transactions_by_Department_of_Licensing
fname_base = "Vehicle_Registration_Transactions_by_Department_of_Licensing"
# filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

url = "https://data.wa.gov/resource/brw6-jymh.csv"
params = {}
# params["$limit"] = int(19e6) #1000 is default, ~13.7M is current (Feb. 2022) number of rows in the dataset
params["$limit"] = int(1e6) #1000 is default, ~13.7M is current (Feb. 2022) number of rows in the dataset
# params["$select"] = "start_of_month, model, model_year, vehicle_type, vehicle_primary_use, fuel_type_primary, fuel_type_secondary, electrification_level, zip_code, transaction_type"
params["$select"] = "start_of_month, vehicle_type, electrification_level, county, state, zip_code, transaction_type, _2020_census_tract, vehicle_record_count"
# params["zip"] = "98119"

dics[fname_base] = [url,params]

# scrape_from_api(url, params, filename)
# update_latest_file(fname_base, folder)

In [27]:
# Vehicle_Title_Transactions_by_Department_of_Licensing
fname_base = "Vehicle_Title_Transactions_by_Department_of_Licensing"
# filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

url = "https://data.wa.gov/resource/cdk6-5kdf.csv"
params = {}
params["$limit"] = int(9e6) #1000 is default, ~6.8M is current (Feb. 2022) number of rows in the dataset
# params["$select"] = "start_of_month, model, model_year, vehicle_type, vehicle_primary_use, fuel_type_primary, fuel_type_secondary, electrification_level, zip_code, transaction_type"
# params["$select"] = "model, model_year, vehicle_primary_use, fuel_type_primary, electrification_level, county, state, zip_code, transaction_type, _2020_census_tract, vehicle_record_count"
# params["zip"] = "98119"

dics[fname_base] = [url,params]

# scrape_from_api(url, params, filename)
# update_latest_file(fname_base, folder)

In [8]:
dics

{'Vehicle_Registration_Transactions_by_Department_of_Licensing': ['https://data.wa.gov/resource/brw6-jymh.csv',
  {'$limit': 1000000,
   '$select': 'start_of_month, vehicle_type, electrification_level, county, state, zip_code, transaction_type, _2020_census_tract, vehicle_record_count'}]}

In [9]:
#select datasets you would like to download
datasets = []
# datasets += ["Electric_Vehicle_Population_Data"]
# datasets += ["Electric_Vehicle_Population_Size_History"]
# datasets += ["Electric_Vehicle_Population_Size_History_By_County"]
# datasets += ["Electric_Vehicle_Title_and_Registration_Activity"]
datasets += ["Vehicle_Registration_Transactions_by_Department_of_Licensing"]
# datasets += ["Vehicle_Title_Transactions_by_Department_of_Licensing"]

In [10]:
# scrape all relevant data sources and update the most recent file
for fname_base, val in dics.items():
    if fname_base in datasets:
        url, params = val
        print("Scraping {0:s}...".format(fname_base))
        print("\turl={0:s}, params={1:s}".format(url,str(params)))
        
        filename = folder + "raw/" + fname_base + " (as of {0:s}).csv".format(today)

        scrape_from_api(url, params, filename)
        # update_latest_file(fname_base, folder)
print("done")

Scraping Vehicle_Registration_Transactions_by_Department_of_Licensing...
	url=https://data.wa.gov/resource/brw6-jymh.csv, params={'$limit': 1000000, '$select': 'start_of_month, vehicle_type, electrification_level, county, state, zip_code, transaction_type, _2020_census_tract, vehicle_record_count'}
done


In [None]:
"""
Here, I tried figuring out here how to show a progress bar for the ongoing downloading process.

The problem is that the Socrata API doesn't return the file size, so I tried estimating 
the file size by counting the number of rows and number of columns and applying a scale factor.
This didn't work sufficiently well, so I discarded this for now.

For long downloading processes, the best is to monitor the downloaded file size in Windows Explorer 
(or similar) and know the estimated file size for the complete dataset. This helps in estimating 
how long the download will still take.
"""

import enlighten
import numpy as np
import math
from tqdm import tqdm
from urllib.request import urlopen

def scrape_from_api2(url, params={}, filename="scraped_data", verbose=False):
#     params2 = params.copy()
#     print("h1", params)
#     params2["$select"] = "count(*)"
#     print("h2", params)
#     print("params2", params2)
#     response2 = requests.get(url, params=params2)
#     print(response2.url)
#     print(response2.text.split("\n"))
#     nrows = int(response2.text.split("\n")[1][1:-1])
    nrows = params["$limit"]
    print(nrows)
#     dlen = total
    
#     response3 = requests.head(url)
#     print(response.headers)

    params4 = params.copy()
    print("hh", params)
    params4["$limit"] = 1
    print("hhh", params)
    response4 = requests.get(url, params=params4)
    print(response4.text)
    l = response4.text.split("\n")[0].split(",")
    ncols = len(l)
    print(l,ncols)
    
#     dlen = ncols*nrows*10
    dlen = int(ncols*nrows*12.5)
    dlen = int(dlen/260000)
    print(dlen)
    
    print(params)
    
    response = requests.get(url, stream=True, params=params)
    
    if verbose:
        print("status code:", response.status_code)
        print("full url:", response.url)
        print("dictio", response.request.headers)
#         print("text:", response.text)
    
    #used to monitor progress
    MANAGER = enlighten.get_manager()
#     dlen = int(response.headers.get('Content-Length', '0')) or None
#     dlen = int(1e8)
#     dlen = int(response.request.headers.get('Content-Length', '0')) or None
#     dlen = int(len(response.content))
#     dlen = response.request.headers["Content-Length"]
#     print(dlen)
#     dlen = sum(len(chunk) for chunk in response.iter_content(8196))


    
    
#     print(total)
    
    # with MANAGER.counter(color = 'green', leave = False) as ctr, \
#     with MANAGER.counter(color = 'green', total = dlen and math.ceil(dlen / 2 ** 20), unit = 'MiB', leave = False) as ctr, \
#      open(fname, 'wb', buffering = 2 ** 24) as f:
#     for chunk in r.iter_content(chunk_size = 2 ** 20):
#         print(chunk[-16:].hex().upper())
#     with MANAGER.counter(color = 'green', total = dlen and math.ceil(dlen / 2**20), unit = 'MiB', leave = False) as ctr, \
    with MANAGER.counter(color = 'green', total = dlen, leave = False) as ctr, \
         open(filename, "wb") as fd:
        for chunk in response.iter_content(chunk_size=2**20):
            fd.write(chunk)
            ctr.update()

In [None]:
scrape_from_api2(url, params, filename, True)