In [13]:
from legiscan import LegiScan
import legiscan

import os
import pandas as pd
import swifter
import zipfile
import base64
import io
import glob
import time
import json
import requests
import mimetypes
import tqdm 

# force reload of legiscan if .py file is updated
from importlib import reload
reload(legiscan);

In [None]:
# # make sure api key is saved as an environment variable
# for key, value in os.environ.items():
#     if key == 'LEGISCAN_API_KEY': 
#         print(f'{key}: {value}')

### using pylegiscan to parse legiscan json -- test sample 

In [None]:
# I saved my API key as an environment variable 
# create an API key here: https://legiscan.com/legiscan

api_key = os.environ.get('LEGISCAN_API_KEY')
legis = LegiScan(api_key)

In [None]:
# use LegiScan's datasets to get all of the bills for each session
# Here, I'm just creating a sample dataset
datasets = legis.get_dataset_list()
dataset = legis.get_dataset(datasets[20]['session_id'], datasets[20]['access_key'])

# DO NOT PRINT THE OUTPUT FOR THE FOLLOWING LINE
#   - key 'zip' contains a base64 encoded zip file ( a very very long string that requires conversion ) 
# dataset['zip']
len(dataset['zip'])

In [None]:
# we need to decode the datasets into a normal file
# using Python's zipfile module here
z_bytes = base64.b64decode(dataset['zip'])

# create an in-memory stream for bytes data (io.BytesIO()) from decoded base64,
#     then create a zipfile object using the zipfile module to store the bytes
z = zipfile.ZipFile(io.BytesIO(z_bytes))

# extract all files in the zip file
z.extractall("./sample-data")

In [None]:
import glob # to return all file paths that match a specific pattern

filenames = glob.glob("./sample-data/*/*/bill/*", recursive=True)
filenames[:2]

In [None]:
import json # the bill data is stored in json format, so we need to import json as well to be able to read the files

json_data = json.load(open("./sample-data/AL/2012-2012_1st_Special_Session/bill/HB1.json"))
#json_data

# when we look at the json_data, we can see that the actual bill text (which we need!!) is not in the json files already
# after creating a database with the bills, we need to extract the bill text from the urls in the json

### download and extract final datasets

In [None]:
# instantiate legis object 
api_key = os.environ.get('LEGISCAN_API_KEY')
legis = LegiScan(api_key)

In [None]:
datasets = legis.get_dataset_list()
len(datasets)

In [None]:
# DO NOT RERUN THIS CELL -- the bill data is already saved in a local folder
# num_datasets = len(datasets) 

# for dataset in tqdm.tqdm_notebook(datasets): 
#     session_id = dataset['session_id'] #  ID corresponds to internal LegiScan legislative session numbering
#     access_key = dataset['access_key'] #  required by the legiscan api to access getDatasetList
    
#     # get all of the details for a particular dataset
#     dataset_details = legis.get_dataset(session_id, access_key)
    
#     # decode the dataset
#     zipfile_bytes = base64.b64decode(dataset_details['zip'])
#     z = zipfile.ZipFile(io.BytesIO(zipfile_bytes))
#     z.extractall("./bill_data")
    

In [16]:
filenames = glob.glob("bill_data/*/*/bill/*.json")
len(filenames)

1753624

In [17]:
def process_json(filename):
    with open(filename) as file:
        bill_data = {}
        # We need to do a little string replacing so the 
        json_str = file.read().replace('"0000-00-00"', 'null')
        content = json.loads(json_str)['bill']

        bill_data['bill_id'] = content['bill_id']
        bill_data['code'] = os.path.splitext(os.path.basename(filename))[0]
        bill_data['bill_number'] = content['bill_number']
        bill_data['title'] = content['title']
        bill_data['description'] = content['description']
        bill_data['state'] = content['state']
        bill_data['session'] = content['session']['session_name']
        bill_data['filename'] = filename
        bill_data['status'] = content['status']
        bill_data['status_date'] = content['status_date']

        try:
            bill_data['url'] = content['texts'][-1]['state_link']
        except:
            pass

        return pd.Series(bill_data)

In [None]:
df = pd.Series(filenames).swifter.apply(process_json)
df.head()

Pandas Apply:   0%|          | 0/1753624 [00:00<?, ?it/s]