In [1]:
import pandas as pd
import numpy as np
import os
import json


Let's walk through the input folder and save every bill as its json contents.


In [2]:
bills = []
statesDict = {}
for f in os.walk(r"D:\Big Input Data Stuff\LegiScan\input"):
    if "bill" in f[0]:
        for file in f[-1]:
            if ".json" in file:
                fileName = f[0] + "\\" + file
                with open(fileName, "r") as b:
                    s = json.loads(b.read())
                    bills.append(s)
                    statesDict[s['bill']['state_id']] = s["bill"]["state"]


`statesDict` contains the mapping of all internal state_ids to their abbreviation, for ease of reading. This will be used later when storing about people for human readable formats.

In [3]:
statesDict

{2: 'AK',
 1: 'AL',
 4: 'AR',
 3: 'AZ',
 5: 'CA',
 6: 'CO',
 7: 'CT',
 51: 'DC',
 8: 'DE',
 9: 'FL',
 10: 'GA',
 11: 'HI',
 15: 'IA',
 12: 'ID',
 13: 'IL',
 14: 'IN',
 16: 'KS',
 17: 'KY',
 18: 'LA',
 21: 'MA',
 20: 'MD',
 19: 'ME',
 22: 'MI',
 23: 'MN',
 25: 'MO',
 24: 'MS',
 26: 'MT',
 33: 'NC',
 34: 'ND',
 27: 'NE',
 29: 'NH',
 30: 'NJ',
 31: 'NM',
 28: 'NV',
 32: 'NY',
 35: 'OH',
 36: 'OK',
 37: 'OR',
 38: 'PA',
 39: 'RI',
 40: 'SC',
 41: 'SD',
 42: 'TN',
 43: 'TX',
 52: 'US',
 44: 'UT',
 46: 'VA',
 45: 'VT',
 47: 'WA',
 49: 'WI',
 48: 'WV',
 50: 'WY'}

Let's take a look at what one of our bill files looks like


In [4]:
print(json.dumps(bills[0], indent=2))


{
  "bill": {
    "bill_id": 132957,
    "change_hash": "b5e975dbbea5c36b8138633cb342023d",
    "session_id": 58,
    "session": {
      "session_id": 58,
      "state_id": 2,
      "year_start": 2009,
      "year_end": 2010,
      "prefile": 0,
      "sine_die": 1,
      "prior": 1,
      "special": 0,
      "session_tag": "Regular Session",
      "session_title": "2009-2010 Regular Session",
      "session_name": "26th Legislature"
    },
    "url": "https://legiscan.com/AK/bill/HB1/2009",
    "state_link": "http://www.legis.state.ak.us/basis/get_bill.asp?session=26&bill=HB1",
    "completed": 0,
    "status": 1,
    "status_date": "2009-01-20",
    "progress": [
      {
        "date": "2009-01-20",
        "event": 1
      },
      {
        "date": "2009-01-20",
        "event": 9
      }
    ],
    "state": "AK",
    "state_id": 2,
    "bill_number": "HB1",
    "bill_type": "B",
    "bill_type_id": "1",
    "body": "H",
    "body_id": 13,
    "current_body": "H",
    "current_bod

Ok that's a lot of information, most of it useless. Let's take out what we care about.


In [8]:
statuses = [
    "N\\A",
    "Intro",
    "Engrossed",
    "Enrolled",
    "Passed",
    "Vetoed",
    "Failed",
    "Override",
    "Chaptered",
    "Refer",
    "Report Pass",
    "Report DNP",
    "Draft",
]
bill_id = bills[0]["bill"]["bill_id"]
status = statuses[bills[0]["bill"]["status"]]
bill_name = bills[0]["bill"]["title"]
bill_number = bills[0]["bill"]["bill_number"]
bill_state = bills[0]["bill"]["state"]
bill_desc = bills[0]["bill"]["description"]
text_link = bills[0]["bill"]["texts"][-1]["url"]
sponsors = []
for person in bills[0]["bill"]["sponsors"]:
    sponsors.append(person["people_id"])
roll_call_ids = []
for vote in bills[0]["bill"]["votes"]:
    roll_call_ids.append(vote["roll_call_id"])
topics = []
for topic in bills[0]["bill"]["subjects"]:
    topics.append(topic["subject_name"])
example_bill = {
    "ID": bill_id,
    "Name": bill_name,
    "Number": bill_number,
    "State": bill_state,
    "Status": status,
    "Description": bill_desc,
    "Text": text_link,
    "Topics": f"{topics}",
    "Sponsors": f"{sponsors}",
    "Votes": f"{roll_call_ids}"
}
example_bill


{'ID': 132957,
 'Name': 'Amend Amt Of Base Student Allocation',
 'Number': 'HB1',
 'State': 'AK',
 'Status': 'Intro',
 'Description': 'An Act relating to the base student allocation used in the formula for state funding of public education; providing for an effective date by amending the effective dates of secs. 9 and 10, ch. 9, SLA 2008; and providing for an effective date.',
 'Text': 'https://legiscan.com/AK/text/HB1/id/365510',
 'Topics': "['Education', 'Public Finance', 'School Districts', 'Schools', 'State Aid']",
 'Sponsors': '[6033, 6034]',
 'Votes': '[]'}

Much better, let's do that to every single one of our bills and add them to a DataFrame so we can save it out to a database later.


In [9]:
statuses = [
    "N\\A",
    "Intro",
    "Engrossed",
    "Enrolled",
    "Passed",
    "Vetoed",
    "Failed",
    "Override",
    "Chaptered",
    "Refer",
    "Report Pass",
    "Report DNP",
    "Draft",
]
bill_list = []
for bill in bills:
    bill_id = bill["bill"]["bill_id"]
    status = statuses[bill["bill"]["status"]]
    bill_name = bill["bill"]["title"]
    bill_number = bill["bill"]["bill_number"]
    bill_state = bill["bill"]["state"]
    bill_desc = bill["bill"]["description"]
    text_link = (
        bill["bill"]["texts"][-1]["url"] if len(bill["bill"]["texts"]) != 0 else ""
    )
    sponsors = []
    for person in bill["bill"]["sponsors"]:
        sponsors.append(person["people_id"])
    roll_call_ids = []
    for vote in bill["bill"]["votes"]:
        roll_call_ids.append(vote["roll_call_id"])
    topics = []
    for topic in bill["bill"]["subjects"]:
        topics.append(topic["subject_name"])
    bill_list.append(
        (
            {
                "ID": bill_id,
                "Name": bill_name,
                "Number": bill_number,
                "State": bill_state,
                "Description": bill_desc,
                "Status": status,
                "Text": text_link,
                "Topics": f"{topics}",
                "Sponsors": f"{sponsors}",
                "Votes": f"{roll_call_ids}",
            }
        )
    )


In [10]:
bill_df = pd.DataFrame.from_records(bill_list)
bill_df.set_index("ID", inplace=True)
bill_df


Unnamed: 0_level_0,Name,Number,State,Description,Status,Text,Topics,Sponsors,Votes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
132957,Amend Amt Of Base Student Allocation,HB1,AK,An Act relating to the base student allocation...,Intro,https://legiscan.com/AK/text/HB1/id/365510,"['Education', 'Public Finance', 'School Distri...","[6033, 6034]",[]
132966,Municipal Tax Exemptions,HB10,AK,An Act relating to mandatory exemptions from m...,Passed,https://legiscan.com/AK/text/HB10/id/367679,"['Age Of Persons', 'Disabilities', 'Law Enforc...","[6033, 6054, 6053, 6052, 6051, 6050, 6046, 604...",[]
133056,"Joseph C Williams, Sr., Coastal Trail",HB100,AK,"An Act relating to the Joseph C. Williams, Sr....",Passed,https://legiscan.com/AK/text/HB100/id/392326,"['Highways', 'Place Names', 'Trails', 'Transpo...",[6067],[]
133057,Exemptions: Life Insurance; Annuities,HB101,AK,An Act increasing the value of life insurance ...,Passed,https://legiscan.com/AK/text/HB101/id/392605,"['Civil Procedure', 'Death', 'Debt', 'Insuranc...","[6034, 6061]",[]
133058,Uniform Commercial Code/liens/contracts,HB102,AK,An Act relating to the Uniform Commercial Code...,Passed,https://legiscan.com/AK/text/HB102/id/392839,"['Animals', 'Business', 'Court Rules', 'Credit...",[],[]
...,...,...,...,...,...,...,...,...,...
1664730,Right of health care access-constitutional ame...,SJ0010,WY,A JOINT RESOLUTION proposing to amend the Wyom...,Failed,https://legiscan.com/WY/text/SJ0010/id/2646499,[],"[22676, 20193, 8711, 14226, 8675, 10602]",[1239782]
1664641,Convention of states.,SJ0011,WY,A JOINT RESOLUTION requesting Congress to call...,Engrossed,https://legiscan.com/WY/text/SJ0011/id/2646418,[],"[24753, 23310, 24162, 23879, 18469, 23645, 234...","[1240828, 1243122, 1260993, 1263065]"
1666701,Constitutional amendment-property tax valuation.,SJ0012,WY,A JOINT RESOLUTION providing for the assessmen...,Failed,https://legiscan.com/WY/text/SJ0012/id/2648490,[],"[23643, 8679]",[1234868]
1674775,Legislative and executive authority-taxation a...,SJ0013,WY,A JOINT RESOLUTION to amend the Wyoming Consti...,Failed,https://legiscan.com/WY/text/SJ0013/id/2657100,[],"[22077, 16559, 8663]",[1239735]


Alright, now for the next part. We need to take votes for each bill and pull detailed information from the voting data provided by LegiScan.

First let's walk through all the files, like we did with the bills, and save out the only the important parts.


In [11]:
votes = []
for f in os.walk(r"D:\Big Input Data Stuff\LegiScan\input"):
    if "vote" in f[0]:
        for file in f[-1]:
            if ".json" in file:
                fileName = f[0] + "\\" + file
                with open(fileName, "r") as b:
                    votes.append(json.loads(b.read()))


What information does each vote contain?


In [12]:
votes[0]["roll_call"]


{'roll_call_id': 306479,
 'bill_id': 454312,
 'date': '2013-01-15',
 'desc': 'Senate: <pre> SR 1 Final Passage',
 'yea': 20,
 'nay': 0,
 'nv': 0,
 'absent': 0,
 'total': 20,
 'passed': 1,
 'chamber': 'S',
 'chamber_id': 14,
 'votes': [{'people_id': 6044, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6061, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6064, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6075, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6076, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6079, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6083, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6084, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6085, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6087, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6088, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6089, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6091, 'vote_id': 1, 'vote_text': 'Yea'},
  {'people_id': 6093, 'vot

Unsurprisingly, there's a fair amount of stuff we don't care about here. Let's clean it up and grab only the important parts. We'll do the same thing we did with bill information above.


In [13]:
vote_list = []
for vote in votes:
    vote_id = vote["roll_call"]["roll_call_id"]
    bill_vote_id = vote["roll_call"]["bill_id"]
    vote_desc = vote["roll_call"]["desc"]
    vote_passed = bool(vote["roll_call"]["passed"])
    actual_votes = []
    for v in vote["roll_call"]["votes"]:
        actual_votes.append((v["people_id"], v["vote_text"]))
    vote_list.append(
        {
            "ID": vote_id,
            "Bill ID": bill_vote_id,
            "Description": vote_desc,
            "Passed": vote_passed,
            "Votes": f"{actual_votes}",
        }
    )



In [14]:
vote_df = pd.DataFrame.from_records(vote_list)
vote_df.set_index("ID", inplace=True)
vote_df

Unnamed: 0_level_0,Bill ID,Description,Passed,Votes
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
306479,454312,Senate: <pre> SR 1 Final Passage,True,"[(6044, 'Yea'), (6061, 'Yea'), (6064, 'Yea'), ..."
306480,472178,Senate: CSHB 84(FIN)(efd am S) Third Reading -...,True,"[(6044, 'Yea'), (6061, 'Yea'), (6064, 'Yea'), ..."
306481,472178,Senate: CSHB 84(FIN)(efd am S) Third Reading -...,True,"[(6044, 'Yea'), (6061, 'Yea'), (6064, 'Yea'), ..."
306482,472178,House: Concur,True,"[(6034, 'Yea'), (6035, 'Yea'), (6037, 'Yea'), ..."
306483,545632,House: Special Order of Business,True,"[(6034, 'Yea'), (6035, 'Yea'), (6037, 'Yea'), ..."
...,...,...,...,...
1268431,1673024,Line Item Veto Override 27-3-1-0-0,True,"[(8641, 'Yea'), (8663, 'Yea'), (8675, 'Yea'), ..."
1268432,1673024,Line Item Veto Override 29-1-1-0-0,True,"[(8641, 'Yea'), (8663, 'Yea'), (8675, 'Yea'), ..."
1268433,1673024,Line Item Veto Override 27-3-1-0-0,True,"[(8641, 'Yea'), (8663, 'Yea'), (8675, 'Yea'), ..."
1268434,1673024,Line Item Veto Override 23-7-1-0-0,True,"[(8641, 'Nay'), (8663, 'Nay'), (8675, 'Yea'), ..."


Amazing, now let's round it out by doing the same thing to the people data.

In [15]:
people = []
for f in os.walk(r"D:\Big Input Data Stuff\LegiScan\input"):
    if "people" in f[0]:
        for file in f[-1]:
            if ".json" in file:
                fileName = f[0] + "\\" + file
                with open(fileName, "r") as b:
                    people.append(json.loads(b.read()))

In [16]:
people[0]['person']

{'people_id': 6033,
 'person_hash': 'j9sj4uas',
 'party_id': '2',
 'state_id': 2,
 'party': 'R',
 'role_id': 1,
 'role': 'Rep',
 'name': 'Carl Gatto',
 'first_name': 'Carl',
 'middle_name': 'J.',
 'last_name': 'Gatto',
 'suffix': '',
 'nickname': '',
 'district': 'HD-013',
 'ftm_eid': 6604168,
 'votesmart_id': 27296,
 'opensecrets_id': '',
 'knowwho_pid': 211774,
 'ballotpedia': 'Carl_Gatto',
 'committee_sponsor': 0,
 'committee_id': 0,
 'state_federal': 0}

In [17]:
people_list = []
for person in people:
    person_id = person["person"]["people_id"]
    party = person["person"]["party"]
    role = person["person"]["role"]
    name = person["person"]["name"]
    state = statesDict[person["person"]["state_id"]]
    district = person["person"]["district"]
    people_list.append(
        {
            "ID": person_id,
            "Name": name,
            "Party": party,
            "Role": role,
            "State": state,
            "District": district,
        }
    )



In [18]:
people_df = pd.DataFrame.from_records(people_list)
people_df.set_index("ID", inplace=True)
people_df

Unnamed: 0_level_0,Name,Party,Role,State,District
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
6033,Carl Gatto,R,Rep,AK,HD-013
6034,Robert Lynn,R,Rep,AK,HD-026
6035,Max Gruenberg,D,Rep,AK,HD-016
6036,Nancy Dahlstrom,R,Rep,AK,HD-018
6037,Wes Keller,R,Rep,AK,HD-010
...,...,...,...,...,...
8706,Select School Facilities,,Jnt,WY,
8711,Dan Zwonitzer,R,Rep,WY,HD-043
8713,Bob Nicholas,R,Rep,WY,HD-008
8720,Management Council,,Jnt,WY,


Some states appear to be saving councils as people? Not sure why but let's just remove all of those real quick.

In [19]:
people_df = people_df[people_df["District"] != ""]

Brilliant, now let's close it out by saving these to some pickles (and compress them) so we can analyze it later without having to do all the cleaning again.

In [20]:
bill_df.to_pickle("./cleaned_input/bills.pkl.xz")
people_df.to_pickle("./cleaned_input/people.pkl.xz")
vote_df.to_pickle("./cleaned_input/votes.pkl.xz")