In [16]:
import psycopg2
import numpy as np
import pandas as pd

In [17]:
conn = psycopg2.connect("dbname='calaccess_processed' user='postgres' host='localhost' password=''")
cur = conn.cursor()

In [18]:
def get_office(string):
    if 'SENATE' in string.upper():
        return 'S'
    elif 'ASSEMBLY' in string.upper():
        return 'A'
    else:
        raise Exception

In [19]:
candidates = pd.read_csv('../data/scraped_candidates.csv', dtype=str)
# Keep only Senate and Assembly
candidates = candidates[(candidates['office'].str.contains('STATE SENATE')) | (candidates['office'].str.contains('ASSEMBLY'))]
candidates['district'] = candidates['office'].apply(lambda s: int(s.split(' ')[-1]))
candidates['office'] = candidates['office'].apply(get_office)
candidates['year'] = candidates['election'].apply(lambda s: int(s[0:4]))
# We don't have 2000 vote share data
candidates = candidates[candidates['year'] > 2000]

In [20]:
# How many candidates are missing a filer ID?
print(len(candidates[candidates['candidate_id'].isnull()]))

564


In [21]:
def candidate_filer_id(row):
    if pd.isnull(row['candidate_id']):
        name = row['candidate_name']
        names = name.split(',')
        if len(names) == 1:
            return None
        lname = names[0].strip()
        fname = names[1].strip()

        query ="""
        SELECT "FILER_ID"
        FROM "F501_502_CD"
        WHERE "CAND_NAML" = %s AND "CAND_NAMF" = %s;"""

        cur.execute(query, (lname, fname))
        rows = cur.fetchall()
        filer_ids = set([row[0] for row in rows])

        if len(filer_ids) == 1:
            return filer_ids.pop()

    return row['candidate_id']

In [22]:
candidates['candidate_id'] = candidates.apply(candidate_filer_id, axis=1)
# Now how many are missing an ID?
print(len(candidates[candidates['candidate_id'].isnull()]))

117


In [23]:
def candidate_filer_id_2(row):
    if pd.isnull(row['candidate_id']):
        name = row['candidate_name']
        names = name.split(',')
        if len(names) == 1:
            return None
        lname = names[0].strip()
        fname = names[1].strip()

        query ="""
        SELECT DISTINCT "FILER_ID"
        FROM "FILERNAME_CD"
        WHERE "NAML" = %s AND "NAMF" = %s AND "FILER_TYPE" = 'CANDIDATE/OFFICEHOLDER';"""

        cur.execute(query, (lname, fname))
        rows = cur.fetchall()
        filer_ids = set([row[0] for row in rows])

        if len(filer_ids) == 1:
            return filer_ids.pop()

    return row['candidate_id']

In [24]:
candidates['candidate_id'] = candidates.apply(candidate_filer_id_2, axis=1)
# Now how many are missing an ID?
print(len(candidates[candidates['candidate_id'].isnull()]))

11


In [25]:
# Export list of candidates missing ID to fill in manually
candidates[candidates['candidate_id'].isnull()].to_csv('../data/missing_candidates.csv', index=False)

In [26]:
# Import that list
manual_candidates = pd.read_csv('../data/manual_candidates.csv', dtype='str')

In [27]:
# And add it back in, exporting the final candidates file
candidates = candidates[candidates['candidate_id'].notnull()].append(manual_candidates)
# Now how many are missing an ID?
print(len(candidates[candidates['candidate_id'].isnull()]))

2


In [28]:
candidates.to_csv('../data/candidates.csv')