In [1]:
import pandas as pd
import numpy as np
import random
import re

random.seed(123)

In [2]:
df = pd.read_csv("results/bioguide.csv")

## Filtering Birth Year After 1900

In [3]:
# subsample of birth after 1900

birthdates = df["birthDate"]
birthyear = [0]*df.shape[0]

# Collecting birthyears
for i, birthdate in zip(birthdates.index, birthdates):
    # nan
    if pd.isna(birthdate):
        birthyear[i] = float("nan")
    
    else:
        birthdate = birthdate.strip()
        birthyear[i] = int(birthdate[:4])

df = pd.concat([df, pd.DataFrame(birthyear, columns=["birth year"])], axis=1)
df_b1900 = df[df["birth year"] > 1900]

In [4]:
df_b1900.shape

(3125, 22)

In [5]:
# randomly select sample by id, and find the bios with respect to those ids
ids = df_b1900.usCongressBioId.tolist()
sample_ids = random.sample(ids, 100)
sample_bios = [df_b1900.loc[df_b1900['usCongressBioId'] == id].iloc[0]['profileText'] for id in sample_ids]

In [6]:
df_sample_bio = pd.DataFrame(list(zip(sample_ids, sample_bios)), columns=['id', 'bio'])

In [7]:
df_sample_bio

Unnamed: 0,id,bio
0,C000312,A Representative from Texas; born in Washingto...
1,B001259,"A Representative from Iowa; born in Grinnell, ..."
2,S000844,A Representative from Maryland; born in Glen R...
3,N000033,"A Senator from Wisconsin; born in Clear Lake, ..."
4,F000088,A Representative from New York; born in Newbur...
...,...,...
95,S001191,A Senator and a Representative from Arizona; b...
96,S000068,"A Senator from Tennessee; born in Memphis, She..."
97,P000555,"A Representative from Ohio; born in Warren, Tr..."
98,F000085,A Representative from New Mexico; born in Spri...


In [8]:
df_b1900.to_csv("results/bioguide_a1900.csv", index=False)
df_sample_bio.to_csv("results/sample_bios.csv", index=False)

## Filtering roll call after the 102nd Congress

### small sample for 102nd house congress

In [9]:
df_voteh102 = pd.read_csv("H102_votes.csv")

In [10]:
# Are the representatives the same for each roll call?
# collect the icpsrs for each roll number and compare them

grouped = df_voteh102.groupby(df_voteh102.rollnumber)
different_set = 0
member = set(grouped.get_group(1).icpsr)
for rollnumber, group in grouped:
    if member != set(group.icpsr):
        different_set += 1
    member = set(group.icpsr)
print(different_set)
    

147


In [11]:
# find the union for all rolls

attendence = set(grouped.get_group(1).icpsr)
for rollnumber, group in grouped:
    attendence = attendence.union(set(group.icpsr))
len(attendence)

442

### Full sample of congresses after 102nd (109th)

In [12]:
# importing the bulk data from voteview
df_vote = pd.read_csv("HSall_votes.csv")

In [13]:
# keeping the votes after 109th congress
# change the number if needed
df_vote = df_vote[df_vote['congress'] >= 109]

In [14]:
df_vote.head()

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,prob
19619937,109,House,1,2605.0,6.0,100.0
19619938,109,House,1,10713.0,6.0,100.0
19619939,109,House,1,12036.0,6.0,100.0
19619940,109,House,1,13035.0,6.0,100.0
19619941,109,House,1,13047.0,1.0,100.0


In [15]:
# This function finds the attendance of the congress members
# by finding the union of the congress members from the votes of all roll calls of that congress
def find_attendance(df):
    grouped = df.groupby(df.rollnumber)
    attendence = set(grouped.get_group(1).icpsr)
    for rollnumber, group in grouped:
        attendence = attendence.union(set(group.icpsr))
    return attendence

In [16]:
# applying the function on all the congresses
congresses = df_vote.groupby(['congress', 'chamber'])
attendance_lists = {}
for name, congress in congresses:
    key = str(name[1]) + str(name[0])
    attendance = find_attendance(congress)
    attendance_lists[key] = attendance


In [17]:
# turning the dictionary into dataframe
congress_name = []
attendees = []
for congress in attendance_lists:
    for attendee in attendance_lists[congress]:
        congress_name.append(congress)
        attendees.append(int(attendee))

df_attendance = pd.DataFrame({'congress': congress_name, 'attendee': attendees})
df_attendance.head()

Unnamed: 0,congress,attendee
0,House109,20501
1,House109,20502
2,House109,20503
3,House109,20504
4,House109,20505


### Match ICPSR ID to Bioguide ID

#### Preparing crosswalk

In [18]:
# importing the data from voteview
members = pd.read_csv('HSall_members.csv')

In [19]:
crosswalk = members[['icpsr', 'bioguide_id']]
crosswalk.head()

Unnamed: 0,icpsr,bioguide_id
0,99869,
1,379,B000084
2,4854,J000017
3,6071,M000234
4,1538,C000187


In [20]:
crosswalk = crosswalk.drop_duplicates()

In [21]:
crosswalk.to_csv('results/icpsr_bioguide_id_crosswalk.csv', index=False)

#### Match back to bioguide ID

In [22]:
df_attendance = df_attendance.join(crosswalk.set_index('icpsr'), on='attendee')

In [23]:
df_attendance.head()

Unnamed: 0,congress,attendee,bioguide_id
0,House109,20501,C001059
1,House109,20502,S001158
2,House109,20503,M001155
3,House109,20504,W000797
4,House109,20505,P000591


### Using attendance to filter politicians

In [24]:
df_attendance = df_attendance.drop_duplicates(subset=['bioguide_id'])
df_filtered = df_attendance.join(df.set_index('usCongressBioId'), on='bioguide_id')
df_filtered = df_filtered.rename(columns={'bioguide_id': 'usCongressBioId', 'attendee':'icpsr_id'})
df_filtered = df_filtered.drop(columns=['congress'])

In [25]:
df_filtered.shape

(1312, 23)

In [26]:
df_filtered.to_csv("results/bioguide_after109th.csv", index=False)

In [27]:
regex = re.compile(r'<[^>]+>')

def remove_html(string):
    if type(string) is not str:
        pass
    else:
        return regex.sub('', string)

In [28]:
df_filtered['profileText'] = df_filtered['profileText'].apply(remove_html)

In [29]:
# a particularly weird cell
df_filtered.at[5484, 'profileText'] = 'a Senator from Nebraska; born in Nebraska City, Nebr., August 19, 1964; graduated Westside High School, Omaha, Nebr., 1982; B.A., biology, University of Chicago, 1986; M.B.A., marketing and finance, University of Chicago, 1991; stock trading company executive and board member; co-owner of Chicago Cubs baseball team; unsuccessful candidate for the United States Senate in 2006; governor of Nebraska 2015-2023; appointed as a Republican to the United States Senate on January 12, 2023, to fill the vacancy caused by the resignation of Benjamin Sasse; took the oath of office on January 23, 2023.'

In [30]:
df_bio = df_filtered[['usCongressBioId', 'profileText']]
df_bio = df_bio.rename(columns = {'usCongressBioId': 'id', 'profileText': 'bio'})

In [31]:
df_bio.to_csv('results/full_bios_after109th.csv', index=False)

In [32]:
ids = df_filtered.usCongressBioId.tolist()
sample_ids = random.sample(ids, 100)
sample_bios = [df_filtered.loc[df_filtered['usCongressBioId'] == id].iloc[0]['profileText'] for id in sample_ids]
df_sample_bio_filtered = pd.DataFrame(list(zip(sample_ids, sample_bios)), columns=['id', 'bio'])
df_sample_bio_filtered

Unnamed: 0,id,bio
0,E000293,A Representative from Connecticut; born in Oak...
1,K000394,A Representative from New Jersey; born in Bost...
2,T000478,A Representative from New York; born in New Ha...
3,M000523,A Representative from Georgia; born in Atlanta...
4,F000479,"a Senator from Pennsylvania; born in Reading, ..."
...,...,...
95,T000467,a Representative from Pennsylvania; born in Ho...
96,C001117,A Representative from Illinois; born in Dublin...
97,A000378,A Representative from Iowa; born in Grand Rapi...
98,M001210,a Representative from North Carolina; born in ...


In [33]:
df_sample_bio_filtered.to_csv('results/sample_bios_after109th.csv', index=False)