In [1]:
import pandas as pd
import numpy as np
import random

random.seed(123)

In [2]:
df = pd.read_csv("results/bioguide.csv")

## Filtering Birth Year After 1900

In [3]:
# subsample of birth after 1900

birthdates = df["birthDate"]
birthyear = [0]*df.shape[0]

# Collecting birthyears
for i, birthdate in zip(birthdates.index, birthdates):
    # nan
    if pd.isna(birthdate):
        birthyear[i] = float("nan")
    
    else:
        birthdate = birthdate.strip()
        birthyear[i] = int(birthdate[:4])

df = pd.concat([df, pd.DataFrame(birthyear, columns=["birth year"])], axis=1)
df_b1900 = df[df["birth year"] > 1900]

In [4]:
df_b1900.shape

(3125, 22)

In [5]:
# randomly select sample by id, and find the bios with respect to those ids
ids = df_b1900.usCongressBioId.tolist()
sample_ids = random.sample(ids, 100)
sample_bios = [df_b1900.loc[df_b1900['usCongressBioId'] == id].iloc[0]['profileText'] for id in sample_ids]

In [6]:
df_sample_bio = pd.DataFrame(list(zip(sample_ids, sample_bios)), columns=['id', 'bio'])

In [7]:
df_sample_bio

Unnamed: 0,id,bio
0,C000312,A Representative from Texas; born in Washingto...
1,B001259,"A Representative from Iowa; born in Grinnell, ..."
2,S000844,A Representative from Maryland; born in Glen R...
3,N000033,"A Senator from Wisconsin; born in Clear Lake, ..."
4,F000088,A Representative from New York; born in Newbur...
...,...,...
95,S001191,A Senator and a Representative from Arizona; b...
96,S000068,"A Senator from Tennessee; born in Memphis, She..."
97,P000555,"A Representative from Ohio; born in Warren, Tr..."
98,F000085,A Representative from New Mexico; born in Spri...


In [8]:
df_b1900.to_csv("results/bioguide_a1900.csv", index=False)
df_sample_bio.to_csv("results/sample_bios.csv", index=False)

## Filtering roll call after the 102nd Congress

### small sample for 102nd house congress

In [9]:
df_voteh102 = pd.read_csv("H102_votes.csv")

In [10]:
# Are the representatives the same for each roll call?
# collect the icpsrs for each roll number and compare them

grouped = df_voteh102.groupby(df_voteh102.rollnumber)
different_set = 0
member = set(grouped.get_group(1).icpsr)
for rollnumber, group in grouped:
    if member != set(group.icpsr):
        different_set += 1
    member = set(group.icpsr)
print(different_set)
    

147


In [11]:
# find the union for all rolls

attendence = set(grouped.get_group(1).icpsr)
for rollnumber, group in grouped:
    attendence = attendence.union(set(group.icpsr))
len(attendence)

442

### Full sample of congresses after 102nd

In [12]:
df_vote = pd.read_csv("HSall_votes.csv")

In [13]:
df_vote = df_vote[df_vote['congress'] >= 102]

In [14]:
df_vote.head()

Unnamed: 0,congress,chamber,rollnumber,icpsr,cast_code,prob
15702267,102,House,1,633.0,1.0,99.5
15702268,102,House,1,1077.0,1.0,100.0
15702269,102,House,1,1087.0,6.0,99.6
15702270,102,House,1,2009.0,6.0,74.1
15702271,102,House,1,2605.0,1.0,100.0


In [15]:
def find_attendance(df):
    grouped = df.groupby(df.rollnumber)
    attendence = set(grouped.get_group(1).icpsr)
    for rollnumber, group in grouped:
        attendence = attendence.union(set(group.icpsr))
    return attendence

In [16]:
congresses = df_vote.groupby(['congress', 'chamber'])
attendance_lists = {}
for name, congress in congresses:
    key = str(name[1]) + str(name[0])
    attendance = find_attendance(congress)
    attendance_lists[key] = attendance


In [17]:
congress_name = []
attendees = []
for congress in attendance_lists:
    for attendee in attendance_lists[congress]:
        congress_name.append(congress)
        attendees.append(int(attendee))

df_attendance = pd.DataFrame({'congress': congress_name, 'attendee': attendees})
df_attendance.head()

Unnamed: 0,congress,attendee
0,House102,14402
1,House102,14404
2,House102,14405
3,House102,14407
4,House102,14410


In [18]:
df_attendance[df_attendance['congress']=='House102']

Unnamed: 0,congress,attendee
0,House102,14402
1,House102,14404
2,House102,14405
3,House102,14407
4,House102,14410
...,...,...
437,House102,14274
438,House102,14277
439,House102,14280
440,House102,14288


### Match ICPSR ID to Bioguide ID

#### Preparing crosswalk

In [19]:
members = pd.read_csv('HSall_members.csv')

In [20]:
crosswalk = members[['icpsr', 'bioguide_id']]
crosswalk.head()

Unnamed: 0,icpsr,bioguide_id
0,99869,
1,379,B000084
2,4854,J000017
3,6071,M000234
4,1538,C000187


In [21]:
crosswalk = crosswalk.drop_duplicates()

In [22]:
crosswalk.to_csv('results/icpsr_bioguide_id_corsswalk.csv', index=False)

#### Match back to bioguide ID

In [23]:
df_attendance = df_attendance.join(crosswalk.set_index('icpsr'), on='attendee')

In [24]:
df_attendance.head()

Unnamed: 0,congress,attendee,bioguide_id
0,House102,14402,A000214
1,House102,14404,B000153
2,House102,14405,B000318
3,House102,14407,B000619
4,House102,14410,C000618


### Using attendance to filter politicians

In [25]:
df_attendance = df_attendance.drop_duplicates(subset=['bioguide_id'])
df_after102nd = df_attendance.join(df.set_index('usCongressBioId'), on='bioguide_id')
df_after102nd = df_after102nd.rename(columns={'bioguide_id': 'usCongressBioId', 'attendee':'icpsr_id'})
df_after102nd = df_after102nd.drop(columns=['congress'])

In [26]:
df_after102nd.to_csv("results/bioguide_after102nd.csv", index=False)

In [27]:
ids = df_after102nd.usCongressBioId.tolist()
sample_ids = random.sample(ids, 100)
sample_bios = [df_after102nd.loc[df_after102nd['usCongressBioId'] == id].iloc[0]['profileText'] for id in sample_ids]
df_sample_bio_102nd = pd.DataFrame(list(zip(sample_ids, sample_bios)), columns=['id', 'bio'])
df_sample_bio_102nd

Unnamed: 0,id,bio
0,S000250,"a Representative from Texas; born in Waco, McL..."
1,H001040,A Representative from Illinois; born in Galesb...
2,C001056,"A Senator from Texas; born in Houston, Texas, ..."
3,G000576,A Representative from Wisconsin; born in Milwa...
4,F000246,a Representative from Texas; born in Pittsfiel...
...,...,...
95,R000566,"A Representative from Kansas; born in Wichita,..."
96,G000408,(elected under the name Enid G. Waldholtz in t...
97,B001172,"A Representative from Texas; born in Asherton,..."
98,Y000062,A Representative from Kentucky; born in Louisv...


In [28]:
df_sample_bio_102nd.to_csv('results/sample_bios_after102nd.csv', index=False)