In [34]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import unidecode

In [35]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_current_members_of_the_United_States_House_of_Representatives'
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text,"lxml")

house_of_reps_text = soup.find('table',attrs={'class':"wikitable sortable plainrowheaders", 'id': "votingmembers"})

In [36]:
representatives = pd.read_html(str(house_of_reps_text))[0]
representatives = representatives.drop(['Party'], axis=1)
representatives = representatives.rename(columns={'Party.1': 'Party', 'Born[4]': 'Age'})


In [37]:
wiki_url = 'https://en.wikipedia.org/wiki/List_of_current_United_States_senators?fbclid=IwAR3fwBqo12plcIm741lp9eP2ALJ4G49Ntch2P9_Dxg8cOmiQKODTrPluMT8'
response = requests.get(wiki_url)
soup = BeautifulSoup(response.text,"lxml")

senators_text = soup.find('table',attrs={'class':"wikitable sortable", 'id': "senators"})
senators = pd.read_html(str(senators_text))[0]
senators = senators.drop(['Party', 'Portrait'], axis=1)
senators = senators.rename(columns={'Party.1': 'Party', 'Residence[5]': 'Residence', 'Born': 'Age'})
# save to csv

In [38]:
# Save to csv 
representatives.to_csv('usa_data/house.csv', index=False)
senators.to_csv('usa_data/senators.csv', index=False)

In [39]:
# Read the CSV file into a DataFrame
df = pd.read_csv('usa_data/house.csv')
df1 = pd.read_csv('usa_data/senators.csv')

# Define a function to clean the "Age" column
def clean_age(age_str):
    if isinstance(age_str, str):
        age_str = re.findall(r'\d+', age_str)  # Remove non-numeric characters
        if age_str:
            return int(age_str[0])  # Convert the cleaned string to an integer
    return None

# Apply the function to the "Age" column
df['Born[3]'] = df['Born[3]'].apply(clean_age)
df1['Age'] = df1['Age'].apply(clean_age)
# rem

# Save the cleaned DataFrame to a new CSV file
df.to_csv('usa_data/house.csv', index=False)
df1.to_csv('usa_data/senators.csv', index=False)

In [53]:
senators= pd.read_csv('usa_data/senators.csv')
representatives = pd.read_csv('usa_data/house.csv')
representatives = representatives[representatives["Member"] != "Vacant"]

In [41]:
def clean_name(name):
    name = name.replace('"', '')
    return unidecode(name)

In [42]:
house_names = representatives['Member'].tolist()
senators_names = senators['Senator'].tolist()

In [43]:
import os
def get_page_content(pol_name):
    params = {
        "action": "query",
        "prop": "extracts",
        "exlimit":"1",
        "explaintext": "1",
        "format": "json",
        "titles": pol_name  
    }
    baseurl = "https://en.wikipedia.org/w/api.php?"
    # requests version
    wikitext = requests.get(baseurl, params=params)
    wikijson = wikitext.json()

    # get page id
    pageid = list(wikijson['query']['pages'].keys())[0]
    #wikitext = wikijson['query']['pages'][pageid]['revisions'][0]['*']
    wikitext = wikijson['query']['pages'][pageid]['extract']
    # use regex to count words in wikitext

    return wikitext

if not os.path.isdir('usa_data/senate'):
    os.mkdir('usa_data/senate')

if not os.path.isdir('usa_data/house'):
    os.mkdir('usa_data/house')

In [44]:
import re
import requests
import string
from unidecode import unidecode

for name in senators_names:
    wikitext = get_page_content(name)
    name = re.sub(r'[".?"]','',name)
    name = clean_name(name)
    with open('usa_data/senate/{}.txt'.format(name), 'w') as f:
        f.write(wikitext)


for name in house_names:
    wikitext = get_page_content(name)
    name = re.sub(r'[".?"]','',name)
    name = clean_name(name)
    with open('usa_data/house/{}.txt'.format(name), 'w') as f:
        f.write(wikitext)

In [56]:
def getLinks(rapper_name):
    params = {
    "action": "query",
    "prop": "revisions",
    "rvprop": "content",
    "format": "json",
    "titles": rapper_name
    
    }
    baseurl = "https://en.wikipedia.org/w/api.php?"
    # requests version
    wikitext = requests.get(baseurl, params=params)
    wikijson = wikitext.json()


    pageid = list(wikijson['query']['pages'].keys())[0]
    wikitext = wikijson['query']['pages'][pageid]['revisions'][0]['*']
    links = re.findall(r'\[\[(.*?)\]\]', wikitext)
    links = [link.split('|')[0] for link in links]
    links = [link for link in links if link in senators_names or link in house_names]
    print(links)
    # keep only unique links
    links = list(set(links))
    return links

In [75]:
house_links = dict()
senate_links = dict()
counter = 0

def getPairs(dict1,Names):
    for pol in Names:
        links = getLinks(pol)
        links = [clean_name(link) for link in links]
        dict1[clean_name(pol)] = []
        dict1[clean_name(pol)] += links

getPairs(house_links,house_names)
getPairs(senate_links,senators_names)

['Kat Cammack', 'Andrew Clyde']
[]
[]
['Katie Britt', 'Kevin McCarthy', 'Steve Scalise', 'Earl Blumenauer']
['Kevin McCarthy', 'Emilia Sykes']
[]
['Kirsten Gillibrand', 'Nancy Pelosi']
['Jared Golden', 'Marie Gluesenkamp Perez', 'Jim Costa', 'Lisa Murkowski', 'Lisa Murkowski', 'Sharice Davids', 'Markwayne Mullin', 'Tom Cole', 'Jared Golden', 'Jim Costa', 'Jared Golden', 'Marie Gluesenkamp Perez', 'Brad Finstad']
['Martin Heinrich', 'Andy Biggs', 'Paul Gosar', 'Juan Ciscomani']
['Kevin McCarthy', 'Joyce Beatty', 'Jasmine Crockett']
['Eric Swalwell', 'Kyrsten Sinema', 'Kyrsten Sinema', 'Kyrsten Sinema', 'Kyrsten Sinema', 'Raúl Grijalva', 'Tom Emmer', 'Garret Graves']
['Kyrsten Sinema', 'Kyrsten Sinema', 'Kyrsten Sinema', 'Kyrsten Sinema', 'Paul Gosar', 'Paul Gosar', 'Abigail Spanberger', 'Pete Stauber']
['Ken Buck', 'Paul Gosar', 'Matt Gaetz', 'Paul Gosar', 'Kevin McCarthy', 'Paul Gosar', 'Jim Jordan', 'Kevin McCarthy', 'Ken Buck', 'Jack Bergman', 'Lisa Blunt Rochester']
['Kevin McCarthy

In [78]:
from unidecode import unidecode

def clean_name(name):
    name = name.replace('"', '')
    return unidecode(name)

house_links_copy = {}
senate_links_copy = {}
#copy congress_dict
# loop through the dictionary and clean the names
for key, value in house_links.items():
    key = clean_name(key)
    house_links_copy[clean_name(key)] = [clean_name(name) for name in value]

for key, value in senate_links.items():
    key = clean_name(key)
    senate_links_copy[clean_name(key)] = [clean_name(name) for name in value]


senate_links = senate_links_copy
house_links = house_links_copy

In [79]:
# save house names to json
import json

with open('usa_data/house_links.json', 'w') as f:
    json.dump(house_links, f)

with open('usa_data/senate_links.json', 'w') as f:
    json.dump(senate_links, f)

In [80]:
# create csv with name and house or senate
all_list = []

# delete file if exists
if os.path.exists('usa_data/all_list.csv'):
    os.remove('usa_data/all_list.csv')

for name in house_names:
    with open('usa_data/all_list.csv', 'a') as f:
        f.write(str(name + "," + "house"+ "\n"))
        f.close()


for name in senators_names:
    with open('usa_data/all_list.csv', 'a') as f:
        f.write(str(name + "," + "senate"+ "\n"))
        f.close()
