In [2]:
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd 
import unidecode as uni


URL = 'https://en.wikipedia.org/wiki/115th_United_States_Congress'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('table', class_='multicol')

senate115 = re.findall('title\=.*\>(\w.*)\<\/a\> \((\w*)\)', str(results[0]))
house115 = re.findall('title\=.*\>(\w.*)\<\/a\> \((\w*)\)', str(results[1]))
congress115 = senate115 + house115


URL = 'https://en.wikipedia.org/wiki/116th_United_States_Congress'
page = requests.get(URL)
soup = BeautifulSoup(page.content, 'html.parser')
results = soup.find_all('table', class_='multicol')

senate116 = re.findall('title\=.*\>(\w.*)\<\/a\> \((\w*)\)', str(results[0]))
house116 = re.findall('title\=.*\>(\w.*)\<\/a\> \((\w*)\)', str(results[1]))


congress = senate115 + house115 + senate116 + house116


congress_df = pd.DataFrame(columns = ['first_name','last_name','party']) 


for member in congress:
    
    name = member[0]
    name = uni.unidecode(name).replace("'", "").replace("-", "").replace(" Jr.", "").replace(" III", "").lower()
    
    first_name = name.split(" ")[0]
    last_name = name.split(" ")[-1]

    party = member[1]    
    party = party.replace("DFL", "D")
    
    congress_df = congress_df.append({'first_name': first_name,'last_name': last_name, 'party': party}, ignore_index= True)

    
congress_df = congress_df.drop_duplicates()
congress_df.to_csv("party_affiliation.csv", index = False)
print(congress_df)

     first_name last_name party
0          jeff  sessions     R
1        luther   strange     R
2          doug     jones     D
3       richard    shelby     R
4           dan  sullivan     R
...         ...       ...   ...
1089      carol    miller     R
1090      bryan     steil     R
1097        tom   tiffany     R
1102    michael   nicolas     D
1104  jenniffer  gonzalez     R

[666 rows x 3 columns]


In [12]:
# Combine csv files into one
csv_file_list = ['congress115-senate-accounts.csv', 'congress115-house-accounts.csv',
                 'congress116-senate-accounts.csv', 'congress116-house-accounts.csv']
combined_csv = pd.concat([pd.read_csv(f) for f in csv_file_list]).drop_duplicates()
combined_csv.to_csv("congress115-116-accounts.csv", index = False)

# Read twitter handles to list
twitter_handles = combined_csv['Token'].tolist()

# Create empty dataframe
twitter_df = pd.DataFrame(columns = ['user.screen_name','first_name','last_name','party']) 

# Loop through twitter handles
for twitter_handle in twitter_handles:

    perfect_match = False
    first_name_matches = 0
    last_name_matches = 0

    # Loop through congress names and party affiliations
    for row in congress_df.itertuples():
        
        # Read name and party info from row
        first_name = row[1]
        last_name = row[2]
        party = row[3]
        full_name = [first_name, last_name]
        
        # Append twitter handle, name, and party info to dataframe if first and last name in twitter handle (perfect match)
        if all([name in twitter_handle.lower() for name in full_name]):
            twitter_df = twitter_df.append({'user.screen_name': twitter_handle, 'first_name': first_name, 'last_name': last_name, 'party': party}, ignore_index = True)
            perfect_match = True
            break
        # Count first name match if only first name in twitter handle
        elif first_name in twitter_handle.lower():
            first_name_match = [twitter_handle, first_name, last_name, party]
            first_name_matches += 1
        # Count last name match if only last name in twitter handle
        elif last_name in twitter_handle.lower():
            last_name_match = [twitter_handle, first_name, last_name, party]
            last_name_matches += 1
            
    # Run if no perfect match exists after looping through all names
    if not perfect_match:
        # Append twitter handle, name, and party info to dataframe if only one first name match exists
        if first_name_matches == 1:
            twitter_df = twitter_df.append({'user.screen_name': first_name_match[0], 'first_name': first_name_match[1], 'last_name': first_name_match[2], 'party': first_name_match[3]}, ignore_index = True)
        # Append twitter handle, name, and party info to dataframe if only one last name match exists
        elif last_name_matches == 1:
            twitter_df = twitter_df.append({'user.screen_name': last_name_match[0], 'first_name': last_name_match[1], 'last_name': last_name_match[2], 'party': last_name_match[3]}, ignore_index = True)  
        # Append twitter handle without name or party info to dataframe if multiple or no matches exist
        else:
            twitter_df = twitter_df.append({'user.screen_name': twitter_handle, 'first_name': '', 'last_name': '', 'party': ''}, ignore_index = True)

# Write dataframe to csv file
twitter_df.to_csv("twitter_handles.csv", index = False)