In [None]:
import numpy as np
import pandas as pd
import json
import pickle

In [None]:
# Make sure to update congress session value
# and the year that it ends

congress_session = 118
end_year = 2025

In [2]:
bid_to_tbip_speeches = pickle.load(open('speeches_results/bid_to_tbip_floor_speeches.pkl',
                                        'rb'))
print(len(bid_to_tbip_speeches)) 

313


In [None]:
bioguide_ids_with_speech = sorted(list(set(bid_to_tbip_speeches.keys())))
print(len(bioguide_ids_with_speech))

313


In [None]:
# s = set(all_caucus_data_115_ids).union(set(all_caucus_data_116_ids))
final_bids_to_consider = []
for bid in bioguide_ids_with_speech:
    # if bid in s:
        final_bids_to_consider.append(bid)
print(len(final_bids_to_consider))

313


### Loading and adding basic biographical information about legislators

In [5]:
legis_info = json.load(open('supporting_data_files/legislator-info-1990-current.json'))
legis_id_to_info = {}
for x in legis_info:
    legis_id_to_info[x['id']['bioguide']] = x
del legis_info

In [9]:
bid_to_name = {}
bid_to_gender = {}
bid_to_party = {}
bid_to_birth_year = {}
bid_to_seniority = {} #number of terms in the US House
bid_to_state_district = {}

In [None]:
for bid in final_bids_to_consider:
    x = legis_id_to_info[bid]
    if 'ballotpedia' in x['id']:
        bid_to_name[bid] = x['id']['ballotpedia']
    else:
        bid_to_name[bid] = x['id']['wikipedia']
    bid_to_party[bid] = x['terms'][0]['party']
    bid_to_gender[bid] = x['bio']['gender']
    
    bid_to_birth_year[bid] = x['bio']['birthday'][:4]
    
    terms = [z for z in x['terms'] if z['type']=='rep' and int(z['end'][:4])<=end_year]
    bid_to_seniority[bid] = len(terms)
    s = terms[-1]['state']
    try:
        d = terms[-1]['district']
    except:
        print(bid)
        break
    if d==0:
        bid_to_state_district[bid] = s + '-AL'
    elif d<10:
        bid_to_state_district[bid] = s + '-0' + str(d)
    else:
        bid_to_state_district[bid] = s + '-' + str(d)

### Adding ideal point values data

In [21]:
def standardize(x):
  """Standardize a vector x."""
  return (x - np.nanmean(x)) / np.nanstd(x)

In [22]:
bid_to_speech_tbip = {}
for bid in final_bids_to_consider:
    if bid in bid_to_tbip_speeches:
        bid_to_speech_tbip[bid] = -1* bid_to_tbip_speeches[bid]
    else:
        bid_to_speech_tbip[bid] = np.nan

In [23]:
import os

In [24]:
# vote_source_dir = 'tbip/data/congs_118/'
# vote_data_dir = os.path.join(vote_source_dir, "clean")
# vote_param_dir = os.path.join(vote_source_dir, "fits/params")
# vote_ideal_points_1d = standardize(np.load(os.path.join(vote_param_dir, 
#                                                         "ideal_point_loc.npy")))
# voting_reps_map = list(map(lambda x:x.rstrip(), 
#                            open(os.path.join(vote_data_dir, 'rep_map.txt')).readlines()))

# bid_to_stan_vote_tbip = {}
# for bid in final_bids_to_consider:
#     bid_to_stan_vote_tbip[bid] = -1*vote_ideal_points_1d[voting_reps_map.index(bid)]

stan_speech_ideal_points = standardize(list(bid_to_speech_tbip.values()))
#stan_tweet_ideal_points = standardize(list(bid_to_twitter_tbip.values()))

bid_to_stan_speech_tbip, bid_to_stan_tweet_tbip = {}, {}
for i, bid in enumerate(final_bids_to_consider):
    bid_to_stan_speech_tbip[bid] = stan_speech_ideal_points[i]
    #bid_to_stan_tweet_tbip[bid] = stan_tweet_ideal_points[i]

# print(len(bid_to_stan_vote_tbip))
print(len(bid_to_stan_speech_tbip))
#print(len(bid_to_stan_tweet_tbip))

313


In [None]:
final_df = pd.DataFrame()
final_df['Bioguide_ID'] = list(bid_to_name.keys())
final_df['Name'] = list(bid_to_name.values())
final_df['Gender'] = list(bid_to_gender.values())
final_df['Party'] = list(bid_to_party.values())
final_df['Born'] = list(bid_to_birth_year.values())
final_df['Number_of_House_Terms'] = list(bid_to_seniority.values())

final_df['District'] = list(bid_to_state_district.values())
# final_df['Present_Cong115'] = list(bid_to_cong_presence_115.values())
# final_df['Present_Cong116'] = list(bid_to_cong_presence_116.values())

final_df['TBIP_Floor_Speeches'] = list(bid_to_speech_tbip.values())

# final_df['Standardized_Vote_Ideal_Point'] = list(bid_to_stan_vote_tbip.values())
final_df['Standardized_Speech_Ideal_Point'] = list(bid_to_stan_speech_tbip.values())
# final_df['Standardized_Tweet_Ideal_Point'] = list(bid_to_stan_tweet_tbip.values())


In [None]:
final_df.to_csv(f'legislator_info_and_tbip_congresses_{congress_session}.csv', 
                index=False)

In [32]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313 entries, 0 to 312
Data columns (total 8 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Bioguide_ID                      313 non-null    object 
 1   Name                             313 non-null    object 
 2   Gender                           313 non-null    object 
 3   Party                            313 non-null    object 
 4   Born                             313 non-null    object 
 5   Number_of_House_Terms            313 non-null    int64  
 6   TBIP_Floor_Speeches              313 non-null    float64
 7   Standardized_Speech_Ideal_Point  313 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 19.7+ KB
