In [1]:
import numpy as np
import os
from tqdm import tqdm
from scipy.stats import pearsonr, spearmanr
import scipy.sparse as sparse
from scipy.stats import bernoulli, poisson
import analysis_utils_mine as utils

import plotly.graph_objects as go
import chart_studio
import json
import pandas as pd
import ast
from datetime import datetime
import torch
import pandas as pd
from datetime import datetime, timedelta
import pickle

import matplotlib.pyplot as plt

In [5]:
# Make sure to update congress session value
# and the year that it ends

congress_session = 118

cong_sen = "congress" # "congress" or "senate"

annotation = "congs" if cong_sen == "congress" else "senate"
chamber = "house" if cong_sen == "congress" else "senate"

In [6]:
project_dir = os.path.abspath(f'../data/floor_speeches_{annotation}_{congress_session}/') 
fit_dir = os.path.join(project_dir, "mallet_fits_removed_procedural_speeches")

# Load TBIP data.
data_dir = os.path.join(project_dir, "clean_removing_procedural")
(counts, vocabulary, author_indices, 
 author_map) = utils.load_text_data(data_dir)

# Load TBIP parameters.
param_dir = os.path.join(project_dir, "tbip-pytorch-fits-og-rem-procedural-speeches-k50-init-mallet/params/")
(document_loc, document_scale, objective_topic_loc, objective_topic_scale, 
 ideological_topic_loc, ideological_topic_scale, ideal_point_loc, 
 ideal_point_scale) = utils.load_tbip_parameters(param_dir)

# Compute means from variational parameters
document_mean = np.exp(document_loc + document_scale ** 2 / 2)
objective_topic_mean = np.exp(objective_topic_loc + 
                              objective_topic_scale ** 2 / 2)
ideological_topic_mean = ideological_topic_loc
ideal_point_mean = ideal_point_loc

In [7]:
print(document_mean.shape)
print(objective_topic_mean.shape)
print(ideological_topic_mean.shape)
print(ideal_point_mean.shape)
#ideal_point_mean = -1*ideal_point_mean
#ideological_topic_loc = ideological_topic_loc * -1

(24691, 50)
(50, 5152)
(50, 5152)
(313,)


In [8]:
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   20)

['Ideal Point = -1.0, Topic = 0: branch, article, citizens, constitution, executive branch, executive, american, american citizens, law, powers, election, elections, constitutional, authority, republic, citizen, court, person, founders, citizenship'
 'Ideal Point = -0.5, Topic = 0: branch, constitution, citizens, executive, court, executive branch, law, elections, election, article, constitutional, american, state, powers, authority, american citizens, power, supreme, laws, citizen'
 'Ideal Point = 0.0, Topic = 0: court, voting, supreme, elections, constitution, state, supreme court, branch, election, law, citizens, executive, constitutional, federal, elected, power, laws, act, executive branch, district'
 'Ideal Point = 0.5, Topic = 0: court, voting, supreme, supreme court, democracy, voting rights, state, elected, residents, local, act, elections, federal, voters, district, decision, rights, election, laws, power'
 'Ideal Point = 1.0, Topic = 0: residents, voting rights, voting, demo

In [9]:
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   50)

['Ideal Point = -1.0, Topic = 0: branch, article, citizens, constitution, executive branch, executive, american, american citizens, law, powers, election, elections, constitutional, authority, republic, citizen, court, person, founders, citizenship, section, state, power, legislative branch, save, integrity, laws, restore, remove, safeguard, noncitizens, federal, separation, administrative, representatives, rolls, legislative, unconstitutional, body, american citizen, register, question, elected representatives, clearly, passed, supreme, capital, supreme court, government, judicial'
 'Ideal Point = -0.5, Topic = 0: branch, constitution, citizens, executive, court, executive branch, law, elections, election, article, constitutional, american, state, powers, authority, american citizens, power, supreme, laws, citizen, federal, supreme court, voting, legislative, act, citizenship, representatives, republic, elected, district, states, section, noncitizens, unconstitutional, government, fou

In [11]:
%%capture cap --no-stderr
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   20)
with open('../../speeches_results/topics.txt', 'w') as f:
    f.write(cap.stdout)

In [12]:
legis_info = json.load(open(os.path.join(project_dir, 'legislator-info-1990-current.json')))
print(len(legis_info))

1967


In [13]:
legis_id_to_info = {}
for x in legis_info:
    legis_id_to_info[x['id']['bioguide']] = x

In [14]:
parties = []
for a in author_map:
    x = legis_id_to_info[a]
    parties.append(x['terms'][-1]['party'])
print(len(parties))

313


In [15]:
names = []
for a in author_map:
    x = legis_id_to_info[a]
    names.append(x['id']['wikipedia'])
print(len(names))

313


In [16]:
colors = np.array(["steelblue" if p=='Democrat' else "crimson" for p in parties])
def get_ideological_topics(objective_topic_loc, 
                           objective_topic_scale,
                           ideological_topic_loc, 
                           ideological_topic_scale,
                           ideal_point):
    ideological_topic_mean = np.exp(objective_topic_loc +
                              ideal_point * ideological_topic_loc +
                              (objective_topic_scale ** 2 + 
                               ideal_point ** 2 * 
                               ideological_topic_scale ** 2) / 2)
    return ideological_topic_mean

In [17]:
ideal_points = ideal_point_mean
author_map = np.array(author_map)
print(author_map.shape)
ideal_points = np.array(ideal_points)
print(ideal_points.shape)

(313,)
(313,)


In [18]:
bid_to_tbip = {}
for i in range(len(author_map)):
    bid_to_tbip[author_map[i]] = ideal_points[i]
print(len(bid_to_tbip))

313


In [19]:
pickle.dump(bid_to_tbip, open('../../speeches_results/bid_to_tbip_floor_speeches.pkl', 'wb'))

In [18]:
name = "Liz Cheney"
print('Overall Idea Point of ' + name + ' = ' + str(bid_to_tbip[author_map[names.index(name)]]))


ValueError: 'Liz Cheney' is not in list

In [21]:
name = "Elise Stefanik"
print('Overall Idea Point of ' + name + ' = ' + str(bid_to_tbip[author_map[names.index(name)]]))


ValueError: 'Elise Stefanik' is not in list

In [20]:
import chart_studio
chart_studio.tools.set_credentials_file(username='pg96', api_key='NDcsdaKUnDc6rrurwLmg')
import chart_studio.plotly as py

In [22]:
def save_interactive(topic_number):
    topic_name = 'Topic ' + str(topic_number)

    fig = go.Figure(layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'))
    x_min = -1.0
    x_max = 1.0
    diff = 0.05
    offset = -x_min / diff
    scatterplot_location = -0.53

    ideal_point_dict = {}
    for i in range(int((x_max - x_min) / diff) + 1):
        ideal_point_dict[i] = (i - offset) * diff

    # Add black line.
    fig.add_trace(go.Scatter(x=[x_min, x_max], 
                             y = [scatterplot_location, scatterplot_location],
                             line=dict(color="black", width=1),
                             marker=dict(size=1),
                             hoverinfo='skip'))

    # Add trace for all representatives
    fig.add_trace(go.Scatter(
        mode="markers",
        x=ideal_points, 
        y=scatterplot_location * np.ones(len(ideal_points)), 
        text=[x for x in names], 
        hoverinfo="x+text",
        marker=dict(color=colors, size=8)))

    num_top_words = 10

    # Add bar plots, one for each ideal point
    for step in np.arange(len(ideal_point_dict.keys())):
        ideological_topic_mean = get_ideological_topics(objective_topic_loc, 
                                   objective_topic_scale,
                                   ideological_topic_loc, 
                                   ideological_topic_scale,
                                   ideal_point_dict[step])
        topic_intensities = ideological_topic_mean[topic_number]
        top_topic_intensities = np.sort(topic_intensities)[-num_top_words:]
        top_topic_words = vocabulary[np.argsort(-topic_intensities)[:num_top_words]][::-1]
        fig.add_trace(
            go.Bar(
                visible=False,
                x=top_topic_intensities * 5,
                orientation='h',
                text=top_topic_words,
                textposition='outside',
                marker_color='rgb(175,122,197)',
                y0=0.529,
                dy=0.214,
                base=-0.3,
                hoverinfo='skip',
            ))

    fig.update_layout(go.Layout(
    yaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True}),
    xaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True})
    fig.update_yaxes(range=[-0.60, 2.5])
    fig.update_xaxes(range=[x_min - 0.01, x_max + 0.02])

    # Set default trace
    fig.data[len(fig.data) // 2].visible = True

    # Create and add slider
    steps = []
    for i in range(2, len(fig.data)):
        step = dict(
            method="update",
            args=[{"visible": [True, True] + [False] * (len(fig.data) - 2)}],  
            label="{:.2f}".format(ideal_point_dict[i - 2]),
        )
        step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=(len(fig.data) // 2 - 1),
        currentvalue={"prefix": "Ideal Point: "},
        y=0.35,
        steps=steps)]

    fig.update_layout(sliders=sliders, showlegend=False, 
                      title={'text': "Word Usage as a Function of Ideal Point (Topic: {})".format(topic_name),
                             'x': 0.5,
                             'y': 0.9},
                      xaxis_title='Representative ideal points (hover to see names)',
                      annotations = [
                                    dict(xref='paper',
                                            yref='paper',
                                            x=0.5, y=0.255,
                                            font={'size': 14},
                                            showarrow=False,
                                            text ='Move slider to change ideal point')
                      ])
    #fig.show()
    fig.write_html("../../speeches_results/interactive_htmls/topic" + str(topic_ind) + "_interactive.html")

In [23]:
num_topics = objective_topic_mean.shape[0]
print(num_topics)

50


In [25]:
for topic_ind in range(num_topics):
    save_interactive(topic_ind)

In [26]:
print(document_mean.shape)

(24691, 50)


In [27]:

speeches_data = pd.read_csv(os.path.join(project_dir, f"finalized_tbip_speech_set_raw_original_data_floor_speeches_{chamber}_after_removing_procedural_speeches.csv"))
print(speeches_data.info())
speeches_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24691 entries, 0 to 24690
Data columns (total 5 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   Speaker_Bioguide_ID  24691 non-null  object
 1   Speaker_Name         24691 non-null  object
 2   Text                 24691 non-null  object
 3   Date                 24691 non-null  object
 4   Legislative Body     24691 non-null  object
dtypes: object(5)
memory usage: 964.6+ KB
None


Unnamed: 0,Speaker_Bioguide_ID,Speaker_Name,Text,Date,Legislative Body
0,D000216,Ms. DeLAURO,". Mr. Chair, I yield myself such time as I m...",2023-11-14,House
1,D000216,Ms. DeLAURO,". Mr. Chair, I yield 2 minutes to the gentle...",2023-11-14,House
2,W000797,Ms. WASSERMAN SCHULTZ,". Mr. Chairman, I thank the gentlewoman for ...",2023-11-14,House
3,S001214,Mr. STEUBE,". Mr. Chair, our Nation's foster care system...",2023-11-14,House
4,F000462,Ms. LOIS FRANKEL of Florida,". My, my, my. In the spending bill that \nmo...",2023-11-14,House


In [30]:
final_legis_info_and_tbip_data = pd.read_csv(f'../../legislator_info_and_tbip_{cong_sen}_{congress_session}.csv')
# NOTE: while above file is ultimately created using the ideal point estimates... we only create these other files
# as resulting files for legislators ultimately retained in the final file containing everything used for research.

included_bid_to_tbip = dict(zip(final_legis_info_and_tbip_data.Bioguide_ID, 
                       final_legis_info_and_tbip_data.TBIP_Floor_Speeches))
# relev_bids = []
# for bid in included_bid_to_tbip:
#     if np.isnan(included_bid_to_tbip[bid]):
#         continue
#     relev_bids.append(bid)
# print(len(relev_bids))

In [31]:
#final_legis_info_and_tbip_data = pd.read_csv('legislator_info_and_tbip_congresses_115_and_116.csv')
print(final_legis_info_and_tbip_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 313 entries, 0 to 312
Data columns (total 9 columns):
 #   Column                           Non-Null Count  Dtype  
---  ------                           --------------  -----  
 0   Bioguide_ID                      313 non-null    object 
 1   Name                             313 non-null    object 
 2   Gender                           313 non-null    object 
 3   Party                            313 non-null    object 
 4   Born                             313 non-null    int64  
 5   Number_of_House_Terms            313 non-null    int64  
 6   District                         313 non-null    object 
 7   TBIP_Floor_Speeches              313 non-null    float64
 8   Standardized_Speech_Ideal_Point  313 non-null    float64
dtypes: float64(2), int64(2), object(5)
memory usage: 22.1+ KB
None


In [32]:
speaker_ids_from_data = list(speeches_data['Speaker_Bioguide_ID'])
texts_from_data = list(speeches_data['Text'])
dates_from_data = list(speeches_data['Date'])

In [33]:
relev_bids = list(bid_to_tbip.keys())
print(len(relev_bids))

313


In [34]:
retain_inds = []
all_relev_ids, all_relev_texts, all_relev_dates = [], [], []
for i, bid in enumerate(speaker_ids_from_data):
    if bid in relev_bids:
        retain_inds.append(i)
        all_relev_ids.append(bid)
        all_relev_texts.append(texts_from_data[i])
        all_relev_dates.append(dates_from_data[i])
print(len(retain_inds))

24691


In [35]:
document_mean = document_mean[retain_inds]
document_mean = np.array(torch.softmax(torch.from_numpy(document_mean), 1))
print(document_mean.shape)

(24691, 50)


In [36]:
texts_topics = pd.DataFrame()
texts_topics["Bioguide_ID"] = all_relev_ids
texts_topics["Text"] = all_relev_texts
texts_topics["Date"] = all_relev_dates

In [37]:
for topic_ind in range(document_mean.shape[1]):
    texts_topics['Topic' + str(topic_ind)] = list(map(lambda x:np.float32(str(x)), list(document_mean[:, topic_ind])))
    

In [38]:
texts_topics.to_csv('../../speeches_results/texts_topic_proportions.csv', 
                    index=False)

In [39]:
relev_bid_to_inds = {}
for bid in relev_bids:
    relev_bid_to_inds[bid] = []
for i, idd in enumerate(all_relev_ids):
        relev_bid_to_inds[idd].append(i)

In [40]:
def get_mean_topic_props_author(X, bid, author_to_inds):
    return np.mean(X[author_to_inds[bid]], 0).reshape((1, num_topics))

In [41]:
relev_bid_avg_topic_props = []
for bid in relev_bids:
    relev_bid_avg_topic_props.append(get_mean_topic_props_author(document_mean, 
                                                                 bid,
                                                                 relev_bid_to_inds))
relev_bid_avg_topic_props = np.concatenate(relev_bid_avg_topic_props, 0)
print(relev_bid_avg_topic_props.shape)

(313, 50)


In [42]:
author_topic_props = pd.DataFrame()
author_topic_props["Bioguide_ID"] = relev_bids

In [43]:
for topic_ind in range(num_topics):
    author_topic_props['Topic' + str(topic_ind)] = list(relev_bid_avg_topic_props[:, topic_ind])
    

In [44]:
author_topic_props.to_csv('../../speeches_results/topic_proportions_per_author.csv',
                          index=False)