In [1]:
import numpy as np
import os
import scipy.sparse as sparse
from scipy.stats import bernoulli, poisson
import analysis_utils_mine as utils

import plotly.graph_objects as go
import chart_studio
import json
import pandas as pd
import ast
from datetime import datetime
import torch
import pandas as pd
from datetime import datetime, timedelta

### original TBIP on senate speeches

In [2]:
project_dir = os.path.abspath('/workspace/pranav/tbip/') 
source_dir = os.path.join(project_dir, "data/synthetic")

# Load TBIP data.
data_dir = os.path.join(source_dir, "clean")
(counts, vocabulary, author_indices, 
 author_map) = utils.load_text_data(data_dir)

# Load TBIP parameters.
param_dir = os.path.join(source_dir, "tbip-pytorch-fits-og/params/")
(document_loc, document_scale, objective_topic_loc, objective_topic_scale, 
 ideological_topic_loc, ideological_topic_scale, ideal_point_loc, 
 ideal_point_scale) = utils.load_tbip_parameters(param_dir)

# Compute means from variational parameters
document_mean = np.exp(document_loc + document_scale ** 2 / 2)
objective_topic_mean = np.exp(objective_topic_loc + 
                              objective_topic_scale ** 2 / 2)
ideological_topic_mean = ideological_topic_loc
ideal_point_mean = ideal_point_loc

In [3]:
print(document_mean.shape)
print(objective_topic_mean.shape)
print(ideological_topic_mean.shape)
print(ideal_point_mean.shape)
ideal_point_mean = -1*ideal_point_mean
ideological_topic_loc = ideological_topic_loc * -1

(45, 3)
(3, 60)
(3, 60)
(6,)


In [4]:
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   20)

['Ideal Point = -1.0, Topic = 0: solar, crisis, renewable, carbon_tax, energy_efficiency, wind, pollution, clean_energy, global_warming, climate_change, energy, jobs, mine_workers, oil, economy, coal, production, hoax, china, govt_overreach'
 'Ideal Point = -0.5, Topic = 0: pollution, global_warming, climate_change, clean_energy, carbon_tax, solar, crisis, renewable, energy_efficiency, wind, energy, jobs, oil, mine_workers, economy, coal, china, production, hoax, govt_overreach'
 'Ideal Point = 0.0, Topic = 0: energy, pollution, global_warming, jobs, climate_change, oil, mine_workers, clean_energy, economy, carbon_tax, coal, crisis, china, production, solar, renewable, energy_efficiency, wind, hoax, govt_overreach'
 'Ideal Point = 0.5, Topic = 0: energy, jobs, oil, mine_workers, economy, coal, production, china, pollution, govt_overreach, hoax, global_warming, climate_change, clean_energy, carbon_tax, renewable, energy_efficiency, crisis, solar, wind'
 'Ideal Point = 1.0, Topic = 0: ec

In [5]:
colors = np.array(["crimson" if '(R)' in rep else "steelblue" for rep in author_map])
def get_ideological_topics(objective_topic_loc, 
                           objective_topic_scale,
                           ideological_topic_loc, 
                           ideological_topic_scale,
                           ideal_point):
    ideological_topic_mean = np.exp(objective_topic_loc +
                              ideal_point * ideological_topic_loc +
                              (objective_topic_scale ** 2 + 
                               ideal_point ** 2 * 
                               ideological_topic_scale ** 2) / 2)
    return ideological_topic_mean

In [6]:
ideal_points = ideal_point_mean
author_map = np.array(author_map)
print(author_map.shape)
ideal_points = np.array(ideal_points)
print(ideal_points.shape)

(6,)
(6,)


In [7]:
def get_topic_name(topic_number):
    if topic_number == 0:
        topic_name = "Energy"
    elif topic_number == 1:
        topic_name = "Immigration"
    elif topic_number == 2:
        topic_name = "Abortion"
        
    return topic_name

In [14]:
def show_interactive(topic_number):
    topic_name = get_topic_name(topic_number)
    
    #avg_topic_props = mean_topic_props[:, topic_number]

    fig = go.Figure(layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'))
    x_min = -1.0
    x_max = 1.0
    diff = 0.05
    offset = -x_min / diff
    scatterplot_location = -0.53
    avg_topic_prop_scatter_location = -1.06

    ideal_point_dict = {}
    for i in range(int((x_max - x_min) / diff) + 1):
        ideal_point_dict[i] = (i - offset) * diff
        
    topic_prop_dict = {}
    for i in range(int((1.0 - 0.0) / diff) + 1):
        topic_prop_dict[i] = (i - offset) * diff

    # Add black line.
    fig.add_trace(go.Scatter(x=[x_min, x_max], 
                             y = [scatterplot_location, scatterplot_location],
                             line=dict(color="black", width=1),
                             marker=dict(size=1),
                             hoverinfo='skip'))

    # Add trace for all representatives
    fig.add_trace(go.Scatter(
        mode="markers",
        x=ideal_points, 
        y=scatterplot_location * np.ones(len(ideal_points)), 
        text=[x for x in author_map], 
        hoverinfo="x+text",
        marker=dict(color=colors, size=8)))
    
#     # Add another black line
#     fig.add_trace(go.Scatter(x=[0.0, 1.0], 
#                              y = [avg_topic_prop_scatter_location, avg_topic_prop_scatter_location],
#                              line=dict(color="black", width=1),
#                              marker=dict(size=1),
#                              hoverinfo='skip'))

#     # Add another trace for all representatives - this time for avg prop of the topic in their speeches
#     fig.add_trace(go.Scatter(
#         mode="markers",
#         x=avg_topic_props, 
#         y=avg_topic_prop_scatter_location * np.ones(len(avg_topic_props)), 
#         text=[x for x in author_map], 
#         hoverinfo="x+text",
#         marker=dict(color=colors, size=8)))

    num_top_words = 8

    # Add bar plots, one for each ideal point
    for step in np.arange(len(ideal_point_dict.keys())):
        ideological_topic_mean = get_ideological_topics(objective_topic_loc, 
                                   objective_topic_scale,
                                   ideological_topic_loc, 
                                   ideological_topic_scale,
                                   ideal_point_dict[step])
        topic_intensities = ideological_topic_mean[topic_number]/50.0
        top_topic_intensities = np.sort(topic_intensities)[-num_top_words:]
        top_topic_words = vocabulary[np.argsort(-topic_intensities)[:num_top_words]][::-1]
        fig.add_trace(
            go.Bar(
                visible=False,
                x=top_topic_intensities * 5,
                orientation='h',
                text=top_topic_words,
                textposition='outside',
                marker_color='rgb(175,122,197)',
                y0=0.529,
                dy=0.214,
                base=-0.3,
                hoverinfo='skip',
            ))

    fig.update_layout(go.Layout(
    yaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True}),
    xaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True})
    fig.update_yaxes(range=[-0.6, 2.5])
    fig.update_xaxes(range=[x_min - 0.02, x_max + 0.02])

    # Set default trace
    fig.data[len(fig.data) // 2].visible = True
    
    #print(len(fig.data))

    # Create and add slider
    steps = []
    for i in range(2, len(fig.data)):
        step = dict(
            method="update",
            args=[{"visible": [True, True] + [False] * (len(fig.data) - 2)}],  
            label="{:.2f}".format(ideal_point_dict[i - 2]),
        )
        step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=(len(fig.data) // 2 - 1),
        currentvalue={"prefix": "Ideal Point: "},
        y=0.35,
        steps=steps)]

    fig.update_layout(sliders=sliders, showlegend=False, 
                      title={'text': "Word Usage as a Function of Ideal Point (Topic: {})".format(topic_name),
                             'x': 0.5,
                             'y': 0.9},
                      xaxis_title='Representative ideal points (hover to see names)',
                      annotations = [
                                    dict(xref='paper',
                                            yref='paper',
                                            x=0.5, y=0.255,
                                            font={'size': 14},
                                            showarrow=False,
                                            text ='Move slider to change ideal point')
                      ])
    fig.show()

In [15]:
show_interactive(0)

In [16]:
show_interactive(1)

In [17]:
show_interactive(2)

In [None]:
speaker1 = 'Mark (R)' #Conservative - Extremely cons. on abortion, moderate on other two, 4 speeches on abortion and 3 on energy, 3 on immigration
speaker2 = 'Lauren (R)' #Conservative - extreme cons. on immigration, moderate on other two, 3 speeches on abortion and 1 on energy, 4 on immigration
speaker3 = 'John (R)' #Conservative - moderate cons. on immigration, extreme on other two, 1 speech on immigration and 3 on energy, 1 on abortion

speaker4 = 'Mona (D)' #Liberal - Extremely lib. on abortion and energy, moderate on immigration, 3 speeches on abortion, 3 on energy, 3 on immigration
speaker5 = 'Justicia (D)' #Liberal - Extremely lib. on abortion and immigration, moderate on energy, 3 speeches on abortion, 1 on energy, 3 on immigration
speaker6 = 'Alex (D)' #Liberal - moderate on all issues, 2 speeches on abortion, 2 on energy, 2 on immigration

## issue-specific TBIP

In [18]:
project_dir = os.path.abspath('/workspace/pranav/tbip/data') 
source_dir = os.path.join(project_dir, "synthetic/")

# Load TBIP data.
data_dir = os.path.join(source_dir, "clean")
(counts, vocabulary, author_indices, 
 author_map) = utils.load_text_data(data_dir)

# Load TBIP parameters.
param_dir = os.path.join(source_dir, "tbip-pytorch-fits-issue-specific/params/")
(document_loc, document_scale, objective_topic_loc, objective_topic_scale, 
 ideological_topic_loc, ideological_topic_scale, ideal_point_loc, 
 ideal_point_scale) = utils.load_tbip_parameters(param_dir)

# Compute means from variational parameters
document_mean = np.exp(document_loc + document_scale ** 2 / 2)
objective_topic_mean = np.exp(objective_topic_loc + 
                              objective_topic_scale ** 2 / 2)
ideological_topic_mean = ideological_topic_loc
ideal_point_mean = ideal_point_loc

In [19]:
print(document_mean.shape)
print(objective_topic_mean.shape)
print(ideological_topic_mean.shape)
print(ideal_point_mean.shape)

(45, 3)
(3, 60)
(3, 60)
(6, 3)


In [20]:
utils.print_topics(objective_topic_loc, 
                   objective_topic_scale, 
                   ideological_topic_loc, 
                   ideological_topic_scale, 
                   vocabulary, 
                   20)

['Ideal Point = -1.0, Topic = 0: clean_energy, solar, renewable, energy_efficiency, wind, crisis, pollution, global_warming, carbon_tax, climate_change, energy, jobs, oil, mine_workers, economy, hoax, coal, govt_overreach, production, china'
 'Ideal Point = -0.5, Topic = 0: pollution, global_warming, clean_energy, climate_change, energy, carbon_tax, solar, renewable, jobs, energy_efficiency, wind, crisis, mine_workers, oil, economy, hoax, production, coal, govt_overreach, china'
 'Ideal Point = 0.0, Topic = 0: energy, jobs, pollution, global_warming, mine_workers, oil, economy, climate_change, clean_energy, carbon_tax, production, hoax, coal, govt_overreach, china, renewable, solar, energy_efficiency, wind, crisis'
 'Ideal Point = 0.5, Topic = 0: energy, mine_workers, jobs, economy, oil, production, hoax, coal, govt_overreach, china, pollution, global_warming, climate_change, clean_energy, carbon_tax, renewable, energy_efficiency, solar, wind, crisis'
 'Ideal Point = 1.0, Topic = 0: pr

In [21]:
def get_topic_name(topic_number):
    if topic_number == 0:
        topic_name = "Energy"
    elif topic_number == 1:
        topic_name = "Immigration"
    elif topic_number == 2:
        topic_name = "Abortion"
        
    return topic_name

In [22]:
np.max(ideal_point_mean)

1.0510294

In [23]:
np.min(ideal_point_mean)

-1.1048443

In [24]:
colors = np.array(["crimson" if '(R)' in rep else "steelblue" for rep in author_map])
def get_ideological_topics(objective_topic_loc, 
                           objective_topic_scale,
                           ideological_topic_loc, 
                           ideological_topic_scale,
                           ideal_point):
    ideological_topic_mean = np.exp(objective_topic_loc +
                              ideal_point * ideological_topic_loc +
                              (objective_topic_scale ** 2 + 
                               ideal_point ** 2 * 
                               ideological_topic_scale ** 2) / 2)
    return ideological_topic_mean

In [25]:

author_map = np.array(author_map)
print(author_map.shape)
# ideal_points = np.array(ideal_points)
# print(ideal_points.shape)

(6,)


In [26]:
def get_agg_avg_matrix(X, dic_of_inds):
    list_of_arrays = []
    for a in dic_of_inds:
        inds = dic_of_inds[a]
        list_of_arrays.append(np.mean(X[inds], 0).reshape((1, X.shape[1])))
    return np.concatenate(list_of_arrays, 0)

In [27]:
author_to_inds = {}
for i in range(len(author_map)):
    author_to_inds[i] = []
for j, a in enumerate(author_indices):
    author_to_inds[a].append(j)
print(author_to_inds)

{0: [39, 40, 41, 42, 43, 44], 1: [18, 19, 20, 21, 22], 2: [32, 33, 34, 35, 36, 37, 38], 3: [10, 11, 12, 13, 14, 15, 16, 17], 4: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9], 5: [23, 24, 25, 26, 27, 28, 29, 30, 31]}


In [28]:
mean_topic_props = get_agg_avg_matrix(np.array(torch.softmax(torch.from_numpy(document_mean), 1)), author_to_inds)
print(mean_topic_props.shape)

(6, 3)


In [29]:
def show_interactive(topic_number):
    ideal_points = ideal_point_mean[:, topic_number]
    avg_topic_props = mean_topic_props[:, topic_number]
    try:
        topic_name = get_topic_name(topic_number)
    except:
        topic_name = 'Topic ' + str(topic_number)
    fig = go.Figure(layout = go.Layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)'))
    
    y_hover = 0.0
    
    x_min = np.min(ideal_points) #- 0.05
    x_max = np.max(ideal_points) #+ 0.05
    if abs(x_min)>x_max:
        x_max = -1*x_min
    else:
        x_min = -1*x_max
    diff = 0.05
    offset = -x_min / diff
    scatterplot_location = -0.53

    ideal_point_dict = {}
    for i in range(int((x_max - x_min) / diff) + 1):
        ideal_point_dict[i] = (i - offset) * diff

    # Add black line.
    fig.add_trace(go.Scatter(x=[x_min, x_max], 
                             y = [scatterplot_location, scatterplot_location],
                             line=dict(color="black", width=1),
                             marker=dict(size=1),
                             hoverinfo='skip'))

    # Add trace for all representatives
    fig.add_trace(go.Scatter(
        mode="markers",
        x=ideal_points, 
        y=scatterplot_location * np.ones(len(ideal_points)), 
        text=[x + ' [' + str(round(avg_topic_props[ind], 3)) + ']' for ind, x in enumerate(author_map)], 
        hoverinfo="x+text",
        marker=dict(color=colors, size=8)))

    num_top_words = 8

    # Add bar plots, one for each ideal point
    for step in np.arange(len(ideal_point_dict.keys())):
        ideological_topic_mean = get_ideological_topics(objective_topic_loc, 
                                   objective_topic_scale,
                                   ideological_topic_loc, 
                                   ideological_topic_scale,
                                   ideal_point_dict[step])
        topic_intensities = ideological_topic_mean[topic_number]/50.0
        top_topic_intensities = np.sort(topic_intensities)[-num_top_words:]
        top_topic_words = vocabulary[np.argsort(-topic_intensities)[:num_top_words]][::-1]
        fig.add_trace(
            go.Bar(
                visible=False,
                x=top_topic_intensities * 5,
                orientation='h',
                text=top_topic_words,
                textposition='outside',
                marker_color='rgb(175,122,197)',
                y0=0.529,
                dy=0.214,
                base=-0.3,
                hoverinfo='skip',
            ))

    fig.update_layout(go.Layout(
    yaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True}),
    xaxis = {'showgrid': False,
             'showline': False,
             'zeroline': False,
            'showticklabels': False,
            'fixedrange': True})
    fig.update_yaxes(range=[-0.60, 2.5])
    fig.update_xaxes(range=[x_min - 0.02, x_max + 0.02])

    # Set default trace
    fig.data[len(fig.data) // 2].visible = True

    # Create and add slider
    steps = []
    for i in range(2, len(fig.data)):
        step = dict(
            method="update",
            args=[{"visible": [True, True] + [False] * (len(fig.data) - 2)}],  
            label="{:.2f}".format(ideal_point_dict[i - 2]),
        )
        step["args"][0]["visible"][i] = True  # Toggle i'th trace to "visible"
        steps.append(step)

    sliders = [dict(
        active=(len(fig.data) // 2 - 1),
        currentvalue={"prefix": "Ideal Point: "},
        y=0.35,
        steps=steps)]

    fig.update_layout(sliders=sliders, showlegend=False, 
                      title={'text': "Word Usage as a Function of Ideal Point (Topic: {})".format(topic_name),
                             'x': 0.5,
                             'y': 0.9},
                      xaxis_title='Representative ideal points (hover to see names)',
                      annotations = [
                                    dict(xref='paper',
                                            yref='paper',
                                            x=0.5, y=0.255,
                                            font={'size': 14},
                                            showarrow=False,
                                            text ='Move slider to change ideal point')
                      ])
    fig.show()

In [30]:
show_interactive(0)

In [31]:
show_interactive(1)

In [32]:
show_interactive(2)

In [None]:
speaker1 = 'Mark (R)' 
#Conservative - Extremely cons. on abortion, moderate on other two, 4 speeches on abortion and 3 on energy, 3 on immigration

speaker2 = 'Lauren (R)' 
#Conservative - extreme cons. on immigration, moderate on other two, 3 speeches on abortion and 1 on energy, 4 on immigration

speaker3 = 'John (R)' 
#Conservative - moderate cons. on immigration, extreme on other two, 1 speech on immigration and 3 on energy, 1 on abortion


speaker4 = 'Mona (D)' 
#Liberal - Extremely lib. on abortion and energy, moderate on immigration, 3 speeches on abortion, 3 on energy, 3 on immigration

speaker5 = 'Justicia (D)' 
#Liberal - Extremely lib. on abortion and immigration, moderate on energy, 3 speeches on abortion, 1 on energy, 3 on immigration

speaker6 = 'Alex (D)' 
#Liberal - moderate on all issues, 2 speeches on abortion, 2 on energy, 2 on immigration