In [50]:
import torch
import torch.nn as nn
import torch.nn.functional as F
torch.manual_seed(1)

<torch._C.Generator at 0x1151aaf70>

In [51]:
import pandas as pd 

bill_df = pd.read_csv("115th_clean.csv")

In [52]:
bill_df.columns

Index(['bill_id', 'bill_slug', 'bill_type', 'number', 'bill_uri', 'title',
       'short_title', 'sponsor_title', 'sponsor_id', 'sponsor_name',
       'sponsor_state', 'sponsor_party', 'sponsor_uri', 'gpo_pdf_uri',
       'congressdotgov_url', 'govtrack_url', 'introduced_date', 'active',
       'last_vote', 'house_passage', 'senate_passage', 'enacted', 'vetoed',
       'cosponsors', 'cosponsors_by_party', 'committees', 'committee_codes',
       'subcommittee_codes', 'primary_subject', 'summary', 'summary_short',
       'latest_major_action_date', 'latest_major_action', 'raw_text',
       'new_index', 'cleaned_text', 'date'],
      dtype='object')

In [53]:

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
import numpy as np
from torchtext.vocab import GloVe
glove = GloVe(name='6B')

In [54]:
def get_top_words_bills(bill):
    
    custom_stop_words = list(ENGLISH_STOP_WORDS.union({'additional', 'stopwords'}))
    vectorizer = TfidfVectorizer(stop_words=custom_stop_words)
    
    
    tfidf_matrix = vectorizer.fit_transform([bill])

    feature_array = vectorizer.get_feature_names_out()
    tfidf_sorting = np.argsort(tfidf_matrix.toarray().flatten())[::-1]

    n = 10

    top_n = feature_array[tfidf_sorting][:n].tolist()
    return top_n

In [55]:
get_top_words_bills(bill_df.loc[0]['cleaned_text'])

['section',
 'refugee',
 'act',
 'shall',
 'alien',
 'states',
 'resettlement',
 'united',
 'granted',
 'status']

In [56]:
bill_df['keywords'] = bill_df['cleaned_text'].apply(get_top_words_bills)

In [57]:
bill_df['introduced_date'] = pd.to_datetime(bill_df['introduced_date'])

In [62]:
start_date = '2017-01-01'
end_date = '2017-10-31'
bill_df = bill_df.loc[(bill_df['introduced_date'] >= start_date) & (bill_df['introduced_date'] <= end_date), :]

In [63]:
print(bill_df['keywords'])

3884    [violence, domestic, sexual, assault, stalking...
3885    [columbia, district, national, guard, assistan...
3886    [president, law, mr, states, spouse, constitut...
3887    [institution, funds, section, subsection, educ...
3888    [inserting, order, united, title, partner, sta...
                              ...                        
8871    [security, transportation, systems, transit, n...
8872    [isil, resolution, states, 2014, congress, uni...
8873    [johnson, judge, decision, alabama, montgomery...
8874    [house, senate, leader, representatives, congr...
8875    [representatives, house, clerk, elected, state...
Name: keywords, Length: 4992, dtype: object


In [64]:
topic_df = pd.read_json("final_topics_dict.json")

In [65]:
print(topic_df.head())

                                              201701  \
0  [obamacare, health, insurance, care, repeal, m...   
1  [0.008568612132015001, 0.008531346294839, 0.00...   
2                                           0.363346   

                                              201702  \
0  [obamacare, health, insurance, care, repeal, b...   
1  [0.008737137061722001, 0.008435196697133001, 0...   
2                                           0.363346   

                                              201703  \
0  [obamacare, health, insurance, care, bill, rep...   
1  [0.008436168128663001, 0.008163779397270001, 0...   
2                                           0.363346   

                                              201704  \
0  [de, que, la, en, el, una, los, un, las, del, ...   
1  [0.08658085063946801, 0.063443773607105, 0.060...   
2                                           0.363346   

                                              201705  \
0  [de, la, que, en, el, los, una, un, del, 

In [66]:
dates = []
topics_list = []
weights_list = []
summed_values = []

for date, data in topic_df.items():

    formatted_date = f"{str(date)[:4]}-{str(date)[4:]}"
    
    dates.append(formatted_date)

    topics_list.append(data[0])
    weights_list.append(data[1])

topic_df = pd.DataFrame({
    "date": dates,
    "topics": topics_list,
    "weights": weights_list,
})

print(topic_df)


      date                                             topics  \
0  2017-01  [obamacare, health, insurance, care, repeal, m...   
1  2017-02  [obamacare, health, insurance, care, repeal, b...   
2  2017-03  [obamacare, health, insurance, care, bill, rep...   
3  2017-04  [de, que, la, en, el, una, los, un, las, del, ...   
4  2017-05  [de, la, que, en, el, los, una, un, del, se, r...   
5  2017-06  [de, que, la, en, el, los, una, un, del, se, n...   
6  2017-07  [de, que, la, en, el, una, los, un, se, del, a...   
7  2017-08  [de, que, la, en, el, una, los, un, se, del, k...   
8  2017-09  [nationals, baseball, game, innings, yankees, ...   
9  2017-10  [tax, bill, taxes, cuts, republicans, plan, in...   

                                             weights  
0  [0.008568612132015001, 0.008531346294839, 0.00...  
1  [0.008737137061722001, 0.008435196697133001, 0...  
2  [0.008436168128663001, 0.008163779397270001, 0...  
3  [0.08658085063946801, 0.063443773607105, 0.060...  
4  [0.083

In [67]:
inspection_df = pd.read_excel('inspection.xlsx')
print(inspection_df.columns)

average_sentiments = []

for i in range(0, len(inspection_df), 50):
    subset = inspection_df.loc[i:i+49]
    average_sentiment = subset['mean_positive'].mean()
    average_sentiments.append(average_sentiment)

topic_df['average_sentiment'] = average_sentiments

Index(['Unnamed: 0', 'topics', 'topic_words', 'mean_positive', 'count',
       'words', 'weights', 'Window'],
      dtype='object')


In [69]:
def calculate_score(bill_keywords, topics_for_date, weights_for_date):

    matching_keywords = [keyword for keyword in bill_keywords if keyword in topics_for_date]
    score = sum(weights_for_date[topics_for_date.index(keyword)] for keyword in matching_keywords) / sum(weights_for_date) if sum(weights_for_date) > 0 else 0
    return score

In [86]:
def map_keywords_to_topics(bill_df, topics_df):
    
    bill_df['year_month'] = bill_df['introduced_date'].dt.to_period('M')
    
    mapped_scores = {}

    for _, row in bill_df.iterrows():
        year_month = str(row['year_month'])
        bill_keywords = row['keywords']
        
        if year_month in topics_df['date'].values:

            if year_month not in mapped_scores:
                mapped_scores[year_month] = []

            topics_row = topics_df[topics_df['date'] == year_month]
            topics_for_date = topics_row['topics'].iloc[0]
            weights_for_date = topics_row['weights'].iloc[0]
            sentiment_for_date = topics_row['average_sentiment'].iloc[0]

            score = calculate_score(bill_keywords, topics_for_date, weights_for_date)
            bill_df.loc[bill_df['date'] == row['date'], 'bow_similarity'] = score
            bill_df.loc[bill_df['date'] == row['date'], 'avg_media_sentiment'] = sentiment_for_date

            mapped_scores[year_month].append(score)

    return bill_df

In [87]:
map_keywords_to_topics(bill_df, topic_df)

Unnamed: 0,bill_id,bill_slug,bill_type,number,bill_uri,title,short_title,sponsor_title,sponsor_id,sponsor_name,...,latest_major_action_date,latest_major_action,raw_text,new_index,cleaned_text,date,keywords,year_month,bow_similarity,avg_media_sentiment
3884,hr4198-115,hr4198,hr,H.R.4198,https://api.propublica.org/congress/v1/115/bil...,To promote the economic security and safety of...,Security and Financial Empowerment Act of 2017,Rep.,R000486,Lucille Roybal-Allard,...,2017-10-31,Referred to the Committee on Education and the...,[Congressional Bills 115th Congress]\n[From th...,3884,115th congress 1st session h. r. 4198 to promo...,2017-10-31,"[violence, domestic, sexual, assault, stalking...",2017-10,0.002746,0.338246
3885,hr4194-115,hr4194,hr,H.R.4194,https://api.propublica.org/congress/v1/115/bil...,To direct the Mayor of the District of Columbi...,To direct the Mayor of the District of Columbi...,Del.,N000147,Eleanor Holmes Norton,...,2017-10-31,Referred to the House Committee on Oversight a...,[Congressional Bills 115th Congress]\n[From th...,3885,115th congress 1st session h. r. 4194 to direc...,2017-10-31,"[columbia, district, national, guard, assistan...",2017-10,0.002746,0.338246
3886,hjres120-115,hjres120,hjres,H.J.RES.120,https://api.propublica.org/congress/v1/115/bil...,Proposing an amendment to the Constitution of ...,Proposing an amendment to the Constitution of ...,Rep.,C001068,Steve Cohen,...,2017-11-02,Sponsor introductory remarks on measure. (CR H...,[Congressional Bills 115th Congress]\n[From th...,3886,115th congress 1st session h. j. res. 120 prop...,2017-10-31,"[president, law, mr, states, spouse, constitut...",2017-10,0.002746,0.338246
3887,hr4181-115,hr4181,hr,H.R.4181,https://api.propublica.org/congress/v1/115/bil...,To amend the Higher Education Act of 1965 rega...,POST Act of 2017,Rep.,C001068,Steve Cohen,...,2017-10-31,Referred to the House Committee on Education a...,[Congressional Bills 115th Congress]\n[From th...,3887,115th congress 1st session h. r. 4181 to amend...,2017-10-31,"[institution, funds, section, subsection, educ...",2017-10,0.002746,0.338246
3888,hr4186-115,hr4186,hr,H.R.4186,https://api.propublica.org/congress/v1/115/bil...,"To amend title 18, United States Code, to prot...",Lori Jackson Domestic Violence Survivor Protec...,Rep.,H001047,Jim Himes,...,2017-11-17,"Referred to the Subcommittee on Crime, Terrori...",[Congressional Bills 115th Congress]\n[From th...,3888,115th congress 1st session h. r. 4186 to amend...,2017-10-31,"[inserting, order, united, title, partner, sta...",2017-10,0.002746,0.338246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8871,hres13-115,hres13,hres,H.RES.13,https://api.propublica.org/congress/v1/115/bil...,Expressing the sense of the House of Represent...,Expressing the sense of the House of Represent...,Rep.,J000032,Sheila Jackson Lee,...,2017-01-03,Referred to the House Committee on Homeland Se...,[Congressional Bills 115th Congress]\n[From th...,8871,115th congress 1st session h. res. 13 expressi...,2017-01-03,"[security, transportation, systems, transit, n...",2017-01,0.000865,0.376167
8872,hconres2-115,hconres2,hconres,H.CON.RES.2,https://api.propublica.org/congress/v1/115/bil...,To authorize the use of United States Armed Fo...,Authorization for Use of Military Force Agains...,Rep.,C001053,Tom Cole,...,2017-01-03,Referred to the House Committee on Foreign Aff...,[Congressional Bills 115th Congress]\n[From th...,8872,115th congress 1st session h. con. res. 2 to a...,2017-01-03,"[isil, resolution, states, 2014, congress, uni...",2017-01,0.000865,0.376167
8873,hconres3-115,hconres3,hconres,H.CON.RES.3,https://api.propublica.org/congress/v1/115/bil...,Recognizing former United States Federal Judge...,Recognizing former United States Federal Judge...,Rep.,G000553,Al Green,...,2017-01-11,Referred to the Subcommittee on the Constituti...,[Congressional Bills 115th Congress]\n[From th...,8873,115th congress 1st session h. con. res. 3 reco...,2017-01-03,"[johnson, judge, decision, alabama, montgomery...",2017-01,0.000865,0.376167
8874,hconres1-115,hconres1,hconres,H.CON.RES.1,https://api.propublica.org/congress/v1/115/bil...,Regarding consent to assemble outside the seat...,Regarding consent to assemble outside the seat...,Rep.,S000250,Pete Sessions,...,2017-01-04,Received in the Senate.,[Congressional Bills 115th Congress]\n[From th...,8874,115th congress 1st session h. con. res. 1 in t...,2017-01-03,"[house, senate, leader, representatives, congr...",2017-01,0.000865,0.376167


In [89]:
unique_similarities = set()

for similarity in bill_df["bow_similarity"]:
    if similarity > 0:
        unique_similarities.add(similarity)

print("Unique Similarities")
for unique_similarity in unique_similarities:
    print(unique_similarity)

Unique Similarities
0.005470064958203791
0.001557443365920231
0.003454156055004321
0.002955230222877248
0.001443802693447573
0.0025918117813801297
0.0014466578787375496
0.005950485553869017
0.0015356569417557362
0.0019155981000607111
0.0027044251411541096
0.0023762756152545654
0.00287391667307137
0.003313266662626786
0.002121402222404552
0.0039096232830005025
0.00498979133394564
0.0011555489158008659
0.002288083845512859
0.0036076837954892985
0.0017926757241706788
0.002948974905081326
0.006210950295886527
0.0033447359534324846
0.003165257308103184
0.0023325697143073403
0.002876725015122315
0.0014689800937406924
0.007599620685887322
0.0020349672532272226
0.002329539254954336
0.009080886244284003
0.0017146119759564146
0.002488486664207048
0.002400620910398682
0.0005059369524212546
0.00278621184349189
0.001647059758141354
0.0014441125621101542
0.0038858437085243724
0.003004564018227542
0.0008647175449415223
0.0023707857502231755
0.001025635602304929
0.005814922992645278
0.0011584411192783

In [90]:
print("Number of Bills with some Similarity:", len(bill_df["bow_similarity"] > 0))

Number of Bills with some Similarity: 4992


In [91]:
print("Minimum Similarity:", min(bill_df['bow_similarity']))
print("Maximum Similarity:", max(bill_df['bow_similarity']))

Minimum Similarity: 0.0
Maximum Similarity: 0.013790687912233549


In [83]:
bill_df.to_csv('bills_with_bow_similarity.csv', index=False)