Import Packages

In [None]:
print('starting...')
from bertopic import BERTopic

import pandas as pd
import scipy
from hdbscan import HDBSCAN
from umap import UMAP
import unicodedata
import numpy as np
import sys

from sentence_transformers import SentenceTransformer
import lzma
import torch
use_cuda = torch.cuda.is_available()

print('started')

In [None]:

print(use_cuda)
n_neighbors = 10

min_cluster_size = 100

min_samples = 100


umap_model = UMAP(n_neighbors = n_neighbors)




# Set the path to the compressed pickle file
file_path = "bills.pkl.xz"

# Open the compressed file in binary mode
with lzma.open(file_path, "rb") as f:
    # Load the pickle file using pandas
    df = pd.read_pickle(f)

print('done reading')

In [None]:

#df = df.sample(20000, random_state = 42)
print(len(df), flush = True)
from nltk.corpus import stopwords
stop = stopwords.words('english')

# Fit the vectorizer to the text data
docs = df['Description'].astype(str).values

topic_model = BERTopic.load("legal_model")


# assign cluster to bills

a = topic_model.transform(df['Description'].astype(str).values)
df['topic'] = a[0]
topic_model.get_topic_info().to_csv("legiscan_info.csv")

df.to_pickle("bills3.pkl.xz")

In [None]:

''' 
This is the code to create the cluster, but since the model is already created, we don't need to re-create it. Also, it is non-deterministic
so recreating the same model is unlikely to create the exact same groups

'''
topic_model.get_topic_info().to_csv("legiscan.csv")
umap_model = UMAP(n_neighbors = n_neighbors)
hdbscan_model = HDBSCAN(min_cluster_size = min_cluster_size, min_samples = min_samples, prediction_data=True)
topic_model = BERTopic(umap_model = umap_model, hdbscan_model = hdbscan_model, 
                      top_n_words = 3,
                      embedding_model = 'all-mpnet-base-v2',
                      calculate_probabilities = False, verbose = False)



In [None]:

'''
Most of these visualizations are bad because we have 300+ groups. I think the visualize topics is interesting because it is interactive.
The bar graph is also nice because it only shows a subset of topics.

'''
import plotly.io as pio
# topics, probs = topic_model.fit_transform(docs)
print('start plotting', flush = True)
plot = topic_model.visualize_topics()

pio.write_image(plot, "topics.pdf")
print('next')
plot3 = topic_model.visualize_barchart()
pio.write_image(plot3, "bar.png")

plot4 = topic_model.visualize_heatmap()
pio.write_image(plot4, "heat.png")

plot2 = topic_model.visualize_documents(docs)

pio.write_image(plot2, "docs.png")

In [None]:
'''
Sentiment analysis part

much of the code is taken from the example 

https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
'''


from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer
import numpy as np
from scipy.special import softmax
import csv
import urllib.request



task='sentiment'
MODEL = f"cardiffnlp/twitter-roberta-base-{task}"

tokenizer = AutoTokenizer.from_pretrained(MODEL)

# download label mapping
labels=[]
mapping_link = f"https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/{task}/mapping.txt"
with urllib.request.urlopen(mapping_link) as f:
    html = f.read().decode('utf-8').split("\n")
    csvreader = csv.reader(html, delimiter='\t')
labels = [row[1] for row in csvreader if len(row) > 1]

# PT
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


topic = []
sent = []


print(df['sentiment'].head())
print(df['topic'].head())
print('done')


In [None]:



df = df.sample(20000, random_state = 42) # file too large to do sentiment analysis on full dataset

from tqdm import tqdm

for i in tqdm(range(len(df))):
    text = df['Description'].iloc[i]
    encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length = 512)
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    sent.append(int(((np.argmax(scores)-1 *1))))

#df['topic'] = a[0]
df['sentiment'] = sent

df.to_pickle("bills2.pkl.xz")




print('done')