In [1]:
import pandas as pd
import ast  # To safely evaluate string representations of lists
from collections import OrderedDict # Creates a sorted dictionary (sorted by key)

In [2]:
df = pd.read_csv("parsed_bills_115-119_chunks_only_embedded.csv", usecols=['topic_tags'])
df

Unnamed: 0,topic_tags
0,"['Congress', 'Government Operations and Politi..."
1,"['Taxation', 'Social Welfare', 'Commemorations..."
2,"['Congress', 'Economics and Public Finance', '..."
3,"['Congress', 'Economics and Public Finance', '..."
4,"['Congress', 'Commemorations', 'Agriculture an..."
...,...
121233,"['Congress', 'Commemorations', 'Social Science..."
121234,"['International Affairs', 'Security and Intern..."
121235,"['International Affairs', 'Security and Intern..."
121236,"['Education', 'Government Operations and Polit..."


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 121238 entries, 0 to 121237
Data columns (total 1 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   topic_tags  121238 non-null  object
dtypes: object(1)
memory usage: 947.3+ KB


In [4]:
# Convert embeddings from CSV to Lists
df["topic_tags"] = df["topic_tags"].apply(lambda x: list(ast.literal_eval(x)))

In [5]:
topic_counts = {}
for tags in df['topic_tags']:
    for topic in tags:
        if topic not in topic_counts:
            topic_counts[topic] = 1
        else:
            topic_counts[topic] += 1

In [6]:
topic_counts

{'Congress': 82736,
 'Government Operations and Politics': 31005,
 'Economics and Public Finance': 37010,
 'House Operations and Politics': 205,
 'Commemorations': 46776,
 'Labor and Employment': 24599,
 'House and Senate': 406,
 'Senate and Congress': 239,
 'House and Senate Committees': 1,
 'House and Senate Affairs': 1,
 'Taxation': 45823,
 'Social Welfare': 48729,
 'Education': 30570,
 'Health': 46979,
 'Taxation.': 81,
 'Social welfare': 5801,
 'Taxations': 3284,
 'Finance and Financial Sector': 25900,
 'Agriculture and Food': 42207,
 'Transportation and Public Works': 40801,
 'Commerce': 45055,
 'Congress and other stakeholders': 7,
 'Law': 24107,
 'Agriculture and food': 12015,
 'Commemoration': 4592,
 'Congress,': 4955,
 'Arts, Culture, Religion': 13390,
 'Houses and Representatives': 26,
 'Capitol and State Security': 169,
 'Capitol and State Affairs': 4,
 'Houses and Minorities': 2,
 'Senate Minority Congress': 459,
 'Senate Minority Affairs': 309,
 'Senate Minority Committee

In [7]:
sorted_items = sorted(topic_counts.items(), key=lambda kv: (kv[1], kv[0]), reverse=True)
sorted_items

[('Congress', 82736),
 ('Social Welfare', 48729),
 ('Health', 46979),
 ('Commemorations', 46776),
 ('Taxation', 45823),
 ('Commerce', 45055),
 ('Agriculture and Food', 42207),
 ('Transportation and Public Works', 40801),
 ('Emergency Management', 39060),
 ('Economics and Public Finance', 37010),
 ('Crime and Law Enforcement', 33551),
 ('Government Operations and Politics', 31005),
 ('Education', 30570),
 ('International Affairs', 27330),
 ('Finance and Financial Sector', 25900),
 ('Social Sciences and History', 25287),
 ('Labor and Employment', 24599),
 ('Law', 24107),
 ('Immigration', 20359),
 ('Environmental Protection', 20080),
 ('Energy', 18607),
 ('Science, Technology, Communications', 16180),
 ('Arts, Culture, Religion', 13390),
 ('Families', 12971),
 ('Private Legislation', 12512),
 ('Agriculture and food', 12015),
 ('Congress, Congress', 10141),
 ('Water Resources Development', 9730),
 ('Crime, Law Enforcement', 9691),
 ('Animals', 9250),
 ('Foreign Trade and International Fina

In [8]:
len(sorted_items)

5374

In [10]:
df1 = pd.DataFrame.from_dict(sorted_items)
df1

Unnamed: 0,0,1
0,Congress,82736
1,Social Welfare,48729
2,Health,46979
3,Commemorations,46776
4,Taxation,45823
...,...,...
5369,Africa and Pacific Conflict,1
5370,Africa and Oceans,1
5371,Afghanistan,1
5372,Aeronautica,1
