In [1]:
from bertopic import BERTopic
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from tqdm import tqdm
from umap import UMAP
from hdbscan import HDBSCAN
from bertopic.vectorizers import ClassTfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from bertopic.representation import PartOfSpeech

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# embedding_model = SentenceTransformer("Salesforce/SFR-Embedding-2_R")
# embedding_model = SentenceTransformer("mixedbread-ai/mxbai-embed-large-v1")

In [3]:
# df = pd.read_csv('twitter_dataset.csv')
# df.head()

In [4]:
from sklearn.datasets import fetch_20newsgroups

data = fetch_20newsgroups(subset='all', remove=('headers', 'footers')).data

In [5]:
len(data)

18846

In [6]:
# remove all empty strings
data = [text for text in data if text.strip() != ""]
len(data)

18812

In [7]:
# embeddings = embedding_model.encode(newsgroup.data[:100], show_progress_bar=True)

In [8]:
# import voyageai

# vo = voyageai.Client()

# batch_size = 128
# embeddings = []

# # Use tqdm to show a progress bar
# for i in tqdm(range(0, len(data), batch_size), desc="Embedding Batches"):
#     # Embed the batch of data
#     batch_embeddings = vo.embed(
#         data[i : i + batch_size],
#         model="voyage-3",
#         input_type="document",
#     ).embeddings
    
#     # Append the embeddings to the list
#     embeddings.append(batch_embeddings)

# # Concatenate the embeddings into a single array
# embeddings = np.concatenate(embeddings, axis=0)

In [9]:
# np.save("embeddings.npy", embeddings)

In [10]:
embeddings = np.load("embeddings.npy")

In [11]:
embeddings.shape

(18812, 1024)

In [None]:
vectorizer_model = CountVectorizer(stop_words="english", ngram_range=(1, 2))
!python -m spacy download en_core_web_lg
representation_model = PartOfSpeech(model="en_core_web_lg")

umap_model = UMAP(
    n_neighbors=15,
    n_components=5,
    min_dist=0.0,
)

hdbscan_model = HDBSCAN(
    min_cluster_size=15,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
)

ctfidf_model = ClassTfidfTransformer()

model = BERTopic(
    verbose=True,
    min_topic_size=2,
    ctfidf_model=ctfidf_model,
    calculate_probabilities=True,
    umap_model=umap_model,
    # embedding_model=embedding_model,
    vectorizer_model=vectorizer_model,
    representation_model=representation_model,
)

topics, probs = model.fit_transform(data, embeddings)

Collecting en-core-web-lg==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.8.0/en_core_web_lg-3.8.0-py3-none-any.whl (400.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m400.7/400.7 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:10[0m
[?25hInstalling collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


2024-10-05 18:25:42,598 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.
2024-10-05 18:25:55,171 - BERTopic - Dimensionality - Completed ✓
2024-10-05 18:25:55,172 - BERTopic - Cluster - Start clustering the reduced embeddings


In [None]:
model.visualize_topics()

In [None]:
prompt = '''Based on the below information, extract and synthesize human-readable tags/keywords/themes from the text, capitalized first letters of words. What is the main human-readable theme or subject matter discussed in the provided texts? What is the overarching, high-level theme of the texts, e.g. "Music", "Sports", "Environment", etc.? Please provide overarching themes that tie the different pieces of information together. What is/are the overarching, highest level theme(s) that you could use as a keyword(s)? Prefer single word tags/keywords, e.g. "Tennis" rather than "Tennis Match", "Prison" rather than "Prison Time", etc.! Some examples of human-readable themes are   "Agriculture", "Astronomy", "Chemistry", "Computational Universe", "Computer Systems", "Climate and Environment", "Culture", "Demographics", "Earth Science", "Economics", "Education", "Engineering", "Finance", "Geography", "Government", "Games", "Health", "History", "Human Activities", "Images", "Language", "Law", "Life Science", "Machine Learning", "Manufacturing", "Mathematics", "Medicine", "Meteorology", "Physical Sciences", "Politics", "Social Media", "Sociology", "Statistics", "Text & Literature",  "Transportation". Also, don't give very similar tags/keywords, e.g. "Wine" and "Red Wine", just give one or the other in these cases. Avoid tags/keywords that are too specific, e.g. "Serine Threonine Protein Kinase". Good theme examples are: "Birds", "Species Migration", "Air Pollution", or "War", "Government", "International Relations", "Politics". Another important rule to obey - place more focus on the dataset names for theme extraction. And be concise in theme generation, e.g. instead of "Income Prediction", say "Income", instead of "Demographic Information", say "Demographics"! Also, extract the theme of the text, what it is about, instead of the type of problem it is, for instance we don't care about "Regression", "Numerical Features", "Data Analysis", "Data", "Outliers", "Subsampling" or things of that sort, but we care about the ESSENCE of the text! Say {"Themes": [...], "Overarching themes": [...]} and give your answer in JSON format.
For example, for this text:
Text 1: The Biden administration is preparing to roll out a sweeping border executive action as early as Tuesday, according to two sources familiar with the discussions, who cautioned that timing is fluid.

White House officials have begun reaching out to mayors who represent cities along the US southern border to potentially join President Joe Biden when he announces the order, two other sources familiar with those conversations said.

For weeks, administration officials have been working through an executive action that would dramatically limit migrants’ ability to seek asylum at the US southern border — part of a strategy to try to give Biden the upper hand on one of his Republican rival’s key campaign issues. The action is designed to potentially blunt Republican attacks on border security and preempt former President Donald Trump ahead of the first presidential debate, which will be held on June 27 on CNN.
---
Text 2: Now that a New York jury has convicted former President Donald Trump of all 34 felony charges of falsifying business records, the next obvious question is: Can a convicted felon run for president?

Definitely.

Trump meets all three requirements. There is, arguably, another criterion laid out in the 14th Amendment, where it states that no one who has previously taken an oath of office who engages in insurrection can be an officer of the US. But the US Supreme Court ruled earlier this year that Congress would have to pass a special law invoking this prohibition. That’s not happening any time soon.

Judge Juan Merchan has scheduled Trump’s sentencing for July 11, which happens to be four days before the start of the Republican National Convention that is scheduled to take place in Milwaukee.
a
It is technically possible, although perhaps unlikely for a first-time offender, that Trump could be sentenced to prison time.
---
This would be your answer:
{"Themes": ["Biden Administration", "Border", "Executive Action", "Asylum", "Immigration", "Trump", "Felony", "Business Records", "Presidential Campaign", "Republican", "Debate", "Former President", "Conviction", "Sentencing", "Prison", "14th Amendment", "Insurrection", "Supreme Court", "Republican National Convention"], "Overarching themes": ["Politics", "Government", "Law", "Justice", "Elections"]}
---
Now, the above was just an example. Now, do it for the following text(s), be concise!:
[DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]
---
Remember, extract and synthesize human-readable tags/keywords/themes from the text, capitalized first letters of words. What is the main human-readable theme or subject matter discussed in the provided texts? What is the overarching, high-level theme of the texts, e.g. "Music", "Sports", "Environment", etc.? Please provide overarching themes that tie the different pieces of information together. What is/are the overarching, highest level theme(s) that you could use as a keyword(s)? Prefer single word tags/keywords, e.g. "Tennis" rather than "Tennis Match", "Prison" rather than "Prison Time", etc.! Some examples of human-readable themes are   "Agriculture", "Astronomy", "Chemistry", "Computational Universe", "Computer Systems", "Climate and Environment", "Culture", "Demographics", "Earth Science", "Economics", "Education", "Engineering", "Finance", "Geography", "Government", "Games", "Health", "History", "Human Activities", "Images", "Language", "Law", "Life Science", "Machine Learning", "Manufacturing", "Mathematics", "Medicine", "Meteorology", "Physical Sciences", "Politics", "Social Media", "Sociology", "Statistics", "Text & Literature",  "Transportation". Also, don't give very similar tags/keywords, e.g. "Wine" and "Red Wine", just give one or the other in these cases. Avoid tags/keywords that are too specific, e.g. "Serine Threonine Protein Kinase". Good theme examples are: "Birds", "Species Migration", "Air Pollution", or "War", "Government", "International Relations", "Politics". Another important rule to obey - place more focus on the dataset names for theme extraction. And be concise in theme generation, e.g. instead of "Income Prediction", say "Income", instead of "Demographic Information", say "Demographics"! Also, extract the theme of the text, what it is about, instead of the type of problem it is, for instance we don't care about "Regression", "Numerical Features", "Data Analysis", "Data", "Outliers", "Subsampling" or things of that sort, but we care about the ESSENCE of the text! Say {"Themes": [...], "Overarching themes": [...]} and give your answer in JSON format.
'''

In [126]:
# # import cosine similarity
# from sklearn.metrics.pairwise import cosine_similarity

# # calculate cosine similarity between all sentences
# cosine_sim = cosine_similarity(embeddings, embeddings)
# cosine_sim

array([[1.        , 1.        , 0.93582286, 0.68280542, 0.53965799],
       [1.        , 1.        , 0.93582286, 0.68280542, 0.53965799],
       [0.93582286, 0.93582286, 1.        , 0.69075243, 0.5471694 ],
       [0.68280542, 0.68280542, 0.69075243, 1.        , 0.67244471],
       [0.53965799, 0.53965799, 0.5471694 , 0.67244471, 1.        ]])

In [34]:
# import requests

# API_URL = "https://se1nsjdwu8nlsqwt.us-east-1.aws.endpoints.huggingface.cloud"
# headers = {
# 	"Accept" : "application/json",
# 	"Authorization": "Bearer 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",
# 	"Content-Type": "application/json" 
# }

# def query(payload):
# 	response = requests.post(API_URL, headers=headers, json=payload)
# 	return response.json()

# output = query({
# 	"inputs": newsgroup.data[:31],
# 	"parameters": {}
# })

# # same but for cycle for each individual sentence
# # outputs = []
# # for sentence in newsgroup.data[:32]:
# #     outputs.append(query({
# #         "inputs": sentence,
# #         "parameters": {}
# #     }))
# # same but in batches of 20