In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds
import bertopic
from umap import UMAP
from hdbscan import HDBSCAN
from sentence_transformers import SentenceTransformer
from sklearn.feature_extraction.text import CountVectorizer

from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired
from bertopic.vectorizers import ClassTfidfTransformer



In [3]:
ds = tfds.load('wikipedia/20200301.en',split='train')
for example in ds.take(10):
    print(example['text'].numpy().decode('utf-8'))

Joseph Harold Greenberg (May 28, 1915 – May 7, 2001) was an American linguist, known mainly for his work concerning linguistic typology and the genetic classification of languages.

Life

Early life and education 

Joseph Greenberg was born on May 28, 1915 to Jewish parents in Brooklyn, New York. His first great interest was music. At the age of 14, he gave a piano concert in Steinway Hall. He continued to play the piano frequently throughout his life.

After finishing high school, he decided to pursue a scholarly career rather than a musical one. He enrolled at Columbia University in New York. During his senior year, he attended a class taught by Franz Boas concerning American Indian languages. With references from Boas and Ruth Benedict, he was accepted as a graduate student by Melville J. Herskovits at Northwestern University in Chicago. During the course of his graduate studies, Greenberg did fieldwork among the Hausa people of Nigeria, where he learned the Hausa language. The subj

2025-02-08 17:07:36.581667: I tensorflow/core/kernels/data/tf_record_dataset_op.cc:376] The default buffer size is 262144, which is overridden by the user specified `buffer_size` of 8388608
2025-02-08 17:07:36.770427: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: OUT_OF_RANGE: End of sequence


In [10]:

docs = [i['text'].numpy().decode('utf-8') for i in ds.take(5000)]


In [11]:
# Step 1 - Extract embeddings
embedding_model = SentenceTransformer("all-MiniLM-L6-v2")

# Step 2 - Reduce dimensionality
umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine')

# Step 3 - Cluster reduced embeddings
hdbscan_model = HDBSCAN(min_cluster_size=15, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

# Step 4 - Tokenize topics
vectorizer_model = CountVectorizer(stop_words="english")

# Step 5 - Create topic representation
ctfidf_model = ClassTfidfTransformer(bm25_weighting=False, reduce_frequent_words=False)

# Step 6 - (Optional) Fine-tune topic representations with 
# a `bertopic.representation` model
representation_model = KeyBERTInspired()

# All steps together
topic_model = BERTopic(
  embedding_model=embedding_model,          # Step 1 - Extract embeddings
  umap_model=umap_model,                    # Step 2 - Reduce dimensionality
  hdbscan_model=hdbscan_model,              # Step 3 - Cluster reduced embeddings
  vectorizer_model=vectorizer_model,        # Step 4 - Tokenize topics
  ctfidf_model=ctfidf_model,                # Step 5 - Extract topic words
  representation_model=representation_model # Step 6 - (Optional) Fine-tune topic representations
)

In [12]:

topics, probs = topic_model.fit_transform(docs)
topics, probs

([-1,
  22,
  2,
  -1,
  1,
  -1,
  1,
  37,
  23,
  12,
  16,
  -1,
  0,
  52,
  50,
  0,
  44,
  11,
  -1,
  -1,
  1,
  11,
  2,
  -1,
  0,
  -1,
  19,
  -1,
  -1,
  3,
  -1,
  16,
  6,
  4,
  31,
  34,
  -1,
  8,
  20,
  -1,
  1,
  -1,
  5,
  -1,
  43,
  -1,
  -1,
  36,
  3,
  12,
  -1,
  12,
  10,
  9,
  -1,
  -1,
  0,
  44,
  3,
  4,
  -1,
  31,
  32,
  -1,
  8,
  38,
  -1,
  2,
  -1,
  3,
  1,
  47,
  21,
  7,
  5,
  38,
  24,
  4,
  29,
  24,
  21,
  42,
  58,
  43,
  15,
  25,
  10,
  0,
  15,
  9,
  0,
  36,
  11,
  29,
  1,
  -1,
  -1,
  14,
  9,
  28,
  -1,
  -1,
  -1,
  -1,
  6,
  13,
  -1,
  47,
  23,
  2,
  22,
  37,
  22,
  -1,
  0,
  32,
  12,
  3,
  41,
  10,
  -1,
  20,
  23,
  -1,
  38,
  -1,
  4,
  2,
  16,
  28,
  9,
  -1,
  -1,
  40,
  54,
  4,
  11,
  18,
  -1,
  0,
  36,
  11,
  15,
  -1,
  8,
  40,
  -1,
  47,
  40,
  18,
  15,
  -1,
  -1,
  -1,
  4,
  0,
  50,
  34,
  9,
  20,
  9,
  8,
  51,
  18,
  -1,
  39,
  42,
  29,
  47,
  3,
  8,
  -1,
  26,
  35,
  -1

In [13]:
topics, probs

([-1,
  22,
  2,
  -1,
  1,
  -1,
  1,
  37,
  23,
  12,
  16,
  -1,
  0,
  52,
  50,
  0,
  44,
  11,
  -1,
  -1,
  1,
  11,
  2,
  -1,
  0,
  -1,
  19,
  -1,
  -1,
  3,
  -1,
  16,
  6,
  4,
  31,
  34,
  -1,
  8,
  20,
  -1,
  1,
  -1,
  5,
  -1,
  43,
  -1,
  -1,
  36,
  3,
  12,
  -1,
  12,
  10,
  9,
  -1,
  -1,
  0,
  44,
  3,
  4,
  -1,
  31,
  32,
  -1,
  8,
  38,
  -1,
  2,
  -1,
  3,
  1,
  47,
  21,
  7,
  5,
  38,
  24,
  4,
  29,
  24,
  21,
  42,
  58,
  43,
  15,
  25,
  10,
  0,
  15,
  9,
  0,
  36,
  11,
  29,
  1,
  -1,
  -1,
  14,
  9,
  28,
  -1,
  -1,
  -1,
  -1,
  6,
  13,
  -1,
  47,
  23,
  2,
  22,
  37,
  22,
  -1,
  0,
  32,
  12,
  3,
  41,
  10,
  -1,
  20,
  23,
  -1,
  38,
  -1,
  4,
  2,
  16,
  28,
  9,
  -1,
  -1,
  40,
  54,
  4,
  11,
  18,
  -1,
  0,
  36,
  11,
  15,
  -1,
  8,
  40,
  -1,
  47,
  40,
  18,
  15,
  -1,
  -1,
  -1,
  4,
  0,
  50,
  34,
  9,
  20,
  9,
  8,
  51,
  18,
  -1,
  39,
  42,
  29,
  47,
  3,
  8,
  -1,
  26,
  35,
  -1

In [14]:
!pip install matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting matplotlib
  Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl.metadata (11 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Downloading contourpy-1.3.0-cp39-cp39-macosx_11_0_arm64.whl.metadata (5.4 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl.metadata (3.8 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Downloading fonttools-4.56.0-cp39-cp39-macosx_10_9_universal2.whl.metadata (101 kB)
Collecting kiwisolver>=1.3.1 (from matplotlib)
  Downloading kiwisolver-1.4.7-cp39-cp39-macosx_11_0_arm64.whl.metadata (6.3 kB)
Collecting pyparsing>=2.3.1 (from matplotlib)
  Using cached pyparsing-3.2.1-py3-none-any.whl.metadata (5.0 kB)
Downloading matplotlib-3.9.4-cp39-cp39-macosx_11_0_arm64.whl (7.8 MB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m14.2 MB/s[0m eta [36m0:00:00[0m31m14.4 MB/s[0m eta [36m0:00:01[0m
[?25hDownloading contourpy-1.3.0-cp39-cp39-macosx_11_0_

In [15]:
import tensorflow_datasets as tfds
from bertopic import BERTopic
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
from config import config

# 1. Load Wikipedia Data
def load_wikipedia_data(sample_size=config.sample_size):
    ds = tfds.load('wikipedia/20200301.en', split='train', shuffle_files=True)
    texts = []
    for example in ds.take(sample_size):  # Reduce for quick experimentation
        texts.append(example['text'].numpy().decode('utf-8'))
    return texts

# 2. Preprocess Data (simple version)
texts = load_wikipedia_data()


In [16]:

# 3. Create Document Embeddings using BERT
embedder = SentenceTransformer('sentence-transformers/multi-qa-MiniLM-L6-dot-v1', device='cuda')
embeddings = embedder.encode(texts, show_progress_bar=True)

# 4. Cluster Topics with BERTopic
topic_model = BERTopic(embedding_model=embedder, min_topic_size=15)
topics, _ = topic_model.fit_transform(texts, embeddings)

# 5. Create Knowledge Graph Nodes (Topics)
topic_info = topic_model.get_topic_info()
topic_embeddings = np.array([topic_model.topic_embeddings_[topic] 
                           for topic in topic_info['Topic'] if topic != -1])

# 6. Calculate Cosine Similarity for Edges
similarity_matrix = cosine_similarity(topic_embeddings)


AssertionError: Torch not compiled with CUDA enabled

In [23]:
!export CUDA_VISIBLE_DEVICES=1

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:

# 7. Build Graph
G = nx.Graph()
threshold = 0.65  # Similarity threshold for edges

# Add nodes with metadata
for idx, row in topic_info.iterrows():
    if row['Topic'] != -1:
        G.add_node(row['Topic'],
                   label=f"Topic {row['Topic']}",
                   keywords=", ".join([word[0] for word in topic_model.get_topic(row['Topic'])]),
                   size=row['Count'])

# Add edges based on similarity
for i in range(len(topic_embeddings)):
    for j in range(i+1, len(topic_embeddings)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(topic_info.iloc[i]['Topic'], 
                      topic_info.iloc[j]['Topic'],
                      weight=similarity_matrix[i][j])

# 8. Visualize Knowledge Graph
plt.figure(figsize=(20, 15))
pos = nx.spring_layout(G, k=0.5)

node_sizes = [G.nodes[node]['size']*10 for node in G.nodes]
edge_weights = [G.edges[edge]['weight']*2 for edge in G.edges]

nx.draw_networkx_nodes(G, pos, node_size=node_sizes, alpha=0.8)
nx.draw_networkx_edges(G, pos, width=edge_weights, alpha=0.2)
nx.draw_networkx_labels(G, pos, 
                        labels={node:G.nodes[node]['label'] for node in G.nodes},
                        font_size=8)

# Add keyword annotations
for node in G.nodes:
    plt.annotate(G.nodes[node]['keywords'], 
                 xy=pos[node], 
                 xytext=(10, -10),
                 textcoords='offset points',
                 fontsize=6,
                 alpha=0.7)

plt.title("Wikipedia Knowledge Graph (BERTopic + BERT Embeddings)")
plt.show()

In [15]:
!pip3 install matplotlib

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting matplotlib
  Using cached matplotlib-3.10.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.6 MB)
Collecting pyparsing>=2.3.1
  Downloading pyparsing-3.2.1-py3-none-any.whl (107 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m107.7/107.7 KB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Collecting contourpy>=1.0.1
  Using cached contourpy-1.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (324 kB)
Collecting kiwisolver>=1.3.1
  Using cached kiwisolver-1.4.8-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (1.6 MB)
Collecting fonttools>=4.22.0
  Downloading fonttools-4.56.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m
[?25hCollecting cycler>=0.10
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Installing collected package