Required Libraries

In [None]:
!pip install datasets
!pip install umap
!pip install umap-learn
!pip install nltk
!pip install sentence_transformers
!pip install HDBSCAN

In [None]:
from datasets import load_dataset

dataset = load_dataset("textminr/cmu-book-summaries")['train']
summaries = dataset['summary']

Extract canonical authors only:

In [None]:
import pandas as pd

df = pd.DataFrame(dataset)

target_authors = [
    'Louisa May Alcott',
    'Ray Bradbury',
    'Willa Cather',
    'Louise Erdrich',
    'Paul Fleischman',
    'Russell Freedman',
    'Laura Hillenbrand',
    'Sue Monk Kidd',
    'Robert Louis Stevenson',
    'Mark Twain',
    'Gloria Whelan',
    'Elie Wiesel',
    'Sarah Hopkins Bradford',
    'Emily Bronte',
    'Agatha Christie',
    'Charles Dickens',
    'F. Scott Fitzgerald',
    'William Golding',
    'Harper Lee',
    'Madeleine L’Engle',
    'Arthur Miller',
    'Bill O’Reilly',
    'George Orwell',
    'Jack Schaefer',
    'William Shakespeare',
    'John Steinbeck',
    'Sabaa Tahir',
    'Jane Austen',
    'Nathaniel Hawthorne',
    'Ernest Hemingway',
    'J.D. Salinger',
    'Miguel de Cervantes',
    'Mary Shelley',
    'Geoffrey Chaucer',
    'Leo Tolstoy',
    'Albert Camus',
    'John Milton',
    'Homer',
    'Geoffrey Chaucer',
    'Miguel de Cervantes',
    'William Shakespeare',
    'William Wordsworth',
    'Charles Dickens',
    'Jane Austen',
    'Mark Twain',
    'Leo Tolstoy',
    'Harper Lee',
    'F. Scott Fitzgerald',
    'Nathaniel Hawthorne',
    'Ernest Hemingway',
    'George Orwell',
    'J.D. Salinger',
    'William Faulkner',
    'John Steinbeck',
    'Zora Neale Hurston',
    'Maya Angelou',
    'Fyodor Dostoevsky',
    'Geoffrey Chaucer',
    'John Steinbeck',
    'Mary Shelley',
    'S.E. Hinton',
    'John Milton',
    'Albert Camus',
    'Louisa May Alcott',
    "Bill O'Reilly",
    'Ray Bradbury',
    'Arthur Miller',
    'Edgar Allan Poe'

]

filtered_df_canon = df[df['author'].isin(target_authors)]


Tokenize by sentence and remove phrases which are meaningless, but become too representative of topics.

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt')

def tokenize_summaries_into_sentences(summaries, phrases_to_replace):
    tokenized_sentences = []
    for summary in summaries:
        sentences = sent_tokenize(summary)
        for sentence in sentences:
            for phrase in phrases_to_replace:
                sentence = sentence.replace(phrase, "it")
            tokenized_sentences.append(sentence)
    return tokenized_sentences

phrases_to_replace = ["this novel", "novel", "this book", "characters", "the story", "this story", "The story", "the novel", "narrator", "book", "chapter"]


tokenized_sentences = tokenize_summaries_into_sentences(summaries, phrases_to_replace)


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Embed the Sentences

In [None]:
from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer("all-distilroberta-v1")

embeddings = embedding_model.encode(tokenized_sentences, show_progress_bar=True)

Dimensionality Reduction

In [None]:
from umap import UMAP

umap_model = UMAP(n_neighbors=15, n_components=5, min_dist=0.0, metric='cosine', random_state=42)

Clustering

In [None]:
from hdbscan import HDBSCAN

hdbscan_model = HDBSCAN(min_cluster_size=100, metric='euclidean', cluster_selection_method='eom', prediction_data=True)

Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(stop_words="english", min_df=5, ngram_range=(1, 4))

Initialize GPT for Interpretable Topics

In [None]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech


# GPT-3.5
prompt = """
'I have a topic that contains the following documents: [DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label of at most 3 words. It should be a theme.
topic: <theme>
"""
client = openai.OpenAI(api_key="sk-********************") # Requires API Key
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

representation_model = {
    "OpenAI": openai_model # If you don't have API, just comment this out.

}

Train and Fit the Model

In [None]:
from bertopic import BERTopic

topic_model = BERTopic(

  # Pipeline models
  embedding_model=embedding_model,
  umap_model=umap_model,
  hdbscan_model=hdbscan_model,
  vectorizer_model=vectorizer_model,

  # Hyperparameters
  top_n_words=10,
  verbose=True
  calculate_probabilities=True)

)
topics, probs = topic_model.fit_transform(tokenized_sentences, embeddings)

Get Topic List

In [None]:
topic_model.get_topic_info()

Cosine Similarity

In [None]:
topic_model.visualize_heatmap()

Outlier Reduction and Visualization

In [None]:
topic_model.reduce_outliers()
from collections import Counter
import matplotlib.pyplot as plt

topic_counts_before = Counter(topics)
topic_counts_after = Counter(new_topics)

plt.figure(figsize=(10, 5))
plt.bar(topic_counts_before.keys(), topic_counts_before.values(), alpha=0.5, label='Before Reduction')
plt.bar(topic_counts_after.keys(), topic_counts_after.values(), alpha=0.5, label='After Reduction')
plt.xlabel('Topic ID')
plt.ylabel('Frequency')
plt.title('Topic Distribution Before and After Outlier Reduction')
plt.legend()
plt.show()


Perplexity:

In [None]:
calculate_probabilities=True)

topics, probs = model.fit_transform(tokenized_sentences) # docs = dataset
log_perplexity = -1 * np.mean(np.log(np.sum(probs, axis=1)))
perplexity = np.exp(log_perplexity)

Multi-Aspect Representation Modeling with P.O.S. Just replace this in the initialization cell.

In [None]:
import openai
from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance, OpenAI, PartOfSpeech


# GPT-3.5
prompt = """
'I have a topic that contains the following documents: [DOCUMENTS]
The topic is described by the following keywords: [KEYWORDS]

Based on the information above, extract a short topic label of at most 3 words. It should be a theme.
topic: <theme>
"""
client = openai.OpenAI(api_key="sk-********************") # Requires API Key
openai_model = OpenAI(client, model="gpt-3.5-turbo", exponential_backoff=True, chat=True, prompt=prompt)

representation_model = {
    "OpenAI": openai_model, # If you don't have API, just comment this out.
    "POS": pos_model
}