In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(10, 6))
articles.groupby("date").size().rolling(30).mean().plot()

In [None]:
find_colon_splits = lambda x: x.split(": ", 1)[0] if ": " in x else None
articles["title"].apply(find_colon_splits).value_counts().index.to_list()

I'm clustering the articles on the titles, as titles should contain the story more succinctly than the full text. 

Many article titles, however are in the format: `$SECTION: $THE_ACTUAL_TITLE`, indicated by the presence of a colon.

This biases the embedding, as the vectorisation process takes into account pharses like "The Guardian View" or "Monday Briefing". 

I find all instances of colon-split titles, and make some arbitrary decisions about which to trim. My general heuristic is I remove:

- Anything which is clearly a recurring feature for a paper (e.g., Wednesday briefing)
- Anything containing `$SOMETHING_ABOUT_THE_WAR latest|live|briefing`
- Journalistic tropes, e.g., "Revealed", "Analysis"

I leave in everything else, e.g. "Trump: I hate that Epstein guy".

The below cell identifies the parts to crop, tests to see if cropping leaves empty titles, and applies the cropping.

In [None]:
bad_parts = [
    "First Thing",
    "Middle East crisis live",
    "Morning Mail",
    "Afternoon Update",
    "Watch",
    "Israel-Gaza war live",
    "Israel-Hamas war latest",
    "Letters",
    "Wednesday briefing",
    "Israel-Hamas war live",
    "Friday briefing",
    "Tuesday briefing",
    "Monday briefing",
    "Australia news live",
    "Thursday briefing",
    "Revealed",
    "Five Great Reads",
    "Wednesday evening news briefing",
    "Israel-Gaza latest news",
    "Battle Lines",
    "Friday evening news briefing",
    "Monday evening news briefing",
    "Thursday evening news briefing",
    "Tuesday evening news briefing",
    "Israel-Hamas war latest news",
    "TV tonight",
    "Politics latest news",
    "Israel briefing",
    "Israel-Palestine latest news",
    "Dining across the divide",
    "Pictured",
    "Digested week",
    "The Observer view",
    "The Daily T",
    "Australia politics live",
    "Guardian Essential poll",
    "Israel-Gaza war latest",
    "Middle East live",
    "The week in TV",
    "The Crunch",
    "On my radar",
    "Russia-Ukraine war live",
    "Israel-Iran war latest",
    "Analysis",
    # "The week in parliament", These didn't split
    # "The Guardian view on war in the Middle East", These didn't split
]

# Remove "The Guardian view on ..." from titles
articles["title"] = articles["title"].str.replace(
    r"^The Guardian view on ", "", regex=True
)

# Get a list of all titles that start with any of the bad parts, this throws an error if it doesn't split
bad_titles = []
for part in bad_parts:
    for title in articles[articles["title"].str.startswith(part + ":")][
        "title"
    ].to_list():
        bad_titles.append((part, title.split(":", 1)[1]))


def crop_part(part, title):
    if title.startswith(part + ":"):
        return title.split(":", 1)[1].strip()
    return title


for part in bad_parts:
    articles["title"] = articles["title"].apply(lambda x: crop_part(part, x))

In [None]:
# The Guardian also has a habit of including the journo in the title,
# e.g., "I had beans on toast for breakfast – and it was a mistake | Adrian Chiles"
# We cut this too

find_pipe_splits = lambda x: x.split(" | ", 1)[0] if " | " in x else x
articles["title"] = articles["title"].apply(find_pipe_splits)

In [None]:
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from sentence_transformers import SentenceTransformer

# Tweaking the UMAP model to use cosine distance, which is better for text data
# And to use 20 neighbors to capture more global structure and set random_state for reproducibility
umap_model = UMAP(n_neighbors=15, n_components=6, random_state=42)

# Picking a higher-quality model than the default "all-MiniLM-L6-v2"
embedding_model = SentenceTransformer("BAAI/bge-base-en-v1.5")

# Tweaking the hdbscan model to provide slightly larger, denser clusters
hdbscan_model = HDBSCAN(
    min_cluster_size=10,
    metric="euclidean",
    cluster_selection_method="eom",
    prediction_data=True,
    min_samples=5,
)

# Using a CountVectorizer to remove English stop words in topic representation
vectorizer_model = CountVectorizer(stop_words="english")


# Clustering on title, which should capture the main event for each article
docs = articles["title"].tolist()


# Creating the BERTopic model with the custom components
topic_model = BERTopic(
    embedding_model=embedding_model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    calculate_probabilities=True,
    verbose=True,
)

topics, probs = topic_model.fit_transform(docs)

# HDBSCAN can produce a lot of outlier topics, I reduce these by merging them into the closest topic
new_topics = topic_model.reduce_outliers(
    docs,
    topics,
    probabilities=probs,
    strategy="probabilities",
)

# Update the model with the outlier-reduced topics
topic_model.update_topics(docs, topics=new_topics)

# Building a hierarchy of topics
hierarchical_topics = topic_model.hierarchical_topics(docs, new_topics)

# Assigning topics back to the dataframe
articles["topic_n"] = new_topics

# Add in topic labels
articles["topic_label"] = articles["topic_n"].map(
    topic_model.get_topic_info().set_index("Topic")["Name"]
)

In [None]:
topic_model.visualize_hierarchical_documents(
    articles["title"].to_list(), hierarchical_topics, hide_document_hover=False
)