# Data

In [1]:
# Import some common libraries
import numpy as np
import pandas as pd
import ast
import json
import os

In [2]:
# Read the CSV file containing the metadata for articles from January 1, 2000 to May 15, 2023
# df = pd.read_csv('nyt-metadata.csv', low_memory=False)
# Read the CSV file containing the metadata for articles from May 15, 2023 to May 14, 2024
df = pd.read_csv('nyt-metadata-2.csv', usecols=lambda column: column != 'Unnamed: 0', low_memory=False)

# Cleaning

In [3]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts or pub_date
drop_rows = df[df['abstract'].isnull() | df['pub_date'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [4]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [5]:
df.head()

Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material
0,"Economic hardship, climate change, political i...","Title 42 Is Gone, but Not the Conditions Drivi...",2023-05-15 01:24:42+00:00,article,U.S.,News
1,It’s election night in America. Stay away from...,"‘Succession’ Season 4, Episode 8 Recap: The Wi...",2023-05-15 02:01:05+00:00,article,Arts,News
2,"Tom is stressed in dress shoes, Shiv hides ben...","‘Succession’ Style, Episode 8: Some People Jus...",2023-05-15 02:15:04+00:00,article,Style,News
3,"No corrections appeared in print on Monday, Ma...","No Corrections: May 15, 2023",2023-05-15 03:55:48+00:00,article,Corrections,News
4,"Quotation of the Day for Monday, May 15, 2023.",Quotation of the Day: When Your Champions Leag...,2023-05-15 03:55:57+00:00,article,Corrections,News


In [6]:
# Extract the abstracts from the dataframe and convert them to a list for processing
abstracts = df['abstract']
docs = abstracts.to_list()

In [7]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [8]:
# Lemmatize the documents.
# We use the WordNet lemmatizer from NLTK. 
# A lemmatizer is preferred over a stemmer in this case because it produces more readable words. 
# Output that is easy to read is very desirable in topic modelling.
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/craigfranze/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [9]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [10]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [11]:
# Remove stopwords
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

# Concatenate each entry into a single string after removing stopwords
cleaned_docs = [' '.join(doc) for doc in filtered_docs]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craigfranze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# BERTopic

In [12]:
from bertopic import BERTopic

In [13]:
# model = BERTopic(nr_topics=5) # Default number of topics is 30
model = BERTopic(verbose=True)

In [14]:
model.fit(cleaned_docs)

2024-05-30 00:48:20,934 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/1332 [00:00<?, ?it/s]

2024-05-30 00:54:28,033 - BERTopic - Embedding - Completed ✓
2024-05-30 00:54:28,033 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-30 00:55:16,253 - BERTopic - Dimensionality - Completed ✓
2024-05-30 00:55:16,255 - BERTopic - Cluster - Start clustering the reduced embeddings
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling paralle

<bertopic._bertopic.BERTopic at 0x7ffbd2328e50>

In [15]:
topics, probabilities = model.transform(cleaned_docs)

Batches:   0%|          | 0/1332 [00:00<?, ?it/s]

2024-05-30 01:02:28,823 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-30 01:02:28,946 - BERTopic - Dimensionality - Completed ✓
2024-05-30 01:02:28,947 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-05-30 01:02:31,582 - BERTopic - Cluster - Completed ✓


In [16]:
# Set pandas to view the full list of topics
pd.set_option('display.max_rows', None)

In [17]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,20066,-1_wa_ha_said_president,"[wa, ha, said, president, new, year, people, o...",[former major league baseball star running sen...
1,0,1037,0_recipe_restaurant_chicken_dish,"[recipe, restaurant, chicken, dish, chef, food...",[shrimp pasta dumpling salad warm evening pota...
2,1,678,1_museum_art_artist_painting,"[museum, art, artist, painting, exhibition, ga...",[brooklyn museum exhibit art work collected mu...
3,2,567,2_china_chinese_beijing_xi,"[china, chinese, beijing, xi, taiwan, xijinpin...",[meeting henry kissinger china leader xi jinpi...
4,3,516,3_novel_book_newnovel_newbook,"[novel, book, newnovel, newbook, author, debut...",[irish author discus long island sequel novel ...
5,4,466,4_film_movie_director_oscar,"[film, movie, director, oscar, horror, academy...","[director wes ball narrates sequence film, dir..."
6,5,381,5_fashion_designer_clothes_wear,"[fashion, designer, clothes, wear, dress, bran...",[italian fashion designer died milan week nine...
7,6,369,6_puzzlewe_feelingstuck_ontoday_stuck,"[puzzlewe, feelingstuck, ontoday, stuck, canhe...",[feeling stuck today puzzle help feeling_stuck...
8,7,367,7_puzzlehelp_caseyou_needsome_need,"[puzzlehelp, caseyou, needsome, need, puzzle, ...",[case need puzzle help case_you need_some puzz...
9,8,325,8_ukraine_ukrainian_russian_kyiv,"[ukraine, ukrainian, russian, kyiv, russia, co...","[sign counteroffensive ukraine, stake high kyi..."


In [18]:
model.visualize_topics()

In [19]:
model.visualize_barchart()

In [20]:
# Make a dataframe from the topic report
report = model.get_topic_info()
report = pd.DataFrame(report)

# Topic Labeling with Ollama

In [21]:
import langchain

In [22]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2", temperature=0.01)

In [23]:
# Pass a test prompt to Ollama
print(llm.invoke('Tell me a joke. Do not be conversational.'))


Why was the math book sad?

Because it had too many problems.


In [24]:
# Define the prompt
system_prompt = 'You are a helpful, respectful and honest assistant for labeling topics.'
main_prompt_1 =  'I have a topic that contains the following documents: '
main_prompt_2 =  'The topic is described by the following keywords: '
main_prompt_3 =  'Based on the information about the topic above, please create a short label (no more than 10 words) of this topic.'
main_prompt_4 =  'Note that you only return the label and nothing more (no explanation or suggestions).'
main_prompt_5 =  'If you cant provide a label, return No Labels Available.'

In [25]:
# Ask Ollama to label the topics
topics_ollama = []
for i in range(len(report)):
    prompt = (system_prompt + 
              main_prompt_1 + 
              str(report['Representative_Docs'].iloc[i]) + 
              main_prompt_2 + str(report['Representation'].iloc[i]) + 
              main_prompt_3 + 
              main_prompt_4 + 
              main_prompt_5)
    topics_ollama.append(llm.invoke(prompt))

In [26]:
topics_ollama

['Label: Political News',
 'Label: Recipe for Chicken Dish',
 'Label: Art Exhibition in Brooklyn',
 'Label: China-US Relations',
 'Label: Novels by Irish Author',
 'Director Wes Ball Narrates Sequence Film',
 'Label: Fashion Designer Dies',
 'Feeling Stuck Today Puzzle Help',
 'Label: Puzzle Help Needed',
 '\nLabel: Ukraine-Russia War',
 'Label: Shootings and Police Encounters',
 'Heavy rain and flooding in southern California.',
 'Label: Marriage Perspectives',
 'Label: TV Series',
 'Moon Exploration',
 'Label: Fossil Fuels and Climate Change',
 'Label: Presidential Primary Candidates',
 'Label: Nytimes articles from past year',
 'Iranian commander killed in Israeli strike',
 'Gaza War & Hamas Attack',
 'Label: Donald Trump Fraud Trial',
 'Three-bedroom home in Los Angeles',
 'Label: Solving Weekend Puzzles',
 'Label: Dance in New York',
 'Label: Federal Reserve Interest Rate',
 'Label: Broadway Musical Revival',
 '\nLabel: Artificial Intelligence',
 'Sure! Based on the information pr

In [27]:
topics_ollama = [topic.replace('Label: ', '').replace('\nLabel: ', '') for topic in topics_ollama]

In [32]:
topics_ollama = [topic.strip() for topic in topics_ollama]

In [37]:
topics_ollama = [topic.replace('Sure! Based on the information provided, here is a short label for the topic:', '').strip() for topic in topics_ollama]

In [39]:
topics_ollama = [topic.replace('"', '') for topic in topics_ollama]

In [40]:
topics_ollama[28]

'Billboard Chart Toppers'

In [41]:
# Add the labels from Ollama to the report
report['Name'] = topics_ollama

In [42]:
report.head(30)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,20066,Political News,"[wa, ha, said, president, new, year, people, o...",[former major league baseball star running sen...
1,0,1037,Recipe for Chicken Dish,"[recipe, restaurant, chicken, dish, chef, food...",[shrimp pasta dumpling salad warm evening pota...
2,1,678,Art Exhibition in Brooklyn,"[museum, art, artist, painting, exhibition, ga...",[brooklyn museum exhibit art work collected mu...
3,2,567,China-US Relations,"[china, chinese, beijing, xi, taiwan, xijinpin...",[meeting henry kissinger china leader xi jinpi...
4,3,516,Novels by Irish Author,"[novel, book, newnovel, newbook, author, debut...",[irish author discus long island sequel novel ...
5,4,466,Director Wes Ball Narrates Sequence Film,"[film, movie, director, oscar, horror, academy...","[director wes ball narrates sequence film, dir..."
6,5,381,Fashion Designer Dies,"[fashion, designer, clothes, wear, dress, bran...",[italian fashion designer died milan week nine...
7,6,369,Feeling Stuck Today Puzzle Help,"[puzzlewe, feelingstuck, ontoday, stuck, canhe...",[feeling stuck today puzzle help feeling_stuck...
8,7,367,Puzzle Help Needed,"[puzzlehelp, caseyou, needsome, need, puzzle, ...",[case need puzzle help case_you need_some puzz...
9,8,325,Ukraine-Russia War,"[ukraine, ukrainian, russian, kyiv, russia, co...","[sign counteroffensive ukraine, stake high kyi..."


In [43]:
# Find the articles in the dataset that are representative of the topic
print(report['Name'].iloc[201] + '\n' + '---')
for i in range(len(report['Representative_Docs'].iloc[201])):
    search_string = report['Representative_Docs'].iloc[201][i]
    index = -1
    for j, doc in enumerate(cleaned_docs):
        if search_string in doc:
            index = j
            break
    print(abstracts.iloc[index]+ '\n' + '---')

TikTok Ban
---
Plus: Trump flips on a TikTok ban.
---
The U.S. House passes a bill that could ban TikTok.
---
Americans want to keep TikTok and want more privacy, and TikTok should not be banned.
---


# Recommender 

In [44]:
# Make a dictionary of the topic labels
topic_dict = dict(zip(report['Topic'], report['Name']))
# Set the topic labels from Ollama in the model
model.set_topic_labels(topic_dict)

In [45]:
# Get the topic for each document
model.get_document_info(cleaned_docs).head(5)

# A groupby could be used to get the top n documents for each topic
# This dataframe is the basis for a recommendation system

Unnamed: 0,Document,Topic,Name,CustomName,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,economic hardship climate change political ins...,-1,-1_wa_ha_said_president,Political News,"[wa, ha, said, president, new, year, people, o...",[former major league baseball star running sen...,wa - ha - said - president - new - year - peop...,0.0,False
1,election night america stay away bodega sushi ...,140,140_suozzi_tom_pilip_mazi,Suozzi vs Pilip in NY Special House Election,"[suozzi, tom, pilip, mazi, democrat, special, ...",[special house election new york pit mazi pili...,suozzi - tom - pilip - mazi - democrat - speci...,0.572976,False
2,tom stressed dress shoe shiv hide beneath laye...,5,5_fashion_designer_clothes_wear,Fashion Designer Dies,"[fashion, designer, clothes, wear, dress, bran...",[italian fashion designer died milan week nine...,fashion - designer - clothes - wear - dress - ...,0.495954,False
3,correction appeared print monday may no_correc...,112,112_nocorrection_print_printon_correction,Correction Appeared in Print on Monday,"[nocorrection, print, printon, correction, app...",[correction appeared print monday may no_corre...,nocorrection - print - printon - correction - ...,0.10719,True
4,quotation day monday may quotation_of,155,155_quotationof_quotation_monday_day,Quotation Day Monday,"[quotationof, quotation, monday, day, january,...","[quotation day monday may quotation_of, quotat...",quotationof - quotation - monday - day - janua...,1.0,True


# Dynamic Topic Modeling

In [50]:
# Get the timestamp for each document
timestamps = df.pub_date.to_list()

In [52]:
# Get the topics over time
topics_over_time = model.topics_over_time(cleaned_docs, 
                                          timestamps, datetime_format="%Y-%m-%d %H:%M:%S%z", 
                                          nr_bins=50)

50it [00:25,  1.93it/s]


In [83]:
# Graph the topics of interest over time
# model.visualize_topics_over_time(topics_over_time, top_n_topics=10, custom_labels=True)
model.visualize_topics_over_time(topics_over_time, topics=[18, 35, 84], custom_labels=True)

In [58]:
for index, topic in enumerate(topics_ollama, start=1):
    print(f"{index}: {topic}")

1: Political News
2: Recipe for Chicken Dish
3: Art Exhibition in Brooklyn
4: China-US Relations
5: Novels by Irish Author
6: Director Wes Ball Narrates Sequence Film
7: Fashion Designer Dies
8: Feeling Stuck Today Puzzle Help
9: Puzzle Help Needed
10: Ukraine-Russia War
11: Shootings and Police Encounters
12: Heavy rain and flooding in southern California.
13: Marriage Perspectives
14: TV Series
15: Moon Exploration
16: Fossil Fuels and Climate Change
17: Presidential Primary Candidates
18: Nytimes articles from past year
19: Iranian commander killed in Israeli strike
20: Gaza War & Hamas Attack
21: Donald Trump Fraud Trial
22: Three-bedroom home in Los Angeles
23: Solving Weekend Puzzles
24: Dance in New York
25: Federal Reserve Interest Rate
26: Broadway Musical Revival
27: Artificial Intelligence
29: Billboard Chart Toppers
30: Mayor Eric Adam
31: Social Media Platform
32: Housing Rent Regulation
33: Biden Addresses American Public
34: Asylum Seekers at Southern Border
35: Electric

In [93]:
model.visualize_topics_over_time(topics_over_time, topics=[25, 74, 384, 175], custom_labels=True)