# Data

In [99]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [100]:
# Read the CSV file containing the metadata for articles from January 1, 2000 to May 15, 2023
#df = pd.read_csv('nyt-metadata.csv', low_memory=False)
# Read the CSV file containing the metadata for articles from May 15, 2023 to May 14, 2024
df = pd.read_csv('nyt-metadata-2.csv', usecols=lambda column: column != 'Unnamed: 0', low_memory=False)

In [101]:
df.head()

Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,headline,keywords,pub_date,document_type,news_desk,section_name,subsection_name,byline,type_of_material,_id,word_count,uri
0,"Economic hardship, climate change, political i...",https://www.nytimes.com/2023/05/14/us/migrants...,"Economic hardship, climate change, political i...",Relative quiet has prevailed along the souther...,A,14.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': 'Title 42 Is Gone, but Not the Condit...","[{'name': 'subject', 'value': 'Illegal Immigra...",2023-05-15 01:24:42+00:00,article,National,U.S.,,"{'original': 'By Miriam Jordan', 'person': [{'...",News,nyt://article/3d95da14-0c64-59c6-bae2-02b151ad...,1217,nyt://article/3d95da14-0c64-59c6-bae2-02b151ad...
1,It’s election night in America. Stay away from...,https://www.nytimes.com/2023/05/14/arts/televi...,It’s election night in America. Stay away from...,"The day before Logan Roy died, he delivered a ...",,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': '‘Succession’ Season 4, Episode 8 Rec...","[{'name': 'creative_works', 'value': 'Successi...",2023-05-15 02:01:05+00:00,article,Culture,Arts,Television,"{'original': 'By Noel Murray', 'person': [{'fi...",News,nyt://article/17f6f628-2939-541b-a0e8-5c503fa6...,1495,nyt://article/17f6f628-2939-541b-a0e8-5c503fa6...
2,"Tom is stressed in dress shoes, Shiv hides ben...",https://www.nytimes.com/2023/05/14/style/succe...,"Tom is stressed in dress shoes, Shiv hides ben...",This article contains spoilers for Episode 8 o...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...","{'main': '‘Succession’ Style, Episode 8: Some ...",[],2023-05-15 02:15:04+00:00,article,Styles,Style,,"{'original': 'By The Styles Desk', 'person': [...",News,nyt://article/70773662-4815-5d40-8460-b438aa44...,665,nyt://article/70773662-4815-5d40-8460-b438aa44...
3,"No corrections appeared in print on Monday, Ma...",https://www.nytimes.com/2023/05/14/pageoneplus...,"No corrections appeared in print on Monday, Ma...",Errors are corrected during the press run when...,,,The New York Times,[],"{'main': 'No Corrections: May 15, 2023', 'kick...",[],2023-05-15 03:55:48+00:00,article,Corrections,Corrections,,"{'original': '', 'person': [], 'organization':...",News,nyt://article/199d026e-1372-51a3-adf0-c82abeeb...,52,nyt://article/199d026e-1372-51a3-adf0-c82abeeb...
4,"Quotation of the Day for Monday, May 15, 2023.",https://www.nytimes.com/2023/05/14/pageoneplus...,"Quotation of the Day for Monday, May 15, 2023.","“For me, it was time to give back the love the...",A,2.0,The New York Times,[],{'main': 'Quotation of the Day: When Your Cham...,[],2023-05-15 03:55:57+00:00,article,Summary,Corrections,,"{'original': '', 'person': [], 'organization':...",News,nyt://article/5f4b7ea7-88f4-5178-884f-ae28530b...,42,nyt://article/5f4b7ea7-88f4-5178-884f-ae28530b...


# Cleaning

In [102]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [103]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [104]:
df.head()

Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material
0,"Economic hardship, climate change, political i...","Title 42 Is Gone, but Not the Conditions Drivi...",2023-05-15 01:24:42+00:00,article,U.S.,News
1,It’s election night in America. Stay away from...,"‘Succession’ Season 4, Episode 8 Recap: The Wi...",2023-05-15 02:01:05+00:00,article,Arts,News
2,"Tom is stressed in dress shoes, Shiv hides ben...","‘Succession’ Style, Episode 8: Some People Jus...",2023-05-15 02:15:04+00:00,article,Style,News
3,"No corrections appeared in print on Monday, Ma...","No Corrections: May 15, 2023",2023-05-15 03:55:48+00:00,article,Corrections,News
4,"Quotation of the Day for Monday, May 15, 2023.",Quotation of the Day: When Your Champions Leag...,2023-05-15 03:55:57+00:00,article,Corrections,News


In [111]:
# Use the first 2000 abstracts for the analysis
abstracts = df['abstract'][-6000:]
docs = abstracts.to_list()

In [112]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [113]:
# Lemmatize the documents.
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/craigfranze/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [114]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [115]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [116]:
docs[3]

['ha', 'liberalism', 'found', 'coherent', 'sexual', 'ethic']

In [117]:
# Vectorize data.

# Bag-of-words representation of the documents.
#corpus = [dictionary.doc2bow(doc) for doc in docs]

In [118]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

# Concatenate each entry into a single string after removing stopwords
cleaned_docs = [' '.join(doc) for doc in filtered_docs]

# Example of how to print the filtered and concatenated result for the first document
print(cleaned_docs[3])

ha liberalism found coherent sexual ethic


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craigfranze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [119]:
cleaned_docs[1]

'patrick carfizzi vibrant performer supporting role ha grabbed attention new production verdi la forza del destino'

# BERTopic

In [120]:
from bertopic import BERTopic

In [121]:
# Concatenate each entry into a single string
#concatenated_docs = [' '.join(doc) for doc in docs]

In [122]:
# model = BERTopic(nr_topics=5) # Default number of topics is 30
model = BERTopic(verbose=True)

In [123]:
model.fit(cleaned_docs)

2024-05-27 10:46:00,586 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/188 [00:00<?, ?it/s]

2024-05-27 10:46:47,090 - BERTopic - Embedding - Completed ✓
2024-05-27 10:46:47,091 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-27 10:47:08,457 - BERTopic - Dimensionality - Completed ✓
2024-05-27 10:47:08,458 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-27 10:47:08,656 - BERTopic - Cluster - Completed ✓
2024-05-27 10:47:08,661 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-27 10:47:08,826 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7f7f821720a0>

In [124]:
topics, probabilities = model.transform(cleaned_docs)

Batches:   0%|          | 0/188 [00:00<?, ?it/s]

2024-05-27 10:48:00,035 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-27 10:48:00,053 - BERTopic - Dimensionality - Completed ✓
2024-05-27 10:48:00,053 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-05-27 10:48:00,265 - BERTopic - Cluster - Completed ✓


In [125]:
model.get_topic_freq().head(10)

Unnamed: 0,Topic,Count
2,-1,2030
23,0,345
1,1,323
11,2,252
3,3,239
0,4,149
25,5,136
7,6,120
21,7,95
19,8,95


In [126]:
model.visualize_topics()

In [127]:
model.visualize_barchart()

In [128]:
model.visualize_heatmap()

In [129]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2030,-1_wa_new_said_ha,"[wa, new, said, ha, year, one, time, also, sta...",[housing advocate even homeowner wealthier nei...
1,0,345,0_trump_donald_donaldtrump_trial,"[trump, donald, donaldtrump, trial, criminal, ...",[jury selection begin monday donald trump hush...
2,1,323,1_president_election_voter_party,"[president, election, voter, party, biden, tru...",[former president donald trump wa due hold cam...
3,2,252,2_art_museum_artist_fashion,"[art, museum, artist, fashion, work, designer,...",[special section new york time museum highligh...
4,3,239,3_film_movie_show_series,"[film, movie, show, series, netflix, star, com...",[look back george romero film one influential ...
...,...,...,...,...,...
79,78,11,78_orchestra_conductor_philharmonic_director,"[orchestra, conductor, philharmonic, director,...",[thursday richly talented year old maestro led...
80,79,11,79_economy_optimistic_bad_demographer,"[economy, optimistic, bad, demographer, tempti...",[economist explains lens make much optimistic ...
81,80,10,80_fossil_gas_drilling_fuel,"[fossil, gas, drilling, fuel, mining, say, nat...",[shell others say plan drill oil gas gulf mexi...
82,81,10,81_debut_crossword_york_newyork,"[debut, crossword, york, newyork, make, time, ...",[jake bunch make new york time crossword debut...


In [130]:
report = model.get_topic_info()

In [131]:
report = pd.DataFrame(report)

# Topic Labeling with Ollama

In [132]:
import langchain

In [133]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2", temperature=0.01)

In [134]:
print(llm.invoke('Tell me a joke. Do not be conversational.'))


Why was the math book sad?

Because it had too many problems.


In [135]:
system_prompt = 'You are a helpful, respectful and honest assistant for labeling topics.'
main_prompt_1 =  'I have a topic that contains the following documents: '
main_prompt_2 =  'The topic is described by the following keywords: '
main_prompt_3 =  'Based on the information about the topic above, please create a short label (no more than 10 words) of this topic.'
main_prompt_4 =  'Note that you only return the label and nothing more (no explanation or suggestions).'
main_prompt_5 =  'If you cant provide a label, return No Labels Available.'

In [136]:
# Concatenate the various parts into a single string, broken down for readability
prompt = (
    system_prompt + 
    main_prompt_1 + 
    str(report['Representative_Docs'].iloc[0]) + 
    main_prompt_2 + 
    str(report['Representation'].iloc[0]) + 
    main_prompt_3 + 
    main_prompt_4 + 
    main_prompt_5
)

In [137]:
topics = []
for i in range(len(report)):
    prompt = (system_prompt + 
              main_prompt_1 + 
              str(report['Representative_Docs'].iloc[i]) + 
              main_prompt_2 + str(report['Representation'].iloc[i]) + 
              main_prompt_3 + 
              main_prompt_4 + 
              main_prompt_5)
    topics.append(llm.invoke(prompt))

In [138]:
topics

['Label: New York Property Tax System',
 'Label: Trump Criminal Trial',
 'Label: Presidential Election',
 'Label: Art Exhibitions in New York and Denver',
 'Label: Horror Movie Revival',
 'Sure! Based on the information provided, here is a short label for the topic: "Pop Superstar\'s Latest Album Reigns at Billboard"',
 'Modern Love Miniature',
 'Label: China-US Trade Tensions',
 'In Gaza, Israel launches invasion; US withholds aid.',
 'Ukraine-Russia Conflict',
 'Palestinian Protests on Campus',
 'Easy Pasta & Chicken Dinner',
 'Label: Mental Health Interventions',
 'Israel strikes Iran',
 'Sure! Based on the information provided, here is a short label for the topic:\n\nPremier League Sports',
 'Abortion Policy and Rights in Florida and Arizona',
 'Certainly! Based on the information provided, here is a short label for the topic:\n\n"Columbia University protest"',
 'Label: Company Investment Deal',
 'Label: Inflation Rate Decision',
 'Label: Harvey Weinstein Sex Crime Conviction Overt

In [139]:
report['Name'] = topics

In [142]:
pd.set_option('display.max_rows', None)

In [143]:
report.head(84)

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,2030,Label: New York Property Tax System,"[wa, new, said, ha, year, one, time, also, sta...",[housing advocate even homeowner wealthier nei...
1,0,345,Label: Trump Criminal Trial,"[trump, donald, donaldtrump, trial, criminal, ...",[jury selection begin monday donald trump hush...
2,1,323,Label: Presidential Election,"[president, election, voter, party, biden, tru...",[former president donald trump wa due hold cam...
3,2,252,Label: Art Exhibitions in New York and Denver,"[art, museum, artist, fashion, work, designer,...",[special section new york time museum highligh...
4,3,239,Label: Horror Movie Revival,"[film, movie, show, series, netflix, star, com...",[look back george romero film one influential ...
5,4,149,"Sure! Based on the information provided, here ...","[album, track, music, singer, song, musician, ...",[beatles lp pop superstar reign atop billboard...
6,5,136,Modern Love Miniature,"[novel, book, author, story, love, life, tale,...",[modern love miniature featuring reader submit...
7,6,120,Label: China-US Trade Tensions,"[china, chinese, electric, beijing, vehicle, i...",[president biden aimed keep relation stable ca...
8,7,95,"In Gaza, Israel launches invasion; US withhold...","[gaza, israel, ingaza, aid, israeli, worker, h...",[president warned united state would withhold ...
9,8,95,Ukraine-Russia Conflict,"[ukraine, russia, russian, ukrainian, weapon, ...",[ukrainian force first time used longer range ...


In [167]:
# Find the articles in the dataset that are representative of the topic
print(report['Name'].iloc[22] + '\n' + '---')
for i in range(len(report['Representative_Docs'].iloc[22])):
    search_string = report['Representative_Docs'].iloc[22][i]
    index = -1
    for j, doc in enumerate(cleaned_docs):
        if search_string in doc:
            index = j
            break
    print(abstracts.iloc[index]+ '\n' + '---')

Total Solar Eclipse
---
Plus, is it eclipse weather?
---
As millions of Americans prepare to see a total solar eclipse, a retired astrophysicist known as “Mr. Eclipse,” discusses the celestial phenomenon.
---
We cover the solar eclipse that captivated the U.S.
---


In [179]:
# Get the topic for each document
model.get_document_info(cleaned_docs).head(5)
# A groupby could be used to get the top n documents for each topic

Unnamed: 0,Document,Topic,Name,Representation,Representative_Docs,Top_n_words,Probability,Representative_document
0,singer lizzo ha new swim line ha moved body po...,4,4_album_track_music_singer,"[album, track, music, singer, song, musician, ...",[beatles lp pop superstar reign atop billboard...,album - track - music - singer - song - musici...,0.969447,False
1,patrick carfizzi vibrant performer supporting ...,4,4_album_track_music_singer,"[album, track, music, singer, song, musician, ...",[beatles lp pop superstar reign atop billboard...,album - track - music - singer - song - musici...,0.769767,False
2,vote history data support polling showing bide...,1,1_president_election_voter_party,"[president, election, voter, party, biden, tru...",[former president donald trump wa due hold cam...,president - election - voter - party - biden -...,1.0,False
3,ha liberalism found coherent sexual ethic,-1,-1_wa_new_said_ha,"[wa, new, said, ha, year, one, time, also, sta...",[housing advocate even homeowner wealthier nei...,wa - new - said - ha - year - one - time - als...,0.0,False
4,need start asking better question kind work te...,-1,-1_wa_new_said_ha,"[wa, new, said, ha, year, one, time, also, sta...",[housing advocate even homeowner wealthier nei...,wa - new - said - ha - year - one - time - als...,0.0,False
