# Data

In [6]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [7]:
# Read the CSV file
df = pd.read_csv('nyt-metadata.csv', low_memory=False)

# Cleaning

In [8]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [9]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [10]:
df.head()

Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material
0,Article on upcoming New York Giants-Dallas Cow...,"Playoffs or No, Dallas Provides The Motivation",2000-01-01 05:00:00+00:00,article,Sports,News
1,Jeanne C Pond letter expresses hope that spiri...,"On This First Day, a Fanfare for the New Era; ...",2000-01-01 05:00:00+00:00,article,Opinion,Letter
2,Many experts on Y2K computer problem report th...,Internet's Cheering Squad Nervously Watches Clock,2000-01-01 05:00:00+00:00,article,U.S.,News
3,WILL the forces of globalism continue to push ...,Economic Thinking Finds a Free Market,2000-01-01 05:00:00+00:00,article,Archives,News
4,SPECIAL TODAY The Millennium Envisioning th...,INSIDE,2000-01-01 05:00:00+00:00,article,New York,Summary


In [11]:
# Use the first 2000 abstracts for the analysis
abstracts = df['abstract'][0:200]
docs = abstracts.to_list()

In [12]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [13]:
# Lemmatize the documents.
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/craigfranze/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [14]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [15]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [16]:
docs[3]

['will',
 'the',
 'force',
 'of',
 'globalism',
 'continue',
 'to',
 'push',
 'the',
 'world',
 'toward',
 'american',
 'style',
 'capitalism',
 'a',
 'the',
 '21st',
 'century',
 'begin',
 'advocate',
 'of',
 'the',
 'free',
 'market',
 'have',
 'no',
 'doubt',
 'that',
 'they',
 'have',
 'won',
 'the',
 'economic',
 'argument',
 'socialism',
 'is',
 'dead',
 'moreover',
 'a',
 'mean',
 'of',
 'creating',
 'wealth',
 'and',
 'material',
 'progress',
 'american',
 'capitalism',
 'seems',
 'to',
 'be',
 'clearly',
 'superior',
 'to',
 'the',
 'asian',
 'variety',
 'with',
 'it',
 'greater',
 'level',
 'of',
 'government',
 'planning',
 'or',
 'the',
 'european',
 'version',
 'with',
 'it',
 'emphasis',
 'on',
 'social',
 'welfare',
 'and',
 'protection',
 'of',
 'worker',
 'from',
 'losing',
 'their',
 'job']

In [17]:
# Vectorize data.

# Bag-of-words representation of the documents.
#corpus = [dictionary.doc2bow(doc) for doc in docs]

In [18]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

# Concatenate each entry into a single string after removing stopwords
cleaned_docs = [' '.join(doc) for doc in filtered_docs]

# Example of how to print the filtered and concatenated result for the first document
print(cleaned_docs[3])

force globalism continue push world toward american style capitalism 21st century begin advocate free market doubt economic argument socialism dead moreover mean creating wealth material progress american capitalism seems clearly superior asian variety greater level government planning european version emphasis social welfare protection worker losing job


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/craigfranze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [19]:
cleaned_docs[1]

'jeanne pond letter express hope spiritual development artistic knowledge skill self esteem flourish new century drawing'

# BERTopic

In [20]:
from bertopic import BERTopic

In [21]:
# Concatenate each entry into a single string
#concatenated_docs = [' '.join(doc) for doc in docs]

In [22]:
# model = BERTopic(nr_topics=5) # Default number of topics is 30
model = BERTopic(verbose=True)

In [23]:
model.fit(cleaned_docs)

2024-05-26 12:30:25,993 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2024-05-26 12:30:33,355 - BERTopic - Embedding - Completed ✓
2024-05-26 12:30:33,357 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-26 12:30:40,047 - BERTopic - Dimensionality - Completed ✓
2024-05-26 12:30:40,048 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-26 12:30:40,060 - BERTopic - Cluster - Completed ✓
2024-05-26 12:30:40,068 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-26 12:30:40,100 - BERTopic - Representation - Completed ✓


<bertopic._bertopic.BERTopic at 0x7f8342a447c0>

In [24]:
topics, probabilities = model.transform(cleaned_docs)

Batches:   0%|          | 0/7 [00:00<?, ?it/s]

2024-05-26 12:30:48,061 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-26 12:30:48,066 - BERTopic - Dimensionality - Completed ✓
2024-05-26 12:30:48,067 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2024-05-26 12:30:48,076 - BERTopic - Cluster - Completed ✓


In [25]:
model.get_topic_freq().head(10)

Unnamed: 0,Topic,Count
2,-1,57
1,0,46
4,1,43
3,2,24
0,3,15
5,4,15


In [26]:
model.visualize_topics()

In [27]:
model.visualize_barchart()

In [52]:
model.visualize_heatmap()

In [29]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,57,-1_photo_say_year_american,"[photo, say, year, american, new, computer, ti...",[people held hostage hijacked indian airline p...
1,0,46,0_century_drawing_letter_21st,"[century, drawing, letter, 21st, say, photo, a...",[steven lee myers article transformation warfa...
2,1,43,1_year_new_wa_history,"[year, new, wa, history, old, time, since, mil...",[lily yen year old retired teacher homemaker s...
3,2,24,2_grandchild_service_december_wife,"[grandchild, service, december, wife, david, m...",[rowley louis jr age shelton ct formerly port ...
4,3,15,3_article_photo_upcoming_bowl,"[article, photo, upcoming, bowl, note, footbal...",[article upcoming new york giant dallas cowboy...
5,4,15,4_president_yeltsin_said_putin,"[president, yeltsin, said, putin, east, russia...",[boris yeltsin unexpectedly resigns president ...


In [30]:
report = model.get_topic_info()

In [34]:
report = pd.DataFrame(report)

# Topic Labeling with Ollama

In [33]:
import langchain

In [35]:
from langchain_community.llms import Ollama

llm = Ollama(model="llama2", temperature=0.01)

In [36]:
print(llm.invoke('Tell me a joke. Do not be conversational.'))


Why was the math book sad?

Because it had too many problems.


In [37]:
system_prompt = 'You are a helpful, respectful and honest assistant for labeling topics.'
main_prompt_1 =  'I have a topic that contains the following documents: '
main_prompt_2 =  'The topic is described by the following keywords: '
main_prompt_3 =  'Based on the information about the topic above, please create a short label (no more than 10 words) of this topic.'
main_prompt_4 =  'Note that you only return the label and nothing more (no explanation or suggestions).'
main_prompt_5 =  'If you cant provide a label, return No Labels Available.'

In [45]:
# Concatenate the various parts into a single string, broken down for readability
prompt = (
    system_prompt + 
    main_prompt_1 + 
    str(report['Representative_Docs'].iloc[0]) + 
    main_prompt_2 + 
    str(report['Representation'].iloc[0]) + 
    main_prompt_3 + 
    main_prompt_4 + 
    main_prompt_5
)

In [47]:
topics = []
for i in range(len(report)):
    prompt = (system_prompt + 
              main_prompt_1 + 
              str(report['Representative_Docs'].iloc[i]) + 
              main_prompt_2 + str(report['Representation'].iloc[i]) + 
              main_prompt_3 + 
              main_prompt_4 + 
              main_prompt_5)
    topics.append(llm.invoke(prompt))

In [48]:
topics

['Indian Airline Hijacking',
 'Label: 21st Century Drawing Trends',
 'Retired Teacher',
 'Beloved Grandmother Memorial',
 'Upcoming Football Game',
 'President Yeltsin Resigns']

In [50]:
report['Name'] = topics

In [51]:
report

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,57,Indian Airline Hijacking,"[photo, say, year, american, new, computer, ti...",[people held hostage hijacked indian airline p...
1,0,46,Label: 21st Century Drawing Trends,"[century, drawing, letter, 21st, say, photo, a...",[steven lee myers article transformation warfa...
2,1,43,Retired Teacher,"[year, new, wa, history, old, time, since, mil...",[lily yen year old retired teacher homemaker s...
3,2,24,Beloved Grandmother Memorial,"[grandchild, service, december, wife, david, m...",[rowley louis jr age shelton ct formerly port ...
4,3,15,Upcoming Football Game,"[article, photo, upcoming, bowl, note, footbal...",[article upcoming new york giant dallas cowboy...
5,4,15,President Yeltsin Resigns,"[president, yeltsin, said, putin, east, russia...",[boris yeltsin unexpectedly resigns president ...


In [84]:
# Find the articles in the dataset that are representative of the topic
print(report['Name'].iloc[5])
for i in range(len(report['Representative_Docs'].iloc[5])):
    search_string = report['Representative_Docs'].iloc[5][i]
    index = -1
    for j, doc in enumerate(cleaned_docs):
        if search_string in doc:
            index = j
            break
    print(df.iloc[index])

President Yeltsin Resigns
abstract            Boris N Yeltsin unexpectedly resigns as presid...
headline            Yeltsin Resigns, Naming Putin as Acting Presid...
pub_date                                    2000-01-01 05:00:00+00:00
document_type                                                 article
section_name                                                    World
type_of_material                                                 News
Name: 194, dtype: object
abstract             INTERNATIONAL   A11, 13-19    Yeltsin Resigns...
headline                                                 NEWS SUMMARY
pub_date                                    2000-01-01 05:00:00+00:00
document_type                                                 article
section_name                                                 New York
type_of_material                                              Summary
Name: 105, dtype: object
abstract             EUROPE    FRANCE: MORE EXPLOSIVES LINKED TO B...
headline      