In [66]:
import numpy as np
import pandas as pd
import ast
import json
import os
import matplotlib.pyplot as plt
import plotly.express as px
from scipy.optimize import curve_fit
from sklearn.linear_model import LinearRegression
import seaborn as sns
import zipfile

In [67]:
# Read the CSV file
df = pd.read_csv('nyt-metadata.csv')

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


# Cleaning

In [68]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [69]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [70]:
df.head()

Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material
0,Article on upcoming New York Giants-Dallas Cow...,"Playoffs or No, Dallas Provides The Motivation",2000-01-01 05:00:00+00:00,article,Sports,News
1,Jeanne C Pond letter expresses hope that spiri...,"On This First Day, a Fanfare for the New Era; ...",2000-01-01 05:00:00+00:00,article,Opinion,Letter
2,Many experts on Y2K computer problem report th...,Internet's Cheering Squad Nervously Watches Clock,2000-01-01 05:00:00+00:00,article,U.S.,News
3,WILL the forces of globalism continue to push ...,Economic Thinking Finds a Free Market,2000-01-01 05:00:00+00:00,article,Archives,News
4,SPECIAL TODAY The Millennium Envisioning th...,INSIDE,2000-01-01 05:00:00+00:00,article,New York,Summary


In [103]:
# Use the first 2000 abstracts for the analysis
abstracts = df['abstract'][0:2000]
docs = abstracts.to_list()

In [104]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [108]:
# Lemmatize the documents.
import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/craigfranze/nltk_data...


In [111]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [116]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [119]:
docs[3]

['will',
 'the',
 'force',
 'of',
 'globalism',
 'continue',
 'to',
 'push',
 'the',
 'world',
 'toward',
 'american',
 'style',
 'capitalism',
 'a',
 'the',
 '21st',
 'century',
 'begin',
 'advocate',
 'of',
 'the',
 'free',
 'market',
 'have',
 'no',
 'doubt',
 'that',
 'they',
 'have',
 'won',
 'the',
 'economic',
 'argument',
 'socialism',
 'is',
 'dead',
 'moreover',
 'a',
 'mean',
 'of',
 'creating',
 'wealth',
 'and',
 'material',
 'progress',
 'american',
 'capitalism',
 'seems',
 'to',
 'be',
 'clearly',
 'superior',
 'to',
 'the',
 'asian',
 'variety',
 'with',
 'it',
 'greater',
 'level',
 'of',
 'government',
 'planning',
 'or',
 'the',
 'european',
 'version',
 'with',
 'it',
 'emphasis',
 'on',
 'social',
 'welfare',
 'and',
 'protection',
 'of',
 'worker',
 'from',
 'losing',
 'their',
 'job',
 '21st_century']

# BERTopic

In [96]:
from bertopic import BERTopic

In [121]:
# Concatenate each entry into a single string
concatenated_docs = [' '.join(doc) for doc in docs]

In [122]:
# model = BERTopic(nr_topics=5) # Default number of topics is 30
model = BERTopic(verbose=True)
model.fit(concatenated_docs)
topics, probabilities = model.transform(concatenated_docs)

2024-05-21 21:08:55,567 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 63/63 [00:51<00:00,  1.24it/s]
2024-05-21 21:09:47,970 - BERTopic - Embedding - Completed ✓
2024-05-21 21:09:47,971 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-05-21 21:09:55,042 - BERTopic - Dimensionality - Completed ✓
2024-05-21 21:09:55,043 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-05-21 21:09:55,100 - BERTopic - Cluster - Completed ✓
2024-05-21 21:09:55,104 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-05-21 21:09:55,274 - BERTopic - Representation - Completed ✓
Batches: 100%|██████████| 63/63 [00:50<00:00,  1.24it/s]
2024-05-21 21:10:46,094 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2024-05-21 21:10:46,100 - BERTopic - Dimensionality - Completed ✓
2024-05-21 21:10:46,101 - BERTopic - Clustering - Approximating new points with

In [123]:
model.get_topic_freq().head(10)

Unnamed: 0,Topic,Count
3,0,409
0,-1,401
1,1,291
16,2,117
4,3,107
14,4,76
11,5,74
5,6,67
2,7,63
20,8,42


In [124]:
model.visualize_topics()

In [125]:
model.visualize_barchart()

In [126]:
model.visualize_heatmap()

In [127]:
model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,401,-1_the_to_of_in,"[the, to, of, in, and, that, for, on, is, with]",[in the wake of his first losing season a the ...
1,0,409,0_of_and_at_his,"[of, and, at, his, january, her, the, in, belo...",[fuller robert garfield of west palm beach fl ...
2,1,291,1_the_it_to_in,"[the, it, to, in, company, that, percent, of, ...",[economist have rather zen like view of stock ...
3,2,117,2_the_to_mr_for,"[the, to, mr, for, that, campaign, candidate, ...",[in lively almost raucous debate in new hampsh...
4,3,107,3_the_year_new_of,"[the, year, new, of, in, and, time, to, yearev...",[new york awoke on saturday morning to discove...
5,4,76,4_editor_the_to_frontpage,"[editor, the, to, frontpage, front, page, edit...","[to the editor, to the editor, to the editor]"
6,5,74,5_the_in_police_to,"[the, in, police, to, of, and, said, wa, court...",[europe ireland killer return blocked the high...
7,6,67,6_the_and_theater_at,"[the, and, theater, at, dance, new, in, film, ...",[doug varone ha long and impressive record a m...
8,7,63,7_century_drawing_21stcentury_21st,"[century, drawing, 21stcentury, 21st, say, the...",[expert discus trend in the kind of home ameri...
9,8,42,8_museum_art_gallery_and,"[museum, art, gallery, and, in, of, artist, ph...",[almost everyone who writes about cildo meirel...
