# Data

The following code has been modified from Craig and Ping's work.
References: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#3importpackages, https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html, and https://radimrehurek.com/gensim/models/ldamodel.html.

New code is from the first link so we can modify it later on. 

In [1]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
# Read the CSV file
df = pd.read_csv('data/nyt_metadata.csv', low_memory=False)

# Cleaning

In [3]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [4]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material
0,1813,"Economic hardship, climate change, political i...","Title 42 Is Gone, but Not the Conditions Drivi...",2023-05-15 01:24:42+00:00,article,U.S.,News
1,1814,It’s election night in America. Stay away from...,"‘Succession’ Season 4, Episode 8 Recap: The Wi...",2023-05-15 02:01:05+00:00,article,Arts,News
2,1815,"Tom is stressed in dress shoes, Shiv hides ben...","‘Succession’ Style, Episode 8: Some People Jus...",2023-05-15 02:15:04+00:00,article,Style,News
3,1816,"No corrections appeared in print on Monday, Ma...","No Corrections: May 15, 2023",2023-05-15 03:55:48+00:00,article,Corrections,News
4,1817,"Quotation of the Day for Monday, May 15, 2023.",Quotation of the Day: When Your Champions Leag...,2023-05-15 03:55:57+00:00,article,Corrections,News


In [6]:
# Use all of the abstracts for the analysis
abstracts = df['abstract']
docs = abstracts.to_list()

In [7]:
len(docs)

42623

In [8]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [9]:
docs[:2]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'to',
  'spur',
  'emigration',
  'from',
  'many',
  'corners',
  'of',
  'the',
  'world'],
 ['it',
  'election',
  'night',
  'in',
  'america',
  'stay',
  'away',
  'from',
  'the',
  'bodega',
  'sushi']]

In [9]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/schinella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [10]:
filtered_docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corners',
  'world']]

In [16]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
# bigram[filtered_docs[0]]

for idx in range(len(docs)):
    filtered_docs[idx] = bigram[filtered_docs[idx]]

In [None]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
# bigram = Phrases(docs, min_count=20)
# for idx in range(len(docs)):
#    for token in bigram[docs[idx]]:
#        if '_' in token:
#            # Token is a bigram, add to document.
#            filtered_docs[idx].append(token)

In [20]:
filtered_docs[:5]

[['economic',
  'hardship',
  'climate_change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corners',
  'world'],
 ['election', 'night', 'america', 'stay', 'away', 'bodega', 'sushi'],
 ['tom',
  'stressed',
  'dress',
  'shoes',
  'shiv',
  'hides',
  'beneath',
  'layers',
  'lies',
  'turtleneck',
  'willa',
  'cosplays',
  'first_lady',
  'hopeful'],
 ['corrections_appeared', 'print', 'monday', 'may'],
 ['quotation', 'day', 'monday', 'may']]

In [21]:
len(filtered_docs)

42623

In [22]:
# Lemmatize the documents.

import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/schinella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [23]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/schinella/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [25]:
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
fitlered_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in filtered_docs]

In [26]:
len(filtered_docs)

42623

In [28]:
filtered_docs[:5]

[['economic',
  'hardship',
  'climate_change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corners',
  'world'],
 ['election', 'night', 'america', 'stay', 'away', 'bodega', 'sushi'],
 ['tom',
  'stressed',
  'dress',
  'shoes',
  'shiv',
  'hides',
  'beneath',
  'layers',
  'lies',
  'turtleneck',
  'willa',
  'cosplays',
  'first_lady',
  'hopeful'],
 ['corrections_appeared', 'print', 'monday', 'may'],
 ['quotation', 'day', 'monday', 'may']]

In [29]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(filtered_docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [30]:
filtered_docs[:1]

[['economic',
  'hardship',
  'climate_change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corners',
  'world']]

In [None]:
## Concatenate each entry into a single string after removing stopwords
#cleaned_docs = [' '.join(doc) for doc in filtered_docs]

## Example of how to print the filtered and concatenated result for the first document
#print(cleaned_docs[3])

In [31]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in filtered_docs]

In [32]:
corpus[:2]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1)],
 [(8, 1), (9, 1), (10, 1), (11, 1), (12, 1)]]

In [33]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [34]:
# Create Dictionary
id2word = corpora.Dictionary(filtered_docs)

# Create Corpus
texts = filtered_docs

In [35]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('climate_change', 1),
  ('continue', 1),
  ('corners', 1),
  ('economic', 1),
  ('emigration', 1),
  ('gang', 1),
  ('hardship', 1),
  ('instability', 1)]]

In [36]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [37]:
# Print the keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics(num_topics=20, num_words=10))
doc_lda = lda_model[corpus]

[(0,
  '0.228*"across" + 0.062*"brandon" + 0.051*"creditors" + 0.039*"nearly" + '
  '0.036*"corrections_appeared" + 0.032*"next" + 0.029*"explains" + '
  '0.027*"national" + 0.024*"largest" + 0.022*"selected"'),
 (1,
  '0.085*"bakhmut" + 0.083*"competition" + 0.081*"sense" + 0.075*"early" + '
  '0.058*"things" + 0.037*"beliefs" + 0.030*"island" + 0.030*"erdogan" + '
  '0.023*"fossil_fuels" + 0.023*"discovered"'),
 (2,
  '0.217*"memoir" + 0.096*"movement" + 0.086*"unions" + 0.044*"career" + '
  '0.030*"others" + 0.029*"turkey" + 0.027*"maddening" + 0.025*"boeing" + '
  '0.020*"threadgill" + 0.017*"protest"'),
 (3,
  '0.160*"vote" + 0.066*"system" + 0.057*"rockets" + 0.053*"spur" + '
  '0.052*"youth" + 0.049*"similarities" + 0.043*"russia" + 0.037*"thought" + '
  '0.033*"quickly" + 0.029*"unfolds"'),
 (4,
  '0.085*"vice" + 0.078*"expressions" + 0.055*"gang" + 0.052*"happening" + '
  '0.045*"hotels" + 0.039*"giants" + 0.037*"balm" + 0.034*"sherpas" + '
  '0.029*"regulators" + 0.024*"four"