# Data

The following code has been modified from Craig and Ping's work.
References: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#3importpackages, https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html, and https://radimrehurek.com/gensim/models/ldamodel.html.

New code is from the first link so we can modify it later on. 

In [1]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
# Read the CSV file
df = pd.read_csv('data/nyt_metadata_cleaned.csv', low_memory=False)

# Cleaning

In [3]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [4]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [5]:
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material,tokenized_abstract,tokenized_lead_par
0,0,1813,economic hardship climate change political ins...,"Title 42 Is Gone, but Not the Conditions Drivi...",2023-05-15 01:24:42+00:00,article,U.S.,News,"['economic', 'hardship', 'climate', 'change', ...","['relative', 'quiet', 'has', 'prevailed', 'alo..."
1,1,1814,election night america stay away from the bode...,"‘Succession’ Season 4, Episode 8 Recap: The Wi...",2023-05-15 02:01:05+00:00,article,Arts,News,"['election', 'night', 'america', 'stay', 'away...","['the', 'day', 'before', 'logan', 'roy', 'died..."
2,2,1815,tom stressed dress shoes shiv hides beneath la...,"‘Succession’ Style, Episode 8: Some People Jus...",2023-05-15 02:15:04+00:00,article,Style,News,"['tom', 'stressed', 'dress', 'shoes', 'shiv', ...","['this', 'article', 'contains', 'spoilers', 'f..."
3,3,1816,corrections appeared print monday may 2023,"No Corrections: May 15, 2023",2023-05-15 03:55:48+00:00,article,Corrections,News,"['corrections', 'appeared', 'print', 'monday',...","['errors', 'are', 'corrected', 'during', 'the'..."
4,5,1818,the year old french basketball star the most h...,Everybody Wants Victor Wembanyama. He Wants to...,2023-05-15 04:01:13+00:00,article,Sports,News,"['the', 'year', 'old', 'french', 'basketball',...","['boris', 'diaw', 'was', 'passing', 'through',..."


In [6]:
# Use 2000 of the abstracts for the analysis
abstracts = df['abstract'][:2000]
docs = abstracts.to_list()

In [7]:
len(docs)

2000

In [8]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [9]:
docs[:2]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'spur',
  'emigration',
  'from',
  'many',
  'corners',
  'the',
  'world'],
 ['election',
  'night',
  'america',
  'stay',
  'away',
  'from',
  'the',
  'bodega',
  'sushi']]

In [10]:
# Lemmatize the documents.

import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /Users/ale/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [12]:
len(docs)

2000

In [13]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [14]:
docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'spur',
  'emigration',
  'from',
  'many',
  'corner',
  'the',
  'world']]

In [15]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [16]:
docs[1]

['election',
 'night',
 'america',
 'stay',
 'away',
 'from',
 'the',
 'bodega',
 'sushi']

In [17]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

[nltk_data] Downloading package stopwords to /Users/ale/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
filtered_docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corner',
  'world']]

In [19]:
## Concatenate each entry into a single string after removing stopwords
#cleaned_docs = [' '.join(doc) for doc in filtered_docs]

## Example of how to print the filtered and concatenated result for the first document
#print(cleaned_docs[3])

In [20]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [21]:
corpus[:2]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1)], [(2, 1), (7, 1)]]

In [22]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [23]:
# Create Dictionary
id2word = corpora.Dictionary(filtered_docs)

# Create Corpus
texts = filtered_docs

In [24]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('change', 1),
  ('climate', 1),
  ('continue', 1),
  ('corner', 1),
  ('economic', 1),
  ('emigration', 1),
  ('gang', 1)]]

In [25]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [26]:
# Print the keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics(num_topics=20, num_words=10))
doc_lda = lda_model[corpus]

[(0,
  '0.099*"led" + 0.059*"month" + 0.051*"vice" + 0.050*"ability" + '
  '0.042*"trinidad" + 0.040*"struggled" + 0.040*"left" + 0.038*"made" + '
  '0.032*"nation" + 0.000*"drab"'),
 (1,
  '0.084*"article" + 0.081*"spur" + 0.049*"number" + 0.042*"gas" + '
  '0.042*"reality" + 0.041*"put" + 0.038*"ha_been" + 0.036*"liquefied" + '
  '0.031*"first" + 0.028*"many"'),
 (2,
  '0.242*"reflect" + 0.066*"drama" + 0.051*"massacre" + 0.044*"shoe" + '
  '0.036*"nine" + 0.034*"fx" + 0.027*"medium" + 0.026*"hillsong" + '
  '0.000*"visceral" + 0.000*"future"'),
 (3,
  '0.096*"instability" + 0.062*"african" + 0.058*"year" + 0.047*"new" + '
  '0.045*"layer" + 0.043*"new_york" + 0.042*"old" + 0.038*"principal" + '
  '0.034*"prospect" + 0.031*"pressure"'),
 (4,
  '0.100*"believed" + 0.052*"battle" + 0.000*"echo" + 0.000*"house" + '
  '0.000*"view" + 0.000*"michael" + 0.000*"inspire" + 0.000*"assault" + '
  '0.000*"four" + 0.000*"drab"'),
 (5,
  '0.098*"lebron" + 0.093*"basketball" + 0.056*"war" + 0.046*