# Data

The following code has been modified from Craig and Ping's work.
References: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#3importpackages, https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html, and https://radimrehurek.com/gensim/models/ldamodel.html.

New code is from the first link so we can modify it later on. 

In [1]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
# Read the CSV file
df = pd.read_csv('data/nyt_metadata.csv', low_memory=False)

# Cleaning

In [3]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [4]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [5]:
df.head()

Unnamed: 0.1,Unnamed: 0,abstract,headline,pub_date,document_type,section_name,type_of_material
0,1813,"Economic hardship, climate change, political i...","Title 42 Is Gone, but Not the Conditions Drivi...",2023-05-15 01:24:42+00:00,article,U.S.,News
1,1814,It’s election night in America. Stay away from...,"‘Succession’ Season 4, Episode 8 Recap: The Wi...",2023-05-15 02:01:05+00:00,article,Arts,News
2,1815,"Tom is stressed in dress shoes, Shiv hides ben...","‘Succession’ Style, Episode 8: Some People Jus...",2023-05-15 02:15:04+00:00,article,Style,News
3,1816,"No corrections appeared in print on Monday, Ma...","No Corrections: May 15, 2023",2023-05-15 03:55:48+00:00,article,Corrections,News
4,1817,"Quotation of the Day for Monday, May 15, 2023.",Quotation of the Day: When Your Champions Leag...,2023-05-15 03:55:57+00:00,article,Corrections,News


In [6]:
# Use 2000 of the abstracts for the analysis
abstracts = df['abstract'][:2000]
docs = abstracts.to_list()

In [7]:
len(docs)

2000

In [8]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [9]:
docs[:2]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'to',
  'spur',
  'emigration',
  'from',
  'many',
  'corners',
  'of',
  'the',
  'world'],
 ['it',
  'election',
  'night',
  'in',
  'america',
  'stay',
  'away',
  'from',
  'the',
  'bodega',
  'sushi']]

In [10]:
# Lemmatize the documents.

import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/schinella/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

LookupError: 
**********************************************************************
  Resource [93momw-1.4[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('omw-1.4')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/omw-1.4[0m

  Searched in:
    - '/Users/schinella/nltk_data'
    - '/Users/schinella/opt/anaconda3/nltk_data'
    - '/Users/schinella/opt/anaconda3/share/nltk_data'
    - '/Users/schinella/opt/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


In [12]:
len(docs)

2000

In [13]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [14]:
docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'to',
  'spur',
  'emigration',
  'from',
  'many',
  'corners',
  'of',
  'the',
  'world']]

In [15]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [16]:
docs[1]

['it',
 'election',
 'night',
 'in',
 'america',
 'stay',
 'away',
 'from',
 'the',
 'bodega',
 'sushi']

In [17]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/schinella/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
filtered_docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corners',
  'world']]

In [None]:
## Concatenate each entry into a single string after removing stopwords
#cleaned_docs = [' '.join(doc) for doc in filtered_docs]

## Example of how to print the filtered and concatenated result for the first document
#print(cleaned_docs[3])

In [19]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [20]:
corpus[:2]

[[(0, 1), (1, 1), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)],
 [(2, 1), (9, 1), (10, 1), (11, 1)]]

In [21]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [22]:
# Create Dictionary
id2word = corpora.Dictionary(filtered_docs)

# Create Corpus
texts = filtered_docs

In [23]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('change', 1),
  ('climate', 1),
  ('continue', 1),
  ('corners', 1),
  ('economic', 1),
  ('emigration', 1),
  ('gang', 1),
  ('hardship', 1),
  ('instability', 1)]]

In [24]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=20, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [28]:
# Print the keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics(num_topics=20, num_words=10))
doc_lda = lda_model[corpus]

[(0,
  '0.120*"election" + 0.108*"giants" + 0.101*"days" + 0.042*"citizens" + '
  '0.032*"beneath" + 0.000*"bottom" + 0.000*"workers" + 0.000*"monrovia" + '
  '0.000*"nonpartisan" + 0.000*"things"'),
 (1,
  '0.062*"ability" + 0.000*"tones" + 0.000*"drab" + 0.000*"earth" + '
  '0.000*"house" + 0.000*"things" + 0.000*"inspire" + 0.000*"question" + '
  '0.000*"races" + 0.000*"recent"'),
 (2,
  '0.091*"coming" + 0.065*"believed" + 0.044*"protect" + 0.042*"battle" + '
  '0.035*"liquefied" + 0.031*"overseas" + 0.000*"country" + 0.000*"number" + '
  '0.000*"spruce" + 0.000*"reader"'),
 (3,
  '0.079*"bill" + 0.044*"nation" + 0.039*"ending" + 0.039*"governor" + '
  '0.036*"democratic" + 0.000*"nonpartisan" + 0.000*"monrovia" + '
  '0.000*"storybook" + 0.000*"country" + 0.000*"northwest"'),
 (4,
  '0.059*"several" + 0.055*"mostly" + 0.041*"showtime" + 0.040*"adjust" + '
  '0.034*"word" + 0.000*"bottom" + 0.000*"study" + 0.000*"meant" + '
  '0.000*"country" + 0.000*"important"'),
 (5,
  '0.110*"c