# Data

The following code has been modified from Craig and Ping's work.
References: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/#3importpackages, https://markroxor.github.io/gensim/static/notebooks/lda_training_tips.html, and https://radimrehurek.com/gensim/models/ldamodel.html.

New code is from the first link so we can modify it later on. 

In [None]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [None]:
# Read the CSV file
df = pd.read_csv('data/nyt_metadata_cleaned.csv', low_memory=False)

# Cleaning

In [None]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count',
                   'keywords']
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [None]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [None]:
df.head()

In [None]:
# Use all of the abstracts for the analysis
abstracts = df['abstract']
docs = abstracts.to_list()

In [None]:
len(docs)

In [None]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [None]:
docs[:2]

In [None]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

In [None]:
filtered_docs[:1]

In [None]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
# bigram[filtered_docs[0]]

for idx in range(len(docs)):
    filtered_docs[idx] = bigram[filtered_docs[idx]]

In [None]:
# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
# bigram = Phrases(docs, min_count=20)
# for idx in range(len(docs)):
#    for token in bigram[docs[idx]]:
#        if '_' in token:
#            # Token is a bigram, add to document.
#            filtered_docs[idx].append(token)

In [None]:
filtered_docs[:5]

In [None]:
len(filtered_docs)

In [None]:
# Lemmatize the documents.

import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
nltk.download('omw-1.4')

In [None]:
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
fitlered_docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in filtered_docs]

In [None]:
len(filtered_docs)

In [None]:
filtered_docs[:5]

In [None]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(filtered_docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [None]:
filtered_docs[:1]

In [None]:
## Concatenate each entry into a single string after removing stopwords
#cleaned_docs = [' '.join(doc) for doc in filtered_docs]

## Example of how to print the filtered and concatenated result for the first document
#print(cleaned_docs[3])

In [None]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in filtered_docs]

In [None]:
corpus[:2]

In [None]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [None]:
# Create Dictionary
id2word = corpora.Dictionary(filtered_docs)

# Create Corpus
texts = filtered_docs

In [None]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=40, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [None]:
# Print the keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics(num_topics=80, num_words=100))
doc_lda = lda_model[corpus]