# Implementing LDA using Gensim package

In [1]:
import numpy as np
import pandas as pd
import ast
import json
import os
#os.environ['KMP_WARNINGS'] = '0'
#os.environ['OMP_NUM_THREADS'] = '1'

In [2]:
# Read the CSV file
df = pd.read_csv('NYTimes/data/nyt_metadata_cleaned.csv', low_memory=False)
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,abstract,web_url,snippet,lead_paragraph,print_section,print_page,source,multimedia,...,news_desk,section_name,subsection_name,byline,type_of_material,_id,word_count,uri,tokenized_abstract,tokenized_lead_par
0,0,1813,economic hardship climate change political ins...,https://www.nytimes.com/2023/05/14/us/migrants...,"Economic hardship, climate change, political i...",relative quiet has prevailed along the souther...,A,14.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",...,National,U.S.,,"{'original': 'By Miriam Jordan', 'person': [{'...",News,nyt://article/3d95da14-0c64-59c6-bae2-02b151ad...,1217,nyt://article/3d95da14-0c64-59c6-bae2-02b151ad...,"['economic', 'hardship', 'climate', 'change', ...","['relative', 'quiet', 'has', 'prevailed', 'alo..."
1,1,1814,election night america stay away from the bode...,https://www.nytimes.com/2023/05/14/arts/televi...,It’s election night in America. Stay away from...,the day before logan roy died delivered fiery ...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",...,Culture,Arts,Television,"{'original': 'By Noel Murray', 'person': [{'fi...",News,nyt://article/17f6f628-2939-541b-a0e8-5c503fa6...,1495,nyt://article/17f6f628-2939-541b-a0e8-5c503fa6...,"['election', 'night', 'america', 'stay', 'away...","['the', 'day', 'before', 'logan', 'roy', 'died..."
2,2,1815,tom stressed dress shoes shiv hides beneath la...,https://www.nytimes.com/2023/05/14/style/succe...,"Tom is stressed in dress shoes, Shiv hides ben...",this article contains spoilers for episode the...,,,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",...,Styles,Style,,"{'original': 'By The Styles Desk', 'person': [...",News,nyt://article/70773662-4815-5d40-8460-b438aa44...,665,nyt://article/70773662-4815-5d40-8460-b438aa44...,"['tom', 'stressed', 'dress', 'shoes', 'shiv', ...","['this', 'article', 'contains', 'spoilers', 'f..."
3,3,1816,corrections appeared print monday may 2023,https://www.nytimes.com/2023/05/14/pageoneplus...,"No corrections appeared in print on Monday, Ma...",errors are corrected during the press run when...,,,The New York Times,[],...,Corrections,Corrections,,"{'original': '', 'person': [], 'organization':...",News,nyt://article/199d026e-1372-51a3-adf0-c82abeeb...,52,nyt://article/199d026e-1372-51a3-adf0-c82abeeb...,"['corrections', 'appeared', 'print', 'monday',...","['errors', 'are', 'corrected', 'during', 'the'..."
4,5,1818,the year old french basketball star the most h...,https://www.nytimes.com/2023/05/15/sports/bask...,The 19-year-old French basketball star is the ...,boris diaw was passing through paris late sept...,D,1.0,The New York Times,"[{'rank': 0, 'subtype': 'xlarge', 'caption': N...",...,Sports,Sports,Pro Basketball,"{'original': 'By Tania Ganguli', 'person': [{'...",News,nyt://article/26a856ce-3d2c-5df2-baa9-0bd6e29f...,1904,nyt://article/26a856ce-3d2c-5df2-baa9-0bd6e29f...,"['the', 'year', 'old', 'french', 'basketball',...","['boris', 'diaw', 'was', 'passing', 'through',..."


# Cleaning

In [3]:
# Drop columns that are not needed
columns_to_drop = ['web_url', 
                   'snippet', 
                   'lead_paragraph', 
                   'print_section', 
                   'print_page', 
                   'source', 
                   'multimedia', 
                   'news_desk',
                   'byline',
                   '_id',
                   'uri',
                   'subsection_name',
                   'word_count']
#                    'keywords'
df.drop(columns=columns_to_drop, inplace=True)

# Drop rows with missing abstracts
drop_rows = df[df['abstract'].isnull()].index
df.drop(drop_rows, inplace=True)

# Change the date column to datetime
df['pub_date'] = pd.to_datetime(df['pub_date'])

# Change the abstract column to string
df['abstract'] = df['abstract'].astype(str)

In [4]:
# Function to extract the 'main' value from JSON-like strings
def extract_main(headline_str):
    try:
        # Safely evaluate the string to convert it to a dictionary
        json_dict = ast.literal_eval(headline_str)
        # Access and return the 'main' key
        return json_dict.get('main', None)
    except (ValueError, SyntaxError):
        return None

# Apply the function to the 'headline' column
df['headline'] = df['headline'].apply(extract_main)

In [5]:
df.keywords[0]

"[{'name': 'subject', 'value': 'Illegal Immigration', 'rank': 1, 'major': 'N'}, {'name': 'subject', 'value': 'Border Barriers', 'rank': 2, 'major': 'N'}, {'name': 'subject', 'value': 'Asylum, Right of', 'rank': 3, 'major': 'N'}, {'name': 'subject', 'value': 'Refugees and Displaced Persons', 'rank': 4, 'major': 'N'}, {'name': 'subject', 'value': 'Immigration and Emigration', 'rank': 5, 'major': 'N'}, {'name': 'subject', 'value': 'Immigration Detention', 'rank': 6, 'major': 'N'}, {'name': 'organizations', 'value': 'Border Patrol (US)', 'rank': 7, 'major': 'N'}, {'name': 'glocations', 'value': 'United States', 'rank': 8, 'major': 'N'}, {'name': 'glocations', 'value': 'Mexico', 'rank': 9, 'major': 'N'}]"

In [6]:
# Use 2000 of the abstracts for the analysis
abstracts = df['abstract']
docs = abstracts.to_list()

In [7]:
len(docs)

41045

In [8]:
# Tokenize the documents.

from nltk.tokenize import RegexpTokenizer

# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [9]:
docs[:2]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'spur',
  'emigration',
  'from',
  'many',
  'corners',
  'the',
  'world'],
 ['election',
  'night',
  'america',
  'stay',
  'away',
  'from',
  'the',
  'bodega',
  'sushi']]

In [10]:
# Lemmatize the documents.

import nltk
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ravitripathi/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [12]:
len(docs)

41045

In [13]:
# Compute bigrams.

from gensim.models import Phrases

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)

In [14]:
docs[1]

['election',
 'night',
 'america',
 'stay',
 'away',
 'from',
 'the',
 'bodega',
 'sushi',
 'away_from']

In [15]:
# Remove rare and common tokens.

from gensim.corpora import Dictionary

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [16]:
docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'and',
  'gang',
  'violence',
  'will',
  'continue',
  'spur',
  'emigration',
  'from',
  'many',
  'corner',
  'the',
  'world',
  'climate_change']]

In [17]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
nltk.download('stopwords')

# Define the stopwords
stop_words = set(stopwords.words('english'))

# Remove stopwords from each document
filtered_docs = [[word for word in doc if word not in stop_words] for doc in docs]

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ravitripathi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
filtered_docs[:1]

[['economic',
  'hardship',
  'climate',
  'change',
  'political',
  'instability',
  'gang',
  'violence',
  'continue',
  'spur',
  'emigration',
  'many',
  'corner',
  'world',
  'climate_change']]

In [19]:
## Concatenate each entry into a single string after removing stopwords
#cleaned_docs = [' '.join(doc) for doc in filtered_docs]

## Example of how to print the filtered and concatenated result for the first document
#print(cleaned_docs[3])

In [20]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [21]:
corpus[:2]

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(7, 1), (14, 1), (15, 1), (16, 1), (17, 1), (18, 1), (19, 1)]]

In [22]:
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

In [23]:
# Create Dictionary
id2word = corpora.Dictionary(filtered_docs)

# Create Corpus
texts = filtered_docs

In [24]:
# Human readable format of corpus (term-frequency)
[[(id2word[id], freq) for id, freq in cp] for cp in corpus[:1]]

[[('change', 1),
  ('climate', 1),
  ('climate_change', 1),
  ('continue', 1),
  ('corner', 1),
  ('economic', 1),
  ('emigration', 1),
  ('gang', 1),
  ('hardship', 1),
  ('instability', 1),
  ('many', 1),
  ('political', 1),
  ('spur', 1),
  ('violence', 1)]]

In [44]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha='auto',
                                           per_word_topics=True)

In [45]:
# Print the keyword in the 10 topics
from pprint import pprint
pprint(lda_model.print_topics(num_topics=10, num_words=30))
doc_lda = lda_model[corpus]

[(0,
  '0.042*"church" + 0.039*"ha" + 0.026*"violated" + 0.023*"method" + '
  '0.020*"play" + 0.019*"governor" + 0.017*"com" + 0.016*"reality" + '
  '0.016*"protection" + 0.015*"paid" + 0.014*"europe" + 0.014*"hansol" + '
  '0.014*"wooed" + 0.013*"democratic" + 0.013*"article_nytimes" + '
  '0.012*"federal_reserve" + 0.011*"ha_appeared" + 0.011*"biden" + '
  '0.011*"unanimous" + 0.010*"ambassador" + 0.010*"tobago" + 0.010*"nation" + '
  '0.009*"number" + 0.009*"access" + 0.009*"lebron" + 0.009*"car" + '
  '0.009*"giant" + 0.009*"layer" + 0.008*"attitude" + 0.008*"ultimate"'),
 (1,
  '0.086*"american" + 0.042*"light" + 0.042*"offered" + 0.030*"tilted" + '
  '0.024*"veteran" + 0.018*"post" + 0.017*"connor" + 0.015*"position" + '
  '0.014*"looking" + 0.014*"drab" + 0.013*"bed" + 0.013*"reintroduce" + '
  '0.013*"confront" + 0.012*"northwest" + 0.012*"print_monday" + '
  '0.011*"reader_wonder" + 0.010*"immunity" + 0.010*"green" + 0.010*"follows" '
  '+ 0.010*"microsoft" + 0.009*"selected" 

In [46]:
# Compute Perplexity
print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score


Perplexity:  -7.671147479058828


In [47]:
# Compute Coherence Score
coherence_model_lda = CoherenceModel(model=lda_model, texts=docs, dictionary=id2word, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print('\nCoherence Score: ', coherence_lda)


Coherence Score:  0.602715867706146


In [48]:
# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

In [49]:
# Visualize the topics
# Will not be visible if you view the notebook on GitHub
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis