In [10]:
import pandas as pd
import nltk
from gensim import corpora
from gensim.models import LdaModel
import pyLDAvis.gensim_models
import warnings
from nltk.corpus import stopwords
import string

# Setup
nltk.download('stopwords')
warnings.filterwarnings("ignore", category=DeprecationWarning)

stopwords_eng = set(stopwords.words('english'))

df = pd.read_csv("articles/articles1.csv")

# Assuming 'content' is the column with the article texts
documents = df['content'].dropna().astype(str).tolist()

def preprocess(text):
    tokens = text.lower().split()
    # Remove stopwords
    return [word for word in tokens if word not in stopwords_eng and word.isalpha()]

processed_docs = [preprocess(doc) for doc in documents]

# Create Dictionary and Corpus
dictionary = corpora.Dictionary(processed_docs)
dictionary.filter_extremes(no_below=5, no_above=0.5)
doc_term_matrix = [dictionary.doc2bow(doc) for doc in processed_docs]

# LDA Modeling
lda_model = LdaModel(doc_term_matrix,
                     num_topics=10,  
                     id2word=dictionary,
                     passes=10,
                     random_state=0)

lda_model.save('lda_model.gensim')

for idx, topic in lda_model.print_topics(num_words=10):
    print(f"Topic {idx + 1}: {topic}")


[nltk_data] Downloading package stopwords to /home/toybot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Topic 1: 0.014*"us" + 0.011*"president" + 0.007*"russian" + 0.007*"security" + 0.006*"intelligence" + 0.006*"military" + 0.006*"obama" + 0.006*"russia" + 0.006*"united" + 0.005*"foreign"
Topic 2: 0.033*"trump" + 0.023*"clinton" + 0.013*"republican" + 0.012*"campaign" + 0.010*"presidential" + 0.010*"hillary" + 0.008*"donald" + 0.008*"democratic" + 0.007*"party" + 0.007*"election"
Topic 3: 0.011*"house" + 0.011*"court" + 0.010*"federal" + 0.008*"law" + 0.008*"president" + 0.008*"senate" + 0.007*"state" + 0.007*"bill" + 0.006*"republicans" + 0.005*"could"
Topic 4: 0.010*"people" + 0.006*"black" + 0.006*"women" + 0.006*"many" + 0.004*"political" + 0.004*"social" + 0.004*"students" + 0.004*"like" + 0.004*"us" + 0.004*"even"
Topic 5: 0.008*"people" + 0.007*"health" + 0.005*"may" + 0.005*"according" + 0.005*"could" + 0.004*"found" + 0.004*"new" + 0.004*"medical" + 0.004*"study" + 0.004*"climate"
Topic 6: 0.010*"company" + 0.009*"business" + 0.009*"new" + 0.007*"million" + 0.005*"like" + 0.005

- Topic 1: Top keywords: us, president, russian, security, intelligence, military, obama, russia, united, foreign 
- Topic 2: Top keywords: trump, clinton, republican, campaign, presidential, hillary, donald, democratic, party, election
- Topic 3: Top keywords: house, court, federal, law, president, senate, state, bill, republicans, could
- Topic 4: Top keywords: people, black, women, many, political, social, students, like, us, even
- Topic 5: Top keywords: people, health, may, according, could, found, new, medical, study, climate
- Topic 6: Top keywords: company, business, new, million, like, companies, money, could, market, billion
- Topic 7: Top keywords: north, people, south, new, could, two, us, city, european, international
- Topic 8: Top keywords: like, first, get, time, new, people, two, going, told, know
- Topic 9: Top keywords: trump, donald, president, news, think, going, people, told, white, new
- Topic 10: Top keywords: police, told, according, two, killed, officers, people, man, isis, authorities

b. When we increase the number of topics from 5 to 10. Theres a improved separation. Toppic becomes more specific. Words are less mixed but still general and mixed.
Less topics will have broader themes but more topics will have more clarity


In [11]:
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
from gensim import corpora

dictionary = corpora.Dictionary(processed_docs)

dictionary.filter_extremes(no_below=5, no_above=0.5)

corpus = [dictionary.doc2bow(doc) for doc in processed_docs]

# Prepare the visualization
pyldavis_vis = gensimvis.prepare(lda_model, corpus, dictionary)


pyLDAvis.display(pyldavis_vis)

- topic 1 - U.S. - rusian  Intelligence and Security

Focuses on geopolitics, intelligence matters, and US–Russia relations—strongly related to international defense and foreign policy.
- topic 2 - 2016 Election Campaign 


U.S. election dynamics, distinguishing it from general Trump-
- topic 3 - Government & Law

Clear focus on government institutions and law-making processes
- topic 4 - Society & Politics

Focus on social groups , with themes around politics
- topic 5 - Health & Research

Uniquely tied to scientific studies and health/environmental research,

- topic 6 - Business and economy

Most financial topic which are centered on startups, corporate news, investments, and economic impact.
- topic 7 - Global Affairs

Reflects geopolitical coverage, possibly about Korea, EU affairs,

- topic 8 - General Public Speech

commonly spoken words suggest interviews, quotes, or subjective reporting
- topic 9 - Trump’s Media 

Unlike Topic 2 (campaign), this one captures ongoing news coverage and public perception of Trump post-election.
- topic 10 - Crime, Policing 

Highly specific to law enforcement incidents, violence, and terrorism