In [3]:
news_src_path="../newscrawler/data/usatoday_articles_20250531.json"

In [4]:
import json
from datetime import datetime

import pandas as pd
import spacy
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer

# 1. Load your JSON data
with open(news_src_path, 'r', encoding='utf-8') as f:
    articles = json.load(f)

# 2. Filter for USA Today
usadata = [a for a in articles if a['source_domain'] == 'usatoday.com']

# 3. Prepare text & dates
texts = [a['text'] for a in usadata]
dates = [datetime.fromisoformat(a['date_published'].replace('Z','')) for a in usadata]

# 4. Function to print top words per topic
def print_topics(model, feature_names, top_n=10):
    for idx, topic in enumerate(model.components_):
        top_features = [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]]
        print(f"Topic {idx}: {', '.join(top_features)}")



In [5]:
# ─────────────────────────────────────────────────────────
# 5. FULL-PERIOD LDA (all USA Today articles)
vectorizer = CountVectorizer(
    max_df=0.9,
    min_df=2,
    stop_words='english'
)
dtm = vectorizer.fit_transform(texts)
lda = LatentDirichletAllocation(
    n_components=10,
    random_state=42
)
lda.fit(dtm)

print("=== Overall Topics for USA Today (Oct ’23–May ’25) ===")
print_topics(lda, vectorizer.get_feature_names_out(), top_n=8)




=== Overall Topics for USA Today (Oct ’23–May ’25) ===
Topic 0: taylor, wilson, today, usa, trump, just, really, people
Topic 1: trump, said, trial, cohen, read, aysha, new, bagchi
Topic 2: israel, said, gaza, hamas, israeli, war, military, killed
Topic 3: students, university, said, jewish, campus, school, college, student
Topic 4: said, biden, house, president, israel, trump, ukraine, war
Topic 5: trump, said, president, biden, people, donald, just, like
Topic 6: 2024, news, year, tiktok, new, usa, time, story
Topic 7: trump, biden, said, harris, president, voters, democratic, campaign
Topic 8: carter, year, el, said, church, en, time, la
Topic 9: said, police, today, usa, post, people, new, according


In [6]:
# ─────────────────────────────────────────────────────────
# 7. NAMED ENTITY RECOGNITION (NER)
nlp = spacy.load('en_core_web_sm')  # install with: python -m spacy download en_core_web_sm

ner_data = []
for art in usadata:
    doc = nlp(art['text'])
    ents = [ent.text for ent in doc.ents if ent.label_ in ('PERSON','ORG','GPE')]
    ner_data.append({
        'title': art['title'],
        'entities': ents
    })

df_ner = pd.DataFrame(ner_data)
print("\n=== Sample NER Output ===")
print(df_ner.head())


=== Sample NER Output ===
                                               title  \
0  International Atomic Energy Agency report spel...   
1  Trump says he wants foreign students to study ...   
2  Oreo maker sues Aldi over alleged copycat cook...   
3  How to follow the Israel-Hamas war with USA TODAY   
4  US proposes 60-day ceasefire for Gaza; hostage...   

                                            entities  
0  [VIENNA, Reuters, Iran, U.N., Reuters, Board o...  
1  [WASHINGTON, Donald Trump, White House, the Un...  
2  [Oreos, Chips Ahoy, Ritz, Aldi, Mondelēz Inter...  
3  [USA TODAY's, Israel-Hamas War, Israel, Hamas,...  
4  [Reuters, U.S., Gaza, Reuters, U.S., Donald Tr...  


In [7]:
df_ner

Unnamed: 0,title,entities
0,International Atomic Energy Agency report spel...,"[VIENNA, Reuters, Iran, U.N., Reuters, Board o..."
1,Trump says he wants foreign students to study ...,"[WASHINGTON, Donald Trump, White House, the Un..."
2,Oreo maker sues Aldi over alleged copycat cook...,"[Oreos, Chips Ahoy, Ritz, Aldi, Mondelēz Inter..."
3,How to follow the Israel-Hamas war with USA TODAY,"[USA TODAY's, Israel-Hamas War, Israel, Hamas,..."
4,US proposes 60-day ceasefire for Gaza; hostage...,"[Reuters, U.S., Gaza, Reuters, U.S., Donald Tr..."
...,...,...
3166,Timeline of conflict: Why the 2023 Israeli-Pal...,"[Israel, Gaza, Hamas, Israel, Benjamin Netanya..."
3167,Simone Biles vault final shows athlete safety ...,"[Belgium, Simone Biles, Laurent Landi, Yurchen..."
3168,"Israel updates: 'We are at war,' Israel's Neta...","[Israel, Hamas, Israel, Israel, Benjamin Netan..."
3169,Mini-skirts and hijabs: After a rights crackdo...,"[Cadillac, Ray Ban, Iran, U.S., CIA, Iran, the..."


In [8]:
print(df_ner.loc[0, 'entities'])


['VIENNA', 'Reuters', 'Iran', 'U.N.', 'Reuters', 'Board of Governors', 'the United States', 'Britain', 'France', 'Germany', 'Iran', 'Iran', 'Tehran', 'Washington', 'IAEA', 'Iran', 'Iran', 'Tehran', 'the Board of Governors', 'Tehran', 'IAEA', 'Iran', 'IAEA', 'IAEA', 'IAEA', 'Iran', 'Turquzabad', 'Agency', 'Iran', 'Iran', 'Lavisan-Shian', 'Varamin', 'Turquzabad', 'Lavisan-Shian', 'Tehran', 'Iran', 'the U.N. Security Council', 'IAEA', 'Iran', 'the United States', 'IAEA', 'Iran', 'IAEA', 'IAEA', 'Israel', 'Iran', 'IAEA', 'Tehran', 'Benjamin Netanyahu', 'Iran', 'U.S.', 'IAEA', 'Iran', 'Iran', 'Abbas Araqchi', 'U.S.', 'Tehran', 'Washington', 'Tehran', 'Araqchi', 'Iran', 'U.S.', 'Iran', 'Washington', 'Tehran', 'Francois Murphy', 'Menna Alaa El-Din', 'Cairo', 'Parisa Hafezi', 'Dubai', 'Toby Chopra', 'Frances Kerry']


In [9]:
df_ner['entity_count'] = df_ner['entities'].str.len()
print(df_ner[['title','entity_count']].head())


                                               title  entity_count
0  International Atomic Energy Agency report spel...            72
1  Trump says he wants foreign students to study ...            15
2  Oreo maker sues Aldi over alleged copycat cook...            62
3  How to follow the Israel-Hamas war with USA TODAY            13
4  US proposes 60-day ceasefire for Gaza; hostage...            68


In [10]:
from collections import Counter
all_ents = [e for ents in df_ner['entities'] for e in ents]
freq = Counter(all_ents)
print(freq.most_common(1000))



[('Israel', 12511), ('Trump', 10374), ('Gaza', 7948), ('Hamas', 7172), ('Biden', 6692), ('U.S.', 5640), ('Taylor Wilson', 2992), ('Donald Trump', 2522), ('Harris', 2371), ('Iran', 2248), ('USA TODAY', 2009), ('Joe Biden', 1834), ('−', 1524), ('Netanyahu', 1381), ('Ukraine', 1373), ('House', 1367), ('Congress', 1312), ('the United States', 1279), ('Excerpt', 1191), ('America', 1174), ('Senate', 1144), ('Hezbollah', 1028), ('New York', 959), ('US', 925), ('GOP', 924), ('Reuters', 923), ('Washington', 899), ('the White House', 852), ('Kamala Harris', 851), ('Russia', 841), ('Lebanon', 839), ('Michigan', 809), ('Florida', 764), ('Rafah', 691), ('U.N.', 680), ('White House', 666), ('Benjamin Netanyahu', 634), ('Egypt', 619), ('Pennsylvania', 608), ('TikTok', 548), ('Syria', 538), ('Harvard', 531), ('California', 513), ('Haley', 509), ('Texas', 505), ('Carter', 499), ('China', 494), ('Columbia', 490), ('Georgia', 487), ('Chicago', 478), ('the Gaza Strip', 472), ('the West Bank', 453), ('John

In [11]:
# Pro-Israel entities
israel_entities = {
    "Israel",
    "Netanyahu",
    "Benjamin Netanyahu",
    "Jerusalem"
}

# Pro-Palestinian entities
palestine_entities = {
    "Gaza",
    "the Gaza Strip",
    "West Bank",
    "Palestine",
    "Hamas",
    "Hezbollah",
    "Rafah"
}

In [12]:
# Example bias function
def ner_bias(ents):
    ni = sum(e in israel_entities    for e in ents)
    np = sum(e in palestine_entities for e in ents)
    total = ni + np
    return 0.0 if total == 0 else (ni - np) / total

# Apply to your DataFrame
df_ner['entity_bias'] = df_ner['entities'].apply(ner_bias)

# Inspect a sample
print(df_ner[['title','entity_bias']])
print("Overall entity bias:", df_ner['entity_bias'].mean())


                                                  title  entity_bias
0     International Atomic Energy Agency report spel...     1.000000
1     Trump says he wants foreign students to study ...     0.000000
2     Oreo maker sues Aldi over alleged copycat cook...     0.000000
3     How to follow the Israel-Hamas war with USA TODAY    -0.200000
4     US proposes 60-day ceasefire for Gaza; hostage...    -0.277778
...                                                 ...          ...
3166  Timeline of conflict: Why the 2023 Israeli-Pal...     0.152941
3167  Simone Biles vault final shows athlete safety ...     1.000000
3168  Israel updates: 'We are at war,' Israel's Neta...     0.414634
3169  Mini-skirts and hijabs: After a rights crackdo...     1.000000
3170                  Horoscopes Today, October 7, 2023     0.000000

[3171 rows x 2 columns]
Overall entity bias: 0.010724540326024296


In [14]:
df_ner[['title','entities','entity_bias']].to_csv('articles_entities.csv', index=False)



In [None]:
print("\n=== May 2025 Topics ===")
for topic_idx, topic in enumerate(lda_may.components_):
    top_features = [feature_names[i] 
                    for i in topic.argsort()[:-n_top_words - 1:-1]]
    print(f"Topic {topic_idx}: {', '.join(top_features)}")



=== May 2025 Topics ===
Topic 0: trump, said, president, students, harvard, federal, administration, right
Topic 1: combs, said, fine, ventura, taylor, wilson, today, just
Topic 2: pope, francis, said, church, cardinals, leo, new, world
Topic 3: said, 500, rodriguez, museum, indianapolis, jewish, indy, shooting
Topic 4: said, israel, gaza, israeli, hamas, trump, war, people
