In [None]:
# 🕵️ Misinformation Detection - Exploration Notebook

# --- Setup ---
import pandas as pd
from pymongo import MongoClient
import yaml
from collections import Counter
import matplotlib.pyplot as plt
from wordcloud import WordCloud

# --- Load Config ---
with open("../config/config.yaml", "r") as f:
    config = yaml.safe_load(f)

client = MongoClient(config["mongo"]["uri"])
db = client[config["mongo"]["db_name"]]

print("✅ MongoDB connected for exploration")

# --- Load Tweets ---
tweets = list(db.tweets.find({}, {"text": 1, "cleaned_text": 1, "_id": 0}))
df_tweets = pd.DataFrame(tweets)

print(f"Loaded {len(df_tweets)} tweets")
df_tweets.head()

# --- Load News ---
news = list(db.news.find({}, {"title": 1, "description": 1, "cleaned_text": 1, "_id": 0}))
df_news = pd.DataFrame(news)

print(f"Loaded {len(df_news)} news articles")
df_news.head()

# --- Word Frequency Analysis (Tweets) ---
all_words = " ".join(df_tweets["cleaned_text"].dropna())
word_freq = Counter(all_words.split())

top_words = pd.DataFrame(word_freq.most_common(20), columns=["Word", "Frequency"])
print(top_words)

# --- Plot Word Frequencies ---
plt.figure(figsize=(10, 5))
plt.bar(top_words["Word"], top_words["Frequency"])
plt.xticks(rotation=45)
plt.title("Top 20 Words in Tweets")
plt.show()

# --- Word Cloud for News ---
all_words_news = " ".join(df_news["cleaned_text"].dropna())
wordcloud = WordCloud(width=800, height=400, background_color="white").generate(all_words_news)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.title("News Articles Word Cloud", fontsize=16)
plt.show()
