In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import matplotlib.pyplot as plt

In [None]:
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import NewsDataLoader

if __name__ == "__main__":
    data_directory = "../data"
    loader = NewsDataLoader(data_directory)

    merge_df = loader.load_data()

In [None]:
import re


stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    tokens = word_tokenize(text)
    filtered_tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

merge_df['preprocessed_content'] = merge_df['content'].apply(preprocess_text)

# Topic Modeling
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(merge_df['preprocessed_content'])

lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_model.fit(tfidf_matrix)

#  Analyzing Topic Diversity
topic_counts = merge_df.groupby('source_name')['topic'].nunique()

# Analyze Topic Trends
# Plot 2D scatter plot

# Example code for plotting
plt.scatter(merge_df['published_at'], merge_df['topic'], c=merge_df['topic_count'], cmap='viridis')
plt.xlabel('Date')
plt.ylabel('Topics')
plt.title('Topic Trends Over Time')
plt.colorbar(label='Count')
plt.show()