In [None]:
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

In [None]:
rpath = os.path.abspath('..')
if rpath not in sys.path:
    sys.path.insert(0, rpath)

from src.loader import NewsDataLoader

if __name__ == "__main__":
    data_directory = "../data"
    loader = NewsDataLoader(data_directory)

    merge_df = loader.load_data()

In [None]:

stop_words = set(stopwords.words('english'))
ps = PorterStemmer()

def preprocess_text(text):
    text = text.lower()

    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    tokens = word_tokenize(text)
    # Remove stop words and apply stemming
    filtered_tokens = [ps.stem(word) for word in tokens if word not in stop_words]
    # Join the filtered tokens back into text
    preprocessed_text = ' '.join(filtered_tokens)
    return preprocessed_text

merge_df['preprocessed_content'] = merge_df['content'].apply(preprocess_text)

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(merge_df['preprocessed_content'])

#  Clustering
num_clusters = 10  # Choose an appropriate number of clusters
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(tfidf_matrix)


merge_df['cluster'] = kmeans.labels_

# Count the number of events covered in the data
num_events = merge_df['cluster'].nunique()
print("Number of events:", num_events)

# Determine which news sites report events the earliest
earliest_reporting = merge_df.groupby('source_name')['published_at'].min()
earliest_reporting = earliest_reporting.sort_values()
print("News sites reporting events earliest:\n", earliest_reporting)

event_counts = merge_df['cluster'].value_counts()
print("Event reporting frequency:\n", event_counts)

# Plotting
plt.figure(figsize=(10, 6))
plt.bar(event_counts.index, event_counts.values, color='skyblue')
plt.xlabel('Event Cluster')
plt.ylabel('Number of Articles')
plt.title('Event Reporting Frequency')
plt.show()