In [None]:
from gnews import GNews
import newspaper
from googlenewsdecoder import gnewsdecoder


google_news = GNews(
    language='en',
    country='US',
    period='1d',
    start_date=None,
    end_date=None,
    max_results=20,
    exclude_websites=['www.hindustantimes.com', 'www.livemint.com', 'www.reuters.com', 'www.timesnownews.com']
)

top_news = google_news.get_news_by_topic("POLITICS");

In [None]:
article_urls = []
for news in top_news:
    try:
        decoded_url = gnewsdecoder(news['url'])
        if decoded_url.get("status"):
            print("Decoded URL:", decoded_url["decoded_url"])
            article_urls.append(decoded_url['decoded_url'])
        else:
            print("Error:", decoded_url["message"])
    except Exception as e:
        print(f"Error occurred: {e}")

In [None]:
from newspaper.mthreading import fetch_news 

try:
    results = fetch_news(article_urls, threads=4)
except:
    print("ERROR")

In [None]:
for article in results:  
    print(article.url)
    print(article.title)
    print(article.text_cleaned[:300])
    print('--------------------')

In [None]:
article_titles = []
for article in results:
    article_titles.append(article.title);

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx
import numpy as np

# 1. Load BERT model
model = SentenceTransformer('all-mpnet-base-v2')

# 2. Compute embeddings
def get_article_embedding(article, max_chars=400):
    text = article.text[:max_chars]
    title = article.title

    total = title + " " + text
    return model.encode(total)

embeddings = [get_article_embedding(article) for article in results]

# 3. Compute cosine similarity matrix
similarity_matrix = cosine_similarity(embeddings)

# 4. Create graph based on similarity threshold
threshold = 0.655
G = nx.Graph()

# Add nodes
G.add_nodes_from(range(len(results)))

# Add edges where similarity > threshold
for i in range(len(results)):
    for j in range(i + 1, len(results)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(i, j)

# 5. Find connected components (clusters)
components = list(nx.connected_components(G))

# 6. Print clusters
for idx, cluster in enumerate(components):
    print(f"\nCluster {idx + 1}:")
    for i in sorted(cluster):
        print(f"  - {article_titles[i]}")


In [None]:
import matplotlib.pyplot as plt

# Create graph and add edges above threshold
G = nx.Graph()
G.add_nodes_from(range(len(results)))

for i in range(len(results)):
    for j in range(i + 1, len(results)):
        if similarity_matrix[i][j] > threshold:
            G.add_edge(i, j, weight=similarity_matrix[i][j])

# Find connected components (clusters)
components = list(nx.connected_components(G))
node_colors = {}
for idx, comp in enumerate(components):
    for node in comp:
        node_colors[node] = idx

# Assign color to each node
color_list = [node_colors.get(i, -1) for i in range(len(results))]

# Plot the graph
plt.figure(figsize=(12, 8))
pos = nx.spring_layout(G, seed=42, k=0.8)

nx.draw_networkx_nodes(G, pos, node_size=500, cmap=plt.cm.tab10, 
                       node_color=color_list, alpha=0.9)
nx.draw_networkx_edges(G, pos, alpha=0.3)

# Optionally use short labels
labels = {i: article_titles[i][:30] + "..." for i in G.nodes}
nx.draw_networkx_labels(G, pos, labels=labels, font_size=8)

plt.title("News Article Similarity Graph", fontsize=14)
plt.axis('off')
plt.show()