In [81]:
import requests
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer

In [82]:
API_KEY = "***REMOVED***"

In [106]:
"""
step 1: def fetch_news requests and retrieves 100 news articles related to "AI" from the NewsAPI.
step 2: the query is set to "AI", the language is English, the page size is 100, and the results are sorted by publication date.
step 3: the response is received in JSON format.
step 4 :The docs list is created by combining the title and description of each news article into a single string.
step 5: When called, the function returns the docs list containing these combined news texts.
"""
def fetch_news(query="AI", page_size=100):
    url = (
        f"https://newsapi.org/v2/everything?"
        f"q={query}&"
        f"language=en&"
        f"pageSize={page_size}&"
        f"sortBy=publishedAt&"
        f"apiKey={API_KEY}"
    )
    response = requests.get(url)
    data = response.json()

    articles = data.get("articles", [])
    docs = []
    meta_info = []  #Store source name and publication date

    for article in articles:
        text = f"{article['title']} {article.get('description') or ''}".strip()
        docs.append(text)
        source = article.get("source", {}).get("name", "Unknown source")
        published = article.get("publishedAt", "Unknown date")
        meta_info.append(f"{source} | {published}")

    return docs, meta_info

In [101]:
#news data
docs, meta_info = fetch_news(query="AI", page_size=100)
print(f"Fetched {len(docs)} news articles.")
print(f"Fetched {len(meta_info)} news articles.")

Fetched 100 news articles.
Fetched 100 news articles.


In [85]:
# embedding - use TF-IDF Vs SBERT

In [51]:
#TF-IDF

In [89]:
#set max_features=5000 = the vectorizer will select only the top 5000 most important/frequenct words
#helps prevent the vector size from becoming too large  + saving memories
tfidf = TfidfVectorizer(max_features=5000)
tfidf_vectors = tfidf.fit_transform(docs)
tfidf_sim = cosine_similarity(tfidf_vectors)
print(f"TF-IDF vectors shape: {tfidf_vectors.shape}")

TF-IDF vectors shape: (100, 1309)


In [54]:
#SBERT

In [55]:
from sentence_transformers import SentenceTransformer

In [90]:
sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
sbert_vectors = sbert_model.encode(docs, show_progress_bar=True)
sbert_sim = cosine_similarity(sbert_vectors)
print(f"SBERT vectors shape: {sbert_vectors.shape}")

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

SBERT vectors shape: (100, 384)


In [57]:
"""
Keypoints :
TF-IDF

Represents each document as a sparse vector based on word occurrence frequency.

High dimensional (thousands of features) and word-focused, so it weakly captures sentence meaning.

SBERT

Dense vector based on sentence-level semantics (usually 384 dimensions).

Effectively captures sentence meaning and is efficient due to its lower dimensionality.
"""

'\nKeypoints :\nTF-IDF\n\nRepresents each document as a sparse vector based on word occurrence frequency.\n\nHigh dimensional (thousands of features) and word-focused, so it weakly captures sentence meaning.\n\nSBERT\n\nDense vector based on sentence-level semantics (usually 384 dimensions).\n\nEffectively captures sentence meaning and is efficient due to its lower dimensionality.\n'

In [66]:
#Comparison of the Two Similarity Matrices =>Although the shapes of the two matrices are the same, the actual values inside them can differ significantly due to the different underlying representations.

In [91]:
print(np.round(tfidf_sim[:5, :5], 2))

[[1.   0.13 0.09 0.03 0.05]
 [0.13 1.   0.1  0.01 0.01]
 [0.09 0.1  1.   0.13 0.04]
 [0.03 0.01 0.13 1.   0.12]
 [0.05 0.01 0.04 0.12 1.  ]]


In [92]:
print(np.round(sbert_sim[:5, :5], 2))

[[ 1.    0.29  0.12  0.04  0.09]
 [ 0.29  1.    0.33  0.14 -0.  ]
 [ 0.12  0.33  1.    0.32  0.18]
 [ 0.04  0.14  0.32  1.    0.3 ]
 [ 0.09 -0.    0.18  0.3   1.  ]]


In [69]:
#Compare the similarity rankings of other sentences based on a single sentence

In [93]:
def recommend_similar_articles(sim_matrix, article_index, top_n=5):
    """
    Given a similarity matrix and a target article index,
    returns the indices of the top_n most similar articles excluding the target itself.
    """
    sim_scores = sim_matrix[article_index]
    similar_indices = np.argsort(-sim_scores)  # 내림차순 정렬
    similar_indices = similar_indices[similar_indices != article_index]  # 자기 자신 제외
    top_indices = similar_indices[:top_n]
    return top_indices


In [94]:
target_index = 0

In [107]:
print("Target Article:")
print(f"- {docs[target_index]}")
print(f"  ({meta_info[target_index]})")


Target Article:
- inspect-viz 0.2.7 Data visualization for Inspect AI large language model evalutions.
  (Pypi.org | 2025-07-18T12:05:04Z)


In [104]:
print("\nTF-IDF based Recommended Articles:")
tfidf_recommendations = recommend_similar_articles(tfidf_sim, target_index, top_n=5)
for idx in tfidf_recommendations:
    print(f"- {docs[idx]}")
    print(f"  ({meta_info[idx]})")


TF-IDF 기반 추천 뉴스:
- 'Black Swan' author Nassim Taleb shares 4 life lessons — and reveals what keeps him awake at night "The Black Swan" author Nassim Taleb told BI that discipline, health, and building the right skills are key ingredients for a good life.
  (Business Insider | 2025-07-18T11:33:45Z)
- Beamr Reports Entering PoCs in Video Data Compression Solution for Autonomous Vehicle Herzliya, Israel, July 18, 2025 (GLOBE NEWSWIRE) -- Beamr Imaging Ltd. (NASDAQ: BMR), a leader in video optimization technology and solutions, today announced a further update on its progress of validating Beamr content-adaptive, GPU-accelerated technology to…
  (GlobeNewswire | 2025-07-18T11:21:00Z)
- South Korea Poised to Become Regional Hub for Smart Prefabricated Construction - South Korea Prefabricated Construction Market Intelligence and Future Growth Dynamics Databook South Korea's prefabricated construction market is projected to reach KRW 13 trillion by 2025, growing at a 4.7% annual rate. From 2

In [108]:
print("\nSBERT based Recommended Articles:")
sbert_recommendations = recommend_similar_articles(sbert_sim, target_index, top_n=5)
for idx in sbert_recommendations:
    print(f"- {docs[idx]}")
    print(f"  ({meta_info[idx]})")


SBERT based Recommended Articles:
- 'Black Swan' author Nassim Taleb shares 4 life lessons — and reveals what keeps him awake at night "The Black Swan" author Nassim Taleb told BI that discipline, health, and building the right skills are key ingredients for a good life.
  (Business Insider | 2025-07-18T11:33:45Z)
- Beamr Reports Entering PoCs in Video Data Compression Solution for Autonomous Vehicle Herzliya, Israel, July 18, 2025 (GLOBE NEWSWIRE) -- Beamr Imaging Ltd. (NASDAQ: BMR), a leader in video optimization technology and solutions, today announced a further update on its progress of validating Beamr content-adaptive, GPU-accelerated technology to…
  (GlobeNewswire | 2025-07-18T11:21:00Z)
- South Korea Poised to Become Regional Hub for Smart Prefabricated Construction - South Korea Prefabricated Construction Market Intelligence and Future Growth Dynamics Databook South Korea's prefabricated construction market is projected to reach KRW 13 trillion by 2025, growing at a 4.7% an