#Used Colab Workbook to consolidate weekly highlights (i.e. recurring themes) through the use of Machine Learning.

#In line with https://www.sigmacomputing.com/blog/tf-idf-definition, I selected Term Frequency-Inverse Document Frequency (TF-IDF) to identify recurring themes based on the frequency of certain phrases (i.e. recurring stories/news reports).

#Output: weekly_headlines.csv file

In [1]:
#Cell 1: Upload and validate space_records.csv data, which was pulled from VS Code / Python.

import pandas as pd
from google.colab import files

uploaded = files.upload()  # upload space_records.csv

df = pd.read_csv("space_records.csv") #upload aggregated data (generated via VS Code)

#Clean data
df["published_date"] = pd.to_datetime(df.get("published_date"), errors="coerce", utc=True)
df["title"] = df.get("title", "").fillna("").astype(str)
df["summary"] = df.get("summary", "").fillna("").astype(str)

df = df.dropna(subset=["published_date"]) #keep rows that can be interpreted
df = df[df["title"].str.len() > 0].copy()

print(f"Loaded {len(df):,} rows from {df}")
print("Date range:", df["published_date"].min(), "→", df["published_date"].max())

display(df[["published_date","source_api","source","event_type","title"]].head(5))

Saving space_records.csv to space_records.csv
Loaded 689 rows from                                                  title  \
0    When allies can’t count on U.S. ISR, commercia...   
1    Space operations will become more dynamic this...   
2    SkyFi raises $12.7 million to scale satellite ...   
3    China’s first launches of 2026 send Yaogan spa...   
4                Indian rocket hits snag during launch   
..                                                 ...   
684  Does Expanding U.S. Defense Work Reshape The B...   
685  X-Bow Completes Preliminary Design Review for ...   
686  SpaceX set for Falcon 9 rocket launch Monday f...   
687  Rocket Lab: We'll wait for Neutron rocket to p...   
688  Israeli forces kill 11 in Gaza hours after hit...   

                                               summary                source  \
0    \nThe global security environment is becoming ...  Spaceflight News API   
1    \nFrom civil science and exploration missions ...  Spaceflight News API

Unnamed: 0,published_date,source_api,source,event_type,title
0,2026-01-14 13:00:00+00:00,spaceflight_news,Spaceflight News API,security_event,"When allies can’t count on U.S. ISR, commercia..."
1,2026-01-14 12:00:00+00:00,spaceflight_news,Spaceflight News API,launch,Space operations will become more dynamic this...
2,2026-01-14 11:30:00+00:00,spaceflight_news,Spaceflight News API,satellite_deployment,SkyFi raises $12.7 million to scale satellite ...
3,2026-01-14 10:46:43+00:00,spaceflight_news,Spaceflight News API,launch,China’s first launches of 2026 send Yaogan spa...
4,2026-01-14 07:08:24+00:00,spaceflight_news,Spaceflight News API,launch,Indian rocket hits snag during launch


In [2]:
#define parameters (i.e. relevant reporting period for recent highlights)

DAYS = 7

period_end = df["published_date"].max()
period_start = period_end - pd.Timedelta(days=DAYS)

df_period = (
    df[(df["published_date"] >= period_start) & (df["published_date"] <= period_end)]
    .copy()
    .sort_values("published_date", ascending=False)
)

print(f"Period: {period_start.date()} → {period_end.date()}  |  Days: {DAYS}")
print(f"Rows in period: {len(df_period):,}")

display(df_period[["published_date","source","event_type","title"]].head(10))


Period: 2026-01-07 → 2026-01-14  |  Days: 7
Rows in period: 274


Unnamed: 0,published_date,source,event_type,title
0,2026-01-14 13:00:00+00:00,Spaceflight News API,security_event,"When allies can’t count on U.S. ISR, commercia..."
515,2026-01-14 13:00:00+00:00,GDELT DOC 2.0,satellite_deployment,SkyFi raises $12 . 7M to turn satellite images...
684,2026-01-14 12:35:53+00:00,Google News,launch,Does Expanding U.S. Defense Work Reshape The B...
516,2026-01-14 12:30:00+00:00,GDELT DOC 2.0,satellite_deployment,SkyFi raises $12 . 7M to turn satellite images...
1,2026-01-14 12:00:00+00:00,Spaceflight News API,launch,Space operations will become more dynamic this...
2,2026-01-14 11:30:00+00:00,Spaceflight News API,satellite_deployment,SkyFi raises $12.7 million to scale satellite ...
603,2026-01-14 11:30:00+00:00,Google News,satellite_deployment,SkyFi raises $12.7 million to scale satellite ...
625,2026-01-14 10:47:40+00:00,Google News,launch,Astroscale UK set for in-orbit satellite refur...
3,2026-01-14 10:46:43+00:00,Spaceflight News API,launch,China’s first launches of 2026 send Yaogan spa...
640,2026-01-14 10:44:25+00:00,Google News,security_event,"Hackers Launch Over 91,000 Attacks on AI Syste..."


In [3]:
# Cell 3 — Group similar articles into "story clusters"

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import AgglomerativeClustering

SIM_THRESHOLD = 0.50  #higher = stricter grouping (try 0.58–0.68)

df_period["ml_text"] = (df_period["title"].fillna("") + " " + df_period["summary"].fillna("")).str.strip() #one record/row

vectorizer = TfidfVectorizer(stop_words="english", max_features=5000)
X = vectorizer.fit_transform(df_period["ml_text"]) #text to TF-IDF

S = cosine_similarity(X)
D = 1 - S  #clustering articles by similarity (cosine is the distance)

clusterer = AgglomerativeClustering(
    metric="precomputed",
    linkage="average",
    distance_threshold=1 - SIM_THRESHOLD,
    n_clusters=None
)

df_period["story_cluster_id"] = clusterer.fit_predict(D)

print("Articles:", len(df_period))
print("Story clusters:", df_period["story_cluster_id"].nunique())
df_period["story_cluster_id"].value_counts().head(10)

Articles: 274
Story clusters: 237


Unnamed: 0_level_0,count
story_cluster_id,Unnamed: 1_level_1
6,5
16,4
26,3
12,3
14,3
5,2
27,2
53,2
1,2
8,2


In [4]:
# Cell 4 — One representative headline per cluster (simple)

# Pick the most recent article in each cluster as the "headline"
headlines = (
    df_period.sort_values("published_date", ascending=False)
    .groupby("story_cluster_id", as_index=False)
    .first()
)

# Add cluster size (how many articles were grouped into that story)
cluster_sizes = df_period["story_cluster_id"].value_counts().rename("article_count")
headlines = headlines.merge(cluster_sizes, left_on="story_cluster_id", right_index=True)

# Keep only the columns we need
headlines_out = headlines[[
    "story_cluster_id",
    "published_date",
    "source",
    "title",
    "raw_source",
    "event_type",
    "is_security_related",
    "article_count"
]].rename(columns={
    "published_date": "published_max",
    "title": "rep_title",
    "raw_source": "rep_url"
}).sort_values(["article_count","published_max"], ascending=False)

print("Headlines rows:", len(headlines_out))
display(headlines_out.head(30))

headlines_out.to_csv("weekly_headlines.csv", index=False)
print("Saved weekly_headlines.csv")


Headlines rows: 237


Unnamed: 0,story_cluster_id,published_max,source,rep_title,rep_url,event_type,is_security_related,article_count
6,6,2026-01-13 13:37:45+00:00,Google News,Eutelsat orders 340 new OneWeb LEO satellites ...,https://news.google.com/rss/articles/CBMioAFBV...,satellite_deployment,False,5
16,16,2026-01-08 09:35:42+00:00,Google News,NordSpace’s Atlantic Spaceport Complex (ASX) R...,https://news.google.com/rss/articles/CBMigAJBV...,launch,False,4
26,26,2026-01-14 00:35:35+00:00,Spaceflight News API,Live coverage: SpaceX to launch midweek Starli...,35353,launch,False,3
12,12,2026-01-09 00:14:13+00:00,Spaceflight News API,NASA Crew-11 Will Return Early Due to Medical ...,35233,launch,False,3
14,14,2026-01-08 14:48:45+00:00,Google News,Northrop Grumman Secures $94M Navy Contract fo...,https://news.google.com/rss/articles/CBMifkFVX...,launch,False,3
52,52,2026-01-14 13:00:00+00:00,GDELT DOC 2.0,SkyFi raises $12 . 7M to turn satellite images...,https://finance.yahoo.com/news/skyfi-raises-12...,satellite_deployment,False,2
17,17,2026-01-14 11:30:00+00:00,Spaceflight News API,SkyFi raises $12.7 million to scale satellite ...,35381,satellite_deployment,False,2
21,21,2026-01-14 08:39:59+00:00,Google News,Daily Report - Air & Space Forces Magazine,https://news.google.com/rss/articles/CBMiZ0FVX...,policy_or_corporate,False,2
69,69,2026-01-14 07:08:24+00:00,Spaceflight News API,NASA Back for Seconds with New Food System Des...,35372,launch,False,2
5,5,2026-01-13 23:13:09+00:00,Spaceflight News API,NASA to Provide Live Coverage of Crew-11 Retur...,35350,launch,False,2


Saved weekly_headlines.csv
