In [2]:
import pandas as pd

cols = ["newsID", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]

news_df = pd.read_csv(r"C:\Users\rshaw\Desktop\EC Utbildning - Data Science\Thesis\Agentic_AI_News_Editor project\agentic_ai_editor_project\train_data\news.tsv", sep="\t", header=None, names=cols)

# Preview first few rows
display(news_df.head())

Unnamed: 0,newsID,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N88753,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N45436,news,newsscienceandtechnology,Walmart Slashes Prices on Last-Generation iPads,Apple's new iPad releases bring big deals on l...,https://assets.msn.com/labs/mind/AABmf2I.html,"[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ...","[{""Label"": ""IPad"", ""Type"": ""J"", ""WikidataId"": ..."
2,N23144,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
3,N86255,health,medical,Dispose of unwanted prescription drugs during ...,,https://assets.msn.com/labs/mind/AAISxPN.html,"[{""Label"": ""Drug Enforcement Administration"", ...",[]
4,N93187,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."


In [3]:
print(news_df.isnull().sum())


newsID                  0
category                0
subcategory             0
title                   0
abstract             5415
url                     0
title_entities          3
abstract_entities       6
dtype: int64


In [4]:
news_df = news_df.dropna(subset=["abstract", "title_entities", "abstract_entities"])


In [5]:
print(news_df.isnull().sum())


newsID               0
category             0
subcategory          0
title                0
abstract             0
url                  0
title_entities       0
abstract_entities    0
dtype: int64


In [6]:
# How long are the titles and abstracts?
news_df["title_length"] = news_df["title"].apply(lambda x: len(x.split()))
news_df["abstract_length"] = news_df["abstract"].apply(lambda x: len(x.split()))

print(news_df[["title_length", "abstract_length"]].describe())


       title_length  abstract_length
count  96106.000000     96106.000000
mean      10.687512        38.282594
std        3.243870        26.192824
min        1.000000         1.000000
25%        9.000000        17.000000
50%       10.000000        27.000000
75%       13.000000        67.000000
max       48.000000       474.000000


In [7]:
counts = news_df["category"].value_counts()
percs = news_df["category"].value_counts(normalize=True).mul(100).round(1).astype(str) + '%'
display(pd.concat([counts,percs], axis=1, keys=['count', 'percentage']))


Unnamed: 0_level_0,count,percentage
category,Unnamed: 1_level_1,Unnamed: 2_level_1
sports,29625,30.8%
news,29363,30.6%
finance,5777,6.0%
travel,4605,4.8%
video,4562,4.7%
foodanddrink,4319,4.5%
lifestyle,4255,4.4%
weather,3820,4.0%
health,2815,2.9%
autos,2756,2.9%


In [8]:
news_df["has_title_entities"] = news_df["title_entities"].apply(lambda x: x != "[]")
news_df["has_abstract_entities"] = news_df["abstract_entities"].apply(lambda x: x != "[]")

display(news_df[["has_title_entities", "has_abstract_entities"]].mean())

has_title_entities       0.743627
has_abstract_entities    0.791220
dtype: float64

In [9]:
# Look at a few non-empty rows
sample = news_df[news_df["title_entities"] != "[]"].sample(5)
print(sample["title_entities"].values)


['[{"Label": "Tesla Autopilot", "Type": "U", "WikidataId": "Q27150149", "Confidence": 1.0, "OccurrenceOffsets": [0], "SurfaceForms": ["Tesla Autopilot"]}]'
 '[{"Label": "Pittsburgh", "Type": "G", "WikidataId": "Q1342", "Confidence": 1.0, "OccurrenceOffsets": [0], "SurfaceForms": ["Pittsburgh"]}, {"Label": "Community development", "Type": "C", "WikidataId": "Q718998", "Confidence": 0.987, "OccurrenceOffsets": [39], "SurfaceForms": ["Community Development"]}]'
 '[{"Label": "Council Bluffs, Iowa", "Type": "G", "WikidataId": "Q695565", "Confidence": 1.0, "OccurrenceOffsets": [0], "SurfaceForms": ["Council Bluffs"]}]'
 '[{"Label": "Boston Bruins", "Type": "O", "WikidataId": "Q194121", "Confidence": 0.997, "OccurrenceOffsets": [0], "SurfaceForms": ["Bruins"]}]'
 '[{"Label": "New England Patriots", "Type": "O", "WikidataId": "Q193390", "Confidence": 1.0, "OccurrenceOffsets": [22], "SurfaceForms": ["Patriots"]}, {"Label": "Stephon Gilmore", "Type": "P", "WikidataId": "Q3973224", "Confidence": 

In [10]:
import ast

# Look at just one random non-empty row
sample = news_df[news_df["title_entities"] != "[]"]["title_entities"].sample(1).values[0]

# Parse it
parsed = ast.literal_eval(sample)

# Get all keys from one example entity
print(parsed[0].keys())
3

dict_keys(['Label', 'Type', 'WikidataId', 'Confidence', 'OccurrenceOffsets', 'SurfaceForms'])


3

In [11]:
def extract_all_types(entity_col):
    types = set()
    for item in entity_col.dropna():
        if item.strip() == "[]":
            continue
        try:
            parsed = ast.literal_eval(item)
            if isinstance(parsed, list):
                for ent in parsed:
                    if isinstance(ent, dict) and "Type" in ent:
                        types.add(ent["Type"])
        except (ValueError, SyntaxError):
            continue
    return types

# Now run it
title_entity_types = extract_all_types(news_df["title_entities"])
abstract_entity_types = extract_all_types(news_df["abstract_entities"])

print(f"Title entity types: {title_entity_types}")
print(f"Abstract entity types: {abstract_entity_types}")

Title entity types: {'S', 'G', 'Q', 'M', 'U', 'R', 'O', 'C', 'B', 'E', 'V', 'H', 'Y', 'W', 'L', 'J', 'P', 'N', 'I', 'A', 'F', 'K'}
Abstract entity types: {'S', 'G', 'Q', 'M', 'U', 'R', 'O', 'C', 'B', 'E', 'V', 'H', 'Y', 'W', 'L', 'J', 'P', 'N', 'I', 'A', 'F', 'K'}


In [39]:
news_df["title"] = news_df["title"].str.lower()
news_df["abstract"] = news_df["abstract"].str.lower()


In [40]:
from bs4 import BeautifulSoup

def strip_html(text):
    return BeautifulSoup(text, "html.parser").get_text() if pd.notna(text) else ""

news_df["title"] = news_df["title"].apply(strip_html)
news_df["abstract"] = news_df["abstract"].apply(strip_html)


In [41]:
# Save cleaned dataset
news_df[["title", "abstract", "title_entities", "category"]].to_csv("articles_clean.csv", index=False)

print("✅ Cleaned data saved as 'articles_clean.csv'")

✅ Cleaned data saved as 'articles_clean.csv'


In [44]:
# --- 2. Build Embeddings with Checkpoints ---

import os
import numpy as np
from sentence_transformers import SentenceTransformer

# Load cleaned data
articles_df = pd.read_csv("articles_clean.csv")

# Prepare text for embedding (title + abstract combined)
articles_df["text_for_embedding"] = articles_df["title"] + " " + articles_df["abstract"]

# Load embedding model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Parameters for batching and checkpointing
texts = articles_df["text_for_embedding"].tolist()
batch_size = 128
checkpoint_every = 10000  # Save every 10,000 articles
save_dir = "embedding_checkpoints"

# Make checkpoint directory if not exists
os.makedirs(save_dir, exist_ok=True)

# Encode and save in chunks
n_total = len(texts)
all_embeddings = []

for start_idx in range(0, n_total, checkpoint_every):
    end_idx = min(start_idx + checkpoint_every, n_total)
    batch_texts = texts[start_idx:end_idx]
    
    print(f"🔵 Encoding articles {start_idx} to {end_idx}...")

    batch_embeddings = model.encode(
        batch_texts,
        batch_size=batch_size,
        convert_to_numpy=True,
        show_progress_bar=True
    )
    
    # Save checkpoint immediately
    checkpoint_path = os.path.join(save_dir, f"embeddings_{start_idx}_{end_idx}.npy")
    np.save(checkpoint_path, batch_embeddings)
    
    # Optionally collect in memory
    all_embeddings.append(batch_embeddings)

print("✅ All embeddings generated and checkpoints saved!")


🔵 Encoding articles 0 to 10000...


Batches: 100%|██████████| 79/79 [03:02<00:00,  2.31s/it]


🔵 Encoding articles 10000 to 20000...


Batches: 100%|██████████| 79/79 [03:42<00:00,  2.82s/it]


🔵 Encoding articles 20000 to 30000...


Batches: 100%|██████████| 79/79 [03:42<00:00,  2.81s/it]


🔵 Encoding articles 30000 to 40000...


Batches: 100%|██████████| 79/79 [03:53<00:00,  2.95s/it]


🔵 Encoding articles 40000 to 50000...


Batches: 100%|██████████| 79/79 [03:30<00:00,  2.67s/it]


🔵 Encoding articles 50000 to 60000...


Batches: 100%|██████████| 79/79 [03:13<00:00,  2.45s/it]


🔵 Encoding articles 60000 to 70000...


Batches: 100%|██████████| 79/79 [03:43<00:00,  2.83s/it]


🔵 Encoding articles 70000 to 80000...


Batches: 100%|██████████| 79/79 [03:27<00:00,  2.63s/it]


🔵 Encoding articles 80000 to 90000...


Batches: 100%|██████████| 79/79 [03:54<00:00,  2.96s/it]


🔵 Encoding articles 90000 to 96106...


Batches: 100%|██████████| 48/48 [02:38<00:00,  3.30s/it]

✅ All embeddings generated and checkpoints saved!





In [45]:
import glob

# Load all saved checkpoints
embedding_files = sorted(glob.glob("embedding_checkpoints/embeddings_*.npy"))

all_embeddings = []
for f in embedding_files:
    batch = np.load(f)
    all_embeddings.append(batch)

# Merge into one big array
final_embeddings = np.vstack(all_embeddings)

print("✅ Merged all embeddings:", final_embeddings.shape)


✅ Merged all embeddings: (96106, 384)


In [48]:
# --- 3. Create FAISS Index ---

import faiss

# Prepare embeddings as numpy array
embeddings_np = np.array(final_embeddings)

# Build FAISS index
embedding_dim = embeddings_np.shape[1]  # 384 for MiniLM model
index = faiss.IndexFlatL2(embedding_dim)  # L2 = Euclidean distance
index.add(embeddings_np)

# Save the index
faiss.write_index(index, "articles_faiss.index")

# Save article IDs separately
articles_df.to_csv("articles_with_embeddings.csv", index=False)

print("✅ FAISS index saved as 'articles_faiss.index' and article metadata saved as 'articles_with_embeddings.csv'")


✅ FAISS index saved as 'articles_faiss.index' and article metadata saved as 'articles_with_embeddings.csv'


In [None]:
# --- 4. Recommend Articles ---

import pandas as pd
import faiss
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import pipeline

# Load index and articles
index = faiss.read_index("articles_faiss.index")
articles_df = pd.read_csv("articles_with_embeddings.csv")

# Load embedding model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Load lightweight LLM for rewriting
rewrite_pipeline = pipeline("text2text-generation", model="google/flan-t5-small")

# Define a sample editorial query
editorial_query = "artificial intelligence innovation"

# Embed the query
query_embedding = model.encode([editorial_query])

# Search top 5 articles
D, I = index.search(np.array(query_embedding), k=5)  # D = distance, I = index

# Get recommended articles as a DataFrame
recommended_articles = articles_df.iloc[I[0]].copy()  # <-- copy to avoid SettingWithCopyWarning

# Define rewriting function
def rewrite_headline(title, abstract):
    prompt = f"Rewrite the news headline to be more engaging and SEO-friendly:\n\nTitle: {title}\n\nAbstract: {abstract}\n\nRewritten Headline:"
    response = rewrite_pipeline(prompt, max_length=30, do_sample=False)
    return response[0]['generated_text']

# Apply headline rewriting
recommended_articles["rewritten_title"] = recommended_articles.apply(
    lambda row: rewrite_headline(row["title"], row["abstract"]), axis=1
)

# Display the rewritten articles
print("\n Recommended Articles (with Rewritten Headlines):")
display(recommended_articles[["title", "rewritten_title", "abstract", "category"]])


Device set to use cpu



🎯 Recommended Articles (with Rewritten Headlines):


Unnamed: 0,title,rewritten_title,abstract,category
49187,this robotic hand taught itself to solve a rub...,"The robots are a slick, slick, and slick, but ...",there are a number of very impressive examples...,news
47326,these researchers are using ai drones to more ...,The research is aimed at a new way to speed up...,researchers are looking to new advances in com...,news
72594,"with face scans, automated marking, singapore ...",The strategy is to create a niche for itself i...,singapore has unveiled an ambitious strategy t...,news
79083,openai published the tool that writes disturbi...,Openai published the full ai in april,"in february, openai announced that it had deve...",news
86899,david m. shribman: how artificial intelligence...,The sex of the sex of the sex of the sex of th...,montreal in a classic startup setting in a...,sports


### Load FAISS index, articles metadata, and embedding model

In [72]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer

# Load FAISS index
index = faiss.read_index("articles_faiss.index")

# Load articles metadata
articles_df = pd.read_csv("articles_with_embeddings.csv")

# Load embedding model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

print("✅ Loaded FAISS index, articles, and MiniLM model.")


✅ Loaded FAISS index, articles, and MiniLM model.


### Define editorial queries

In [73]:
# Define editorial topics ("editorial queries")
editorial_queries = {
    "Top Technology News": "latest breakthroughs in technology and innovation",
    "Inspiring Stories": "positive and uplifting news stories",
    "Global Politics": "latest news about world politics and diplomacy",
    "Climate and Environment": "climate change news and environment protection",
    "Health and Wellness": "advances in healthcare and medical discoveries"
}


### Run retrieval for each topic

In [None]:
retrieved_articles = {}

for topic, articles in retrieved_articles.items():
    articles["rewritten_title"] = articles.apply(
        lambda row: rewrite_headline(row["title"], row["abstract"]), axis=1
    )

display("Retrieved top articles for each editorial topic.")


'Retrieved top articles for each editorial topic.'

### LLM Headline Rewriter

In [None]:
for topic, articles in retrieved_articles.items():
    articles["rewritten_title"] = articles.apply(
        lambda row: rewrite_headline(row["title"], row["abstract"]), axis=1
    )
    

### LLM Editorial Explainer Generator

In [None]:
# Define the explanation generation function
def generate_explanation(title, abstract):
    prompt = f"Explain in one sentence why this news article is important to readers:\n\nTitle: {title}\n\nAbstract: {abstract}\n\nExplanation:"
    response = rewrite_pipeline(prompt, max_length=40, do_sample=False)
    return response[0]['generated_text']

# Apply explanation generation to each topic's articles
for topic, articles in retrieved_articles.items():
    articles["explanation"] = articles.apply(
        lambda row: generate_explanation(row["title"], row["abstract"]), axis=1
    )


### Memory

In [77]:
import json

def save_topics(topics, filename="memory_topics.json"):
    with open(filename, "w") as f:
        json.dump(topics, f)

def load_topics(filename="memory_topics.json"):
    try:
        with open(filename, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        return []

# Example usage:
yesterday_topics = load_topics()

# Compare today's topics
fresh_topics = [t for t in editorial_queries if t not in yesterday_topics]

# Use fresh topics for retrieval
print(f"✅ Today's fresh topics: {fresh_topics}")

# After today's run
save_topics(editorial_queries)


✅ Today's fresh topics: []


In [None]:
import pandas as pd
from glob import glob

# Find all files starting with 'retrieved_'
csv_files = glob("retrieved_*.csv")

# Combine all retrieved topics into one DataFrame
dfs = [pd.read_csv(file).assign(topic=file.replace("retrieved_", "").replace(".csv", "").replace("_", " ").title()) for file in csv_files]
combined_df = pd.concat(dfs, ignore_index=True)

display(combined_df[["topic", "title", "rewritten_title", "explanation"]].head(20))
