System version: 3.10.14 | packaged by conda-forge | (main, Mar 20 2024, 12:45:18) [GCC 12.3.0]

In [1]:
!pip install langchain groq langchain_groq langchain_community --quiet

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-cloud-bigquery 2.34.4 requires packaging<22.0dev,>=14.3, but you have packaging 24.2 which is incompatible.
jupyterlab 4.3.1 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
jupyterlab-lsp 5.1.0 requires jupyter-lsp>=2.0.0, but you have jupyter-lsp 1.5.1 which is incompatible.
kfp 2.5.0 requires google-cloud-storage<3,>=2.2.1, but you have google-cloud-storage 1.44.0 which is incompatible.
kfp 2.5.0 requires requests-toolbelt<1,>=0.8.0, but you have requests-toolbelt 1.0.0 which is incompatible.
libpysal 4.9.2 requires shapely>=2.0.1, but you have shapely 1.8.5.post1 which is incompatible.
mlxtend 0.23.3 requires scikit-learn>=1.3.1, but you have scikit-learn 1.2.2 which is incompatible.
plotnine 0.14.3 requires matplotlib>=3.8.0, but you have matplotlib 3.7.5 which is

In [None]:
import re
import pandas as pd
import numpy as np
import pickle
import random
import os
import time
from tqdm import tqdm
from langchain_groq import ChatGroq
from langchain import PromptTemplate, LLMChain


trainfilename = "/kaggle/input/mind-demo/MIND Demo Dataset/train/behaviors.tsv"
newsfilename = "/kaggle/input/mind-demo/MIND Demo Dataset/train/news.tsv"
output_csv_path = "/kaggle/working/generated_headlines.csv"

In [None]:
MODEL_ID = "deepseek-r1-distill-llama-70b"
os.environ["GROQ_API_KEY"] = "..."

In [None]:
NUM_USERS = 50
NUM_ARTICLES = 10
BATCH_SIZE = 10

In [None]:
def word_tokenize(sent):
    pat = re.compile(r'[\w]+|[.,!?;|]')
    return pat.findall(sent.lower()) if isinstance(sent, str) else []

def read_news(filename):
    """Reads the MIND news.tsv file and processes news articles."""
    news = {}
    news_data = pd.read_csv(filename, sep='\t', header=None, names=[
        "news_id", "category", "subcategory", "title", "abstract", "url", "entity_title", "entity_abstract"
    ])
    news_data.fillna(value=" ", inplace=True)
    
    for _, row in news_data.iterrows():
        doc_id = row["news_id"]
        title_tokens = word_tokenize(row["title"])
        abstract_tokens = word_tokenize(row["abstract"])
        news[doc_id] = {
            "title": title_tokens,
            "abstract": abstract_tokens
        }
    return news

def read_user_interactions(filename):
    """Reads the MIND behaviors.tsv file and extracts user interactions."""
    user_interactions = {}
    data = pd.read_csv(filename, sep='\t', header=None, names=[
        "impression_id", "user_id", "timestamp", "history", "impressions"
    ])
    
    for _, row in data.iterrows():
        user_id = row["user_id"]
        clicked_news = row["history"].split() if isinstance(row["history"], str) else []
        impressions = row["impressions"].split() if isinstance(row["impressions"], str) else []

        if user_id not in user_interactions:
            user_interactions[user_id] = {"positive": set(), "negative": set()}

        user_interactions[user_id]["positive"].update(clicked_news)
        
        for item in impressions:
            parts = item.split('-')
            if len(parts) == 2 and parts[1] == '0':  # Click label 0 = not clicked
                user_interactions[user_id]["negative"].add(parts[0])
    
    return user_interactions

In [None]:
print("Loading and processing MIND dataset...")
news = read_news(newsfilename)
user_interactions = read_user_interactions(trainfilename)

# filtered_users = {user: data for user, data in user_interactions.items() if len(data["positive"]) > 25}
filtered_users = {user: data for user, data in user_interactions.items() if 10 < len(data["positive"]) < 20}
filtered_users_list = [{"user_id": user, **data} for user, data in filtered_users.items()]

print(f"Total users: {len(user_interactions)}")
print(f"Users with <20 and >10 positive articles: {len(filtered_users_list)}")

selected_users = random.sample(filtered_users_list, min(NUM_USERS, len(filtered_users_list)))
selected_articles = random.sample(list(news.keys()), NUM_ARTICLES)

In [None]:
llm = ChatGroq(
    model=MODEL_ID,
    temperature=0.7,
    max_retries=2,
    api_key=os.getenv("GROQ_API_KEY")
)

Prompt Variation:
"""
You are a professional news headline rewriter specializing in personalized content. Your task is to create a new, engaging headline for a news article that is tailored to a specific user's interests. It is critical that your headline uses simple, common language because our recommendation system relies on a fixed word dictionary; any rare or unknown words will be mapped to a default token (0), which can negatively impact the predicted click probability.

You are provided with the following information:

1. User's Positive Context:
   - A collection of headlines from news articles that the user has liked.
   - These headlines reflect the topics, style, and tone that resonate with the user.
   - **Important:** Directly incorporate the exact key words from these headlines into your new headline. Do not replace these words with synonyms; the exact wording is important.

2. User's Negative Context:
   - A collection of headlines from news articles that the user did not like.
   - Avoid using any language, tone, or topics similar to these headlines.

3. Target Article Information:
   - The original headline and the body of the target news article, which convey its main content and tone.

Instructions:
- Generate a completely new, personalized headline for the target article.
- Use the exact key words from the positive context as inspiration—do not modify them or use synonyms. Their precise form is essential.
- Do not include any phrasing or elements that appear in the negative context.
- Reflect the primary content and tone of the target article.
- Do not provide any explanation or commentary; output only the final rewritten headline.
- Ensure the headline is creative, distinctive, and clearly targeted to this specific user.

User's Positive Context:
{positive_context}

User's Negative Context:
{negative_context}

Target Article Headline:
{target_headline}

Target Article Body:
{target_body}

Rewritten Personalized Headline:
"""

In [None]:
def truncate_context(context, max_tokens=750):
    tokens = context.split()
    return " ".join(tokens[:max_tokens]) if len(tokens) > max_tokens else context

prompt_template = """
You are a professional news headline rewriter with a deep understanding of user interests. Your task is to generate a new, personalized, and engaging headline for a news article.

You are provided with:
1. The user's positive news context: these are headlines of articles the user liked. Analyze the style, topics, and tone that the user prefers.
2. The user's negative news context: these are headlines of articles the user did not like. Avoid the language, tone, or topics present in these headlines.
3. The current headline and body of the target news article.

Instructions:
- Use the user's positive context as inspiration for style, tone, or topics, and ensure the headline feels uniquely tailored.
- Use KEYWORDS from the positive context that the user will be interested in.
- Steer clear of any phrasing or sentiment that appears in the negative context.
- Reflect the main points and tone of the target article, while injecting personalized elements.
- Do not provide any explanation or additional commentary; only output the final rewritten headline.
- Ensure the headline is creative, distinctive, and clearly differentiated for each user.
- The headline should be very specifically targeted to this user.

User's Positive News Context:
{positive_context}

User's Negative News Context:
{negative_context}

Target Article Headline:
{target_headline}

Target Article Body:
{target_body}

Generate a rewritten headline:
"""

prompt = PromptTemplate(
    input_variables=["positive_context", "negative_context", "target_headline", "target_body"],
    template=prompt_template.strip()
)

llm_chain = LLMChain(prompt=prompt, llm=llm)

results = []

print("\nGenerating personalized headlines for users and articles...\n")
for i in tqdm(range(0, len(selected_users), BATCH_SIZE), desc="User Batches"):
    batch_users = selected_users[i:i + BATCH_SIZE]
    
    for article_id in tqdm(selected_articles, desc="Articles", leave=False):
        if article_id not in news:
            continue
        
        article = news[article_id]
        target_headline = " ".join(article["title"])
        target_body = " ".join(article["abstract"])
        
        for user_context in batch_users:
            positive_context = " | ".join([" ".join(news[doc]["title"]) for doc in user_context["positive"] if doc in news])
            if not positive_context:
                positive_context = "No positive news history available."
            else:
                positive_context = truncate_context(positive_context, max_tokens=1000)
            
                negative_context = " | ".join([" ".join(news[doc]["title"]) for doc in list(user_context["negative"])[:5] if doc in news])
                if not negative_context:
                    negative_context = "No negative news history available."

            
            prompt_params = {
                "positive_context": positive_context,
                "negative_context": negative_context,
                "target_headline": target_headline,
                "target_body": target_body
            }
            
            generated = llm_chain.run(prompt_params).strip()
            cleaned_generated = re.sub(r'<think>.*?</think>', '', generated, flags=re.DOTALL).strip()
            results.append({
                "user_id": user_context["user_id"],
                "article_id": article_id,
                "original_headline": target_headline,
                "generated_headline": generated,
                "cleaned_headline": cleaned_generated,
                "positive_context": positive_context,
                "negative_context": negative_context
            })

In [None]:
results_df = pd.DataFrame(results)
results_df.to_csv(output_csv_path, index=False)
print(f"\nFinal results saved to {output_csv_path}")