## News Recommendation by Combining Embedding with Learning to Rank Models
#### Objective

- Learn how to retrieve candidate articles for each user using embedding-based semantic similarity.
- Apply both baseline similarity ranking and advanced learning-to-rank models (e.g., LambdaMART) to sort the retrieved candidates.
- Evaluate recommendation quality using standard metrics such as Precision@k, Recall@k, and NDCG@k.

#### Prerequisites
- Get model API keys from yaml file.

In [70]:
# %pip install tiktoken
# %pip install -U langchain-community
# %pip install weaviate-client
# %pip install lightgbm

In [71]:
import yaml
from openai import OpenAI
import os
import pandas as pd

import openai
import time

import logging
# Configure logging
logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)

from transformers import AutoTokenizer, AutoModel
import torch
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

# Read the YAML file
with open('./../../../Curify/curify_api.yaml', 'r') as yaml_file:
    data = yaml.safe_load(yaml_file)

openai_api_key = data.get('openai').get('api_key')
os.environ["OPENAI_API_KEY"] = openai_api_key

client = OpenAI(
    api_key= openai_api_key
)


#### Step 1: Data Preparation

In [72]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load data
file_path = './../Data/news-recommendation/'
user_df = pd.read_csv(file_path + "user_profiles.tsv", sep='\t').head(10)
item_df = pd.read_csv(file_path + "news_summary.tsv", sep='\t').drop_duplicates()

#### Step 2: Retrieval Set Generation

In [73]:
from sklearn.neighbors import NearestNeighbors

def compute_retrieval_set(user_embeddings, article_embeddings, k=10):
    """
    Use Nearest Neighbors to find top-k articles for each user based on cosine similarity.

    Args:
        user_embeddings (pd.DataFrame): DataFrame with user_id as index and embedding columns.
        article_embeddings (pd.DataFrame): DataFrame with item_id as index and embedding columns.
        k (int): Number of nearest neighbors to retrieve.

    Returns:
        dict: Mapping of user_id to a list of top-k item_ids.
    """
    k = min(k, len(article_embeddings))

    # Fit Nearest Neighbors model on article embeddings
    nn = NearestNeighbors(n_neighbors=k, metric='cosine')
    nn.fit(article_embeddings.values)

    # Perform kNN search for each user
    distances, indices = nn.kneighbors(user_embeddings.values)

    # Retrieve corresponding item_ids from index
    item_ids = article_embeddings.index.to_list()
    user_ids = user_embeddings.index.to_list()

    user_to_items = {
        user_id: [item_ids[idx] for idx in top_k_idxs]
        for user_id, top_k_idxs in zip(user_ids, indices)
    }

    return user_to_items

#### Step 4: User-Item Relevance Labeling

In [74]:
def batch_label_user_interest(users, articles, user_to_article_map, model="gpt-4o-mini", batch_size=10):
    """
    Labels user interest in news articles based on LinkedIn profiles using OpenAI API.
    Output is 1 for Interested, 0 for Not Interested (one line per user-article pair).

    Args:
        users (dict): User profiles, {user_id: profile text}.
        articles (dict): News articles, {item_id: article summary}.
        user_to_article_map (dict): {user_id: [item_id1, item_id2, ...]}.
        model (str): OpenAI model to use.
        batch_size (int): Number of user-article pairs per prompt.

    Returns:
        dict: {user_id: {item_id: interest_label (0 or 1)}}
    """
    results = {}
    all_pairs = [(user_id, item_id) for user_id, item_ids in user_to_article_map.items() for item_id in item_ids]

    for i in range(0, len(all_pairs), batch_size):
        batch = all_pairs[i:i + batch_size]

        # Build prompt
        prompt = (
            "For each of the following user and article pairs, determine interest level.\n"
            "Respond only with a single line per pair, format:\n"
            "`user_id, item_id, 1` for Interested OR `user_id, item_id, 0` for Not Interested.\n"
            "Do NOT add extra explanations or formatting.\n\n"
        )

        for user_id, item_id in batch:
            prompt += f"User ID: {user_id}\nProfile: {users[user_id]}\n"
            prompt += f"Article ID: {item_id}\nArticle: {articles[item_id]}\n\n"

        try:
            response = client.chat.completions.create(
                model=model,
                messages=[
                    {"role": "system", "content": "You are a helpful assistant."},
                    {"role": "user", "content": prompt}
                ],
                temperature=0.3
            )

            output = response.choices[0].message.content.strip()

            # Parse each line of output
            for line in output.splitlines():
                parts = line.strip().split(",")
                if len(parts) != 3:
                    print(f"⚠️ Skipping malformed line: {line}")
                    continue
                uid, iid, label = parts[0].strip(), parts[1].strip(), parts[2].strip()
                try:
                    label = int(label)
                    if uid not in results:
                        results[uid] = {}
                    results[uid][iid] = label
                except ValueError:
                    print(f"⚠️ Invalid label in line: {line}")

        except Exception as e:
            print(f"⚠️ Error calling OpenAI API: {str(e)}")

    return results

#### Step 5: Recommendation Approaches

**Direct Embedding Similarity**

In [75]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def recommend_top_k_items(user_embeddings_df, item_embeddings_df, top_k=5):
    """
    Recommend top-k items to each user based on cosine similarity between user and item embeddings.

    Args:
        user_embeddings_df (pd.DataFrame): DataFrame with user embeddings, indexed by 'user_id'.
        item_embeddings_df (pd.DataFrame): DataFrame with item embeddings, indexed by 'item_id'.
        top_k (int): Number of items to recommend per user.

    Returns:
        dict: {user_id: [item_id1, item_id2, ...]} of top-k recommended items.
    """
    user_ids = user_embeddings_df.index.tolist()
    item_ids = item_embeddings_df.index.tolist()

    # Compute cosine similarity matrix: shape (num_users, num_items)
    similarity_matrix = cosine_similarity(user_embeddings_df.values, item_embeddings_df.values)

    # Build recommendation dictionary
    recommendations = {
        user_id: [item_ids[j] for j in np.argsort(similarity_matrix[i])[::-1][:top_k]]
        for i, user_id in enumerate(user_ids)
    }

    return recommendations


**State-of-the-Art Ranking Model (LambdaMART)**

In [80]:
import lightgbm as lgb
import numpy as np

def train_lambdamart_ranking(user_embeddings_df, item_embeddings_df, relevance_dict):
    features = []
    labels = []
    group_sizes = []

    # Step 1: Filter relevance_dict to include only training users
    training_users = set(user_embeddings_df.index)
    filtered_relevance = {u: items for u, items in relevance_dict.items() if u in training_users}

    for user_id, item_labels in filtered_relevance.items():
        user_vec = user_embeddings_df.loc[user_id].values
        valid_item_count = 0

        for item_id, label in item_labels.items():
            if item_id not in item_embeddings_df.index:
                print(f"Item ID {item_id} not in item_embeddings. Skipping.")
                continue

            item_vec = item_embeddings_df.loc[item_id].values

            if user_vec.shape[0] != item_vec.shape[0]:
                print(f"Dimension mismatch: user {user_id}, item {item_id}. Skipping this pair.")
                continue

            sim = cosine_similarity([user_vec], [item_vec])[0][0]
            feature = np.concatenate([user_vec, item_vec, [sim]])
            features.append(feature)
            labels.append(label)
            valid_item_count += 1

        if valid_item_count > 0:
            group_sizes.append(valid_item_count)

    if not features:
        raise ValueError("No valid training data could be constructed.")

    train_data = lgb.Dataset(np.array(features), label=np.array(labels), group=group_sizes)

    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'boosting_type': 'gbdt',
        'learning_rate': 0.05,
        'num_leaves': 32,
        'min_data_in_leaf': 10
    }

    model = lgb.train(params, train_data, num_boost_round=100)
    return model


def predict_lambdamart_ranking(model, user_embeddings_df, item_embeddings_df, relevance_dict, top_k=10):
    """
    Predict top-k items per user using a trained LambdaMART model.

    Args:
        model (lightgbm.Booster): Trained model.
        user_embeddings_df (pd.DataFrame): User embeddings indexed by user_id.
        item_embeddings_df (pd.DataFrame): Item embeddings indexed by item_id.
        relevance_dict (dict): {user_id: {item_id: relevance_label (0/1)}}
        top_k (int): Number of items to recommend per user.

    Returns:
        dict: {user_id: [item_id1, item_id2, ...]} with top-k ranked items.
    """
    recommendations = {}

    for user_id, item_labels in relevance_dict.items():
        user_vec = user_embeddings_df.loc[user_id].values
        item_ids = list(item_labels.keys())

        features = []
        for item_id in item_ids:
            item_vec = item_embeddings_df.loc[item_id].values
            sim = cosine_similarity([user_vec], [item_vec])[0][0]
            feature = np.concatenate([user_vec, item_vec, [sim]])
            features.append(feature)

        scores = model.predict(np.array(features))
        ranked_items = [item for _, item in sorted(zip(scores, item_ids), reverse=True)[:top_k]]
        recommendations[user_id] = ranked_items

    return recommendations


#### Step 6: Evaluation

In [86]:
from sklearn.metrics import ndcg_score
import numpy as np

def evaluate_ranking_metrics(recommendations, relevance_dict, k_list=[5, 10, 20]):
    """
    Compute Precision@k, Recall@k, and NDCG@k for multiple values of k.

    Args:
        recommendations (dict): {user_id: [ranked list of item_ids]}
        relevance_dict (dict): {user_id: {item_id: relevance (0/1)}}
        k_list (list): List of cutoff values for evaluation metrics.

    Returns:
        dict: Mean Precision@k, Recall@k, and NDCG@k for each k across all users.
    """
    results = {}

    for k in k_list:
        precision_scores = []
        recall_scores = []
        ndcg_scores = []

        for user_id, rec_items in recommendations.items():
            if user_id not in relevance_dict:
                continue

            # Relevant (positive) items for the user
            relevant_items = [item for item, rel in relevance_dict[user_id].items() if rel > 0]
            if not relevant_items:
                continue

            # Top-k recommended items
            top_k_items = rec_items[:k]
            relevance = [1 if item in relevant_items else 0 for item in top_k_items]

            # Precision@k
            precision = sum(relevance) / k
            precision_scores.append(precision)

            # Recall@k
            recall = sum(relevance) / len(relevant_items)
            recall_scores.append(recall)

            # NDCG@k
            predicted_scores = list(range(k, 0, -1))  # descending scores
            try:
                ndcg = ndcg_score([relevance], [predicted_scores], k=k)
                ndcg_scores.append(ndcg)
            except:
                continue

        # Store average metrics for current k
        results[f"Precision@{k}"] = np.mean(precision_scores) if precision_scores else 0.0
        results[f"Recall@{k}"] = np.mean(recall_scores) if recall_scores else 0.0
        results[f"NDCG@{k}"] = np.mean(ndcg_scores) if ndcg_scores else 0.0

    return results

In [88]:
import json

from sklearn.model_selection import train_test_split

num_retrieval = 50
num_recommend = 20
num_eval = [1, 5, 10]

# Zip user_id and user_summary into key-value pairs
users_summary = dict(zip(user_df['user_id'], user_df['summary']))

# Zip item_id and item_summary into key-value pairs
items_summary = dict(zip(item_df['item_id'], item_df['summary']))

# Generate embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for users and items
user_emb_array = model.encode(user_df['summary'].tolist())
item_emb_array = model.encode(item_df['summary'].tolist())

# Convert embeddings into DataFrames with index
user_embeddings = pd.DataFrame(user_emb_array, index=user_df['user_id'])
item_embeddings = pd.DataFrame(item_emb_array, index=item_df['item_id'])

# Split users into train and test sets
train_users, test_users = train_test_split(user_df['user_id'], test_size=0.2, random_state=42)

# Extract corresponding embeddings
train_user_embeddings = user_embeddings.loc[train_users]
test_user_embeddings = user_embeddings.loc[test_users]

retrieval_set = compute_retrieval_set(user_embeddings, item_embeddings, k=num_retrieval)
relevance_label = batch_label_user_interest(users_summary, items_summary, retrieval_set)

recommend_similarity = recommend_top_k_items(user_embeddings, item_embeddings, top_k=num_recommend)

lambdamart = train_lambdamart_ranking(train_user_embeddings, item_embeddings, relevance_label)
recommend_lambdamart = predict_lambdamart_ranking(lambdamart, user_embeddings, item_embeddings, relevance_label, top_k=num_recommend)

metrics_similarity = evaluate_ranking_metrics(recommend_similarity, relevance_label, num_eval)

metrics_lambdamart = evaluate_ranking_metrics(recommend_lambdamart, relevance_label, num_eval)

# Serialize both results
results_combined = {
    "similarity": metrics_similarity,
    "lambdamart": metrics_lambdamart
}

# Save to JSON file
with open("personalized_ranking.json", "w") as f:
    json.dump(results_combined, f, indent=4)

print("Saved results to recommendation_results.json")

2025-03-19 01:51:01,877 - INFO - Use pytorch device_name: cpu
2025-03-19 01:51:01,878 - INFO - Load pretrained SentenceTransformer: all-MiniLM-L6-v2
2025-03-19 01:51:01,886 - DEBUG - Resetting dropped connection: huggingface.co
2025-03-19 01:51:01,938 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0
2025-03-19 01:51:01,955 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/config_sentence_transformers.json HTTP/1.1" 200 0
2025-03-19 01:51:01,972 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/README.md HTTP/1.1" 200 0
2025-03-19 01:51:01,987 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/modules.json HTTP/1.1" 200 0
2025-03-19 01:51:02,003 - DEBUG - https://huggingface.co:443 "HEAD /sentence-transformers/all-MiniLM-L6-v2/resolve/main/sentence_bert_config.json HTTP/1.1" 20