System version: 3.7.9 (tags/v3.7.9:13c94747c7, Aug 17 2020, 18:58:18) [MSC v.1900 64 bit (AMD64)] <br>
Tensorflow version: 2.6.1

In [None]:
import os
import sys
import numpy as np
import zipfile
from tqdm import tqdm
import pandas as pd
import pickle
import re
import random
from tempfile import TemporaryDirectory
import tensorflow as tf
tf.get_logger().setLevel('ERROR')

from recommenders.models.deeprec.deeprec_utils import download_deeprec_resources 
from recommenders.models.newsrec.newsrec_utils import prepare_hparams
from recommenders.models.newsrec.models.naml import NAMLModel
from recommenders.models.newsrec.io.mind_all_iterator import MINDAllIterator
from recommenders.models.newsrec.newsrec_utils import get_mind_data_set
from recommenders.utils.notebook_utils import store_metadata

In [None]:
data_path = "..."
train_news_file = os.path.join(data_path, 'train', r'news.tsv')
train_behaviors_file = os.path.join(data_path, 'train', r'behaviors.tsv')
valid_news_file = os.path.join(data_path, 'valid', r'news.tsv')
valid_behaviors_file = os.path.join(data_path, 'valid', r'behaviors.tsv')
wordEmb_file = os.path.join(data_path, "utils", "embedding_all.npy")
userDict_file = os.path.join(data_path, "utils", "uid2index.pkl")
wordDict_file = os.path.join(data_path, "utils", "word_dict_all.pkl")
vertDict_file = os.path.join(data_path, "utils", "vert_dict.pkl")
subvertDict_file = os.path.join(data_path, "utils", "subvert_dict.pkl")
yaml_file = os.path.join(data_path, "utils", r'naml.yaml')


In [None]:
# Define model path
model_path = "..."

with open(subvertDict_file, "rb") as f:
    subvert_dict = pickle.load(f)


with open(vertDict_file, "rb") as f:
    vert_dict = pickle.load(f)
    
hparams = prepare_hparams(yaml_file,
                          vert_num=max(vert_dict.values()) + 1,
                          subvert_num=max(subvert_dict.values()) + 1,
                          batch_size=32,
                          epochs=5,
                          wordEmb_file=wordEmb_file,
                          wordDict_file=wordDict_file, 
                          userDict_file=userDict_file,
                          vertDict_file=vertDict_file, 
                          subvertDict_file=subvertDict_file)

iterator = MINDAllIterator
model = NAMLModel(hparams, iterator, seed=42)
model.model.load_weights(os.path.join(model_path, "naml_ckpt"))

scorer = model._build_graph()[1]


In [None]:
news_file = train_news_file
behaviors_file = train_behaviors_file

In [None]:
generated_csv = ".\\generated_headlines_r1.csv"
results_df = pd.read_csv(generated_csv)

Updated CSV with cleaned headlines saved to .\generated_headlines_r1.csv


In [None]:
with open(wordDict_file, "rb") as f:
    word_dict = pickle.load(f)

with open(vertDict_file, "rb") as f:
    vert_dict = pickle.load(f)

with open(subvertDict_file, "rb") as f:
    subvert_dict = pickle.load(f)

In [None]:
class HParams:
    his_size = 50
    title_size = 30
    body_size = 50

hparams = HParams()

In [None]:
def tokenize_text(text, max_length):
    """Read and Process news Data"""
    text = str(text).replace('"', '').replace(':', '').replace('-', ' ').replace(',', '').replace("'", ' ').replace(";", '').replace('.', '').replace('?', '').replace("â","").replace("€","").replace("™","")
    tokens = text.lower().split()[:max_length]
    token_ids = [word_dict.get(token, 0) for token in tokens]
    return token_ids + [0] * (max_length - len(token_ids))

In [None]:
def convert_news(news_df):
    """Convert news to numerical format for NAML model"""
    news_dict = {}
    for _, row in news_df.iterrows():
        news_dict[row["news_id"]] = {
            "title": tokenize_text(row["title"], hparams.title_size),
            "body": tokenize_text(row["abstract"], hparams.body_size),
            "category": vert_dict.get(row["category"], 0),
            "subcategory": subvert_dict.get(row["subcategory"], 0),
        }
    return news_dict

In [None]:
news_df = pd.read_csv(news_file, sep="\t", header=None,
                      names=["news_id", "category", "subcategory", "title", "abstract",
                             "url", "title_entities", "abstract_entities"])

In [None]:
news_dict = convert_news(news_df)
behaviors_df = pd.read_csv(behaviors_file, sep="\t", header=None,
                           names=["impression_id", "user_id", "timestamp", "history", "impressions"])

In [None]:
def get_click_probability(user_id, news_title_tokens):
    """ Compute click probability for a given user and news title """
    user_data = behaviors_df[behaviors_df["user_id"] == user_id]
    if not user_data.empty and isinstance(user_data["history"].values[0], str):
        user_clicked_news = user_data["history"].values[0].split()[-hparams.his_size:]
    else:
        user_clicked_news = []

    clicked_title_batch = [news_dict[n]["title"] for n in user_clicked_news if n in news_dict]
    clicked_body_batch = [news_dict[n]["body"] for n in user_clicked_news if n in news_dict]
    clicked_vert_batch = [[news_dict[n]["category"]] for n in user_clicked_news if n in news_dict]
    clicked_subvert_batch = [[news_dict[n]["subcategory"]] for n in user_clicked_news if n in news_dict]

    while len(clicked_title_batch) < hparams.his_size:
        clicked_title_batch.append([0] * hparams.title_size)
        clicked_body_batch.append([0] * hparams.body_size)
        clicked_vert_batch.append([0])
        clicked_subvert_batch.append([0])

    clicked_title_batch = np.array(clicked_title_batch).reshape(1, hparams.his_size, hparams.title_size)
    clicked_body_batch = np.array(clicked_body_batch).reshape(1, hparams.his_size, hparams.body_size)
    clicked_vert_batch = np.array(clicked_vert_batch).reshape(1, hparams.his_size, 1)
    clicked_subvert_batch = np.array(clicked_subvert_batch).reshape(1, hparams.his_size, 1)

    candidate_title_batch = np.array(news_title_tokens).reshape(1, 1, hparams.title_size)
    candidate_body_batch = np.zeros((1, 1, hparams.body_size))  # placeholders for news body and category since we're evaluating titles
    candidate_vert_batch = np.zeros((1, 1, 1))
    candidate_subvert_batch = np.zeros((1, 1, 1))

    score = scorer.predict([
        clicked_title_batch,
        clicked_body_batch,
        clicked_vert_batch,
        clicked_subvert_batch,
        candidate_title_batch,
        candidate_body_batch,
        candidate_vert_batch,
        candidate_subvert_batch
    ])

    return float(score.flatten()[0])

In [None]:
results_df = pd.read_csv(generated_csv)

original_probs = []
generated_probs = []

for _, row in results_df.iterrows():
    user_id = row["user_id"]
    article_id = row["article_id"]

    original_tokens = tokenize_text(row["original_headline"], hparams.title_size)
    
    generated_tokens = tokenize_text(row["cleaned_headline"], hparams.title_size)

    original_prob = get_click_probability(user_id, original_tokens)
    generated_prob = get_click_probability(user_id, generated_tokens)

    original_probs.append(original_prob)
    generated_probs.append(generated_prob)

results_df["original_click_prob"] = original_probs
results_df["generated_click_prob"] = generated_probs

results_df.to_csv(generated_csv, index=False)


In [None]:
results_df = pd.read_csv(generated_csv)

results_df["diff"] = results_df["generated_click_prob"] - results_df["original_click_prob"]
mean_diff = results_df["diff"].mean()

results_df["perc_improvement"] = results_df.apply(
    lambda row: ((row["generated_click_prob"] - row["original_click_prob"]) / row["original_click_prob"] * 100)
    if row["original_click_prob"] > 0 else 0,
    axis=1
)
percentage_improvement = results_df["perc_improvement"].mean()

win_rate = (results_df["generated_click_prob"] > results_df["original_click_prob"]).mean() * 100

print("Mean Click Probability Difference (Generated - Original):", mean_diff)
print("Average Percentage Improvement:", percentage_improvement, "%")
print("Win Rate (% of cases where generated headline wins):", win_rate, "%") 


Mean Click Probability Difference (Generated - Original): 0.004471528778473536
Average Percentage Improvement: 0.859860333011391 %
Win Rate (% of cases where generated headline wins): 70.20833333333333 %


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

csv_path = generated_csv
results_df = pd.read_csv(csv_path)

def compute_similarity(row):
    texts = [row["original_headline"], row["cleaned_headline"]]
    vectorizer = TfidfVectorizer().fit(texts)
    tfidf_matrix = vectorizer.transform(texts)
    sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
    return sim[0][0]

results_df["similarity"] = results_df.apply(compute_similarity, axis=1)
avg_similarity = results_df["similarity"].mean()

print("Average similarity between original and generated headlines: {:.4f}".format(avg_similarity))


Average similarity between original and generated headlines: 0.3146


In [None]:
results_df = pd.read_csv(generated_csv)
news_tsv = news_file

news_df = pd.read_csv(news_tsv, sep='\t', header=None, names=[
    "news_id", "category", "subcategory", "title", "abstract", "url", "entity_title", "entity_abstract"
])
news_df.fillna("", inplace=True)

merged_df = results_df.merge(news_df[["news_id", "category"]], left_on="article_id", right_on="news_id", how="left")
merged_df["diff"] = merged_df["generated_click_prob"] - merged_df["original_click_prob"]

merged_df["perc_improvement"] = merged_df.apply(
    lambda row: ((row["generated_click_prob"] - row["original_click_prob"]) / row["original_click_prob"] * 100)
    if row["original_click_prob"] > 0 else 0,
    axis=1
)

grouped = merged_df.groupby("category")

print("Metrics by Category:")
for category, group in grouped:
    mean_diff = group["diff"].mean()
    percentage_improvement = group["perc_improvement"].mean()
    win_rate = (group["generated_click_prob"] > group["original_click_prob"]).mean() * 100
    
    print("Category:", category)
    print("  Mean Click Probability Difference (Generated - Original): {:.4f}".format(mean_diff))
    print("  Average Percentage Improvement: {:.2f}%".format(percentage_improvement))
    print("  Win Rate (% of cases where generated headline wins): {:.2f}%".format(win_rate))
