In [1]:
import pandas as pd
import numpy as np
import plotly.express as px



In [10]:
# Load datasets from the parent folder
behaviors_train = pd.read_csv('../Datasets/MIND-small/MINDsmall_train/behaviors.tsv', sep='\t', header=None)
news_train = pd.read_csv('../Datasets/MIND-small/MINDsmall_train/news.tsv', sep='\t', header=None)

In [12]:
def clean_behaviors_data(df: pd.DataFrame) -> pd.DataFrame:
    # Rename columns for clarity
    df.columns = ["impression_id", "user_id", "impression_time", "history", "impressions"]

    # Convert impression time to datetime format
    df["impression_time"] = pd.to_datetime(df["impression_time"])

    # Handle missing histories (fill empty lists where there's no history)
    df["history"] = df["history"].fillna("").apply(lambda x: x.split() if x else [])

    # Split impressions into lists of (news_id, click_status) tuples
    df["impressions"] = df["impressions"].apply(
        lambda x: [(news.split('-')[0], int(news.split('-')[1])) for news in x.split()]
    )

    return df


def clean_news_data(df: pd.DataFrame) -> pd.DataFrame:
    # Rename columns for clarity
    df.columns = [
        "news_id", "category", "subcategory", "title", 
        "abstract", "url", "title_entities", "abstract_entities"
    ]

    # Handle missing values by filling with appropriate defaults
    df["title_entities"] = df["title_entities"].fillna("[]").apply(eval)
    df["abstract_entities"] = df["abstract_entities"].fillna("[]").apply(eval)

    # Ensure URL columns are string types
    df["url"] = df["url"].astype(str)

    return df

In [13]:
behaviors_train = clean_behaviors_data(behaviors_train)
news_train = clean_news_data(news_train)
behaviors_train.head()

Unnamed: 0,impression_id,user_id,impression_time,history,impressions
0,1,U13740,2019-11-11 09:05:58,"[N55189, N42782, N34694, N45794, N18445, N6330...","[(N55689, 1), (N35729, 0)]"
1,2,U91836,2019-11-12 18:11:30,"[N31739, N6072, N63045, N23979, N35656, N43353...","[(N20678, 0), (N39317, 0), (N58114, 0), (N2049..."
2,3,U73700,2019-11-14 07:01:48,"[N10732, N25792, N7563, N21087, N41087, N5445,...","[(N50014, 0), (N23877, 0), (N35389, 0), (N4971..."
3,4,U34670,2019-11-11 05:28:05,"[N45729, N2203, N871, N53880, N41375, N43142, ...","[(N35729, 0), (N33632, 0), (N49685, 1), (N2758..."
4,5,U8125,2019-11-12 16:11:21,"[N10078, N56514, N14904, N33740]","[(N39985, 0), (N36050, 0), (N16096, 0), (N8400..."


In [None]:
news_train.head()


Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{'Label': 'Prince Philip, Duke of Edinburgh',...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{'Label': 'Adipose tissue', 'Type': 'C', 'Wik...","[{'Label': 'Adipose tissue', 'Type': 'C', 'Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{'Label': 'Ukraine', 'Type': 'G', 'WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{'Label': 'National Basketball Association', ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{'Label': 'Skin tag', 'Type': 'C', 'WikidataI...","[{'Label': 'Skin tag', 'Type': 'C', 'WikidataI..."


In [15]:
# Function to load .vec files
def load_vec_file(filepath: str) -> pd.DataFrame:
    vectors = []
    words = []

    with open(filepath, 'r', encoding='utf-8') as file:
        # Skip the header line if it contains vector dimensions
        first_line = file.readline().strip().split()
        if len(first_line) == 2 and all(x.isdigit() for x in first_line):
            pass  # Header line detected, skip it
        else:
            file.seek(0)  # No header, reset file pointer

        for line in file:
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=float)
            words.append(word)
            vectors.append(vector)

    # Create DataFrame from vectors
    vector_df = pd.DataFrame(vectors, index=words)
    vector_df.index.name = "ID"

    return vector_df



In [16]:
# Load .vec files
relation_vectors_train = load_vec_file('../Datasets/MIND-small/MINDsmall_train/relation_embedding.vec')
entity_vectors_train = load_vec_file('../Datasets/MIND-small/MINDsmall_train/entity_embedding.vec')

In [18]:
entity_vectors_train.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Q41,-0.063388,-0.181451,0.057501,-0.091254,-0.076217,-0.052525,0.0505,-0.224871,-0.018145,0.030722,...,-0.051949,0.001861,0.124535,-0.151043,-0.263698,-0.103607,0.020007,-0.101157,-0.091567,0.035234
Q1860,0.060958,0.069934,0.015832,0.079471,-0.023362,-0.125007,-0.043618,0.134063,-0.121691,0.089166,...,-0.070713,-0.014287,0.013578,0.099977,0.012199,-0.141138,0.056129,-0.133727,0.025795,0.051448
Q39631,-0.093106,-0.052002,0.020556,-0.020801,0.04318,-0.072321,0.00091,0.028156,0.176303,0.035396,...,-0.124472,-0.08684,-0.078992,-0.062712,0.051117,-0.184307,0.127637,-0.144866,0.04469,0.013498
Q30,-0.115737,-0.179113,0.102739,-0.112469,-0.101853,-0.177516,0.01586,-0.092626,0.086708,0.05785,...,0.005893,0.080511,-8.5e-05,-0.089968,-0.083486,-0.149992,-0.053031,-0.136071,-0.029001,0.174155
Q60,-0.051036,-0.165637,0.132802,-0.089949,-0.146637,-0.142246,0.103853,-0.129651,0.096265,0.017288,...,-0.002713,0.078628,0.003711,-0.058953,-0.154067,-0.117159,-0.031614,-0.140451,0.001288,0.14035


In [None]:
def map_news_to_vectors(news_df: pd.DataFrame, entity_vectors_train: pd.DataFrame) -> pd.DataFrame:
    """
    Maps each news item to an aggregated vector based on entity embeddings.
    """
    # Helper function to extract entity IDs from entity lists
    def extract_entity_ids(entity_list):
        return [entity["WikidataId"] for entity in entity_list if "WikidataId" in entity]

    # Extract and combine entity IDs from title and abstract
    news_df["all_entities"] = news_df["title_entities"].apply(extract_entity_ids) + \
                               news_df["abstract_entities"].apply(extract_entity_ids)

    # Compute aggregated entity vectors (mean pooling)
    def get_entity_vectors(entity_ids):
        vectors = [entity_vectors_train.loc[entity_id].values 
                   for entity_id in entity_ids if entity_id in entity_vectors_train.index]
        return np.mean(vectors, axis=0) if vectors else np.zeros(entity_vectors_train.shape[1])

    news_df["news_vector"] = news_df["all_entities"].apply(get_entity_vectors)
    
    return news_df
