In [1]:
# first import
import pandas as pd
import numpy as np
import random

## Load data

In [2]:
#then have news and behavior data ready
# read news data
news = pd.read_csv(
    "C:/Users/ryan0/OneDrive/Obsidian_Lib/11_GWU/13_24FA/CSCI_6365_A_ML/20240908_AML_HW1/Dataset/archive/MINDsmall_train/news.tsv", 
    sep="\t",
    names=["itemId", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]
)
news.head(2)

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."


In [3]:
# read behaviours data
raw_behaviour = pd.read_csv(
    "C:/Users/ryan0/OneDrive/Obsidian_Lib/11_GWU/13_24FA/CSCI_6365_A_ML/20240908_AML_HW1/Dataset/archive/MINDsmall_train/behaviors.tsv", 
    sep="\t",
    names=["impressionId","userId","timestamp","click_history","impressions"])
raw_behaviour.head(2)

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...


In [4]:
# Drop unnecessary columns
news = news.drop(columns=["url", "title_entities", "abstract_entities", "abstract"])

# Check the first two rows after dropping columns
news.head(2)


Unnamed: 0,itemId,category,subcategory,title
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an..."
1,N19639,health,weightloss,50 Worst Habits For Belly Fat


In [5]:

# Map subcategories to unique IDs
unique_subcategories = news['subcategory'].unique()
subcategory_to_id = {subcategory: idx + 1 for idx, subcategory in enumerate(unique_subcategories)}
news['subcategoryId'] = news['subcategory'].map(subcategory_to_id)


subcatid_to_news = news.groupby('subcategoryId')['itemId'].apply(list).to_dict()
news_to_subcatid = news.set_index('itemId')['subcategoryId'].to_dict()


## Then process behavior

In [6]:
# Function to process impressions
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    clicks = []
    for entry in itemid_rel_tuple:
        if entry[1] == '0':
            noclicks.append(entry[0])
        elif entry[1] == '1':
            clicks.append(entry[0])
    return noclicks, clicks

# Apply the function to the 'impressions' column
raw_behaviour[['noclicks', 'clicks']] = raw_behaviour['impressions'].apply(lambda x: pd.Series(process_impression(x)))

# Process 'click_history' column
raw_behaviour['click_history'] = raw_behaviour['click_history'].apply(lambda x: x.split(" ") if pd.notna(x) else [])

# Display the dataset
print(f"The dataset consists of {len(raw_behaviour)} interactions after processing.")


The dataset consists of 156965 interactions after processing.


In [7]:
# Function to aggregate user preferences
def aggregate_user_preferences(group):
    click_history = sum(group['click_history'], [])
    clicks = sum(group['clicks'], [])
    noclicks = sum(group['noclicks'], [])
    return pd.Series({
        'history': click_history,
        'clicks': clicks,
        'noclicks': noclicks
    })

# Group by 'userId' and aggregate
preference = raw_behaviour.groupby('userId').apply(aggregate_user_preferences).reset_index()


preference.sample(5)#randomly sample 5 rows



  preference = raw_behaviour.groupby('userId').apply(aggregate_user_preferences).reset_index()


Unnamed: 0,userId,history,clicks,noclicks
4010,U16689,"[N1150, N55189, N941, N46091, N32098, N49262, ...","[N41387, N3894, N45428, N60272, N47576, N16439...","[N3841, N16931, N48416, N44698, N39683, N35094..."
29004,U59310,"[N20121, N848, N16715, N63554, N47020, N19679,...","[N33677, N20527, N55689, N53585, N38779, N43502]","[N33885, N58114, N22407, N6890, N42977, N20678..."
39200,U76422,"[N5373, N55189, N60209, N6233, N55743, N14607,...","[N20678, N23446, N4689, N38779]","[N64745, N31978, N36184, N7670, N6390, N14436,..."
48374,U91897,"[N55829, N30150, N2028, N51591, N38643, N54304...","[N29961, N11830]","[N64968, N13486, N14523, N33212, N43083, N3573..."
508,U10805,"[N42458, N20068, N42137, N477, N16617, N10919,...","[N50004, N41113]","[N42143, N59673, N5048, N47817, N47149, N59088..."


## get preference score and apply softmax

In [8]:
# Softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x)) 
    return exp_x / exp_x.sum()

# preference score with softmax
def calculate_user_preference_score_with_softmax(row, news_to_subcatid):
    subcat_preference = {}
    for news_id in row['history']:
        subcat_id = news_to_subcatid.get(news_id)
        if subcat_id is not None:
            subcat_preference[subcat_id] = subcat_preference.get(subcat_id, 0) + 1
    # Apply softmax
    if subcat_preference:
        subcat_ids = list(subcat_preference.keys())
        scores = np.array(list(subcat_preference.values()))
        softmax_scores = softmax(scores)
        subcat_preference = dict(zip(subcat_ids, softmax_scores))
    return subcat_preference


preference['subcat_preference_score'] = preference.apply(
    lambda row: calculate_user_preference_score_with_softmax(row, news_to_subcatid), axis=1
)


preference[['userId', 'subcat_preference_score']].sample(5)


Unnamed: 0,userId,subcat_preference_score
36183,U71451,"{24: 0.125, 3: 0.125, 76: 0.125, 40: 0.125, 11..."
31024,U62744,"{6: 0.013430219925312731, 95: 0.01343021992531..."
15793,U36687,"{88: 5.622983918023546e-08, 17: 0.499664565505..."
8446,U24209,"{3: 0.0024424271470241486, 6: 0.00244242714702..."
24207,U51241,"{32: 0.9999999999921687, 2: 4.658886145066913e..."


## Rec Sys by content

In [9]:

def recommend_news_content_based(user_id, preference, news_to_subcatid, news, top_n=1000):
    # Get scores
    user_pref_row = preference[preference['userId'] == user_id]
    if user_pref_row.empty:
        return []
    user_preference = user_pref_row['subcat_preference_score'].values[0]
    # news already viewed
    user_viewed_news = set(user_pref_row['history'].values[0])
    unseen_news = news[~news['itemId'].isin(user_viewed_news)].copy()
    #unseen news by subcategory
    unseen_news['subcat_id'] = unseen_news['itemId'].map(news_to_subcatid)
    # news in preferred subcategories
    candidate_news = unseen_news[unseen_news['subcat_id'].isin(user_preference.keys())].copy()
    candidate_news['score'] = candidate_news['subcat_id'].map(user_preference)
    # Sort by score
    top_recommendations = candidate_news.sort_values(by='score', ascending=False).head(top_n)
    return list(zip(top_recommendations['itemId'], top_recommendations['score']))


input_userid = 'U2'  #test
top_content_recommendations = recommend_news_content_based(
    input_userid, preference, news_to_subcatid, news, top_n=1000
)


print(f"Top-{len(top_content_recommendations)} content-based recommended news for {input_userid}:")
for news_id, score in top_content_recommendations:
    print(f"News ID: {news_id}, Score: {score}")


Top-1000 content-based recommended news for U2:
News ID: N9292, Score: 0.9805555660504806
News ID: N51093, Score: 0.9805555660504806
News ID: N43994, Score: 0.9805555660504806
News ID: N24762, Score: 0.9805555660504806
News ID: N15509, Score: 0.9805555660504806
News ID: N20094, Score: 0.9805555660504806
News ID: N55273, Score: 0.9805555660504806
News ID: N45840, Score: 0.9805555660504806
News ID: N43928, Score: 0.9805555660504806
News ID: N39715, Score: 0.9805555660504806
News ID: N41097, Score: 0.9805555660504806
News ID: N8040, Score: 0.9805555660504806
News ID: N37086, Score: 0.9805555660504806
News ID: N55348, Score: 0.9805555660504806
News ID: N5775, Score: 0.9805555660504806
News ID: N44172, Score: 0.9805555660504806
News ID: N26252, Score: 0.9805555660504806
News ID: N36345, Score: 0.9805555660504806
News ID: N1502, Score: 0.9805555660504806
News ID: N59244, Score: 0.9805555660504806
News ID: N13743, Score: 0.9805555660504806
News ID: N32091, Score: 0.9805555660504806
News ID: N

## Random pick 100 user and calculate recall and precision

In [13]:
def evaluate_content_based_recommendations(selected_user_ids, preference, news_to_subcatid, news, top_n=100):
    recall_scores = []
    precision_scores = []

    for user_id in selected_user_ids:
        recommended_news = recommend_news_content_based(user_id, preference, news_to_subcatid, news, top_n=top_n)
        if not recommended_news:
            continue
        recommended_news_ids = set(news_id for news_id, score in recommended_news)
        
        # Retrieve actual clicks for the user
        user_data = preference[preference['userId'] == user_id]
        if user_data.empty:
            continue
        
        actual_clicked_news_ids = set(user_data['clicks'].values[0]) if user_data['clicks'].values[0] else set()
        if not actual_clicked_news_ids:
            continue
        
        # Calculate recall and precision
        num_relevant = len(recommended_news_ids & actual_clicked_news_ids)
        recall = num_relevant / len(actual_clicked_news_ids) if actual_clicked_news_ids else 0
        precision = num_relevant / len(recommended_news_ids) if recommended_news_ids else 0
        
        recall_scores.append(recall)
        precision_scores.append(precision)

    # Calculate average recall and precision
    average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0
    average_precision = sum(precision_scores) / len(precision_scores) if precision_scores else 0

    print(f"Average Recall over {len(recall_scores)} users: {average_recall:.4f}")
    print(f"Average Precision over {len(precision_scores)} users: {average_precision:.4f}")
    
    return average_recall, average_precision

# Example usage
selected_user_ids = random.sample(list(preference['userId']), 100)
evaluate_content_based_recommendations(selected_user_ids, preference, news_to_subcatid, news, top_n=100)


Average Recall over 98 users: 0.0017
Average Precision over 98 users: 0.0001


(0.0017006802721088435, 0.00010204081632653062)