In [1]:
# first import the module
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import random

In [2]:
#then have news and behavior data ready
# read news data
news = pd.read_csv(
    "C:/Users/ryan0/OneDrive/Obsidian_Lib/11_GWU/13_24FA/CSCI_6365_A_ML/20240908_AML_HW1/Dataset/archive/MINDsmall_train/news.tsv", 
    sep="\t",
    names=["itemId", "category", "subcategory", "title", "abstract", "url", "title_entities", "abstract_entities"]
)
news.head(2)

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."


In [3]:
# read behaviours data
raw_behaviour = pd.read_csv(
    "C:/Users/ryan0/OneDrive/Obsidian_Lib/11_GWU/13_24FA/CSCI_6365_A_ML/20240908_AML_HW1/Dataset/archive/MINDsmall_train/behaviors.tsv", 
    sep="\t",
    names=["impressionId","userId","timestamp","click_history","impressions"])
raw_behaviour.head(2)

Unnamed: 0,impressionId,userId,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...


In [4]:
# Create mappings between indices and item IDs
#ind2item = {idx + 1: itemid for idx, itemid in enumerate(news['itemId'].values)}
#  item2ind = {itemid: idx for idx, itemid in ind2item.items()}

# Map subcategories to unique IDs
unique_subcategories = news['subcategory'].unique()
subcategory_to_id = {subcategory: idx + 1 for idx, subcategory in enumerate(unique_subcategories)}
news['subcategoryId'] = news['subcategory'].map(subcategory_to_id)

# Create mappings between subcategory IDs and news IDs
subcatid_to_news = news.groupby('subcategoryId')['itemId'].apply(list).to_dict()
news_to_subcatid = news.set_index('itemId')['subcategoryId'].to_dict()


---
then process behaviors


In [5]:
# Function to process impressions
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    clicks = []
    for entry in itemid_rel_tuple:
        if entry[1] == '0':
            noclicks.append(entry[0])
        elif entry[1] == '1':
            clicks.append(entry[0])
    return noclicks, clicks

# Apply the function to the 'impressions' column
raw_behaviour[['noclicks', 'clicks']] = raw_behaviour['impressions'].apply(lambda x: pd.Series(process_impression(x)))

# Process 'click_history' column
raw_behaviour['click_history'] = raw_behaviour['click_history'].apply(lambda x: x.split(" ") if pd.notna(x) else [])

# Display the dataset
print(f"The dataset consists of {len(raw_behaviour)} interactions after processing.")


The dataset consists of 156965 interactions after processing.


In [6]:
# user preferences
def aggregate_user_preferences(group):
    click_history = sum(group['click_history'], [])
    clicks = sum(group['clicks'], [])
    noclicks = sum(group['noclicks'], [])
    return pd.Series({
        'history': click_history,
        'clicks': clicks,
        'noclicks': noclicks
    })

# Group by 'userId' and aggregate
preference = raw_behaviour.groupby('userId').apply(aggregate_user_preferences).reset_index()


preference.sample(5)#randomly sample 5 rows



  preference = raw_behaviour.groupby('userId').apply(aggregate_user_preferences).reset_index()


Unnamed: 0,userId,history,clicks,noclicks
27487,U56815,"[N46222, N3615, N46392, N19435, N52551, N53948...","[N9621, N60992, N33425]","[N38779, N23446, N16844, N46821, N36226, N1181..."
46723,U89064,"[N46133, N10059, N39117, N11177, N16715, N4573...","[N41020, N287, N64094]","[N57327, N30518, N38309, N34998, N41354, N6139..."
36651,U72218,"[N24356, N13137, N37047, N27154, N53033, N2805...","[N55689, N35729, N49180, N64632, N14592]","[N47035, N57090, N27581, N6693, N10121, N32544..."
30687,U62173,"[N8277, N8148, N46811, N1936, N33291, N18884, ...","[N12028, N63550, N5075, N42457, N17115, N64482]","[N15435, N35729, N15462, N13341, N16280, N5568..."
12818,U31619,"[N54416, N19084, N29523, N47143, N46392, N5629...","[N23446, N33576, N41020, N24176]","[N6477, N38779, N47098, N29212, N7494, N51195,..."


In [7]:
# softmax function
def softmax(x):
    exp_x = np.exp(x - np.max(x)) 
    return exp_x / exp_x.sum()

# preference score with softmax
def calculate_user_preference_score_with_softmax(row, news_to_subcatid):
    subcat_preference = {}
    
    # (history)
    for news_id in row['history']:
        subcat_id = news_to_subcatid.get(news_id)
        if subcat_id is not None:
            subcat_preference[subcat_id] = subcat_preference.get(subcat_id, 0) + 1
    
    # Apply softmax 
    if subcat_preference:
        subcat_ids = list(subcat_preference.keys())
        scores = np.array(list(subcat_preference.values()))
        softmax_scores = softmax(scores)
        subcat_preference = dict(zip(subcat_ids, softmax_scores))
    
    return subcat_preference


preference['subcat_preference_score'] = preference.apply(
    lambda row: calculate_user_preference_score_with_softmax(row, news_to_subcatid), axis=1
)

print("User subcategory preferences with softmax-applied scores:")
display(preference[['userId', 'subcat_preference_score']].sample(5))


User subcategory preferences with softmax-applied scores:


Unnamed: 0,userId,subcat_preference_score
34666,U68914,"{1: 1.0, 90: 2.8625185805493937e-20, 5: 2.8625..."
17691,U39876,"{30: 0.12956251432964971, 40: 0.12956251432964..."
19997,U43966,"{34: 0.06478125716482484, 95: 0.17609371417587..."
40157,U78024,{}
25709,U53815,"{26: 0.09629992399711591, 13: 0.01303277748981..."


In [8]:

all_subcategories = sorted({subcat_id for prefs in preference['subcat_preference_score'] for subcat_id in prefs})
subcat_id_to_index = {subcat_id: idx for idx, subcat_id in enumerate(all_subcategories)}

# Convert preference scores to vectors
def preference_to_vector(subcat_preference_score):
    vector = np.zeros(len(all_subcategories))
    for subcat_id, score in subcat_preference_score.items():
        idx = subcat_id_to_index[subcat_id]
        vector[idx] = score
    return vector

preference['preference_vector'] = preference['subcat_preference_score'].apply(preference_to_vector)
preference['preference_vector'].sample(5)

33370    [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...
44148    [0.0, 0.0, 3.051381467639314e-07, 0.0, 0.0, 0....
32260    [1.2995380160172727e-24, 0.0, 1.29953801601727...
46089    [0.0, 0.0, 0.7112345942275939, 0.0, 0.0, 0.0, ...
29898    [0.0, 0.0, 0.03208504508582291, 0.0, 0.0320850...
Name: preference_vector, dtype: object

In [9]:

user_vectors = np.vstack(preference['preference_vector'].values)

# Compute similarity
user_similarity_matrix = cosine_similarity(user_vectors)

In [10]:
def find_top_k_similar_users(input_userid, preference, user_similarity_matrix, k=50000):
    user_indices = preference.index[preference['userId'] == input_userid].tolist()
    if not user_indices:
        print(f"User {input_userid} not found.")
        return pd.DataFrame()
    input_user_idx = user_indices[0]
    
    sim_scores = user_similarity_matrix[input_user_idx]
    sim_scores = softmax(sim_scores)
    
    # Get top k similar users (exclude self)
    similar_user_indices = sim_scores.argsort()[::-1][1:k+1]
    
    similar_users = preference.iloc[similar_user_indices][['userId']].copy()
    similar_users['similarity'] = sim_scores[similar_user_indices]
    
    return similar_users.reset_index(drop=True)

input_userid = 'U2'  #test user
top_k_users = find_top_k_similar_users(input_userid, preference, user_similarity_matrix, k=500)

display(top_k_users)


Unnamed: 0,userId,similarity
0,U74198,0.000045
1,U84099,0.000045
2,U62685,0.000045
3,U8012,0.000045
4,U90262,0.000045
...,...,...
495,U29479,0.000045
496,U84068,0.000045
497,U29706,0.000045
498,U83331,0.000045


In [15]:
def recommend_news_user_based(input_userid, preference, top_k_users, top_n=1000):
    #news already viewed
    user_viewed_news = set(preference[preference['userId'] == input_userid]['history'].values[0])
    
    news_recommendation_score = {}
    for _, row in top_k_users.iterrows():
        similar_user_id = row['userId']
        similarity_score = row['similarity']
        
        # similar user news
        similar_user_clicks = preference[preference['userId'] == similar_user_id]['clicks'].values[0]
        

        for news_id in similar_user_clicks:
            if news_id not in user_viewed_news:  # Exclude news already viewed
                news_recommendation_score[news_id] = news_recommendation_score.get(news_id, 0) + similarity_score
    
    # Sort recommendations by score and return top N
    top_recommendations = sorted(news_recommendation_score.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    return top_recommendations


top_user_based_recommendations = recommend_news_user_based(input_userid, preference, top_k_users, top_n=1000)

# Display
print(f"Top-{len(top_user_based_recommendations)} user-based recommended news for {input_userid}:")
for news_id, score in top_user_based_recommendations:
    print(f"News ID: {news_id}, Score: {score}")


Top-1000 user-based recommended news for U2:
News ID: N55689, Score: 0.004561663032244054
News ID: N7821, Score: 0.0017162695054389328
News ID: N33619, Score: 0.0014452793438828355
News ID: N40839, Score: 0.0014001144673420362
News ID: N35729, Score: 0.0013549493473560294
News ID: N52622, Score: 0.0012194545424283074
News ID: N53585, Score: 0.0011742894076828833
News ID: N62360, Score: 0.0009484646098137426
News ID: N56193, Score: 0.0009484645062373029
News ID: N38779, Score: 0.0009032995798982229
News ID: N58363, Score: 0.000903299561795117
News ID: N4642, Score: 0.0008129695909124944
News ID: N51048, Score: 0.0008129695601207655
News ID: N59685, Score: 0.0007678046014872998
News ID: N42977, Score: 0.0007678045898739665
News ID: N61768, Score: 0.0007226397744688305
News ID: N49685, Score: 0.0007226397261328133
News ID: N6477, Score: 0.0007226396280822154
News ID: N32544, Score: 0.0006323097679570951
News ID: N39317, Score: 0.0006323097627985197
News ID: N18708, Score: 0.00063230966090

In [14]:
def evaluate_user_based_recommendations(selected_user_ids, preference, user_similarity_matrix, top_n=100, k=100):
    recall_scores = []
    precision_scores = []
    num_recommendations_list = []

    for user_id in selected_user_ids:
        # Find top-k sim user
        top_k_users = find_top_k_similar_users(user_id, preference, user_similarity_matrix, k=k)
        if top_k_users.empty:
            continue  

        # Generate recommendations
        recommended_news = recommend_news_user_based(user_id, preference, top_k_users, top_n=top_n)
        if not recommended_news:
            continue  

        num_recommendations = len(recommended_news)
        num_recommendations_list.append(num_recommendations)

        # Convert recommended news to a set of IDs
        recommended_news_ids = {news_id for news_id, score in recommended_news}

        # Get actual clicked news by the user
        user_data = preference[preference['userId'] == user_id]
        if user_data.empty:
            continue

        actual_clicked_news_ids = set(user_data['clicks'].values[0]) if user_data['clicks'].values[0] else set()
        if not actual_clicked_news_ids:
            continue  

        # Calculate the number of relevant recommendations
        num_relevant = len(recommended_news_ids & actual_clicked_news_ids)

        # Calculate recall and precision
        recall = num_relevant / len(actual_clicked_news_ids) if actual_clicked_news_ids else 0
        precision = num_relevant / len(recommended_news_ids) if recommended_news_ids else 0

        recall_scores.append(recall)
        precision_scores.append(precision)

    # Calculate average
    average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0
    average_precision = sum(precision_scores) / len(precision_scores) if precision_scores else 0
    average_num_recommendations = sum(num_recommendations_list) / len(num_recommendations_list) if num_recommendations_list else 0

    return average_recall, average_precision, average_num_recommendations

# Example 100 user
selected_user_ids = random.sample(list(preference['userId']), 100)


average_recall, average_precision, average_num_recommendations = evaluate_user_based_recommendations(
    selected_user_ids, preference, user_similarity_matrix, top_n=100, k=100
)


print(f"Average Recall: {average_recall:.4f}")
print(f"Average Precision: {average_precision:.4f}")
print(f"Average number of news recommended per user: {average_num_recommendations:.2f}")


Average Recall: 0.4021
Average Precision: 0.0175
Average number of news recommended per user: 100.00
