# **Getting the all-user Data + Cleaning**

In [1]:
import pandas as pd
import numpy as np
from lightfm.cross_validation import random_train_test_split


from itertools import product
from lightfm import LightFM
from lightfm.evaluation import auc_score


from scipy.sparse import csr_matrix
from tqdm import tqdm
from lightfm.data import Dataset
from scipy.sparse import coo_matrix

import scipy





# **Getting the Data + Cleaning**

In [None]:
column_names = ['Impression ID', 'User ID', 'Time', 'History', 'Impressions']
behaviors_df = pd.read_csv("MINDsmall_train/behaviors.tsv", sep='\t', names=column_names)

behaviors_df['Impressions'] = behaviors_df['Impressions'].str.split(' ')
behaviors_df = behaviors_df.explode('Impressions').rename(columns={'Impressions': 'News_ID'})
behaviors_df[['News_ID', 'News_Subcategory']] = behaviors_df['News_ID'].str.split('-', n=1, expand=True)
behaviors_df['News_ID'] = behaviors_df['News_ID'].astype(str)


news_column_names = ['News_ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities']
news_df = pd.read_csv("MINDsmall_train/news.tsv", sep='\t', names=news_column_names)
news_df['News_ID'] = news_df['News_ID'].astype(str)

merged_impression_df = behaviors_df.merge(news_df, on='News_ID', how='left')


# *Preparing all-user Data for Modeling: category+subcategory predictors*

In [8]:
grouped_df = merged_impression_df.groupby(['User ID', 'News_ID']).agg({
    'News_Subcategory': 'sum', 
    'Category': 'first', 
    'SubCategory': 'first',
}).reset_index()

grouped_df['News_Subcategory'] = grouped_df['News_Subcategory'].astype(float)
merged_impression_df['History'] = merged_impression_df['History'].apply(lambda x: x if isinstance(x, str) and x.strip() else None)

valid_histories = merged_impression_df['History'].dropna().unique()
user_features_list = [f"history_{history}" for history in valid_histories]
user_features_data = [
    (user_id, [f"history_{history}"])
    for user_id, history in merged_impression_df.groupby('User ID')['History'].first().items()
    if history is not None
]

dataset = Dataset()
dataset.fit(
    users=grouped_df['User ID'].unique(),
    items=grouped_df['News_ID'].unique(),
    user_features=user_features_list,
    item_features=list(set(grouped_df['Category']) | set(grouped_df['SubCategory']))
)

item_features = dataset.build_item_features(
    tqdm([(row['News_ID'], [row['Category'], row['SubCategory']]) for _, row in grouped_df.iterrows()], desc="Building Item Features")
)
user_features = dataset.build_user_features(
    tqdm(user_features_data, desc="Building User Features")
)
interactions, weights = dataset.build_interactions(
    tqdm([(row['User ID'], row['News_ID'], float(row['News_Subcategory'])) for _, row in grouped_df.iterrows()], desc="Building Interactions")
)

interactions = interactions.astype('float32')
weights = weights.astype('float32')
item_features = coo_matrix(item_features)
user_features = coo_matrix(user_features)

train, test = random_train_test_split(interactions, test_percentage=0.2) 
train_weights = weights.multiply(train > 0)
train_weights = coo_matrix(train_weights)

Building Item Features: 100%|██████████| 4988584/4988584 [00:09<00:00, 539757.72it/s]
Building User Features: 100%|██████████| 49108/49108 [00:00<00:00, 369069.95it/s]
Building Interactions: 100%|██████████| 4988584/4988584 [00:05<00:00, 952879.07it/s] 


# *Finding optimal parameters: Use category+Subcategory predictors*

In [None]:
n_components_range = [10, 20, 30]
loss_range = ['warp', 'bpr', 'warp-kos']
epoch_range = [10, 20, 30]
num_threads = 4

results = []

for n_components, loss, epochs in product(n_components_range, loss_range, epoch_range):
    print(f"Training model with n_components={n_components}, loss={loss}, epochs={epochs}")

    model = LightFM(no_components=n_components, loss=loss)
    model.fit(
        train,
        item_features=item_features,
        epochs=epochs,
        num_threads=num_threads,
        verbose=True
    )
    
    train_auc = auc_score(model, train, item_features=item_features).mean()
    test_auc = auc_score(model, test, item_features=item_features).mean()
    
    # Save the results
    results.append({
        'n_components': n_components,
        'loss': loss,
        'epochs': epochs,
        'train_auc': train_auc,
        'test_auc': test_auc
    })

results = sorted(results, key=lambda x: x['test_auc'], reverse=True)
for res in results:
    print(res)

In [None]:
#fitting model
model = LightFM(no_components=30, loss='warp')  

model.fit(
    interactions=train, 
    user_features=user_features,  
    item_features=item_features, 
    epochs=30,  
    num_threads=4  
)

<lightfm.lightfm.LightFM at 0x1817a8910>

In [None]:
auc_train = auc_score(model, train, user_features=user_features, item_features=item_features, num_threads=4).mean()

0.96387655

In [None]:
auc_test = auc_score(model, test, user_features=user_features, item_features=item_features, num_threads=4).mean()

0.92696345

In [None]:
#calculating Mrr
def calculate_mrr(model, interactions, item_features=None):
    if not isinstance(interactions, csr_matrix):
        interactions = csr_matrix(interactions)

    mrr = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].indices  
        ranked_items = np.argsort(-scores)
        ranks = np.where(np.isin(ranked_items, true_items))[0]
        if len(ranks) > 0:
            mrr.append(1.0 / (ranks[0] + 1))  
    return np.mean(mrr)


train = csr_matrix(train) 
test = csr_matrix(test)
train_mrr = calculate_mrr(model, train, item_features=item_features)
test_mrr = calculate_mrr(model, test, item_features=item_features)

print(f"Train MRR: {train_mrr:.4f}")
print(f"Test MRR: {test_mrr:.4f}")


#Calculating ndcg@5 and ndcg@10
def calculate_ndcg(model, interactions, k, item_features=None):
    if not isinstance(interactions, scipy.sparse.csr_matrix):
        interactions = interactions.tocsr()
    
    ndcg = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].toarray().flatten()  
        ranked_items = np.argsort(-scores)[:k]
   
        dcg = sum(
            (true_items[item] / np.log2(rank + 2))
            for rank, item in enumerate(ranked_items)
            if true_items[item] > 0
        )
      
        sorted_true_items = np.sort(true_items)[::-1][:k]
        idcg = sum(
            (rel / np.log2(rank + 2))
            for rank, rel in enumerate(sorted_true_items)
            if rel > 0
        )
        ndcg.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcg)


ndcg_at_5_train = calculate_ndcg(model, train, k=5, item_features=item_features)
ndcg_at_10_train = calculate_ndcg(model, train, k=10, item_features=item_features)

ndcg_at_5_test = calculate_ndcg(model, test, k=5, item_features=item_features)
ndcg_at_10_test = calculate_ndcg(model, test, k=10, item_features=item_features)

print(f"Train nDCG@5: {ndcg_at_5_train:.4f}, Train nDCG@10: {ndcg_at_10_train:.4f}")
print(f"Test nDCG@5: {ndcg_at_5_test:.4f}, Test nDCG@10: {ndcg_at_10_test:.4f}")

# *Preparing all-user Data for Modeling: category+subcategory+Titles predictors*

In [3]:
grouped_df = merged_impression_df.groupby(['User ID', 'News_ID']).agg({
    'News_Subcategory': 'sum', 
    'Category': 'first',      
    'SubCategory': 'first',    
    'Title': 'first',          
}).reset_index()


grouped_df['News_Subcategory'] = grouped_df['News_Subcategory'].astype(float)
merged_impression_df['History'] = merged_impression_df['History'].apply(lambda x: x if isinstance(x, str) and x.strip() else None)
valid_histories = merged_impression_df['History'].dropna().unique()
user_features_list = [f"history_{history}" for history in valid_histories]

user_features_data = [
    (user_id, [f"history_{history}"])
    for user_id, history in merged_impression_df.groupby('User ID')['History'].first().items()
    if history is not None
]

dataset = Dataset()
dataset.fit(
    users=grouped_df['User ID'].unique(),
    items=grouped_df['News_ID'].unique(),
    user_features=user_features_list,
    item_features=list(set(grouped_df['Category']) | set(grouped_df['SubCategory']) | set(grouped_df['Title']))
)

item_features = dataset.build_item_features([
    (
        row['News_ID'],
        [row['Category'], row['SubCategory'], row['Title']]
    )
    for _, row in grouped_df.iterrows()
])


user_features = dataset.build_user_features(
    tqdm(user_features_data, desc="Building User Features")
)
interactions, weights = dataset.build_interactions(
    tqdm([(row['User ID'], row['News_ID'], float(row['News_Subcategory'])) for _, row in grouped_df.iterrows()], desc="Building Interactions")
)


interactions = interactions.astype('float32')
weights = weights.astype('float32')
item_features = coo_matrix(item_features)
user_features = coo_matrix(user_features)


train, test = random_train_test_split(interactions, test_percentage=0.2) 
train_weights = weights.multiply(train > 0)
train_weights = coo_matrix(train_weights)


Building User Features: 100%|██████████| 49108/49108 [00:00<00:00, 382169.85it/s]
Building Interactions: 100%|██████████| 4988584/4988584 [00:05<00:00, 907533.74it/s] 


In [4]:
#fitting model
model = LightFM(no_components=30, loss='warp')  

model.fit(
    interactions=train, 
    user_features=user_features,  
    item_features=item_features, 
    epochs=30,  
    num_threads=4  
)

<lightfm.lightfm.LightFM at 0x1ca9c9e50>

In [5]:
auc_train = auc_score(model, train,  user_features=user_features, item_features=item_features, num_threads=4).mean()

0.9943037

In [6]:
auc_test = auc_score(model, test, user_features=user_features, item_features=item_features, num_threads=4).mean()

0.9882977

In [None]:
#calculating Mrr
def calculate_mrr(model, interactions, item_features=None):
    if not isinstance(interactions, csr_matrix):
        interactions = csr_matrix(interactions)

    mrr = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].indices  
        ranked_items = np.argsort(-scores)
        ranks = np.where(np.isin(ranked_items, true_items))[0]
        if len(ranks) > 0:
            mrr.append(1.0 / (ranks[0] + 1))  
    return np.mean(mrr)


train = csr_matrix(train) 
test = csr_matrix(test)
train_mrr = calculate_mrr(model, train, item_features=item_features)
test_mrr = calculate_mrr(model, test, item_features=item_features)

print(f"Train MRR: {train_mrr:.4f}")
print(f"Test MRR: {test_mrr:.4f}")


#Calculating ndcg@5 and ndcg@10
def calculate_ndcg(model, interactions, k, item_features=None):
    if not isinstance(interactions, scipy.sparse.csr_matrix):
        interactions = interactions.tocsr()
    
    ndcg = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].toarray().flatten()  
        ranked_items = np.argsort(-scores)[:k]
   
        dcg = sum(
            (true_items[item] / np.log2(rank + 2))
            for rank, item in enumerate(ranked_items)
            if true_items[item] > 0
        )
      
        sorted_true_items = np.sort(true_items)[::-1][:k]
        idcg = sum(
            (rel / np.log2(rank + 2))
            for rank, rel in enumerate(sorted_true_items)
            if rel > 0
        )
        ndcg.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcg)


ndcg_at_5_train = calculate_ndcg(model, train, k=5, item_features=item_features)
ndcg_at_10_train = calculate_ndcg(model, train, k=10, item_features=item_features)

ndcg_at_5_test = calculate_ndcg(model, test, k=5, item_features=item_features)
ndcg_at_10_test = calculate_ndcg(model, test, k=10, item_features=item_features)

print(f"Train nDCG@5: {ndcg_at_5_train:.4f}, Train nDCG@10: {ndcg_at_10_train:.4f}")
print(f"Test nDCG@5: {ndcg_at_5_test:.4f}, Test nDCG@10: {ndcg_at_10_test:.4f}")

# No-User History: Cleaning and Splitting Data

In [2]:
column_names = ['Impression ID', 'User ID', 'Time', 'History', 'Impressions',]
behaviors_df = pd.read_csv("MINDsmall_train/behaviors.tsv", sep='\t', names=column_names)
behaviors_df['History'] = behaviors_df['History'].str.split(' ')
behaviors_df = behaviors_df.explode('History').rename(columns={'History': 'News_ID'})


news_column_names = ['News_ID', 'Category', 'SubCategory', 'Title', 'Abstract', 'URL', 'Title Entities', 'Abstract Entities']
news_df = pd.read_csv("MINDsmall_train/news.tsv", sep='\t', names=news_column_names)
merged_df = behaviors_df.merge(news_df, on='News_ID', how='left')
merged_df = merged_df.drop_duplicates(subset=["User ID",'News_ID'])



Nan_df = merged_df[merged_df["News_ID"].isna()]
Nan_df1 = Nan_df.assign(Impressions=Nan_df['Impressions'].str.split()).explode('Impressions')
Nan_df1['News_ID'] = Nan_df1['Impressions'].str.split('-').str[0]
Nan_df1['Impressions'] = Nan_df1['Impressions'].str.split('-').str[1]

merged_null_df = Nan_df1.merge(news_df, on='News_ID', how='left')
merged_null_df = merged_null_df.loc[:, ~merged_null_df.columns.str.endswith('_x')]
Nan_df = merged_df[merged_df["News_ID"].isna()]
Nan_df1 = Nan_df.assign(Impressions=Nan_df['Impressions'].str.split()).explode('Impressions')

Nan_df1['News_ID'] = Nan_df1['Impressions'].str.split('-').str[0]
Nan_df1['Impressions'] = Nan_df1['Impressions'].str.split('-').str[1]


merged_null_df = Nan_df1.merge(news_df, on='News_ID', how='left')
merged_null_df = merged_null_df.loc[:, ~merged_null_df.columns.str.endswith('_x')]

# *Preparing no history user Data for Modeling: category+subcategory predictors*

In [None]:
grouped_df = merged_null_df.groupby(['User ID', 'News_ID']).agg({
    'Impressions': 'sum',  
    'Category_y': 'first',  
    'SubCategory_y': 'first',
}).reset_index()

grouped_df['Impressions'] = grouped_df['Impressions'].astype(float)
dataset = Dataset()
dataset.fit(
    users=grouped_df['User ID'].unique(),
    items=grouped_df['News_ID'].unique(),
    user_features=None, 
    item_features=list(set(grouped_df['Category_y']) | set(grouped_df['SubCategory_y']))
)
item_features = dataset.build_item_features(
    [(row['News_ID'], [str(row['Category_y']), str(row['SubCategory_y'])]) for _, row in grouped_df.iterrows()]
)
interactions, weights = dataset.build_interactions(
    [(row['User ID'], row['News_ID'], float(row['Impressions'])) for _, row in grouped_df.iterrows()]
)


interactions = interactions.astype('float32')
weights = weights.astype('float32')
item_features = coo_matrix(item_features)


train, test = random_train_test_split(interactions, test_percentage=0.2) 
train_weights = weights.multiply(train > 0) 
train_weights = coo_matrix(train_weights)

In [None]:
#fitting model
n_components =10
loss = 'warp'
epoch = 30
num_thread = 4
model = LightFM(no_components=n_components, loss=loss)

model.fit(
    train,
    item_features=item_features,
    epochs=epoch,
    num_threads=num_thread,
    verbose=True
)

Epoch: 100%|██████████| 30/30 [00:00<00:00, 63.26it/s]


<lightfm.lightfm.LightFM at 0x185b23d90>

In [None]:
auc_train = auc_score(model, train, item_features=item_features, num_threads=4).mean()

0.89423525

In [None]:
auc_test = auc_score(model, test,item_features=item_features, num_threads=4).mean()

0.8077622

In [None]:
#Calculating mrr 
def calculate_mrr(model, interactions, item_features=None):
    if not isinstance(interactions, csr_matrix):
        interactions = csr_matrix(interactions)

    mrr = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].indices  
        ranked_items = np.argsort(-scores)
        ranks = np.where(np.isin(ranked_items, true_items))[0]
        if len(ranks) > 0:
            mrr.append(1.0 / (ranks[0] + 1))  
    return np.mean(mrr)


train = csr_matrix(train) 
test = csr_matrix(test)
train_mrr = calculate_mrr(model, train, item_features=item_features)
test_mrr = calculate_mrr(model, test, item_features=item_features)

print(f"Train MRR: {train_mrr:.4f}")
print(f"Test MRR: {test_mrr:.4f}")


#Calculating ndcg@5 and ndcg@10

def calculate_ndcg(model, interactions, k, item_features=None):
    if not isinstance(interactions, scipy.sparse.csr_matrix):
        interactions = interactions.tocsr()
    
    ndcg = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].toarray().flatten()  
        ranked_items = np.argsort(-scores)[:k]
   
        dcg = sum(
            (true_items[item] / np.log2(rank + 2))
            for rank, item in enumerate(ranked_items)
            if true_items[item] > 0
        )
      
        sorted_true_items = np.sort(true_items)[::-1][:k]
        idcg = sum(
            (rel / np.log2(rank + 2))
            for rank, rel in enumerate(sorted_true_items)
            if rel > 0
        )
        ndcg.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcg)


ndcg_at_5_train = calculate_ndcg(model, train, k=5, item_features=item_features)
ndcg_at_10_train = calculate_ndcg(model, train, k=10, item_features=item_features)

ndcg_at_5_test = calculate_ndcg(model, test, k=5, item_features=item_features)
ndcg_at_10_test = calculate_ndcg(model, test, k=10, item_features=item_features)

print(f"Train nDCG@5: {ndcg_at_5_train:.4f}, Train nDCG@10: {ndcg_at_10_train:.4f}")
print(f"Test nDCG@5: {ndcg_at_5_test:.4f}, Test nDCG@10: {ndcg_at_10_test:.4f}")

Train MRR: 0.3842
Test MRR: 0.0861
Train nDCG@5: 0.1895, Train nDCG@10: 0.1749
Test nDCG@5: 0.0290, Test nDCG@10: 0.0306


# *Preparing no history user Data for Modeling: category+subcategory+Titles predictors*

In [None]:
grouped_df = merged_null_df.groupby(['User ID', 'News_ID']).agg({
    'Impressions': 'sum',  
    'Category_y': 'first',        
    'SubCategory_y': 'first',    
    'Title_y': 'first',     
}).reset_index()
grouped_df['Impressions'] = grouped_df['Impressions'].astype(float)

dataset = Dataset()
dataset.fit(
    users=grouped_df['User ID'].unique(),
    items=grouped_df['News_ID'].unique(),
    item_features=list(set(grouped_df['Category_y']) | set(grouped_df['SubCategory_y']) | set(grouped_df['Title_y']))
)
item_features = dataset.build_item_features(
    tqdm([
        (
            row['News_ID'],
            [row['Category_y'], row['SubCategory_y'], row['Title_y']]
        )
        for _, row in grouped_df.iterrows()
    ], desc="Building Item Features")
)

interactions, weights = dataset.build_interactions(
    tqdm([(row['User ID'], row['News_ID'], float(row['Impressions'])) for _, row in grouped_df.iterrows()], desc="Building Interactions")
)
interactions = interactions.astype('float32')
weights = weights.astype('float32')
item_features = coo_matrix(item_features)

train, test = random_train_test_split(interactions, test_percentage=0.2) 
train_weights = weights.multiply(train > 0)  
train_weights = coo_matrix(train_weights)


Building Item Features: 100%|██████████| 33621/33621 [00:00<00:00, 449227.94it/s]
Building Interactions: 100%|██████████| 33621/33621 [00:00<00:00, 896488.18it/s]


In [None]:
#fitting model
n_components =30
loss = 'warp'
epoch = 30
num_thread = 4
model = LightFM(no_components=n_components, loss=loss)

model.fit(
    train,
    item_features=item_features,
    epochs=epoch,
    num_threads=num_thread,
    verbose=True
)

Epoch: 100%|██████████| 30/30 [00:01<00:00, 29.11it/s]


<lightfm.lightfm.LightFM at 0x1816e0590>

In [None]:
auc_train = auc_score(model, train, item_features=item_features, num_threads=4).mean()

0.9896907

In [None]:
auc_test = auc_score(model, test, item_features=item_features, num_threads=4).mean()

0.9497407

In [12]:
#calculating mrr
def calculate_mrr(model, interactions, item_features=None):
    if not isinstance(interactions, csr_matrix):
        interactions = csr_matrix(interactions)

    mrr = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].indices  
        ranked_items = np.argsort(-scores)
        ranks = np.where(np.isin(ranked_items, true_items))[0]
        if len(ranks) > 0:
            mrr.append(1.0 / (ranks[0] + 1))  
    return np.mean(mrr)


train = csr_matrix(train) 
test = csr_matrix(test)
train_mrr = calculate_mrr(model, train, item_features=item_features)
test_mrr = calculate_mrr(model, test, item_features=item_features)

print(f"Train MRR: {train_mrr:.4f}")
print(f"Test MRR: {test_mrr:.4f}")


#Calculating ndcg@5 and ndcg@10
def calculate_ndcg(model, interactions, k, item_features=None):
    if not isinstance(interactions, scipy.sparse.csr_matrix):
        interactions = interactions.tocsr()
    
    ndcg = []
    for user_id in range(interactions.shape[0]):  
        scores = model.predict(user_id, np.arange(interactions.shape[1]), item_features=item_features)
        true_items = interactions[user_id].toarray().flatten()  
        ranked_items = np.argsort(-scores)[:k]
   
        dcg = sum(
            (true_items[item] / np.log2(rank + 2))
            for rank, item in enumerate(ranked_items)
            if true_items[item] > 0
        )
      
        sorted_true_items = np.sort(true_items)[::-1][:k]
        idcg = sum(
            (rel / np.log2(rank + 2))
            for rank, rel in enumerate(sorted_true_items)
            if rel > 0
        )
        ndcg.append(dcg / idcg if idcg > 0 else 0.0)
    return np.mean(ndcg)


ndcg_at_5_train = calculate_ndcg(model, train, k=5, item_features=item_features)
ndcg_at_10_train = calculate_ndcg(model, train, k=10, item_features=item_features)

ndcg_at_5_test = calculate_ndcg(model, test, k=5, item_features=item_features)
ndcg_at_10_test = calculate_ndcg(model, test, k=10, item_features=item_features)

print(f"Train nDCG@5: {ndcg_at_5_train:.4f}, Train nDCG@10: {ndcg_at_10_train:.4f}")
print(f"Test nDCG@5: {ndcg_at_5_test:.4f}, Test nDCG@10: {ndcg_at_10_test:.4f}")

Train MRR: 0.8340
Test MRR: 0.2530
Train nDCG@5: 0.7002, Train nDCG@10: 0.6925
Test nDCG@5: 0.1158, Test nDCG@10: 0.1499
