<h1>PERSONALIZED NEWS RECOMMENDATION SYSTEM</h1>

In [214]:
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader
from surprise import SVD, KNNBasic
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, Dense, Dot, Activation
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping
from surprise.model_selection import train_test_split as surprise_split
from surprise import accuracy
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score

In [2]:
news_columns = [
    'news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'
]

In [3]:
news = pd.read_csv('news.tsv', sep='\t', names=news_columns)

In [4]:
news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [5]:
news.tail()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid..."
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[]
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[]
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[]
51281,N44276,autos,autossports,Best Sports Car Deals for October,,https://assets.msn.com/labs/mind/BBy5rVe.html,"[{""Label"": ""Peugeot RCZ"", ""Type"": ""V"", ""Wikida...",[]


In [6]:
news.shape

(51282, 8)

In [7]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   news_id            51282 non-null  object
 1   category           51282 non-null  object
 2   subcategory        51282 non-null  object
 3   title              51282 non-null  object
 4   abstract           48616 non-null  object
 5   url                51282 non-null  object
 6   title_entities     51279 non-null  object
 7   abstract_entities  51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


In [8]:
behaviors_columns = [
    'impression_id', 'user_id', 'time', 'history', 'impressions'
]

In [9]:
behaviors = pd.read_csv('behaviors.tsv', sep='\t', names=behaviors_columns)

In [10]:
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [11]:
behaviors.tail()

Unnamed: 0,impression_id,user_id,time,history,impressions
156960,156961,U21593,11/14/2019 10:24:05 PM,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...
156961,156962,U10123,11/13/2019 6:57:04 AM,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...
156962,156963,U75630,11/14/2019 10:58:13 AM,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...
156963,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...
156964,156965,U64800,11/14/2019 3:25:49 PM,N22997 N48742,N61233-0 N33828-1 N19661-0 N41934-0


In [12]:
behaviors.shape

(156965, 5)

In [13]:
behaviors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   impression_id  156965 non-null  int64 
 1   user_id        156965 non-null  object
 2   time           156965 non-null  object
 3   history        153727 non-null  object
 4   impressions    156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


In [14]:
news.isnull().sum()

news_id                 0
category                0
subcategory             0
title                   0
abstract             2666
url                     0
title_entities          3
abstract_entities       4
dtype: int64

In [15]:
behaviors.isnull().sum()

impression_id       0
user_id             0
time                0
history          3238
impressions         0
dtype: int64

In [16]:
news['abstract'] = news['abstract'].fillna('')

In [17]:
behaviors['history'] = behaviors['history'].fillna('')

In [18]:
news['full_text'] = news['title'] + ' ' + news['abstract']

In [19]:
news['category'].value_counts()

category
news             15774
sports           14510
finance           3107
foodanddrink      2551
lifestyle         2479
travel            2350
video             2068
weather           2048
health            1885
autos             1639
tv                 889
music              769
movies             606
entertainment      587
kids                17
middleeast           2
northamerica         1
Name: count, dtype: int64

In [20]:
news['category'].nunique()

17

In [21]:
news['subcategory'].value_counts()

subcategory
newsus                    6564
football_nfl              5420
newspolitics              2826
newscrime                 2254
weathertopstories         2047
                          ... 
newsvideo                    1
travel                       1
lifestylehoroscopefish       1
baseball                     1
celebhub                     1
Name: count, Length: 264, dtype: int64

In [22]:
news['subcategory'].nunique()

264

In [23]:
behaviors['user_id'].value_counts()

user_id
U32146    62
U15740    44
U20833    41
U51286    40
U44201    40
          ..
U60416     1
U20588     1
U84385     1
U89164     1
U72015     1
Name: count, Length: 50000, dtype: int64

In [24]:
behaviors['user_id'].nunique()

50000

In [25]:
parsed_interactions = []
for index, row in tqdm(behaviors.iterrows(), total=behaviors.shape[0]):
    user_id = row['user_id']
    time = row['time']
    history = row['history']
    impression_list = row['impressions'].split()
    for impression in impression_list:
        parts = impression.split('-')
        news_id = parts[0]
        clicked = int(parts[1])
        parsed_interactions.append(
            {
                'user_id': user_id,
                'news_id': news_id,
                'clicked': clicked,
                'time': time,
                'history': history
            }
        )

100%|██████████| 156965/156965 [00:03<00:00, 43480.62it/s]


In [26]:
interacions = pd.DataFrame(parsed_interactions)

In [27]:
interacions.head()

Unnamed: 0,user_id,news_id,clicked,time,history
0,U13740,N55689,1,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U13740,N35729,0,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...
2,U91836,N20678,0,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
3,U91836,N39317,0,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
4,U91836,N58114,0,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...


In [28]:
interacions.shape

(5843444, 5)

In [29]:
interacions['time'] = pd.to_datetime(interacions['time'], format='%m/%d/%Y %I:%M:%S %p')

In [30]:
interacions.sort_values(by='time', inplace=True)

In [31]:
interacions.reset_index(drop=True, inplace=True)

In [32]:
interacions.head()

Unnamed: 0,user_id,news_id,clicked,time,history
0,U65916,N57099,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
1,U65916,N50329,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
2,U65916,N20602,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
3,U65916,N18546,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
4,U65916,N6868,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...


In [33]:
interacions.tail()

Unnamed: 0,user_id,news_id,clicked,time,history
5843439,U82996,N46917,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843440,U82996,N27737,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843441,U82996,N6837,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843442,U82996,N61233,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843443,U82996,N14478,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...


In [34]:
split = int(len(interacions)*0.8)

In [35]:
train = interacions.iloc[:split]
test = interacions.iloc[split:]

In [36]:
len(interacions)

5843444

In [37]:
len(train)

4674755

In [38]:
len(test)

1168689

In [39]:
train['clicked'].value_counts(normalize=True)

clicked
0    0.959084
1    0.040916
Name: proportion, dtype: float64

In [40]:
train_positives = train[train['clicked'] == 1]
train_negatives = train[train['clicked'] == 0]

In [41]:
num_positives = len(train_positives)

In [42]:
num_positives

191271

In [43]:
len(train_negatives)

4483484

In [44]:
train_negatives_sampled = train_negatives.sample(n=num_positives, random_state=42)

In [45]:
train_balanced = pd.concat([train_positives, train_negatives_sampled])
train_balanced = train_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

In [46]:
len(train_balanced)

382542

In [47]:
train = train_balanced.copy()

In [48]:
train['clicked'].value_counts(normalize=True)

clicked
0    0.5
1    0.5
Name: proportion, dtype: float64

## POPULARITY BASED RECOMMENDER (Baseline)

In [49]:
popular_articles = train[train['clicked'] == 1]['news_id'].value_counts()

In [50]:
popular_articles_data = popular_articles.reset_index()

In [51]:
popular_articles_data.columns = ['news_id', 'click_count']

In [52]:
popular_articles_data.head()

Unnamed: 0,news_id,click_count
0,N55689,4316
1,N35729,3346
2,N33619,3246
3,N53585,2835
4,N63970,2578


In [53]:
test_with_popularity = test.merge(popular_articles_data, on='news_id', how='left')

In [54]:
test_with_popularity['click_count'] = test_with_popularity['click_count'].fillna(0)

In [55]:
test_with_popularity[['user_id', 'news_id', 'clicked', 'click_count']].head()

Unnamed: 0,user_id,news_id,clicked,click_count
0,U47606,N50107,0,31.0
1,U47606,N60272,0,313.0
2,U47606,N58086,0,6.0
3,U47606,N38215,0,315.0
4,U47606,N61787,1,46.0


In [56]:
evaluation_results = {}

In [57]:
def calculate_metrics(group):
    if len(group['clicked'].unique()) < 2:
        return np.nan, np.nan, np.nan
    
    y_true = group['clicked'].values
    y_score = group['score'].values

    k_5 = min(5, len(y_true))
    k_10 = min(10, len(y_true))

    y_true_2d = [y_true]
    y_score_2d = [y_score]

    auc = roc_auc_score(y_true, y_score)
    map_score = average_precision_score(y_true, y_score)

    ndcg_at_5 = ndcg_score(y_true_2d, y_score_2d, k=k_5)
    ndcg_at_10 = ndcg_score(y_true_2d, y_score_2d, k=k_10)

    return auc, map_score, ndcg_at_5, ndcg_at_10

In [58]:
pop_test_df = test_with_popularity.rename(columns={'click_count': 'score'})

In [59]:
pop_metrics = pop_test_df.groupby(['user_id', 'time']).apply(calculate_metrics)

  pop_metrics = pop_test_df.groupby(['user_id', 'time']).apply(calculate_metrics)


In [60]:
pop_metrics.dropna(inplace=True)

In [61]:
pop_metrics_data = pd.DataFrame(pop_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [62]:
evaluation_results['Popularity'] = pop_metrics_data.mean()

In [63]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64}

## CONTENT-BASED FILTERING

### TF-IDF

In [64]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=20000,
    min_df=5
)

In [65]:
tfidf_data = tfidf.fit_transform(news['full_text'])

In [66]:
tfidf_data.shape

(51282, 18646)

In [67]:
news_id_to_index = pd.Series(news.index, index=news['news_id'])

In [68]:
##sample_user_id = train['user_id'].iloc[0]

In [69]:
##user_history_ids = train[
##    (train['user_id'] == sample_user_id) & (train['clicked'] == 1)
##]['news_id'].unique()

In [70]:
##sample_user_id

In [71]:
'''if len(user_history_ids) > 0:
    last_liked_article_id = user_history_ids[-1]
    last_liked_article_index = news_id_to_index[last_liked_article_id]
    article_vector = tfidf_data[last_liked_article_index]
    cosine = cosine_similarity(article_vector, tfidf_data)

    similar_article_scores = list(enumerate(cosine[0]))
    sorted_scores = sorted(similar_article_scores, key=lambda x: x[1], reverse=True)[1:11]

    similar_article_indices = [i[0] for i in sorted_scores]
    recommended_news_ids = news.iloc[similar_article_indices]['news_id']

    print('Last liked article: ', last_liked_article_id)
    print('Recommendations:\n', recommended_news_ids)
else:
    print(f'User {sample_user_id} has no click history in train set.')
'''

"if len(user_history_ids) > 0:\n    last_liked_article_id = user_history_ids[-1]\n    last_liked_article_index = news_id_to_index[last_liked_article_id]\n    article_vector = tfidf_data[last_liked_article_index]\n    cosine = cosine_similarity(article_vector, tfidf_data)\n\n    similar_article_scores = list(enumerate(cosine[0]))\n    sorted_scores = sorted(similar_article_scores, key=lambda x: x[1], reverse=True)[1:11]\n\n    similar_article_indices = [i[0] for i in sorted_scores]\n    recommended_news_ids = news.iloc[similar_article_indices]['news_id']\n\n    print('Last liked article: ', last_liked_article_id)\n    print('Recommendations:\n', recommended_news_ids)\nelse:\n    print(f'User {sample_user_id} has no click history in train set.')\n"

In [72]:
tqdm.pandas(desc='Calculating Content Scores')

In [73]:
def calculate_content_score(row):
    history_ids = row['history'].split()
    candidate_id = row['news_id']
    if not history_ids:
        return 0.0
    history_indices = [news_id_to_index.get(nid) for nid in history_ids]
    history_indices = [i for i in history_indices if i is not None]
    if not history_indices:
        return 0.0
    history_vectors = tfidf_data[history_indices]
    user_profile = np.asarray(np.mean(history_vectors, axis=0))
    candidate_index = news_id_to_index.get(candidate_id)
    if candidate_index is None:
        return 0.0
    candidate_vector = tfidf_data[candidate_index]
    score = cosine_similarity(user_profile, candidate_vector)[0][0]
    return score

In [74]:
cb_test_data = test.copy()

In [75]:
cb_test_data['score'] = cb_test_data.progress_apply(calculate_content_score, axis=1)

Calculating Content Scores: 100%|██████████| 1168689/1168689 [05:39<00:00, 3441.94it/s]


In [76]:
cb_metrics = cb_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)

  cb_metrics = cb_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)


In [77]:
cb_metrics.dropna(inplace=True)

In [78]:
cb_metrics_data = pd.DataFrame(cb_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [79]:
cb_mean_metrics = cb_metrics_data.mean()

In [80]:
cb_mean_metrics

AUC        0.593417
MAP        0.285850
NDCG@5     0.296274
NDCG@10    0.352968
dtype: float64

In [81]:
evaluation_results['Content-Based'] = cb_mean_metrics

In [82]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64,
 'Content-Based': AUC        0.593417
 MAP        0.285850
 NDCG@5     0.296274
 NDCG@10    0.352968
 dtype: float64}

### Word2Vec

In [83]:
def preprocess(text):
    text = text.lower()
    text = re.sub(r'\w\s', '', text)
    return text.split()

In [84]:
tokenized_text = news['full_text'].apply(preprocess)

In [85]:
vector_size = 100

In [86]:
word2vec_model = Word2Vec(
    sentences=tokenized_text,
    vector_size=vector_size,
    window=5,
    min_count=5,
    workers=4
)

In [87]:
def get_article_vector(tokens, model, vector_size):
    word_vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not word_vectors:
        return np.zeros(vector_size)
    return np.mean(word_vectors, axis=0)

In [88]:
article_vectors_w2v = np.array(
    [get_article_vector(doc, word2vec_model, vector_size) for doc in tokenized_text]
)

In [89]:
def calculate_content_score_w2v(row):
    history_ids = row['history'].split()
    candidate_id = row['news_id']
    if not history_ids:
        return 0.0
    history_indices = [news_id_to_index.get(nid) for nid in history_ids]
    history_indices = [i for i in history_indices if i is not None]
    if not history_indices:
        return 0.0
    history_vectors = article_vectors_w2v[history_indices]
    user_profile = np.mean(history_vectors, axis=0)
    candidate_index = news_id_to_index.get(candidate_id)
    if candidate_index is None:
        return 0.0
    candidate_vector = article_vectors_w2v[candidate_index]
    score = cosine_similarity(user_profile.reshape(1, -1), candidate_vector.reshape(1, -1))[0][0]
    return score

In [90]:
cb_w2v_test = test.copy()

In [91]:
cb_w2v_test['score'] = cb_w2v_test.progress_apply(calculate_content_score_w2v, axis=1)

Calculating Content Scores:   0%|          | 0/1168689 [00:00<?, ?it/s]

Calculating Content Scores: 100%|██████████| 1168689/1168689 [02:32<00:00, 7652.16it/s]


In [92]:
cb_w2v_metrics = cb_w2v_test.groupby(['user_id', 'time']).apply(calculate_metrics)

  cb_w2v_metrics = cb_w2v_test.groupby(['user_id', 'time']).apply(calculate_metrics)


In [93]:
cb_w2v_metrics.dropna(inplace=True)

In [94]:
cb_w2v_metrics_data = pd.DataFrame(cb_w2v_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [95]:
cb_w2v_mean_metrics = cb_w2v_metrics_data.mean()

In [96]:
cb_w2v_mean_metrics

AUC        0.504367
MAP        0.134679
NDCG@5     0.207777
NDCG@10    0.268099
dtype: float64

In [97]:
evaluation_results['Content-Based (W2V)'] = cb_w2v_mean_metrics

In [98]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64,
 'Content-Based': AUC        0.593417
 MAP        0.285850
 NDCG@5     0.296274
 NDCG@10    0.352968
 dtype: float64,
 'Content-Based (W2V)': AUC        0.504367
 MAP        0.134679
 NDCG@5     0.207777
 NDCG@10    0.268099
 dtype: float64}

## COLLABORATIVE FILTERING

### SVD

In [99]:
reader = Reader(rating_scale=(0, 1))

In [100]:
svd_train_data = Dataset.load_from_df(
    train[['user_id', 'news_id', 'clicked']],
    reader
)

In [101]:
trainset = svd_train_data.build_full_trainset()

In [102]:
model_svd = SVD(
    n_factors = 100,
    n_epochs = 20,
    lr_all = 0.005,
    reg_all = 0.02,
    verbose = True
)

In [103]:
model_svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x395e5e090>

In [104]:
sample_test_interaction = test.iloc[0]

In [105]:
user_id = sample_test_interaction['user_id']
news_id = sample_test_interaction['news_id']
true_click = sample_test_interaction['clicked']

In [106]:
prediction = model_svd.predict(uid=user_id, iid=news_id)

In [107]:
print(f'User: {user_id} on {news_id}: ')
print(f'Predicted click probability: {prediction}')
print(f'True Click: {true_click}')

User: U47606 on N50107: 
Predicted click probability: user: U47606     item: N50107     r_ui = None   est = 0.41   {'was_impossible': False}
True Click: 0


In [108]:
svd_scores = []
for index, row in tqdm(test.iterrows(), total=test.shape[0]):
    prediction = model_svd.predict(uid=row['user_id'], iid=row['news_id'])
    svd_scores.append(prediction.est)

100%|██████████| 1168689/1168689 [00:17<00:00, 68373.50it/s]


In [109]:
svd_test_data = test.copy()

In [110]:
svd_test_data['score'] = svd_scores

In [111]:
svd_metrics = svd_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)

  svd_metrics = svd_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)


In [112]:
svd_metrics.dropna(inplace=True)

In [113]:
svd_metrics_data = pd.DataFrame(svd_metrics.to_list(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [114]:
svd_mean_metrics = svd_metrics_data.mean()

In [115]:
svd_mean_metrics

AUC        0.554899
MAP        0.204846
NDCG@5     0.229274
NDCG@10    0.291265
dtype: float64

In [116]:
evaluation_results['SVD'] = svd_mean_metrics

In [117]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64,
 'Content-Based': AUC        0.593417
 MAP        0.285850
 NDCG@5     0.296274
 NDCG@10    0.352968
 dtype: float64,
 'Content-Based (W2V)': AUC        0.504367
 MAP        0.134679
 NDCG@5     0.207777
 NDCG@10    0.268099
 dtype: float64,
 'SVD': AUC        0.554899
 MAP        0.204846
 NDCG@5     0.229274
 NDCG@10    0.291265
 dtype: float64}

In [118]:
pd.DataFrame(evaluation_results).T

Unnamed: 0,AUC,MAP,NDCG@5,NDCG@10
Popularity,0.488288,0.182673,0.198044,0.261494
Content-Based,0.593417,0.28585,0.296274,0.352968
Content-Based (W2V),0.504367,0.134679,0.207777,0.268099
SVD,0.554899,0.204846,0.229274,0.291265


## HYBRID NEURAL MODEL

In [119]:
vector_size = 100

In [120]:
max_history_length = 20

In [121]:
batch_size = 1024

In [122]:
original_w2v_matrix = article_vectors_w2v

In [123]:
num_articles = original_w2v_matrix.shape[0]

In [124]:
embedding_matrix = np.zeros((num_articles + 1, vector_size))

In [125]:
embedding_matrix[1:] = original_w2v_matrix

In [126]:
original_w2v_matrix.shape

(51282, 100)

In [127]:
embedding_matrix.shape

(51283, 100)

In [128]:
news_id_to_index_padded = {
    news_id: index + 1 for news_id, index in news_id_to_index.items()
}

In [129]:
def create_sequences(df, padded_index, max_len):
    histories = []
    candidates = []
    labels = []
    for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Creating Sequences"):
        history_ids = row['history'].split()
        history_indices = [padded_index.get(nid) for nid in history_ids if padded_index.get(nid) is not None]
        padded_history = pad_sequences([history_indices], maxlen=max_len, padding='pre', truncating='pre')[0]
        candidate_index = padded_index.get(row['news_id'])
        label = row['clicked']
        if candidate_index is not None:
            histories.append(padded_history)
            candidates.append(candidate_index)
            labels.append(label)
    return np.array(histories), np.array(candidates), np.array(labels)

In [130]:
X_train_hist, X_train_cand, y_train = create_sequences(
    train, 
    news_id_to_index_padded, 
    max_history_length
)

Creating Sequences: 100%|██████████| 382542/382542 [00:09<00:00, 40467.19it/s]


In [131]:
X_test_hist, X_test_cand, y_test = create_sequences(
    test, 
    news_id_to_index_padded, 
    max_history_length
)

Creating Sequences: 100%|██████████| 1168689/1168689 [00:25<00:00, 46144.92it/s]


In [132]:
train_dataset = tf.data.Dataset.from_tensor_slices(
    (
        {"History_Input": X_train_hist, "Candidate_Input": X_train_cand},
        y_train
    )
)

2025-11-14 00:04:19.975646: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M3
2025-11-14 00:04:19.975927: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 8.00 GB
2025-11-14 00:04:19.976335: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 2.67 GB
2025-11-14 00:04:19.976387: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2025-11-14 00:04:19.976777: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [133]:
num_train_samples = len(X_train_hist)

In [134]:
num_val_samples = int(num_train_samples * 0.1)

In [135]:
num_train_only_samples = num_train_samples - num_val_samples

In [136]:
train_dataset = train_dataset.shuffle(num_train_samples, reshuffle_each_iteration=False)

In [137]:
train_pipeline = (
    train_dataset.take(num_train_only_samples)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

In [138]:
val_pipeline = (
    train_dataset.skip(num_train_only_samples)
    .batch(batch_size)
    .prefetch(tf.data.AUTOTUNE)
)

In [139]:
test_pipeline = tf.data.Dataset.from_tensor_slices(
    (
        {"History_Input": X_test_hist, "Candidate_Input": X_test_cand},
        y_test
    )
).batch(batch_size).prefetch(tf.data.AUTOTUNE)

In [140]:
num_train_only_samples

344288

In [141]:
num_val_samples

38254

In [142]:
tf.keras.backend.clear_session()

In [143]:
article_embedding_layer = Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=False,
    mask_zero=True
)

In [144]:
history_input = Input(shape=(max_history_length,), name="History_Input")

In [145]:
candidate_input = Input(shape=(1,), name="Candidate_Input")

In [146]:
history_vectors = article_embedding_layer(history_input)

In [147]:
candidate_vector_sequential = article_embedding_layer(candidate_input)

In [148]:
candidate_vector = tf.keras.layers.Flatten()(candidate_vector_sequential)



### Simple RNN

In [149]:
rnn_output = SimpleRNN(vector_size, name="RNN_Encoder")(history_vectors)

In [150]:
dot_product = Dot(axes=1, name="Dot_Product")([rnn_output, candidate_vector])

In [151]:
dot_product = tf.keras.layers.Flatten()(dot_product)

In [152]:
output = Activation('sigmoid', name="Click_Probability")(dot_product)

In [153]:
model = Model(inputs=[history_input, candidate_input], outputs=[output])

In [154]:
model.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['auc']
)

In [155]:
model.summary()

In [156]:
len(tf.config.list_physical_devices('GPU'))

1

In [157]:
history = model.fit(
    train_pipeline,
    epochs=5,
    validation_data = val_pipeline,
    verbose=1
)

Epoch 1/5


2025-11-14 00:04:21.449157: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 235ms/step - auc: 0.5252 - loss: 0.6909 - val_auc: 0.5264 - val_loss: 0.6903
Epoch 2/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 233ms/step - auc: 0.5274 - loss: 0.6904 - val_auc: 0.5286 - val_loss: 0.6903
Epoch 3/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 233ms/step - auc: 0.5291 - loss: 0.6904 - val_auc: 0.5284 - val_loss: 0.6903
Epoch 4/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 241ms/step - auc: 0.5289 - loss: 0.6904 - val_auc: 0.5285 - val_loss: 0.6903
Epoch 5/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m94s[0m 279ms/step - auc: 0.5289 - loss: 0.6903 - val_auc: 0.5286 - val_loss: 0.6903


In [158]:
rnn_scores = model.predict(
    test_pipeline.map(lambda x, y: x),
    verbose=1
)

[1m1142/1142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m53s[0m 47ms/step


In [159]:
test_indices_with_valid_candidates = [
    idx for idx, row in test.iterrows()
    if news_id_to_index_padded.get(row['news_id']) is not None
]

In [160]:
rnn_test_data = test.loc[test_indices_with_valid_candidates].copy()

In [161]:
rnn_test_data['score'] = rnn_scores.flatten()

In [162]:
rnn_metrics = rnn_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)

  rnn_metrics = rnn_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)


In [163]:
rnn_metrics.dropna(inplace=True)

In [164]:
rnn_metrics_data = pd.DataFrame(rnn_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [165]:
rnn_mean_metrics = rnn_metrics_data.mean()

In [166]:
rnn_mean_metrics

AUC        0.504528
MAP        0.141370
NDCG@5     0.211127
NDCG@10    0.270496
dtype: float64

In [167]:
evaluation_results['Hybrid RNN'] = rnn_mean_metrics

In [168]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64,
 'Content-Based': AUC        0.593417
 MAP        0.285850
 NDCG@5     0.296274
 NDCG@10    0.352968
 dtype: float64,
 'Content-Based (W2V)': AUC        0.504367
 MAP        0.134679
 NDCG@5     0.207777
 NDCG@10    0.268099
 dtype: float64,
 'SVD': AUC        0.554899
 MAP        0.204846
 NDCG@5     0.229274
 NDCG@10    0.291265
 dtype: float64,
 'Hybrid RNN': AUC        0.504528
 MAP        0.141370
 NDCG@5     0.211127
 NDCG@10    0.270496
 dtype: float64}

### LSTM

In [169]:
tf.keras.backend.clear_session()

In [170]:
article_embedding_layer = Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True,
    mask_zero=True
)

In [171]:
history_input = Input(shape=(max_history_length,), name="History_Input")
candidate_input = Input(shape=(1,), name="Candidate_Input")

In [172]:
history_vectors = article_embedding_layer(history_input)
candidate_vector_sequential = article_embedding_layer(candidate_input)
candidate_vector = tf.keras.layers.Flatten()(candidate_vector_sequential)



In [173]:
lstm_output = LSTM(vector_size, name="LSTM_Encoder", recurrent_dropout=0.001)(history_vectors)

In [174]:
dot_product = Dot(axes=1, name="Dot_Product")([lstm_output, candidate_vector])

In [175]:
dot_product = tf.keras.layers.Flatten()(dot_product)

In [176]:
output = Activation('sigmoid', name="Click_Probability")(dot_product)

In [177]:
model_lstm = Model(inputs=[history_input, candidate_input], outputs=[output])

In [178]:
model_lstm.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['auc']
)

In [179]:
model_lstm.summary()

In [180]:
history_lstm = model_lstm.fit(
    train_pipeline,
    epochs=5,
    validation_data=val_pipeline,
    verbose=1
)

Epoch 1/5


[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m255s[0m 749ms/step - auc: 0.6977 - loss: 0.6275 - val_auc: 0.7137 - val_loss: 0.6169
Epoch 2/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m247s[0m 731ms/step - auc: 0.7266 - loss: 0.6076 - val_auc: 0.7151 - val_loss: 0.6163
Epoch 3/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m246s[0m 728ms/step - auc: 0.7352 - loss: 0.5997 - val_auc: 0.7154 - val_loss: 0.6174
Epoch 4/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 734ms/step - auc: 0.7440 - loss: 0.5911 - val_auc: 0.7154 - val_loss: 0.6199
Epoch 5/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m256s[0m 759ms/step - auc: 0.7537 - loss: 0.5819 - val_auc: 0.7150 - val_loss: 0.6242


In [181]:
lstm_scores = model_lstm.predict(
    test_pipeline.map(lambda x, y: x),
    verbose=1
)

[1m1142/1142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 65ms/step


In [182]:
lstm_test_data = test.loc[test_indices_with_valid_candidates].copy()

In [183]:
lstm_test_data['score'] = lstm_scores.flatten()

In [184]:
lstm_metrics = lstm_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)

  lstm_metrics = lstm_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)


In [185]:
lstm_metrics.dropna(inplace=True)

In [186]:
lstm_metrics_data = pd.DataFrame(lstm_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [187]:
lstm_mean_metrics = lstm_metrics_data.mean()

In [188]:
lstm_mean_metrics

AUC        0.565333
MAP        0.223510
NDCG@5     0.241490
NDCG@10    0.303544
dtype: float64

In [189]:
evaluation_results['Hybrid LSTM'] = lstm_mean_metrics

In [190]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64,
 'Content-Based': AUC        0.593417
 MAP        0.285850
 NDCG@5     0.296274
 NDCG@10    0.352968
 dtype: float64,
 'Content-Based (W2V)': AUC        0.504367
 MAP        0.134679
 NDCG@5     0.207777
 NDCG@10    0.268099
 dtype: float64,
 'SVD': AUC        0.554899
 MAP        0.204846
 NDCG@5     0.229274
 NDCG@10    0.291265
 dtype: float64,
 'Hybrid RNN': AUC        0.504528
 MAP        0.141370
 NDCG@5     0.211127
 NDCG@10    0.270496
 dtype: float64,
 'Hybrid LSTM': AUC        0.565333
 MAP        0.223510
 NDCG@5     0.241490
 NDCG@10    0.303544
 dtype: float64}

### BI-LSTM

In [215]:
tf.keras.backend.clear_session()

In [216]:
early_stopping = EarlyStopping(
    monitor='val_auc', 
    mode='max',
    patience=1, 
    restore_best_weights=True,
    verbose=1
)

In [217]:
article_embedding_layer = Embedding(
    input_dim=embedding_matrix.shape[0],
    output_dim=embedding_matrix.shape[1],
    weights=[embedding_matrix],
    trainable=True,
    mask_zero=True
)

In [218]:
history_input = Input(shape=(max_history_length,), name="History_Input")
candidate_input = Input(shape=(1,), name="Candidate_Input")

In [219]:
history_vectors = article_embedding_layer(history_input)
candidate_vector_sequential = article_embedding_layer(candidate_input)
candidate_vector = tf.keras.layers.Flatten()(candidate_vector_sequential)



In [220]:
bilstm_output = tf.keras.layers.Bidirectional(
    LSTM(vector_size, name="LSTM_Encoder", recurrent_dropout=0.001),
    merge_mode='sum',
    name="BiLSTM_Encoder"
)(history_vectors)

In [221]:
dot_product = Dot(axes=1, name="Dot_Product")([bilstm_output, candidate_vector])
dot_product = tf.keras.layers.Flatten()(dot_product)

In [222]:
output = Activation('sigmoid', name="Click_Probability")(dot_product)

In [223]:
model_bilstm = Model(inputs=[history_input, candidate_input], outputs=[output])

In [224]:
model_bilstm.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['auc']
)

In [225]:
model_bilstm.summary()

In [226]:
history_bilstm = model_bilstm.fit(
    train_pipeline,
    epochs=5,
    validation_data=val_pipeline,
    verbose=1,
    callbacks=[early_stopping]
)

Epoch 1/5




[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m829s[0m 2s/step - auc: 0.6992 - loss: 0.6264 - val_auc: 0.7134 - val_loss: 0.6167
Epoch 2/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m835s[0m 2s/step - auc: 0.7313 - loss: 0.6033 - val_auc: 0.7171 - val_loss: 0.6152
Epoch 3/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m881s[0m 3s/step - auc: 0.7500 - loss: 0.5872 - val_auc: 0.7176 - val_loss: 0.6186
Epoch 4/5
[1m337/337[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m896s[0m 3s/step - auc: 0.7678 - loss: 0.5706 - val_auc: 0.7148 - val_loss: 0.6238
Epoch 4: early stopping
Restoring model weights from the end of the best epoch: 3.


In [202]:
bilstm_scores = model_bilstm.predict(
    test_pipeline.map(lambda x, y: x),
    verbose=1
)

[1m1142/1142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m114s[0m 100ms/step


In [203]:
bilstm_test_data = test.loc[test_indices_with_valid_candidates].copy()

In [204]:
bilstm_test_data['score'] = bilstm_scores.flatten()

In [205]:
bilstm_metrics = bilstm_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)

  bilstm_metrics = bilstm_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)


In [206]:
bilstm_metrics.dropna(inplace=True)

In [207]:
bilstm_metrics_data = pd.DataFrame(bilstm_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [208]:
bilstm_mean_metrics = bilstm_metrics_data.mean()

In [209]:
bilstm_mean_metrics

AUC        0.560424
MAP        0.222078
NDCG@5     0.239530
NDCG@10    0.301003
dtype: float64

In [210]:
evaluation_results['Hybrid Bi-LSTM'] = bilstm_mean_metrics

In [211]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64,
 'Content-Based': AUC        0.593417
 MAP        0.285850
 NDCG@5     0.296274
 NDCG@10    0.352968
 dtype: float64,
 'Content-Based (W2V)': AUC        0.504367
 MAP        0.134679
 NDCG@5     0.207777
 NDCG@10    0.268099
 dtype: float64,
 'SVD': AUC        0.554899
 MAP        0.204846
 NDCG@5     0.229274
 NDCG@10    0.291265
 dtype: float64,
 'Hybrid RNN': AUC        0.504528
 MAP        0.141370
 NDCG@5     0.211127
 NDCG@10    0.270496
 dtype: float64,
 'Hybrid LSTM': AUC        0.565333
 MAP        0.223510
 NDCG@5     0.241490
 NDCG@10    0.303544
 dtype: float64,
 'Hybrid Bi-LSTM': AUC        0.560424
 MAP        0.222078
 NDCG@5     0.239530
 NDCG@10    0.301003
 dtype: float64}

In [213]:
pd.DataFrame(evaluation_results).T

Unnamed: 0,AUC,MAP,NDCG@5,NDCG@10
Popularity,0.488288,0.182673,0.198044,0.261494
Content-Based,0.593417,0.28585,0.296274,0.352968
Content-Based (W2V),0.504367,0.134679,0.207777,0.268099
SVD,0.554899,0.204846,0.229274,0.291265
Hybrid RNN,0.504528,0.14137,0.211127,0.270496
Hybrid LSTM,0.565333,0.22351,0.24149,0.303544
Hybrid Bi-LSTM,0.560424,0.222078,0.23953,0.301003
