<h1>NEWS RECOMMENDATION SYSTEM</h1>

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split as surprise_split
from surprise import accuracy
from sklearn.metrics import roc_auc_score, average_precision_score, ndcg_score

In [2]:
news_columns = [
    'news_id', 'category', 'subcategory', 'title', 'abstract', 'url', 'title_entities', 'abstract_entities'
]

In [3]:
news = pd.read_csv('news.tsv', sep='\t', names=news_columns)

In [4]:
news.head()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


In [5]:
news.tail()

Unnamed: 0,news_id,category,subcategory,title,abstract,url,title_entities,abstract_entities
51277,N16909,weather,weathertopstories,"Adapting, Learning And Soul Searching: Reflect...",Woolsey Fire Anniversary: A community is forev...,https://assets.msn.com/labs/mind/BBWzQJK.html,"[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid...","[{""Label"": ""Woolsey Fire"", ""Type"": ""N"", ""Wikid..."
51278,N47585,lifestyle,lifestylefamily,Family says 13-year-old Broadway star died fro...,,https://assets.msn.com/labs/mind/BBWzQYV.html,"[{""Label"": ""Broadway theatre"", ""Type"": ""F"", ""W...",[]
51279,N7482,sports,more_sports,St. Dominic soccer player tries to kick cancer...,"Sometimes, what happens on the sidelines can b...",https://assets.msn.com/labs/mind/BBWzQnK.html,[],[]
51280,N34418,sports,soccer_epl,How the Sounders won MLS Cup,"Mark, Jeremiah and Casey were so excited they ...",https://assets.msn.com/labs/mind/BBWzQuK.html,"[{""Label"": ""MLS Cup"", ""Type"": ""U"", ""WikidataId...",[]
51281,N44276,autos,autossports,Best Sports Car Deals for October,,https://assets.msn.com/labs/mind/BBy5rVe.html,"[{""Label"": ""Peugeot RCZ"", ""Type"": ""V"", ""Wikida...",[]


In [6]:
news.shape

(51282, 8)

In [7]:
news.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 51282 entries, 0 to 51281
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   news_id            51282 non-null  object
 1   category           51282 non-null  object
 2   subcategory        51282 non-null  object
 3   title              51282 non-null  object
 4   abstract           48616 non-null  object
 5   url                51282 non-null  object
 6   title_entities     51279 non-null  object
 7   abstract_entities  51278 non-null  object
dtypes: object(8)
memory usage: 3.1+ MB


In [8]:
behaviors_columns = [
    'impression_id', 'user_id', 'time', 'history', 'impressions'
]

In [9]:
behaviors = pd.read_csv('behaviors.tsv', sep='\t', names=behaviors_columns)

In [10]:
behaviors.head()

Unnamed: 0,impression_id,user_id,time,history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [11]:
behaviors.tail()

Unnamed: 0,impression_id,user_id,time,history,impressions
156960,156961,U21593,11/14/2019 10:24:05 PM,N7432 N58559 N1954 N43353 N14343 N13008 N28833...,N2235-0 N22975-0 N64037-0 N47652-0 N11378-0 N4...
156961,156962,U10123,11/13/2019 6:57:04 AM,N9803 N104 N24462 N57318 N55743 N40526 N31726 ...,N3841-0 N61571-0 N58813-0 N28213-0 N4428-0 N25...
156962,156963,U75630,11/14/2019 10:58:13 AM,N29898 N59704 N4408 N9803 N53644 N26103 N812 N...,N55913-0 N62318-0 N53515-0 N10960-0 N9135-0 N5...
156963,156964,U44625,11/13/2019 2:57:02 PM,N4118 N47297 N3164 N43295 N6056 N38747 N42973 ...,N6219-0 N3663-0 N31147-0 N58363-0 N4107-0 N457...
156964,156965,U64800,11/14/2019 3:25:49 PM,N22997 N48742,N61233-0 N33828-1 N19661-0 N41934-0


In [12]:
behaviors.shape

(156965, 5)

In [13]:
behaviors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   impression_id  156965 non-null  int64 
 1   user_id        156965 non-null  object
 2   time           156965 non-null  object
 3   history        153727 non-null  object
 4   impressions    156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


In [14]:
news.isnull().sum()

news_id                 0
category                0
subcategory             0
title                   0
abstract             2666
url                     0
title_entities          3
abstract_entities       4
dtype: int64

In [15]:
behaviors.isnull().sum()

impression_id       0
user_id             0
time                0
history          3238
impressions         0
dtype: int64

In [16]:
news['abstract'] = news['abstract'].fillna('')

In [17]:
behaviors['history'] = behaviors['history'].fillna('')

In [18]:
news['full_text'] = news['title'] + ' ' + news['abstract']

In [19]:
news['category'].value_counts()

category
news             15774
sports           14510
finance           3107
foodanddrink      2551
lifestyle         2479
travel            2350
video             2068
weather           2048
health            1885
autos             1639
tv                 889
music              769
movies             606
entertainment      587
kids                17
middleeast           2
northamerica         1
Name: count, dtype: int64

In [20]:
news['category'].nunique()

17

In [21]:
news['subcategory'].value_counts()

subcategory
newsus                    6564
football_nfl              5420
newspolitics              2826
newscrime                 2254
weathertopstories         2047
                          ... 
newsvideo                    1
travel                       1
lifestylehoroscopefish       1
baseball                     1
celebhub                     1
Name: count, Length: 264, dtype: int64

In [22]:
news['subcategory'].nunique()

264

In [23]:
behaviors['user_id'].value_counts()

user_id
U32146    62
U15740    44
U20833    41
U51286    40
U44201    40
          ..
U60416     1
U20588     1
U84385     1
U89164     1
U72015     1
Name: count, Length: 50000, dtype: int64

In [24]:
behaviors['user_id'].nunique()

50000

In [25]:
parsed_interactions = []
for index, row in tqdm(behaviors.iterrows(), total=behaviors.shape[0]):
    user_id = row['user_id']
    time = row['time']
    history = row['history']
    impression_list = row['impressions'].split()
    for impression in impression_list:
        parts = impression.split('-')
        news_id = parts[0]
        clicked = int(parts[1])
        parsed_interactions.append(
            {
                'user_id': user_id,
                'news_id': news_id,
                'clicked': clicked,
                'time': time,
                'history': history
            }
        )

100%|██████████| 156965/156965 [00:03<00:00, 40155.65it/s]


In [26]:
interacions = pd.DataFrame(parsed_interactions)

In [27]:
interacions.head()

Unnamed: 0,user_id,news_id,clicked,time,history
0,U13740,N55689,1,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...
1,U13740,N35729,0,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...
2,U91836,N20678,0,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
3,U91836,N39317,0,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...
4,U91836,N58114,0,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...


In [28]:
interacions.shape

(5843444, 5)

In [29]:
interacions['time'] = pd.to_datetime(interacions['time'], format='%m/%d/%Y %I:%M:%S %p')

In [30]:
interacions.sort_values(by='time', inplace=True)

In [31]:
interacions.reset_index(drop=True, inplace=True)

In [32]:
interacions.head()

Unnamed: 0,user_id,news_id,clicked,time,history
0,U65916,N57099,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
1,U65916,N50329,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
2,U65916,N20602,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
3,U65916,N18546,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...
4,U65916,N6868,0,2019-11-09 00:00:19,N51706 N40767 N12096 N9798 N38802 N54827 N5780...


In [33]:
interacions.tail()

Unnamed: 0,user_id,news_id,clicked,time,history
5843439,U82996,N46917,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843440,U82996,N27737,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843441,U82996,N6837,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843442,U82996,N61233,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...
5843443,U82996,N14478,0,2019-11-14 23:59:13,N39556 N22279 N56461 N33393 N6233 N33617 N4943...


In [34]:
split = int(len(interacions)*0.8)

In [35]:
train = interacions.iloc[:split]
test = interacions.iloc[split:]

In [36]:
len(interacions)

5843444

In [37]:
len(train)

4674755

In [38]:
len(test)

1168689

In [39]:
popular_articles = train[train['clicked'] == 1]['news_id'].value_counts()

In [40]:
popular_articles_data = popular_articles.reset_index()

In [41]:
popular_articles_data.columns = ['news_id', 'click_count']

In [42]:
popular_articles_data.head()

Unnamed: 0,news_id,click_count
0,N55689,4316
1,N35729,3346
2,N33619,3246
3,N53585,2835
4,N63970,2578


In [43]:
test_with_popularity = test.merge(popular_articles_data, on='news_id', how='left')

In [44]:
test_with_popularity['click_count'] = test_with_popularity['click_count'].fillna(0)

In [45]:
test_with_popularity[['user_id', 'news_id', 'clicked', 'click_count']].head()

Unnamed: 0,user_id,news_id,clicked,click_count
0,U47606,N50107,0,31.0
1,U47606,N60272,0,313.0
2,U47606,N58086,0,6.0
3,U47606,N38215,0,315.0
4,U47606,N61787,1,46.0


In [46]:
tfidf = TfidfVectorizer(
    stop_words='english',
    max_features=20000,
    min_df=5
)

In [47]:
tfidf_data = tfidf.fit_transform(news['full_text'])

In [48]:
tfidf_data.shape

(51282, 18646)

In [49]:
news_id_to_index = pd.Series(news.index, index=news['news_id'])

In [50]:
sample_user_id = train['user_id'].iloc[0]

In [51]:
user_history_ids = train[
    (train['user_id'] == sample_user_id) & (train['clicked'] == 1)
]['news_id'].unique()

In [52]:
sample_user_id

'U65916'

In [53]:
if len(user_history_ids) > 0:
    last_liked_article_id = user_history_ids[-1]
    last_liked_article_index = news_id_to_index[last_liked_article_id]
    article_vector = tfidf_data[last_liked_article_index]
    cosine = cosine_similarity(article_vector, tfidf_data)

    similar_article_scores = list(enumerate(cosine[0]))
    sorted_scores = sorted(similar_article_scores, key=lambda x: x[1], reverse=True)[1:11]

    similar_article_indices = [i[0] for i in sorted_scores]
    recommended_news_ids = news.iloc[similar_article_indices]['news_id']

    print('Last liked article: ', last_liked_article_id)
    print('Recommendations:\n', recommended_news_ids)
else:
    print(f'User {sample_user_id} has no click history in train set.')


Last liked article:  N41612
Recommendations:
 20082    N13136
17516    N55672
50928    N64530
35014    N33576
1000     N44796
19477     N4950
46491    N60364
976      N24064
40066    N36545
13032    N28365
Name: news_id, dtype: object


In [54]:
reader = Reader(rating_scale=(0, 1))

In [55]:
svd_train_data = Dataset.load_from_df(
    train[['user_id', 'news_id', 'clicked']],
    reader
)

In [56]:
trainset = svd_train_data.build_full_trainset()

In [57]:
model_svd = SVD(
    n_factors = 100,
    n_epochs = 20,
    lr_all = 0.005,
    reg_all = 0.02,
    verbose = True
)

In [58]:
model_svd.fit(trainset)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x30a42e510>

In [59]:
sample_test_interaction = test.iloc[0]

In [60]:
user_id = sample_test_interaction['user_id']
news_id = sample_test_interaction['news_id']
true_click = sample_test_interaction['clicked']

In [61]:
prediction = model_svd.predict(uid=user_id, iid=news_id)

In [62]:
print(f'User: {user_id} on {news_id}: ')
print(f'Predicted click probability: {prediction}')
print(f'True Click: {true_click}')

User: U47606 on N50107: 
Predicted click probability: user: U47606     item: N50107     r_ui = None   est = 0.01   {'was_impossible': False}
True Click: 0


In [63]:
evaluation_results = {}

In [64]:
def calculate_metrics(group):
    if len(group['clicked'].unique()) < 2:
        return np.nan, np.nan, np.nan
    
    y_true = group['clicked'].values
    y_score = group['score'].values

    k_5 = min(5, len(y_true))
    k_10 = min(10, len(y_true))

    y_true_2d = [y_true]
    y_score_2d = [y_score]

    auc = roc_auc_score(y_true, y_score)
    map_score = average_precision_score(y_true, y_score)

    ndcg_at_5 = ndcg_score(y_true_2d, y_score_2d, k=k_5)
    ndcg_at_10 = ndcg_score(y_true_2d, y_score_2d, k=k_10)

    return auc, map_score, ndcg_at_5, ndcg_at_10

In [65]:
pop_test_df = test_with_popularity.rename(columns={'click_count': 'score'})

In [66]:
pop_metrics = pop_test_df.groupby(['user_id', 'time']).apply(calculate_metrics)

  pop_metrics = pop_test_df.groupby(['user_id', 'time']).apply(calculate_metrics)


In [67]:
pop_metrics.dropna(inplace=True)

In [68]:
pop_metrics_data = pd.DataFrame(pop_metrics.tolist(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [70]:
evaluation_results['Popularity'] = pop_metrics_data.mean()

In [72]:
evaluation_results

{'Popularity': AUC        0.488288
 MAP        0.182673
 NDCG@5     0.198044
 NDCG@10    0.261494
 dtype: float64}

In [73]:
svd_scores = []
for index, row in tqdm(test.iterrows(), total=test.shape[0]):
    prediction = model_svd.predict(uid=row['user_id'], iid=row['news_id'])
    svd_scores.append(prediction.est)

100%|██████████| 1168689/1168689 [00:17<00:00, 65128.97it/s]


In [74]:
svd_test_data = test.copy()

In [75]:
svd_test_data['score'] = svd_scores

In [76]:
svd_metrics = svd_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)

  svd_metrics = svd_test_data.groupby(['user_id', 'time']).apply(calculate_metrics)


In [77]:
svd_metrics.dropna(inplace=True)

In [78]:
svd_metrics_data = pd.DataFrame(svd_metrics.to_list(), columns=['AUC', 'MAP', 'NDCG@5', 'NDCG@10'])

In [79]:
svd_mean_metrics = svd_metrics_data.mean()

In [80]:
svd_mean_metrics

AUC        0.545980
MAP        0.233836
NDCG@5     0.236017
NDCG@10    0.297342
dtype: float64

In [81]:
evaluation_results['SVD'] = svd_mean_metrics

In [82]:
results_summary = pd.DataFrame(evaluation_results).T

In [83]:
results_summary

Unnamed: 0,AUC,MAP,NDCG@5,NDCG@10
Popularity,0.488288,0.182673,0.198044,0.261494
SVD,0.54598,0.233836,0.236017,0.297342
