In [2]:
import numpy as np
import pandas as pd

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### Behavior Data

In [4]:
raw_behaviors = pd.read_csv('/content/drive/My Drive/UCB_MIDS_2023/w207/MINDsmall_train/behaviors.tsv', sep='\t', header=None, names=['impression_id', 'user_id', 'timestamp', 'click_history', 'impressions'])

print(f'The dataset consists of {len(raw_behaviors)} rows.')
raw_behaviors.head()

The dataset consists of 156965 rows.


Unnamed: 0,impression_id,user_id,timestamp,click_history,impressions
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...


In [5]:
raw_behaviors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 156965 entries, 0 to 156964
Data columns (total 5 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   impression_id  156965 non-null  int64 
 1   user_id        156965 non-null  object
 2   timestamp      156965 non-null  object
 3   click_history  153727 non-null  object
 4   impressions    156965 non-null  object
dtypes: int64(1), object(4)
memory usage: 6.0+ MB


### Data Preprocessing

In [6]:
unique_users = raw_behaviors['user_id'].unique()
print(f'There are {len(unique_users)} unique users.')

There are 50000 unique users.


In [7]:
# Allocate an index for each unique user, assign zeroth index as a UNK index
index_to_user = {idx+1: itemid for idx, itemid in enumerate(unique_users)}
user_to_index = {itemid : idx for idx, itemid in index_to_user.items()}

In [8]:
# Create new column for userID
raw_behaviors['user_index'] = raw_behaviors['user_id'].map(lambda x: user_to_index.get(x, 0))

In [9]:
raw_behaviors.head()

Unnamed: 0,impression_id,user_id,timestamp,click_history,impressions,user_index
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5


### Article Data

In [10]:
news = pd.read_csv(
    "/content/drive/My Drive/UCB_MIDS_2023/w207/MINDsmall_train/news.tsv",
    sep="\t",
    names=["itemId","category","subcategory","title","abstract","url","title_entities","abstract_entities"])
news.head(2)

# Build index of items
ind_item = {idx +1: itemid for idx, itemid in enumerate(news['itemId'].values)}
item_ind = {itemid : idx for idx, itemid in ind_item.items()}

news.head()

Unnamed: 0,itemId,category,subcategory,title,abstract,url,title_entities,abstract_entities
0,N55528,lifestyle,lifestyleroyals,"The Brands Queen Elizabeth, Prince Charles, an...","Shop the notebooks, jackets, and more that the...",https://assets.msn.com/labs/mind/AAGH0ET.html,"[{""Label"": ""Prince Philip, Duke of Edinburgh"",...",[]
1,N19639,health,weightloss,50 Worst Habits For Belly Fat,These seemingly harmless habits are holding yo...,https://assets.msn.com/labs/mind/AAB19MK.html,"[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik...","[{""Label"": ""Adipose tissue"", ""Type"": ""C"", ""Wik..."
2,N61837,news,newsworld,The Cost of Trump's Aid Freeze in the Trenches...,Lt. Ivan Molchanets peeked over a parapet of s...,https://assets.msn.com/labs/mind/AAJgNsz.html,[],"[{""Label"": ""Ukraine"", ""Type"": ""G"", ""WikidataId..."
3,N53526,health,voices,I Was An NBA Wife. Here's How It Affected My M...,"I felt like I was a fraud, and being an NBA wi...",https://assets.msn.com/labs/mind/AACk2N6.html,[],"[{""Label"": ""National Basketball Association"", ..."
4,N38324,health,medical,"How to Get Rid of Skin Tags, According to a De...","They seem harmless, but there's a very good re...",https://assets.msn.com/labs/mind/AAAKEkt.html,"[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI...","[{""Label"": ""Skin tag"", ""Type"": ""C"", ""WikidataI..."


### Process User Interaction Data

In [13]:
# Index click history
def process_clicks(s):
  list_of_strings = str(s).split(' ')
  return [item_ind.get(l, 0) for l in list_of_strings]

raw_behaviors['click_history_idx'] = raw_behaviors.click_history.map(lambda s: process_clicks(s))
raw_behaviors.head()

Unnamed: 0,impression_id,user_id,timestamp,click_history,impressions,user_index,click_history_idx
0,1,U13740,11/11/2019 9:05:58 AM,N55189 N42782 N34694 N45794 N18445 N63302 N104...,N55689-1 N35729-0,1,"[6893, 10050, 15556, 21467, 26358, 4946, 14071..."
1,2,U91836,11/12/2019 6:11:30 PM,N31739 N6072 N63045 N23979 N35656 N43353 N8129...,N20678-0 N39317-0 N58114-0 N20495-0 N42977-0 N...,2,"[25816, 2334, 8524, 12087, 13463, 14202, 12733..."
2,3,U73700,11/14/2019 7:01:48 AM,N10732 N25792 N7563 N21087 N41087 N5445 N60384...,N50014-0 N23877-0 N35389-0 N49712-0 N16844-0 N...,3,"[5477, 4207, 11684, 7704, 8124, 23394, 22970, ..."
3,4,U34670,11/11/2019 5:28:05 AM,N45729 N2203 N871 N53880 N41375 N43142 N33013 ...,N35729-0 N33632-0 N49685-1 N27581-0,4,"[13827, 19085, 28506, 7024, 22910, 16667, 1559..."
4,5,U8125,11/12/2019 4:11:21 PM,N10078 N56514 N14904 N33740,N39985-0 N36050-0 N16096-0 N8400-1 N22407-0 N6...,5,"[23643, 4853, 27686, 31189]"


In [14]:
# collect one click and one no-click from impressions:
def process_impression(s):
    list_of_strings = s.split(" ")
    itemid_rel_tuple = [l.split("-") for l in list_of_strings]
    noclicks = []
    for entry in itemid_rel_tuple:
        if entry[1] =='0':
            noclicks.append(entry[0])
        if entry[1] =='1':
            click = entry[0]
    return noclicks, click

raw_behaviors['noclicks'], raw_behaviors['click'] = zip(*raw_behaviors['impressions'].map(process_impression))

# We can then indexize these two new columns:
raw_behaviors['noclicks'] = raw_behaviors['noclicks'].map(lambda list_of_strings: [item_ind.get(l, 0) for l in list_of_strings])
raw_behaviors['click'] = raw_behaviors['click'].map(lambda x: item_ind.get(x,0))

In [15]:
# convert timestamp value to hours since epoch
raw_behaviors['epochhrs'] = pd.to_datetime(raw_behaviors['timestamp']).values.astype(np.int64)/(1e6)/1000/3600
raw_behaviors['epochhrs'] = raw_behaviors['epochhrs'].round()

In [17]:
# Select the columns that we now want to use for further analysis
behavior = raw_behaviors[['epochhrs','user_id','click_history_idx','noclicks','click']]
behavior.head()

Unnamed: 0,epochhrs,user_id,click_history_idx,noclicks,click
0,437073.0,U13740,"[6893, 10050, 15556, 21467, 26358, 4946, 14071...",[50689],33900
1,437106.0,U91836,"[25816, 2334, 8524, 12087, 13463, 14202, 12733...","[37405, 41306, 34907, 35307, 44370, 37210, 439...",32187
2,437143.0,U73700,"[5477, 4207, 11684, 7704, 8124, 23394, 22970, ...","[39528, 33356, 38720, 43459, 794, 38061, 39830...",5767
3,437069.0,U34670,"[13827, 19085, 28506, 7024, 22910, 16667, 1559...","[50689, 50106, 50022]",50715
4,437104.0,U8125,"[23643, 4853, 27686, 31189]","[2006, 33272, 39220, 37210, 45683, 50113, 3663...",31475


### Outputs

- The behavior dataframe contains all relevant user interaction data.
- The ind_item dictionary maps the item indices given in behavior to the real item ID.
- The ind_user dictionary maps the user indices in behavior to the real user ID

### Modeling

In [21]:
news.isna().sum()

Unnamed: 0,0
itemId,0
category,0
subcategory,0
title,0
abstract,2666
url,0
title_entities,3
abstract_entities,4


In [22]:
# Replace NaN values with an empty string in the title and abstract columns
news['title'] = news['title'].fillna('')
news['abstract'] = news['abstract'].fillna('')

# Now you can apply the TF-IDF vectorizer
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(news['title'] + " " + news['abstract'])

In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
import tensorflow as tf
from tensorflow.keras.layers import Input, Embedding, Dot, Dense, Flatten
from tensorflow.keras.models import Model

# Content-based filtering with TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
tfidf_matrix = tfidf.fit_transform(news['title'] + " " + news['abstract'])

# Compute similarities between articles
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Collaborative Filtering with Matrix Factorization (SVD)
user_item_matrix = ...  # Create user-item interaction matrix
u, s, vt = svds(user_item_matrix, k=50)
user_factors = u @ np.diag(s)
item_factors = vt.T

# Neural Collaborative Filtering
user_input = Input(shape=(1,))
item_input = Input(shape=(1,))
user_embedding = Embedding(input_dim=len(user_to_index)+1, output_dim=50)(user_input)
item_embedding = Embedding(input_dim=len(item_ind)+1, output_dim=50)(item_input)
dot_product = Dot(axes=2)([user_embedding, item_embedding])
dot_product = Flatten()(dot_product)
output = Dense(1, activation='sigmoid')(dot_product)
model = Model([user_input, item_input], output)
model.compile(optimizer='adam', loss='binary_crossentropy')

# Train the model
model.fit([user_ids, item_ids], clicks, epochs=5, batch_size=256)

# Hybrid Approach (Weighted Average)
collab_predictions = user_factors @ item_factors.T  # Collaborative filtering scores
content_predictions = cosine_sim @ user_profile  # Content-based filtering scores

# Combine the predictions
final_scores = 0.5 * collab_predictions + 0.5 * content_predictions

# Rank and recommend items with the highest scores
top_recommendations = np.argsort(-final_scores)[:10]

