# Imports

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize
from datetime import datetime
import math
import warnings
warnings.filterwarnings('ignore')

# Hyperparameters

In [2]:
ALPHA = 0.5   # content similarity weight
BETA  = 0.3   # collaborative score weight
GAMMA = 0.1   # interest overlap
DELTA = 0.05  # recency boost
EPS   = 0.05  # popularity weight
N_COMPONENTS_SVD = 50
TOP_K = 3

# Loading CSVs

In [3]:
users = pd.read_csv('datasets/Users.csv')
posts = pd.read_csv('datasets/Posts.csv')
eng = pd.read_csv('datasets/Engagements.csv')

# Quick glance
print("users", users.shape)
print("posts", posts.shape)
print("engagements", eng.shape)

users (50, 5)
posts (100, 4)
engagements (1000, 3)


# Building textual field for posts

In [4]:
def combine_post_text(row):
    parts = []
    for c in ['title', 'body', 'tags']:
        if c in posts.columns and pd.notna(row.get(c)):
            parts.append(str(row.get(c)))
    return ' '.join(parts)

posts['text'] = posts.apply(combine_post_text, axis=1)

# TF-IDF for posts

In [5]:
tfidf = TfidfVectorizer(max_features=10000, stop_words='english')
post_tfidf = tfidf.fit_transform(posts['text'].fillna(''))
# reduce dim for speed / denoise
svd = TruncatedSVD(n_components=min(N_COMPONENTS_SVD, post_tfidf.shape[1]-1), random_state=42)
post_emb = svd.fit_transform(post_tfidf)
post_emb = normalize(post_emb)

# Building user interest vectors from profile interests (TF-IDF transform)

In [6]:
if 'interests' in users.columns:
    user_interest_text = users['top_3_interests'].fillna('').astype(str)
    user_tfidf = tfidf.transform(user_interest_text)  # using same vectorizer
    user_emb = svd.transform(user_tfidf)
    user_emb = normalize(user_emb)
else:
    # fallback: average embeddings of posts the user engaged with (cold start handled later)
    user_emb = None

# Collaborative matrix
Converting engagement types to weights

In [7]:
event_weight = {'view': 1, 'click': 1, 'like': 3, 'comment': 5, 'share': 4}
eng['weight'] = eng['engagement'].astype(float)

# Building user-item sparse matrix
user_ids = eng['user_id'].unique()
post_ids = posts['post_id'].unique()

# Maping to indices
user2idx = {u:i for i,u in enumerate(eng['user_id'].unique())}
post2idx = {p:i for i,p in enumerate(posts['post_id'].unique())}

n_users = len(user2idx)
n_posts = len(post2idx)

# Building dense matrix (if large, switch to sparse)
ui = np.zeros((n_users, n_posts), dtype=float)
for _, r in eng.iterrows():
    u = r['user_id']; p = r['post_id']; w = r['weight']
    if u in user2idx and p in post2idx:
        ui[user2idx[u], post2idx[p]] += w

# Normalising rows
ui_norm = normalize(ui, norm='l2', axis=1)
# Computing low-rank SVD on user-item matrix to get latent factors
svd_ui = TruncatedSVD(n_components=min(50, min(ui_norm.shape)-1), random_state=42)
ui_factors = svd_ui.fit_transform(ui_norm)  # user latent
item_factors = svd_ui.components_.T        # post latent

# Helper scoring components
content_sim: cosine between user_emb and post_emb

In [8]:
def content_sim_for_user(user_idx):
    uvec = user_emb[user_idx] if user_emb is not None else None
    if uvec is None:
        return np.zeros(n_posts)
    sims = cosine_similarity(uvec.reshape(1, -1), post_emb).ravel()
    return sims

# cf score via dot product of user latent and post latent
def cf_score_for_user(user_id):
    if user_id not in user2idx:
        return np.zeros(n_posts)
    uidx = user2idx[user_id]
    user_lat = ui_factors[uidx]
    scores = item_factors.dot(user_lat)
    # optionally normalize
    return (scores - scores.min()) / (scores.max() - scores.min() + 1e-9)

# interest overlap: Jaccard between user interest keywords and post tags
def interest_overlap(user_row, posts_df):
    if 'interests' not in user_row or 'tags' not in posts_df.columns:
        return np.zeros(len(posts_df))
    uset = set(str(user_row['interests']).lower().split(','))
    overlaps = []
    for tags in posts_df['tags'].fillna(''):
        pset = set(str(tags).lower().split(','))
        if len(uset)==0 and len(pset)==0:
            overlaps.append(0.0)
        else:
            overlaps.append(len(uset & pset)/ (len(uset | pset) + 1e-9))
    return np.array(overlaps)

# Recency and popularity
post created_at assumed ISO or unix; computing age in days

In [9]:
if 'created_at' in posts.columns:
    posts['created_at'] = pd.to_datetime(posts['created_at'], errors='coerce')
else:
    posts['created_at'] = pd.NaT
today = pd.Timestamp.now()
posts['age_days'] = (today - posts['created_at']).dt.days.fillna(9999)
# recency boost: exp decay
posts['recency_boost'] = np.exp(-posts['age_days'] / 30.0)  # half life ~20-30 days

# popularity: total engagement count normalized
post_eng_counts = eng.groupby('post_id')['weight'].sum().reindex(posts['post_id']).fillna(0).values
posts['popularity'] = (post_eng_counts - post_eng_counts.min()) / (post_eng_counts.max() - post_eng_counts.min() + 1e-9)

# Full scoring and recommendation function

In [10]:
def recommend_for_user(user_row, user_id):
    # mapping to embedding index: for user_emb we used users order; ensure alignment
    # finding user index in users DataFrame
    try:
        uidx_df = users.index[users['user_id']==user_id][0]
    except Exception:
        uidx_df = None

    # content similarity
    content_scores = np.zeros(n_posts)
    if uidx_df is not None and user_emb is not None:
        content_scores = cosine_similarity(user_emb[uidx_df].reshape(1,-1), post_emb).ravel()

    # cf score
    cf_scores = cf_score_for_user(user_id)

    # interest overlap
    interest_scores = interest_overlap(user_row, posts)

    # recency and popularity
    recency = posts['recency_boost'].values
    popularity = posts['popularity'].values

    # combining
    score = ALPHA*content_scores + BETA*cf_scores + GAMMA*interest_scores + DELTA*recency + EPS*popularity

    # mask posts the user already engaged with if desired (to recommend new posts)
    if user_id in user2idx:
        engaged_post_idxs = [post2idx[p] for p in eng[eng['user_id']==user_id]['post_id'].unique() if p in post2idx]
        # optional: deprioritize already engaged
        score[engaged_post_idxs] *= 0.5

    top_idx = np.argsort(-score)[:TOP_K]
    top_posts = posts.iloc[[list(post2idx.keys()).index(posts['post_id'].iloc[i]) if False else i for i in top_idx]]
    # easier: use post_id via mapping from index numbers
    recommended_post_ids = posts.iloc[top_idx]['post_id'].tolist()
    return recommended_post_ids, score[top_idx]


# Producing recommendations for all users and create dataframe

In [11]:
recs = []
for _, urow in users.iterrows():
    user_id = urow['user_id']
    top_posts, scores = recommend_for_user(urow, user_id)
    recs.append({
        'user_id': user_id,
        'rec_1': top_posts[0] if len(top_posts)>0 else None,
        'rec_2': top_posts[1] if len(top_posts)>1 else None,
        'rec_3': top_posts[2] if len(top_posts)>2 else None,
    })
recs_df = pd.DataFrame(recs)
recs_df.head()
# Saving to CSV
recs_df.to_csv('recommendations_top3.csv', index=False)


# Evaluation helpers
Defining Precision@k, nDCG@k

In [12]:
def precision_at_k(recommended, actual, k=3):
    if len(recommended)==0:
        return 0.0
    return len(set(recommended[:k]) & set(actual)) / k

def dcg_at_k(recommended, actual, k):
    dcg = 0.0
    for i, r in enumerate(recommended[:k]):
        rel = 1.0 if r in actual else 0.0
        denom = math.log2(i+2)  # i+2 because i=0 => position 1
        dcg += rel/denom
    return dcg

def ndcg_at_k(recommended, actual, k):
    idcg = sum([1.0/math.log2(i+2) for i in range(min(len(actual), k))])
    if idcg == 0:
        return 0.0
    return dcg_at_k(recommended, actual, k)/idcg



---

# Title: **Interest-based Content Recommendation — Approach & Evaluation**

1. **Problem statement**  
Recommend top-3 posts for each user using profile interests, past engagement, and post attributes.

2. **Data & assumptions**  
- `Users.csv`: `user_id`, `age`, `gender`, `top_3_interests`, `past_engagement_score`  
- `Posts.csv`: `post_id`, `creater_it`, `content_type`, `tags`  
- `Engagements.csv`: `user_id`, `post_id`, `engagement`  

3. **Approach**
- **Content modeling:** TF-IDF on post text (title/body/tags), reduced via TruncatedSVD → normalized embeddings.
- **Profile matching:** transform user interests through same TF-IDF+SVD pipeline to get user content vectors; compute cosine similarity against posts.
- **Collaborative signal:** build implicit user–item matrix with weighted engagements, normalize and compute low-rank SVD to obtain user/item latent factors; score by latent dot product.
- **Heuristics:** recency (exponential decay), popularity (total weighted engagements normalized), deprioritize already engaged posts optionally.
- **Ensemble scoring:** linear combination with tunable weights α..ε optimized on validation set.

4. **Evaluation**
- Metrics: **Precision@3**, **Recall@3**, **nDCG@3**, **MAP**.  
- Validation: Time-aware split (train on engagements up to T, validate on engagements after T) to mimic production.

5. **Extensions**
- Use pretrained sentence embeddings (SBERT / Universal Sentence Encoder) for better semantic matches.  
- Train a learning-to-rank model (e.g., LightGBM ranker or pairwise LambdaMART) with engineered features.  
- Add sequential models (RNN/Transformer) for session-based recency.  
- Use LightFM or implicit ALS for more robust collaborative filtering.  
- Online A/B testing, logged-in/offline evaluation and calibration.

6. **Notes on fairness & safety**
- Provide controls to avoid filter bubbles: introduce serendipity / diversify recommendations by content type or topic.