# Recommendation System Pipeline
A posts recommendation system pipeline based on users' interests and the posts they've engaged with.

In [None]:
from google.colab import files

uploaded = files.upload()

Saving Engagements.csv to Engagements.csv
Saving Users.csv to Users.csv
Saving Posts.csv to Posts.csv


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

users = pd.read_csv('Users.csv')
posts = pd.read_csv('Posts.csv')
eng = pd.read_csv('Engagements.csv')
print(users.shape, posts.shape, eng.shape)

(50, 5) (100, 4) (1000, 3)


In [None]:
users.head()

Unnamed: 0,user_id,age,gender,top_3_interests,past_engagement_score
0,U1,24,F,"sports, art, gaming",0.61
1,U2,32,F,"travel, food, fashion",0.93
2,U3,28,Other,"sports, travel, fashion",0.4
3,U4,25,M,"fashion, music, tech",0.53
4,U5,24,M,"fashion, food, fitness",0.8


In [None]:
posts.head()

Unnamed: 0,post_id,creator_id,content_type,tags
0,P1,U44,video,"sports, food"
1,P2,U26,video,"music, travel"
2,P3,U32,text,"sports, travel"
3,P4,U6,image,"music, gaming"
4,P5,U32,image,"food, fashion"


In [None]:
eng.head()

Unnamed: 0,user_id,post_id,engagement
0,U1,P52,1
1,U1,P44,0
2,U1,P1,1
3,U1,P4,1
4,U1,P65,0


In [None]:
posts['tags_clean'] = posts['tags'].fillna('').apply(
    lambda x: ' '.join([t.strip() for t in str(x).split(',')])
)

In [None]:
posts['text'] = posts['tags_clean']

In [43]:
posts.head()

Unnamed: 0,post_id,creator_id,content_type,tags,text,tags_clean
0,P1,U44,video,"sports, food",sports food,sports food
1,P2,U26,video,"music, travel",music travel,music travel
2,P3,U32,text,"sports, travel",sports travel,sports travel
3,P4,U6,image,"music, gaming",music gaming,music gaming
4,P5,U32,image,"food, fashion",food fashion,food fashion


In [None]:
tfv = TfidfVectorizer(max_features=5000, stop_words='english')
post_tfidf = tfv.fit_transform(posts['text'].fillna(''))

In [45]:
# cleaning users' defined interests as earlier where i cleaned tags
users['interests_clean'] = users['top_3_interests'].fillna('').apply(
    lambda x: ' '.join([t.strip() for t in str(x).split(',')])
)

#initialising each user's doc with their interests to start with
users_profile_docs = users['interests_clean'].tolist()

# adding the posts' tags in front of engagements
merged = eng.merge(posts[['post_id','text']], on='post_id', how='left')

In [46]:
merged

Unnamed: 0,user_id,post_id,engagement,text
0,U1,P52,1,music sports
1,U1,P44,0,music
2,U1,P1,1,sports food
3,U1,P4,1,music gaming
4,U1,P65,0,sports
...,...,...,...,...
995,U50,P57,0,sports literature
996,U50,P20,0,fitness
997,U50,P92,1,tech food
998,U50,P94,1,fitness


In [48]:
# user id : tag tag tag....
user_eng_texts = merged.groupby('user_id')['text'].agg(
    lambda x: ' '.join(x.dropna().astype(str))
).to_dict()

In [49]:
user_eng_texts

{'U1': 'music sports music sports food music gaming sports literature travel sports gaming fitness literature art literature music fashion art music food gaming tech sports literature literature music sports sports literature travel',
 'U10': 'tech music tech literature sports fitness literature art art music sports music fashion literature travel literature fitness literature food sports literature food fashion travel literature music food gaming gaming music travel fashion sports art',
 'U11': 'literature fitness sports food art food tech gaming literature music literature travel music fashion sports literature travel fashion sports literature music sports literature tech travel sports literature food travel sports music travel literature fitness',
 'U12': 'fitness fitness sports sports travel music gaming art food sports tech food literature music gaming tech travel fashion food fitness sports art art food sports travel gaming tech literature music music',
 'U13': 'fitness tech spor

In [80]:
# appending each user's engaged-post tags to their profile doc
for i, uid in enumerate(users['user_id']):
    users_profile_docs[i] += ' ' + user_eng_texts.get(uid, '')

In [51]:
users_profile_docs

['sports art gaming music sports music sports food music gaming sports literature travel sports gaming fitness literature art literature music fashion art music food gaming tech sports literature literature music sports sports literature travel music sports music sports food music gaming sports literature travel sports gaming fitness literature art literature music fashion art music food gaming tech sports literature literature music sports sports literature travel',
 'travel food fashion music fashion music gaming food music sports literature art travel music sports food sports art music sports fitness food fitness fitness literature gaming fitness art art food sports literature tech tech travel fashion music fashion music gaming food music sports literature art travel music sports food sports art music sports fitness food fitness fitness literature gaming fitness art art food sports literature tech tech travel fashion',
 'sports travel fashion music sports art gaming fitness gaming t

In [53]:
# user docs to tf-idf vectors
user_tfidf = tfv.transform(users_profile_docs)

In [55]:
# similarity matrix shape - #users X #posts
sim = cosine_similarity(user_tfidf, post_tfidf)

In [56]:
post_ids = posts['post_id'].tolist()
user_ids = users['user_id'].tolist()

# map post_id -> index so we can exclude seen posts
post_id_to_idx = {pid: i for i, pid in enumerate(post_ids)}

# user → posts they've already engaged with (excluding the engaged posts for recommendations)
user_to_engaged = {}
for uid, group in eng.groupby('user_id'):
    user_to_engaged[uid] = set(group['post_id'].tolist())

In [57]:
user_to_engaged

{'U1': {'P1',
  'P27',
  'P32',
  'P33',
  'P38',
  'P39',
  'P4',
  'P44',
  'P52',
  'P57',
  'P6',
  'P60',
  'P61',
  'P65',
  'P76',
  'P83',
  'P85',
  'P88',
  'P91',
  'P96'},
 'U10': {'P10',
  'P23',
  'P28',
  'P31',
  'P36',
  'P37',
  'P40',
  'P42',
  'P5',
  'P52',
  'P63',
  'P66',
  'P73',
  'P76',
  'P78',
  'P85',
  'P90',
  'P96',
  'P98',
  'P99'},
 'U11': {'P1',
  'P11',
  'P16',
  'P2',
  'P23',
  'P27',
  'P34',
  'P39',
  'P42',
  'P43',
  'P47',
  'P48',
  'P54',
  'P57',
  'P63',
  'P68',
  'P71',
  'P87',
  'P94',
  'P96'},
 'U12': {'P17',
  'P19',
  'P22',
  'P3',
  'P38',
  'P4',
  'P42',
  'P46',
  'P51',
  'P54',
  'P55',
  'P62',
  'P63',
  'P7',
  'P72',
  'P75',
  'P87',
  'P91',
  'P92',
  'P95'},
 'U13': {'P10',
  'P17',
  'P18',
  'P24',
  'P27',
  'P30',
  'P32',
  'P39',
  'P41',
  'P47',
  'P51',
  'P61',
  'P62',
  'P63',
  'P7',
  'P77',
  'P79',
  'P84',
  'P86',
  'P92'},
 'U14': {'P1',
  'P14',
  'P16',
  'P17',
  'P20',
  'P23',
  'P28',
  

In [58]:
# Generate recs
recs = []
TOP_N = 3

#iterating users in same order as user_tfidf rows
for u_idx, uid in enumerate(user_ids):
    scores = sim[u_idx].copy()  #  similarity scores between this user and all posts (array length is the number of posts)

    # excluding posts user already engaged with
    for pid in user_to_engaged.get(uid, set()):
        if pid in post_id_to_idx:
            scores[post_id_to_idx[pid]] = -1e9 # a very small value so that it never makes it to the top 3 recommendations

    # picking top-N scores
    top_idx = list(pd.Series(scores).nlargest(TOP_N).index)
    recs.append([uid] + [post_ids[i] for i in top_idx]) # user_id r1 r2 r3

In [59]:
rec_df = pd.DataFrame(recs, columns=['user_id'] + [f"rec_{i+1}" for i in range(TOP_N)])

In [60]:
rec_df

Unnamed: 0,user_id,rec_1,rec_2,rec_3
0,U1,P29,P30,P48
1,U2,P29,P30,P60
2,U3,P82,P85,P26
3,U4,P1,P46,P48
4,U5,P39,P97,P29
5,U6,P48,P23,P77
6,U7,P48,P57,P71
7,U8,P56,P40,P72
8,U9,P56,P17,P40
9,U10,P14,P54,P48


In [75]:
from sklearn.model_selection import train_test_split

# train-test-split
train_eng, test_eng = train_test_split(eng, test_size=0.2, random_state=42, stratify=eng['user_id'])

In [76]:
# merging train engagements with posts to get text
merged_train = train_eng.merge(posts[['post_id','text']], on='post_id', how='left')

# aggregating texts per user
user_eng_texts_train = merged_train.groupby('user_id')['text'].agg(lambda x: ' '.join(x.dropna().astype(str))).to_dict()

# starting with interests
users_profile_docs_train = users['interests_clean'].tolist()
for i, uid in enumerate(users['user_id']):
    users_profile_docs_train[i] += ' ' + user_eng_texts_train.get(uid, '')

# transforming into TF-IDF
user_tfidf_train = tfv.transform(users_profile_docs_train)

In [77]:
sim_train = cosine_similarity(user_tfidf_train, post_tfidf)

# mapping user -> engaged posts in train set
train_user_to_engaged = train_eng.groupby('user_id')['post_id'].agg(set).to_dict()

# generating top-3 recommendations per user, ignoring posts in train_eng
recs_train = []
for u_idx, uid in enumerate(users['user_id']):
    scores = sim_train[u_idx].copy()
    engaged = train_user_to_engaged.get(uid, set())
    for pid in engaged:
        if pid in post_id_to_idx:
            scores[post_id_to_idx[pid]] = -1e9  # excluding train posts
    top = list(pd.Series(scores).nlargest(3).index)
    recs_train.append([uid] + [post_ids[i] for i in top])

rec_df_train = pd.DataFrame(recs_train, columns=['user_id','rec_1','rec_2','rec_3'])

In [78]:
import numpy as np

def precision_at_k_split(recs_df, test_eng_df, K=3):
    precisions = []
    for _, row in recs_df.iterrows():
        uid = row['user_id']
        top_posts = row[['rec_1','rec_2','rec_3']][:K].tolist()
        # testing engagements for this user
        actual_eng = set(test_eng_df[test_eng_df['user_id']==uid].query('engagement==1')['post_id'])
        precisions.append(len(set(top_posts) & actual_eng)/K)
    return np.mean(precisions)

precision_val = precision_at_k_split(rec_df_train, test_eng, K=3)
print("Precision@3:", precision_val)

Precision@3: 0.03333333333333333


In [79]:
def ndcg_at_k_split(recs_df, test_eng_df, K=3):
    ndcgs = []
    for _, row in recs_df.iterrows():
        uid = row['user_id']
        top_posts = row[['rec_1','rec_2','rec_3']][:K].tolist()
        actual_eng = set(test_eng_df[test_eng_df['user_id']==uid].query('engagement==1')['post_id'])
        dcg = sum([1/np.log2(i+2) if p in actual_eng else 0 for i,p in enumerate(top_posts)])
        idcg = sum([1/np.log2(i+2) for i in range(min(len(actual_eng), K))])
        ndcgs.append(dcg/idcg if idcg > 0 else 0)
    return np.mean(ndcgs)

ndcg_val = ndcg_at_k_split(rec_df_train, test_eng, K=3)
print("NDCG@3:", ndcg_val)

NDCG@3: 0.0459216382193173
