In [102]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score, recall_score, ndcg_score
from scipy.sparse import csr_matrix
from google.colab import files
import time

In [103]:
# Set random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Upload and load data
print("Upload Users.csv, Posts.csv, Engagements.csv")
uploaded = files.upload()
users_df = pd.read_csv('Users.csv')
posts_df = pd.read_csv('Posts.csv')
engagements_df = pd.read_csv('Engagements.csv')

print(f"\nData loaded successfully!")
print(f"Users shape: {users_df.shape}")
print(f"Posts shape: {posts_df.shape}")
print(f"Engagements shape: {engagements_df.shape}")

print(f"\nUsers columns: {list(users_df.columns)}")
print(f"Posts columns: {list(posts_df.columns)}")
print(f"Engagements columns: {list(engagements_df.columns)}")

Upload Users.csv, Posts.csv, Engagements.csv


Saving Users.csv to Users (27).csv
Saving Engagements.csv to Engagements (27).csv
Saving Posts.csv to Posts (27).csv

Data loaded successfully!
Users shape: (50, 5)
Posts shape: (100, 4)
Engagements shape: (1000, 3)

Users columns: ['user_id', 'age', 'gender', 'top_3_interests', 'past_engagement_score']
Posts columns: ['post_id', 'creator_id', 'content_type', 'tags']
Engagements columns: ['user_id', 'post_id', 'engagement']


In [104]:
# Preprocess
users_df['interests_str'] = users_df['top_3_interests'].str.replace('"', '').str.replace(', ', ' ')
posts_df['tags_str'] = posts_df['tags'].str.replace('"', '').str.replace(', ', ' ')
scaler = MinMaxScaler()
users_df['age_norm'] = scaler.fit_transform(users_df[['age']])
users_df['past_engagement_norm'] = scaler.fit_transform(users_df[['past_engagement_score']])

# Encode user and post IDs
user_encoder = LabelEncoder()
post_encoder = LabelEncoder()
users_df['user_idx'] = user_encoder.fit_transform(users_df['user_id'])
posts_df['post_idx'] = post_encoder.fit_transform(posts_df['post_id'])

# TF-IDF for content features
vectorizer = TfidfVectorizer(max_features=200, sublinear_tf=True)
user_tfidf = vectorizer.fit_transform(users_df['interests_str']).toarray()
post_tfidf = vectorizer.transform(posts_df['tags_str']).toarray()
tfidf_size = user_tfidf.shape[1]

# Prepare training data
engagements_df = engagements_df.merge(users_df[['user_id', 'user_idx']], on='user_id')
engagements_df = engagements_df.merge(posts_df[['post_id', 'post_idx']], on='post_id')
X = engagements_df[['user_idx', 'post_idx']].values
y = engagements_df['engagement'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Build interaction matrix for masking
num_users = len(users_df)
num_posts = len(posts_df)
rows = engagements_df['user_idx'].values
cols = engagements_df['post_idx'].values
values = engagements_df['engagement'].values
user_item_matrix = csr_matrix((values, (rows, cols)), shape=(num_users, num_posts))

# NCF Model
class NCF(nn.Module):
    def __init__(self, num_users, num_posts, embed_size=32, tfidf_size=None):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embed_size)
        self.post_embedding = nn.Embedding(num_posts, embed_size)
        self.tfidf_size = tfidf_size
        input_size = embed_size * 2 + tfidf_size * 2
        self.mlp = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, user_ids, post_ids, user_tfidf, post_tfidf, past_engagement):
        user_emb = self.user_embedding(user_ids)
        post_emb = self.post_embedding(post_ids)
        # Concatenate embeddings and TF-IDF features
        concat = torch.cat((user_emb, post_emb, user_tfidf, post_tfidf), dim=1)
        # Weight by past engagement
        weighted_input = concat * (1 + 0.5 * past_engagement.unsqueeze(1))
        output = self.mlp(weighted_input).squeeze()
        return torch.sigmoid(output)

# Initialize model
model = NCF(num_users, num_posts, tfidf_size=tfidf_size).to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# Convert to tensors
X_train_tensor = torch.LongTensor(X_train).to(device)
y_train_tensor = torch.FloatTensor(y_train).to(device)
user_tfidf_tensor = torch.FloatTensor(user_tfidf[X_train[:, 0]]).to(device)
post_tfidf_tensor = torch.FloatTensor(post_tfidf[X_train[:, 1]]).to(device)
past_engagement_tensor = torch.FloatTensor(users_df['past_engagement_norm'].values[X_train[:, 0]]).to(device)

# Train
print("\n TRAINING NCF MODEL \n")
start_time = time.time()
num_epochs = 10
batch_size = 32

for epoch in range(num_epochs):
    model.train()
    epoch_loss = 0
    num_batches = 0

    for i in range(0, len(X_train), batch_size):
        batch_idx = slice(i, min(i + batch_size, len(X_train)))
        batch_X = X_train_tensor[batch_idx]
        batch_y = y_train_tensor[batch_idx]
        batch_user_tfidf = user_tfidf_tensor[batch_idx]
        batch_post_tfidf = post_tfidf_tensor[batch_idx]
        batch_past_engagement = past_engagement_tensor[batch_idx]

        optimizer.zero_grad()
        output = model(batch_X[:, 0], batch_X[:, 1], batch_user_tfidf, batch_post_tfidf, batch_past_engagement)
        loss = criterion(output, batch_y)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        num_batches += 1

    avg_loss = epoch_loss / num_batches
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

elapsed_time = time.time() - start_time
print(f"\nTraining completed in {elapsed_time:.2f} seconds")

# Evaluation on test set
print("\n EVALUATING ON TEST SET \n")

model.eval()
with torch.no_grad():
    X_test_tensor = torch.LongTensor(X_test).to(device)
    user_tfidf_test = torch.FloatTensor(user_tfidf[X_test[:, 0]]).to(device)
    post_tfidf_test = torch.FloatTensor(post_tfidf[X_test[:, 1]]).to(device)
    past_engagement_test = torch.FloatTensor(users_df['past_engagement_norm'].values[X_test[:, 0]]).to(device)

    test_preds = model(X_test_tensor[:, 0], X_test_tensor[:, 1],
                       user_tfidf_test, post_tfidf_test, past_engagement_test).cpu().numpy()
    test_preds_binary = (test_preds > 0.5).astype(int)

    # Basic metrics
    precision = precision_score(y_test, test_preds_binary, zero_division=0)
    recall = recall_score(y_test, test_preds_binary, zero_division=0)

# Compute NDCG@3 per user (fixed version)
ndcg_scores = []
precision_at_3 = []
recall_at_3 = []

for user_idx in range(num_users):
    # Get test items for this user
    mask = X_test[:, 0] == user_idx
    if mask.sum() < 3:  # Need at least 3 test items
        continue

    user_test_posts = X_test[mask, 1]
    user_test_scores = test_preds[mask]
    user_test_labels = y_test[mask]

    # Get top-3 predictions
    top3_idx = np.argsort(-user_test_scores)[:3]
    top3_labels = user_test_labels[top3_idx]

    # Precision@3
    prec_at_3 = top3_labels.sum() / 3.0
    precision_at_3.append(prec_at_3)

    # Recall@3
    total_relevant = user_test_labels.sum()
    if total_relevant > 0:
        rec_at_3 = top3_labels.sum() / total_relevant
        recall_at_3.append(rec_at_3)

    # NDCG@3 (only if we have enough items)
    if len(user_test_labels) >= 3 and user_test_labels.sum() > 0:
        try:
            # Create relevance arrays padded to same length
            true_relevance = np.zeros(len(user_test_labels))
            true_relevance[user_test_labels == 1] = 1

            pred_relevance = np.zeros(len(user_test_labels))
            pred_relevance[top3_idx] = [3, 2, 1]  # Positional weights

            ndcg = ndcg_score([true_relevance], [pred_relevance], k=3)
            ndcg_scores.append(ndcg)
        except:
            pass

# Average metrics
avg_precision_at_3 = np.mean(precision_at_3) if precision_at_3 else 0
avg_recall_at_3 = np.mean(recall_at_3) if recall_at_3 else 0
avg_ndcg_at_3 = np.mean(ndcg_scores) if ndcg_scores else 0

print("\nRandom Baseline (Estimated):")
print(f"  Precision@3: ~0.0333")
print(f"  Recall@3:    ~0.0500")
print(f"  NDCG@3:      ~0.0400")

print("\nNCF Hybrid Model:")
print(f"  Binary Precision: {precision:.4f}")
print(f"  Binary Recall:    {recall:.4f}")
print(f"  Precision@3:      {avg_precision_at_3:.4f}")
print(f"  Recall@3:         {avg_recall_at_3:.4f}")
print(f"  NDCG@3:           {avg_ndcg_at_3:.4f}")

# Generate Final Recommendations
print("\n GENERATING FINAL RECOMMENDATIONS \n")

model.eval()
all_scores = np.zeros((num_users, num_posts))

with torch.no_grad():
    # Process in batches to avoid memory issues
    batch_size_pred = 1000

    for user_idx in range(num_users):
        user_tensor = torch.LongTensor([user_idx] * num_posts).to(device)
        post_tensor = torch.LongTensor(range(num_posts)).to(device)
        user_tfidf_tensor = torch.FloatTensor(user_tfidf[user_idx:user_idx+1]).repeat(num_posts, 1).to(device)
        post_tfidf_tensor = torch.FloatTensor(post_tfidf).to(device)
        past_eng_tensor = torch.FloatTensor([users_df['past_engagement_norm'].values[user_idx]] * num_posts).to(device)

        scores = model(user_tensor, post_tensor, user_tfidf_tensor, post_tfidf_tensor, past_eng_tensor)
        all_scores[user_idx] = scores.cpu().numpy()

# Mask seen items
seen_mask = user_item_matrix.toarray() > 0
all_scores[seen_mask] = -np.inf

# Get top-3 recommendations per user
final_recommendations = np.argsort(-all_scores, axis=1)[:, :3]

# Create recommendations DataFrame
recommendations_list = []
for user_idx in range(num_users):
    user_id = user_encoder.inverse_transform([user_idx])[0]
    top3_post_idx = final_recommendations[user_idx]
    top3_scores = all_scores[user_idx, top3_post_idx]

    for rank, (post_idx, score) in enumerate(zip(top3_post_idx, top3_scores), 1):
        post_id = post_encoder.inverse_transform([post_idx])[0]
        recommendations_list.append({
            'user_id': user_id,
            'rank': rank,
            'recommended_post_id': post_id,
            'score': float(score)
        })

recommendations_df = pd.DataFrame(recommendations_list)
recommendations_df.to_csv('recommendations.csv', index=False)
print("\n✓ Recommendations saved to 'recommendations.csv'")

print("\nSample recommendations (first 5 users):")
sample_users = user_encoder.inverse_transform(range(min(5, num_users)))
print(recommendations_df[recommendations_df['user_id'].isin(sample_users)])

total_time = time.time() - start_time
print(f"\n Total runtime: {total_time:.2f} seconds \n")

files.download('recommendations.csv')


 TRAINING NCF MODEL 

Epoch 1/10, Loss: 0.6950
Epoch 2/10, Loss: 0.6874
Epoch 3/10, Loss: 0.6803
Epoch 4/10, Loss: 0.6698
Epoch 5/10, Loss: 0.6590
Epoch 6/10, Loss: 0.6461
Epoch 7/10, Loss: 0.6258
Epoch 8/10, Loss: 0.6030
Epoch 9/10, Loss: 0.5836
Epoch 10/10, Loss: 0.5647

Training completed in 0.75 seconds

 EVALUATING ON TEST SET 


Random Baseline (Estimated):
  Precision@3: ~0.0333
  Recall@3:    ~0.0500
  NDCG@3:      ~0.0400

NCF Hybrid Model:
  Binary Precision: 0.5098
  Binary Recall:    0.5253
  Precision@3:      0.5447
  Recall@3:         0.7650
  NDCG@3:           0.7289

 GENERATING FINAL RECOMMENDATIONS 


✓ Recommendations saved to 'recommendations.csv'

Sample recommendations (first 5 users):
   user_id  rank recommended_post_id     score
0       U1     1                 P70  0.732961
1       U1     2                 P78  0.731526
2       U1     3                 P12  0.718602
3      U10     1                  P1  0.892070
4      U10     2                 P12  0.889899


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>