In [6]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import norm

# Load Data
users_df = pd.read_csv("users.csv")
posts_df = pd.read_csv("posts.csv")
interactions_df = pd.read_csv("interactions.csv")

# Fix Indexing: Map user_id and post_id to sequential indices
user_mapping = {id: idx for idx, id in enumerate(users_df["user_id"].unique())}
post_mapping = {id: idx for idx, id in enumerate(posts_df["post_id"].unique())}

interactions_df["user_id"] = interactions_df["user_id"].map(user_mapping)
interactions_df["post_id"] = interactions_df["post_id"].map(post_mapping)

# Ensure IDs are within range (avoid index errors)
n_users = len(user_mapping)
n_posts = len(post_mapping)

# Convert implicit feedback into ratings
interactions_df["rating"] = (
    interactions_df["upvotes"] * 1 + interactions_df["saves"] * 3 - interactions_df["downvotes"] * 2
)

# Define PyTorch Dataset
class InteractionDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.posts = torch.tensor(df["post_id"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["rating"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.posts[idx], self.ratings[idx]

# Create DataLoader
dataset = InteractionDataset(interactions_df)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define NCF Model
class NCF(nn.Module):
    def __init__(self, n_users, n_posts, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.post_embedding = nn.Embedding(n_posts, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, post):
        user_embedded = self.user_embedding(user)
        post_embedded = self.post_embedding(post)
        x = torch.cat([user_embedded, post_embedded], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x.squeeze()

# Initialize Model
model = NCF(n_users, n_posts)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, dataloader, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for users, posts, ratings in dataloader:
            optimizer.zero_grad()
            predictions = model(users, posts)
            loss = criterion(predictions, ratings)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(dataloader):.4f}")

# Train Model
train_model(model, dataloader)

# Generate NCF-based recommendations
def get_ncf_recommendations(user_id, top_n=50):
    if user_id not in user_mapping:
        return []
    user_idx = user_mapping[user_id]
    post_ids = list(post_mapping.keys())
    predictions = [(post, model(torch.tensor([user_idx]), torch.tensor([post_mapping[post]])).item()) for post in post_ids]
    predictions.sort(key=lambda x: x[1], reverse=True)
    return [post for post, _ in predictions[:top_n]]

# Content-Based Filtering (TF-IDF / Similarity)
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(posts_df["content"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_content_based_recommendations(user_id, recommendations, top_n=30):
    sim_scores = cosine_sim[[post_mapping[post] for post in recommendations]].mean(axis=0)
    post_indices = np.argsort(sim_scores)[::-1][:top_n]
    return posts_df.iloc[post_indices]["post_id"].tolist()

# Industry & Work Profile Prioritization
def get_industry_recommendations(user_id, recommendations, top_n=20):
    user_industry = users_df[users_df["user_id"] == user_id]["industry"].values[0]
    industry_posts = posts_df[posts_df["industry"] == user_industry]["post_id"].tolist()
    prioritized = [post for post in recommendations if post in industry_posts]
    remaining = [post for post in recommendations if post not in industry_posts]
    return (prioritized + remaining)[:top_n]

# Bayesian Ranking (Wilson Score Interval)
def wilson_score(upvotes, downvotes, confidence=0.95):
    n = upvotes + downvotes
    if n == 0:
        return 0
    z = norm.ppf(1 - (1 - confidence) / 2)
    p = upvotes / n
    return (p + z**2 / (2 * n) - z * ((p * (1 - p) + z**2 / (4 * n)) / n)**0.5) / (1 + z**2 / n)

posts_df["wilson_score"] = posts_df.apply(lambda row: wilson_score(row["upvotes"], row["downvotes"]), axis=1)
posts_df = posts_df.sort_values(by="wilson_score", ascending=False)

def get_final_recommendations(user_id, top_n=10):
    ncf_recs = get_ncf_recommendations(user_id, top_n=50)
    content_recs = get_content_based_recommendations(user_id, ncf_recs, top_n=30)
    industry_recs = get_industry_recommendations(user_id, content_recs, top_n=20)
    final_recommendations = sorted(industry_recs, key=lambda post: posts_df[posts_df["post_id"] == post]["wilson_score"].values[0], reverse=True)
    return final_recommendations[:top_n]

user_id = 5
recommended_posts = get_final_recommendations(user_id, top_n=10)
print("Recommended Posts:", recommended_posts)


Epoch 1/5, Loss: 3.5551
Epoch 2/5, Loss: 3.4543
Epoch 3/5, Loss: 3.4358
Epoch 4/5, Loss: 3.4189
Epoch 5/5, Loss: 3.3650
Recommended Posts: [302, 4593, 4219, 2761, 3926, 2705, 4943, 1308, 2483, 3576]


In [7]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import norm

# Load Data
users_df = pd.read_csv("users.csv")
posts_df = pd.read_csv("posts.csv")
interactions_df = pd.read_csv("interactions.csv")

# Fix Indexing: Map user_id and post_id to sequential indices
user_mapping = {id: idx for idx, id in enumerate(users_df["user_id"].unique())}
post_mapping = {id: idx for idx, id in enumerate(posts_df["post_id"].unique())}

interactions_df["user_id"] = interactions_df["user_id"].map(user_mapping)
interactions_df["post_id"] = interactions_df["post_id"].map(post_mapping)

# Ensure IDs are within range (avoid index errors)
n_users = len(user_mapping)
n_posts = len(post_mapping)

# Convert implicit feedback into ratings
interactions_df["rating"] = (
    interactions_df["upvotes"] * 1 + interactions_df["saves"] * 3 - interactions_df["downvotes"] * 2
)

# Define PyTorch Dataset
class InteractionDataset(Dataset):
    def __init__(self, df):
        self.users = torch.tensor(df["user_id"].values, dtype=torch.long)
        self.posts = torch.tensor(df["post_id"].values, dtype=torch.long)
        self.ratings = torch.tensor(df["rating"].values, dtype=torch.float32)

    def __len__(self):
        return len(self.ratings)

    def __getitem__(self, idx):
        return self.users[idx], self.posts[idx], self.ratings[idx]

# Create DataLoader
dataset = InteractionDataset(interactions_df)
dataloader = DataLoader(dataset, batch_size=64, shuffle=True)

# Define NCF Model
class NCF(nn.Module):
    def __init__(self, n_users, n_posts, embedding_dim=32):
        super(NCF, self).__init__()
        self.user_embedding = nn.Embedding(n_users, embedding_dim)
        self.post_embedding = nn.Embedding(n_posts, embedding_dim)
        self.fc1 = nn.Linear(embedding_dim * 2, 64)
        self.fc2 = nn.Linear(64, 32)
        self.fc3 = nn.Linear(32, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()

    def forward(self, user, post):
        user_embedded = self.user_embedding(user)
        post_embedded = self.post_embedding(post)
        x = torch.cat([user_embedded, post_embedded], dim=1)
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x.squeeze()

# Initialize Model
model = NCF(n_users, n_posts)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

def train_model(model, dataloader, epochs=5):
    model.train()
    for epoch in range(epochs):
        for users, posts, ratings in dataloader:
            optimizer.zero_grad()
            predictions = model(users, posts)
            loss = criterion(predictions, ratings)
            loss.backward()
            optimizer.step()

# Train Model
train_model(model, dataloader)

# Generate NCF-based recommendations
def get_ncf_recommendations(user_id, top_n=50):
    if user_id not in user_mapping:
        return []
    user_idx = user_mapping[user_id]
    post_ids = list(post_mapping.keys())
    predictions = [(post, model(torch.tensor([user_idx]), torch.tensor([post_mapping[post]])).item()) for post in post_ids]
    predictions.sort(key=lambda x: x[1], reverse=True)
    return [post for post, _ in predictions[:top_n]]

# Content-Based Filtering (TF-IDF / Similarity)
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(posts_df["content"])
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

def get_content_based_recommendations(user_id, recommendations, top_n=30):
    sim_scores = cosine_sim[[post_mapping[post] for post in recommendations]].mean(axis=0)
    post_indices = np.argsort(sim_scores)[::-1][:top_n]
    return posts_df.iloc[post_indices]["post_id"].tolist()

# Industry & Work Profile Prioritization
def get_industry_recommendations(user_id, recommendations, top_n=20):
    user_industry = users_df[users_df["user_id"] == user_id]["industry"].values[0]
    industry_posts = posts_df[posts_df["industry"] == user_industry]["post_id"].tolist()
    prioritized = [post for post in recommendations if post in industry_posts]
    remaining = [post for post in recommendations if post not in industry_posts]
    return (prioritized + remaining)[:top_n]

# Bayesian Ranking (Wilson Score Interval)
def wilson_score(upvotes, downvotes, confidence=0.95):
    n = upvotes + downvotes
    if n == 0:
        return 0
    z = norm.ppf(1 - (1 - confidence) / 2)
    p = upvotes / n
    return (p + z**2 / (2 * n) - z * ((p * (1 - p) + z**2 / (4 * n)) / n)**0.5) / (1 + z**2 / n)

posts_df["wilson_score"] = posts_df.apply(lambda row: wilson_score(row["upvotes"], row["downvotes"]), axis=1)
posts_df = posts_df.sort_values(by="wilson_score", ascending=False)

def get_final_recommendations(user_id, top_n=10):
    ncf_recs = get_ncf_recommendations(user_id, top_n=50)
    content_recs = get_content_based_recommendations(user_id, ncf_recs, top_n=30)
    industry_recs = get_industry_recommendations(user_id, content_recs, top_n=20)
    final_recommendations = sorted(industry_recs, key=lambda post: posts_df[posts_df["post_id"] == post]["wilson_score"].values[0], reverse=True)
    return final_recommendations[:top_n]

user_id = 5
recommended_posts = get_final_recommendations(user_id, top_n=10)
print("Recommended Posts:", recommended_posts)


Recommended Posts: [2080, 1502, 4852, 4595, 3908, 779, 3043, 4019, 4591, 4403]
