# Imports and Data Loading

In [6]:
import sys
import numpy as np
sys.path.append('../')
from src.models.baselines import PopularRecommender, SimpleJaccard, CosineKNN
import pandas as pd
from src.models.models import NetStatKNN, GNNHandler
from src.models.evaluator import Evaluator
import src.util.tigergraph_util as tgu
import tqdm
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import warnings
warnings.filterwarnings('ignore')

This is the data the baselines use for training as they are non graph-based.

In [2]:
df = pd.read_csv('../data/out/user_subreddit.csv', header=None)

# Our user:
In this notebook, we will make some subreddit recommendations for user 'lowpass'.

In [3]:
user = 'lowpass'

# Evaluation:
Creating an evaluation helper:

In [4]:
evaluator = Evaluator('../data/out/test_interactions.csv', subset=500)
at_k = [1, 3, 5, 10, 25]

# Baseline recommender models:

## Popularity recommender:
Recommends popular subreddits user does not belong to

In [5]:
pop_rec = PopularRecommender(df)
pop_rec.recommend(user, n=5)

['politics', 'IAmA', 'trees', 'worldnews', 'science']

In [6]:
evaluator.precision_recall(pop_rec, at_k)

{'@1': (0.1, 0.09583333333333334),
 '@3': (0.09444444444444455, 0.27291666666666664),
 '@5': (0.07291666666666657, 0.35),
 '@10': (0.047500000000000105, 0.4583333333333333),
 '@25': (0.025166666666666504, 0.6072916666666667)}

precision: 0.02990719257540542

recall:    0.2524693404043752

## Jaccard similarity recommender:
Recommends subreddits by determining what similar users (determined by Jaccard similarity) belong to

In [7]:
jaccard_rec = SimpleJaccard(df)
jaccard_rec.recommend(user)

['learnprogramming']

In [8]:
evaluator.precision_recall(jaccard_rec, at_k)

{'@1': (0.025, 0.022916666666666665),
 '@3': (0.020138888888888887, 0.058333333333333334),
 '@5': (0.01958333333333333, 0.09479166666666666),
 '@10': (0.020416666666666628, 0.19791666666666666),
 '@25': (0.020499999999999928, 0.49270833333333336)}

precision: 0.009563066776587017

recall:    0.07598568418848715

## Cosine similarity KNN recommender:
Recommends subreddits by determining what similar users (determined by nearest-neighbor similarity in a KNN) belong to

In [9]:
knn_rec = CosineKNN(df)
knn_rec.recommend(user)

['beer']

In [10]:
evaluator.precision_recall(knn_rec, at_k)

{'@1': (0.06041666666666667, 0.06041666666666667),
 '@3': (0.0423611111111111, 0.12708333333333333),
 '@5': (0.03729166666666662, 0.17708333333333334),
 '@10': (0.03879464285714283, 0.26666666666666666),
 '@25': (0.040257181611571886, 0.3458333333333333)}

precision: 0.02346024330772864

recall:    0.12531798374749745

# Interaction graph recommender models:

In [11]:
conn = tgu.connection('../config/tigergraph.json')
df = conn.getVertexDataFrame('user', select='fastrp_embedding')
df = pd.concat([df['v_id'].to_frame(), df['fastrp_embedding'].apply(pd.Series)], axis=1)
df.columns = ['v_id', 'pagerank', 'louvain', 'label_prop', 'degree']
embeddings = pd.read_csv('../data/out/user.csv', header=None)
embeddings = embeddings.rename(columns={0:'v_id'})
embeddings['v_id'] = embeddings['v_id'].astype(str)
df['v_id'] = df['v_id'].astype(str)
user_data = df.merge(embeddings, on='v_id', how='inner')
subreddit_data = pd.read_csv('../data/out/subreddit.csv', header=None)
user_subreddit = pd.read_csv('../data/out/user_subreddit.csv', header=None)

reddit_graph = NetStatKNN(conn)

## Centrality KNN community recommender:
Recommends subreddits by determining what similar users (determined by nearest neighbors of the following centrality metrics: pagerank, louvain, label propagation, degree) belong to

In [12]:
reddit_graph.fit(user_data, subreddit_data, user_subreddit, 10)

In [17]:
evaluator.precision_recall(reddit_graph, at_k, subset_size=500)

{'@1': (0.0, 0.0),
 '@3': (0.02079141515761234, 0.027699530516431925),
 '@5': (0.0338028169014084, 0.05908995989478886),
 '@10': (0.07122736418511083, 0.2293605323336342),
 '@25': (0.07557344064386266, 0.4943317205836611)}

## Graph Convolution Network
Uses a graph convolution network to predict links between users and subreddits:

In [2]:
user = pd.read_csv('../data/out/user.csv', header=None)
subreddit = pd.read_csv('../data/out/subreddit.csv', header=None)
user_user = pd.read_csv('../data/out/user_user.csv', header=None)
user_user.dropna(inplace=True)
user_subreddit = pd.read_csv('../data/out/user_subreddit.csv', header=None)

unique_users = user[0].unique()
unique_subreddits = subreddit[0].unique()
user_map = {u:i for i, u in enumerate(unique_users)}
rev_user_map = {i:u for u, i in user_map.items()}
subreddit_map = {u:i for i, u in enumerate(unique_subreddits)}
rev_subreddit_map = {i:u for u, i in subreddit_map.items()}

user[0] = user[0].map(user_map)
subreddit[0] = subreddit[0].map(subreddit_map)
user_subreddit[0], user_subreddit[1] = user_subreddit[0].map(user_map), user_subreddit[1].map(subreddit_map)

data = HeteroData()
data['user'].node_id = torch.arange(len(unique_users))
data['subreddit'].node_id = torch.arange(len(unique_subreddits))
data['user'].x = torch.tensor(user.drop(columns=[0]).values)
data['subreddit'].x = torch.tensor(subreddit.drop(columns=[0]).values)
data['user', 'commented_in', 'subreddit'].edge_index = torch.tensor(user_subreddit.drop(columns=[2]).values).T
data = T.ToUndirected()(data)

In [3]:
gnn = torch.load('gnn.pt')
gnn_handler = GNNHandler(data)

In [56]:
gnn_handler.test_data

HeteroData(
  [1muser[0m={
    node_id=[243809],
    x=[243809, 1250]
  },
  [1msubreddit[0m={
    node_id=[4420],
    x=[4420, 1250]
  },
  [1m(user, commented_in, subreddit)[0m={
    edge_index=[2, 629841],
    edge_label=[209946],
    edge_label_index=[2, 209946]
  },
  [1m(subreddit, rev_commented_in, user)[0m={ edge_index=[2, 629841] }
)

In [61]:
preds = []
truth = []
for sample in tqdm.tqdm(gnn_handler.test_loader):
    preds.append(gnn(sample))
    truth.append(sample['user', 'commented_in', 'subreddit'].edge_label)

100%|██████████| 1641/1641 [02:32<00:00, 10.77it/s]


In [74]:
torch.cat(preds, dim=0) - torch.cat(truth, dim=0)

tensor([-27.1711,   3.0866, -67.8404,  ..., -88.2594,   1.4111, -27.4865],
       grad_fn=<SubBackward0>)

In [91]:
precision = []
k = 5
for pred, t in zip(preds, truth):
    recs = set(np.where(pred > 0)[:k][0])
    relevant = set(np.where(t > 0)[0])
    relevant_recs = relevant.intersection(recs)
    precision.append((len(relevant_recs) / len(recs)) / k)
sum(precision) / len(precision)

TypeError: iteration over a 0-d tensor

In [60]:
gnn_handler.test_data['user', 'commented_in', 'subreddit'].edge_label.to(torch.int16)

tensor([1, 1, 1,  ..., 0, 0, 0], dtype=torch.int16)

In [63]:
gnn.eval()
with torch.no_grad():
    for sample in gnn_handler.test_loader:
        # x = batch['user'].x # node features
        # edge_index = batch.edge_index # edge indices
        node_emb = gnn(sample)

user_emb = node_emb[0] # user embedding
subreddit_emb = node_emb[1:] # subreddit embeddings

# Calculate relevance scores using dot product
relevance_scores = (user_emb @ subreddit_emb.T).squeeze()
sorted_scores, sorted_indices = torch.sort(relevance_scores, descending=True)

# Recommend top-k subreddits
k = 10
topk_indices = sorted_indices[:k]
recommended_subreddits = [data.x[topk_index] for topk_index in topk_indices]

RuntimeError: both arguments to matmul need to be at least 1D, but they are 0D and 1D

In [69]:
preds[0]

tensor([-2.6171e+01,  5.0866e+00, -6.6840e+01, -3.1928e+01,  6.9193e+00,
        -1.9449e+01,  6.0662e+00, -1.4203e+02, -3.0919e+01,  7.9207e+00,
        -2.1436e+01,  2.6394e+00, -4.3705e+01,  4.2754e+00, -1.4541e+01,
        -6.0923e+01, -1.8664e+02, -1.0626e+02,  3.6333e+00, -1.0438e+02,
        -4.1458e+01, -9.6985e+00, -1.2371e+02, -6.4670e+01, -1.0691e+02,
        -1.1019e+02,  2.9756e+00, -2.0708e+01, -2.1019e+01,  2.9112e-01,
        -1.3068e+02, -1.1109e+02, -6.1581e+01, -7.0139e+00, -2.2070e+00,
        -1.5022e+01,  5.4217e+00, -6.6455e+00, -2.9591e+00, -2.8664e+01,
        -6.2047e+01,  5.3950e+00,  5.5349e-01, -1.5419e+01, -2.3033e+01,
        -1.3211e+01, -3.7338e+01, -1.2623e+02, -2.4077e+01, -7.6420e+01,
        -1.0495e+02, -1.3695e+02, -4.8100e+01, -4.1117e+01,  4.4821e+00,
        -3.5613e+01, -4.8283e+01, -1.9142e+02, -2.7311e+01, -1.4921e+01,
        -7.2238e+01, -1.4870e+01, -1.0926e+02,  1.8832e+00,  5.9559e+00,
        -1.4235e+02, -8.9788e+01, -5.0970e+01, -3.8