# Imports and Data Loading

In [None]:
import sys
import numpy as np
sys.path.append('../')
from src.models.baselines import PopularRecommender, SimpleJaccard, CosineKNN
import pandas as pd
import random
from src.models.models import NetStatKNN, GNNHandler
from src.models.evaluator import Evaluator
import src.util.tigergraph_util as tgu
import tqdm
import torch
from torch_geometric.data import HeteroData
import torch_geometric.transforms as T
import warnings
warnings.filterwarnings('ignore')

This is the data the baselines use for training as they are non graph-based.

In [None]:
df = pd.read_csv('../data/out/user_subreddit.csv', header=None)
user_subreddits = pd.read_csv('../data/out/user_subreddit.csv', header=None)
user_subreddits.columns = ['user', 'subreddit', 'times']
pop_subs = set(user_subreddits[['subreddit', 'times']].groupby('subreddit')['times'].count().sort_values(ascending=False).head(25).index)
user_subreddits['subreddit times'] = list(zip(user_subreddits['times'], user_subreddits['subreddit']))
user_subreddits = user_subreddits.groupby('user')['subreddit times'].apply(list)
user_subreddit_map = user_subreddits.apply(lambda x: [s for c, s in sorted(x, reverse=True)]).to_dict()
users = list(user_subreddit_map.keys())
rand_users = random.sample(users, 5)

# Our user:
In this notebook, we will make some subreddit recommendations for user 'lowpass'. Below are the subreddits they have interacted with sorted in order of most to least interaction:

In [None]:
user = 'lowpass'
user_subreddit_map[user]

# Evaluation:
Creating an evaluation helper:

In [None]:
evaluator = Evaluator('../data/out/test_interactions.csv', subset=500)
at_k = [1, 3, 5, 10, 25]

# Baseline recommender models:
The below instantiates and trains baselines, then makes a recommendation for our user:

## Popularity recommender:
Recommends popular subreddits user does not belong to

In [None]:
pop_rec = PopularRecommender(df)
pop_rec.recommend(user, n=5)

In [None]:
#evaluator.precision_recall(pop_rec, at_k)

## Jaccard similarity recommender:
Recommends subreddits by determining what similar users (determined by Jaccard similarity) belong to

In [None]:
jaccard_rec = SimpleJaccard(df)
jaccard_rec.recommend(user, n=5)

In [None]:
#evaluator.precision_recall(jaccard_rec, at_k)

## Cosine similarity KNN recommender:
Recommends subreddits by determining what similar users (determined by nearest-neighbor similarity in a KNN) belong to

In [None]:
knn_rec = CosineKNN(df)
knn_rec.recommend(user, n=5)

In [None]:
# evaluator.precision_recall(knn_rec, at_k)

# Interaction graph recommender models:

In [None]:
conn = tgu.connection('../config/tigergraph.json')
df = conn.getVertexDataFrame('user', select='fastrp_embedding')
df = pd.concat([df['v_id'].to_frame(), df['fastrp_embedding'].apply(pd.Series)], axis=1)
df.columns = ['v_id', 'pagerank', 'louvain', 'label_prop', 'degree']
embeddings = pd.read_csv('../data/out/user.csv', header=None)
embeddings = embeddings.rename(columns={0:'v_id'})
embeddings['v_id'] = embeddings['v_id'].astype(str)
df['v_id'] = df['v_id'].astype(str)
user_data = df.merge(embeddings, on='v_id', how='inner')
subreddit_data = pd.read_csv('../data/out/subreddit.csv', header=None)
user_subreddit = pd.read_csv('../data/out/user_subreddit.csv', header=None)
reddit_graph = NetStatKNN(conn)
reddit_graph.fit(user_data, subreddit_data, user_subreddit, 10)

## Centrality KNN community recommender:
Recommends subreddits by determining what similar users (determined by nearest neighbors of the following centrality metrics: pagerank, louvain, label propagation, degree) belong to

In [None]:
reddit_graph.recommend(user, n=5)

In [None]:
#evaluator.precision_recall(reddit_graph, at_k, subset_size=500)

## Graph Convolution Network
Uses a graph convolution network to predict links between users and subreddits:

In [None]:
user = pd.read_csv('../data/out/user.csv', header=None)
subreddit = pd.read_csv('../data/out/subreddit.csv', header=None)
user_user = pd.read_csv('../data/out/user_user.csv', header=None)
user_user.dropna(inplace=True)
user_subreddit = pd.read_csv('../data/out/user_subreddit.csv', header=None)

unique_users = user[0].unique()
unique_subreddits = subreddit[0].unique()
user_map = {u:i for i, u in enumerate(unique_users)}
rev_user_map = {i:u for u, i in user_map.items()}
subreddit_map = {u:i for i, u in enumerate(unique_subreddits)}
rev_subreddit_map = {i:u for u, i in subreddit_map.items()}

user[0] = user[0].map(user_map)
subreddit[0] = subreddit[0].map(subreddit_map)
user_subreddit[0], user_subreddit[1] = user_subreddit[0].map(user_map), user_subreddit[1].map(subreddit_map)

data = HeteroData()
data['user'].node_id = torch.arange(len(unique_users))
data['subreddit'].node_id = torch.arange(len(unique_subreddits))
data['user'].x = torch.tensor(user.drop(columns=[0]).values)
data['subreddit'].x = torch.tensor(subreddit.drop(columns=[0]).values)
data['user', 'commented_in', 'subreddit'].edge_index = torch.tensor(user_subreddit.drop(columns=[2]).values).T
data = T.ToUndirected()(data)

In [None]:
maps = (user_map, rev_user_map, subreddit_map, rev_subreddit_map)
gnn_handler = GNNHandler(data, maps)
gnn_handler.set_model(hidden_channels=32)

In [None]:
gnn_handler.predict()

In [None]:
evaluator.precision_recall(gnn_handler, at_k)

In [None]:
gnn_handler.recommend('andrewsmith1986', n=10)