In [None]:
%load_ext autoreload
import os
from functools import partial
from dataclasses import dataclass
from typing import Optional, Callable
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from pmf import PoissonMF

# Load & Process Data

In [None]:
data_path = './data/'
notes = pd.read_csv(os.path.join(data_path, 'notes-00000.tsv'), sep='\t')
# Convert NaN to empty string
notes['summary'] = notes['summary'].astype(str).fillna('').str.strip()

ratings = pd.read_csv(os.path.join(data_path, 'ratings-00000.tsv'), sep='\t')

# Drop rows with NaN in helpfulnessLevel column
ratings = ratings.dropna(subset=['helpfulnessLevel'])

In [None]:
# Print total number of ratings
print('Total number of ratings: {}'.format(len(ratings)))

# Print number of unique notes and raters
print('Number of unique notes: {}'.format(ratings['noteId'].nunique()))
print('Number of unique raters: {}'.format(ratings['raterParticipantId'].nunique()))

# Get list of notes with more than 5 ratings
note_rating_counts = ratings['noteId'].value_counts()
filtered_note_ids = note_rating_counts[note_rating_counts > 5].index.tolist()
print('Number of notes with more than 5 ratings: {}'.format(len(notes)))

# Get list of raters with more than 10 ratings
rater_counts = ratings['raterParticipantId'].value_counts()
filtered_rater_ids = rater_counts[rater_counts > 10].index.tolist()
print('Number of raters with more than 10 ratings: {}'.format(len(filtered_rater_ids)))

# Filter ratings to only include ratings rated by raters with more than 10 ratings and for notes with more than 5 ratings
ratings = ratings[ratings['raterParticipantId'].isin(filtered_rater_ids) & ratings['noteId'].isin(filtered_note_ids)]
print('Number of ratings after filtering: {}'.format(len(ratings)))

In [None]:
# Convert the ratings matrix to three lists:
# - rating_labels, which is the 'helpfulnessLevel' column mapped to -1 for 'NOT_HELPFUL',
#   0 for 'SOMEWHAT_HELPFUL', and 1 for 'HELPFUL'
# - user_idxs, which is the 'raterParticipantId' column mapped to a unique integer
# - note_idxs, which is the 'noteId' column mapped to a unique integer
rating_labels = ratings['helpfulnessLevel'].map({'NOT_HELPFUL': -1, 'SOMEWHAT_HELPFUL': 0, 'HELPFUL': 1})
# Use a label encoder to map the user and note ids to unique integers
user_encoder = LabelEncoder()
note_encoder = LabelEncoder()
user_idxs = user_encoder.fit_transform(ratings['raterParticipantId'])
note_idxs = note_encoder.fit_transform(ratings['noteId'])

n_users = len(user_encoder.classes_)
n_notes = len(note_encoder.classes_)

# Sparse exposure matrix (did the user rate the note?)
exp_matrix = csr_matrix((np.ones_like(rating_labels), (user_idxs, note_idxs)), shape=(n_users, n_notes))

In [None]:
n_notes, n_users

# Step 1a: Causal Inference, Exposure Model
Fit Poisson matrix factorization to the exposures/assignments (who rated what). We will then use the reconstructed exposures as substitute confounders.

In [None]:
pf = PoissonMF(n_components=1, random_state=42, verbose=True, a=0.3, b=0.3, c=0.3, d=0.3)
pf.fit(exp_matrix, user_idxs, note_idxs)

In [None]:
# Latent representations learned by Poisson MF
exp_user_factors, exp_item_factors = pf.Eb, pf.Et.T

# Step 1b: Causal Inference, Outcome Model
Now estimate the outcome model, i.e., matrix factorization on the observed ratings while controlling for the substitute confounders estimated from Step 1a.

In [None]:
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import nn
from mf import MatrixFactorizationModel, ModelData

In [None]:
mf_model = MatrixFactorizationModel(n_users, n_notes, exp_user_factors=exp_user_factors, exp_item_factors=exp_item_factors, n_components=1)

rating_tensor = torch.FloatTensor(rating_labels).to(mf_model.device)
user_idxs_tensor = torch.LongTensor(user_idxs).to(mf_model.device)
note_idxs_tensor = torch.LongTensor(note_idxs).to(mf_model.device)
exp_tensor = torch.ones_like(rating_tensor).to(mf_model.device)

data = ModelData(rating_tensor, user_idxs_tensor, note_idxs_tensor, exp_tensor)

In [None]:
train_loss, val_loss = mf_model.fit(data, epochs=100, lr=0.1, print_interval=20, validate_fraction=0.1, print_loss=True)

# Step 2: Voting Aggregation
Calculate results for different voting aggregation rules.

In [None]:
pd.set_option('display.max_colwidth', 1000)

In [None]:
filtered_notes = notes[notes['noteId'].isin(filtered_note_ids)]
note_ids = note_encoder.inverse_transform(np.arange(n_notes))

## Mean

In [None]:
majority_votes = mf_model.forward_majority_vote()

In [None]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': majority_votes})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes[['noteId', 'summary', 'noteScore']]

## Quantile

In [None]:
quantile = partial(torch.quantile, q=0.25)
quantile_scores = mf_model.get_vote_scores(quantile)

In [None]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': quantile_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes[['noteId', 'summary', 'noteScore']]

## Min

In [None]:
def min2(x, dim):
  return torch.min(x, dim=dim).values

min_scores = mf_model.get_vote_scores(min2)

In [None]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': min_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

## Max

In [None]:
def max2(x, dim):
  return torch.max(x, dim=dim).values

max_scores = mf_model.get_vote_scores(max2)

In [None]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': max_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

## Controversial (Variance)

In [None]:
var_scores = mf_model.get_vote_scores(torch.var)

In [None]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': var_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

## % approval
Rank by the % approval (where approval is defined as a score being above a particular threshold)

In [None]:
def approval(x, dim, threshold=0.7):
    return (x > threshold).float().mean(dim=dim)

approval_scores = mf_model.get_vote_scores(approval)

In [None]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': approval_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]