In [1]:
%load_ext autoreload
import os
from functools import partial
from dataclasses import dataclass
from typing import Optional, Callable
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from pmf import PoissonMF

# Load & Process Data

In [2]:
data_path = './data/'
notes = pd.read_csv(os.path.join(data_path, 'notes-00000.tsv'), sep='\t')
# Convert NaN to empty string
notes['summary'] = notes['summary'].astype(str).fillna('').str.strip()

ratings = pd.read_csv(os.path.join(data_path, 'ratings-00000.tsv'), sep='\t')

# Drop rows with NaN in helpfulnessLevel column
ratings = ratings.dropna(subset=['helpfulnessLevel'])

In [3]:
# Print total number of ratings
print('Total number of ratings: {}'.format(len(ratings)))

# Print number of unique notes and raters
print('Number of unique notes: {}'.format(ratings['noteId'].nunique()))
print('Number of unique raters: {}'.format(ratings['raterParticipantId'].nunique()))

# Get list of notes with more than 5 ratings
note_rating_counts = ratings['noteId'].value_counts()
filtered_note_ids = note_rating_counts[note_rating_counts > 5].index.tolist()
print('Number of notes with more than 5 ratings: {}'.format(len(notes)))

# Get list of raters with more than 10 ratings
rater_counts = ratings['raterParticipantId'].value_counts()
filtered_rater_ids = rater_counts[rater_counts > 10].index.tolist()
print('Number of raters with more than 10 ratings: {}'.format(len(filtered_rater_ids)))

# Filter ratings to only include ratings rated by raters with more than 10 ratings and for notes with more than 5 ratings
ratings = ratings[ratings['raterParticipantId'].isin(filtered_rater_ids) & ratings['noteId'].isin(filtered_note_ids)]
print('Number of ratings after filtering: {}'.format(len(ratings)))

Total number of ratings: 5347044
Number of unique notes: 113682
Number of unique raters: 131866
Number of notes with more than 5 ratings: 122641
Number of raters with more than 10 ratings: 54350
Number of ratings after filtering: 5003405


In [4]:
# Convert the ratings matrix to three lists:
# - rating_labels, which is the 'helpfulnessLevel' column mapped to -1 for 'NOT_HELPFUL',
#   0 for 'SOMEWHAT_HELPFUL', and 1 for 'HELPFUL'
# - user_idxs, which is the 'raterParticipantId' column mapped to a unique integer
# - note_idxs, which is the 'noteId' column mapped to a unique integer
rating_labels = ratings['helpfulnessLevel'].map({'NOT_HELPFUL': -1, 'SOMEWHAT_HELPFUL': 0, 'HELPFUL': 1})
# Use a label encoder to map the user and note ids to unique integers
user_encoder = LabelEncoder()
note_encoder = LabelEncoder()
user_idxs = user_encoder.fit_transform(ratings['raterParticipantId'])
note_idxs = note_encoder.fit_transform(ratings['noteId'])

n_users = len(user_encoder.classes_)
n_notes = len(note_encoder.classes_)

# Sparse exposure matrix (did the user rate the note?)
exp_matrix = csr_matrix((np.ones_like(rating_labels), (user_idxs, note_idxs)), shape=(n_users, n_notes))

In [5]:
n_notes, n_users

(89064, 54350)

# Step 1a: Causal Inference, Exposure Model
Fit Poisson matrix factorization to the exposures/assignments (who rated what). We will then use the reconstructed exposures as substitute confounders.

In [6]:
pf = PoissonMF(n_components=1, random_state=42, verbose=True, a=0.3, b=0.3, c=0.3, d=0.3)
pf.fit(exp_matrix, user_idxs, note_idxs)

In [7]:
# Latent representations learned by Poisson MF
exp_user_factors, exp_item_factors = pf.Eb, pf.Et.T

# Step 1b: Causal Inference, Outcome Model
Now estimate the outcome model, i.e., matrix factorization on the observed ratings while controlling for the substitute confounders estimated from Step 1a.

In [8]:
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import nn
from mf import MatrixFactorizationModel, ModelData

In [9]:
mf_model = MatrixFactorizationModel(n_users, n_notes, exp_user_factors, exp_item_factors, n_components=1)

rating_tensor = torch.FloatTensor(rating_labels).to(mf_model.device)
user_idxs_tensor = torch.LongTensor(user_idxs).to(mf_model.device)
note_idxs_tensor = torch.LongTensor(note_idxs).to(mf_model.device)
exp_tensor = torch.ones_like(rating_tensor).to(mf_model.device)

data = ModelData(rating_tensor, user_idxs_tensor, note_idxs_tensor, exp_tensor)

In [10]:
train_loss, val_loss = mf_model.fit(data, epochs=100, lr=0.1, print_interval=20, validate_fraction=0.1, print_loss=True)

Epoch 0: train L2-reg loss = 0.479, val L2-reg loss = 0.817
Epoch 0: train MSE = 0.810, val MSE = 0.813
Epoch 20: train L2-reg loss = 0.246, val L2-reg loss = 0.470
Epoch 20: train MSE = 0.387, val MSE = 0.433
Epoch 40: train L2-reg loss = 0.227, val L2-reg loss = 0.441
Epoch 40: train MSE = 0.352, val MSE = 0.400
Epoch 60: train L2-reg loss = 0.223, val L2-reg loss = 0.438
Epoch 60: train MSE = 0.347, val MSE = 0.395
Epoch 80: train L2-reg loss = 0.223, val L2-reg loss = 0.438
Epoch 80: train MSE = 0.346, val MSE = 0.396


# Step 2: Voting Aggregation
Calculate results for different voting aggregation rules.

In [11]:
pd.set_option('display.max_colwidth', 1000)

In [12]:
filtered_notes = notes[notes['noteId'].isin(filtered_note_ids)]
note_ids = note_encoder.inverse_transform(np.arange(n_notes))

## Mean

In [13]:
majority_votes = mf_model.forward_majority_vote()

In [14]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': majority_votes})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

Unnamed: 0,noteId,summary,noteScore
73036,1653431562489286657,"This is not MH370. MH370 has never been found as of 2023, this photo is a picture of a underwater Lockheed Martin L1011 Tristar that was abandoned at an airport and sunken to attract diving tourism. https://www.cnn.com/travel/article/red-sea-underwater-abandoned-plane/index.html",0.888682
58370,1651791523552395266,Cannons are lethal. https://www.civilwarmed.org/effects-of-artillery/,0.874562
28252,1590463688397430784,The color of the like button has not changed. Twitter Changed The Color Of The Like Button is a frequently repeated joke and troll on Twitter leading other users to test whether the like button color has indeed changed and in the process give the original poster a like. https://knowyourmeme.com/memes/twitter-changed-the-color-of-the-like-button,0.858748
52872,1629730684364558336,"Home Equity Lines of Credit (HELOCs) do not allow someone to borrow from “themselves” rather than a bank. The money is loaned by a bank, with home equity as collateral for the loan. https://www.rocketmortgage.com/learn/home-equity-line-of-credit https://www.bankofamerica.com/mortgage/learn/what-is-a-home-equity-line-of-credit/ https://www.nerdwallet.com/article/mortgages/heloc-home-equity-line-of-credit https://www.wsj.com/buyside/personal-finance/what-is-a-home-equity-line-of-credit-01669668213",0.840181
69335,1666616867551674369,Twitch's Terms of Service still state that Twitch users/streamers may not insert embedded advertisements or banner ads https://www.twitch.tv/p/en/legal/terms-of-service/#12-advertisements (Section 12),0.833417
75279,1640026483438780416,This is an AI generated image. There are currently no reports of any communications between Mary Barra and Elon Musk. Mary Barra is married to Anthony Barra. https://www.nytimes.com/2013/12/11/business/gm-names-first-female-chief-executive.html https://wagcenter.com/corporate-wags/gms-mary-barras-husband-anthony-e-barra/ (Low quality source).,0.826717
50618,1651712018674483200,"The Act of emulation is not a crime as noted in the case of Sony Computer Entertainment America, Inc. v. Bleem, LLC. https://h2o.law.harvard.edu/cases/5198 The only illegal part of emulation is distributing copyrighted software and the BIOS files from consoles, the emulators themselves are not illegal.",0.823258
75511,1649126162742771733,"On December 22, 2022, President Biden signed a bill to block the railroad union from striking. https://www.reuters.com/world/us/biden-signs-bill-block-us-railroad-strike-2022-12-02/",0.817721
49530,1671683633692790786,"The person in the video is using multiple accessibility mode options, such as auto dodge and health regen, as well as only being on the literal prologue/introduction of the game. https://www.ign.com/articles/final-fantasy-16-has-accessibility-items-instead-of-difficulty-levels",0.813102
62839,1668230186897137664,"While the article is grounded and written by Ukraine experts, the headline is misleading. Chernobyl is already Ukraine's own Chernobyl, because it is a city in Ukraine. https://en.wikipedia.org/wiki/Chernobyl",0.809197


## Quantile

In [16]:
quantile = partial(torch.quantile, q=0.25)
quantile_scores = mf_model.get_vote_scores(quantile)

In [17]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': quantile_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

Unnamed: 0,noteId,summary,noteScore
58370,1651791523552395266,Cannons are lethal. https://www.civilwarmed.org/effects-of-artillery/,0.822283
73036,1653431562489286657,"This is not MH370. MH370 has never been found as of 2023, this photo is a picture of a underwater Lockheed Martin L1011 Tristar that was abandoned at an airport and sunken to attract diving tourism. https://www.cnn.com/travel/article/red-sea-underwater-abandoned-plane/index.html",0.803039
28252,1590463688397430784,The color of the like button has not changed. Twitter Changed The Color Of The Like Button is a frequently repeated joke and troll on Twitter leading other users to test whether the like button color has indeed changed and in the process give the original poster a like. https://knowyourmeme.com/memes/twitter-changed-the-color-of-the-like-button,0.767002
52872,1629730684364558336,"Home Equity Lines of Credit (HELOCs) do not allow someone to borrow from “themselves” rather than a bank. The money is loaned by a bank, with home equity as collateral for the loan. https://www.rocketmortgage.com/learn/home-equity-line-of-credit https://www.bankofamerica.com/mortgage/learn/what-is-a-home-equity-line-of-credit/ https://www.nerdwallet.com/article/mortgages/heloc-home-equity-line-of-credit https://www.wsj.com/buyside/personal-finance/what-is-a-home-equity-line-of-credit-01669668213",0.765462
69335,1666616867551674369,Twitch's Terms of Service still state that Twitch users/streamers may not insert embedded advertisements or banner ads https://www.twitch.tv/p/en/legal/terms-of-service/#12-advertisements (Section 12),0.756355
75279,1640026483438780416,This is an AI generated image. There are currently no reports of any communications between Mary Barra and Elon Musk. Mary Barra is married to Anthony Barra. https://www.nytimes.com/2013/12/11/business/gm-names-first-female-chief-executive.html https://wagcenter.com/corporate-wags/gms-mary-barras-husband-anthony-e-barra/ (Low quality source).,0.734186
50618,1651712018674483200,"The Act of emulation is not a crime as noted in the case of Sony Computer Entertainment America, Inc. v. Bleem, LLC. https://h2o.law.harvard.edu/cases/5198 The only illegal part of emulation is distributing copyrighted software and the BIOS files from consoles, the emulators themselves are not illegal.",0.727678
49530,1671683633692790786,"The person in the video is using multiple accessibility mode options, such as auto dodge and health regen, as well as only being on the literal prologue/introduction of the game. https://www.ign.com/articles/final-fantasy-16-has-accessibility-items-instead-of-difficulty-levels",0.727458
62839,1668230186897137664,"While the article is grounded and written by Ukraine experts, the headline is misleading. Chernobyl is already Ukraine's own Chernobyl, because it is a city in Ukraine. https://en.wikipedia.org/wiki/Chernobyl",0.726681
77410,1669987097506205696,"**CAUTION** This ad is a scam. The video is of a completely different drone and is not what you will receive. The drone in the video is the Autel EVO Lite and costs about $1,800 ($1,350 when on sale) https://shop.autelrobotics.com/collections/fathers-day-2023/products/drones-evo-lite",0.714123


## Min

In [18]:
def min2(x, dim):
  return torch.min(x, dim=dim).values

min_scores = mf_model.get_vote_scores(min2)

In [19]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': min_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

Unnamed: 0,noteId,summary,noteScore
58370,1651791523552395266,Cannons are lethal. https://www.civilwarmed.org/effects-of-artillery/,0.262883
73036,1653431562489286657,"This is not MH370. MH370 has never been found as of 2023, this photo is a picture of a underwater Lockheed Martin L1011 Tristar that was abandoned at an airport and sunken to attract diving tourism. https://www.cnn.com/travel/article/red-sea-underwater-abandoned-plane/index.html",0.21266
69335,1666616867551674369,Twitch's Terms of Service still state that Twitch users/streamers may not insert embedded advertisements or banner ads https://www.twitch.tv/p/en/legal/terms-of-service/#12-advertisements (Section 12),0.185582
28252,1590463688397430784,The color of the like button has not changed. Twitter Changed The Color Of The Like Button is a frequently repeated joke and troll on Twitter leading other users to test whether the like button color has indeed changed and in the process give the original poster a like. https://knowyourmeme.com/memes/twitter-changed-the-color-of-the-like-button,0.179882
75279,1640026483438780416,This is an AI generated image. There are currently no reports of any communications between Mary Barra and Elon Musk. Mary Barra is married to Anthony Barra. https://www.nytimes.com/2013/12/11/business/gm-names-first-female-chief-executive.html https://wagcenter.com/corporate-wags/gms-mary-barras-husband-anthony-e-barra/ (Low quality source).,0.162253
52872,1629730684364558336,"Home Equity Lines of Credit (HELOCs) do not allow someone to borrow from “themselves” rather than a bank. The money is loaned by a bank, with home equity as collateral for the loan. https://www.rocketmortgage.com/learn/home-equity-line-of-credit https://www.bankofamerica.com/mortgage/learn/what-is-a-home-equity-line-of-credit/ https://www.nerdwallet.com/article/mortgages/heloc-home-equity-line-of-credit https://www.wsj.com/buyside/personal-finance/what-is-a-home-equity-line-of-credit-01669668213",0.149927
62839,1668230186897137664,"While the article is grounded and written by Ukraine experts, the headline is misleading. Chernobyl is already Ukraine's own Chernobyl, because it is a city in Ukraine. https://en.wikipedia.org/wiki/Chernobyl",0.1328
67577,1618423074046578689,"This is a satirical tweet from a parody account. &quot;Anita B. Etin&quot; is not a real person. The name is a play on words that sounds out &quot;Anita be eating.&quot; There is no such thing as a PhD in &quot;Body Positive Medicine,&quot; and neither of the books listed in the account's bio exist. https://www.findaphd.com/phds/browsebysubject.aspx https://en.wikipedia.org/wiki/Doctor_of_Medicine",0.131342
25365,1661890286195204102,"Roger Waters is in-character as Pink Floyd, a rock star that overdoses and descends into madness, hallucinating he is a dictator at a fascist rally, and the audience are his supporters. It is a role famously played by Bob Geldof in the movie “Pink Floyd: The Wall” (1982). https://www.imdb.com/title/tt0084503/ https://en.m.wikipedia.org/wiki/The_Wall",0.128732
24718,1651618890370400256,"Some tweets with false claims may not have Community Notes. This system is limited in reach by participation, and is not impervious to errors or perpetuating common misconceptions. If you'd like to help us keep improving, consider joining us @CommunityNotes.",0.119169


## Max

In [20]:
def max2(x, dim):
  return torch.max(x, dim=dim).values

max_scores = mf_model.get_vote_scores(max2)

In [21]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': max_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

Unnamed: 0,noteId,summary,noteScore
41063,1641632910263431169,Ms. Pelosi mistakenly says that Trump can prove his innocence at trial. Law in the US assumes the innocence of a defendant and the prosecution must prove guilt for a conviction. https://www.law.cornell.edu/wex/presumption_of_innocence,35.69202
75511,1649126162742771733,"On December 22, 2022, President Biden signed a bill to block the railroad union from striking. https://www.reuters.com/world/us/biden-signs-bill-block-us-railroad-strike-2022-12-02/",32.796707
70960,1661796202554294297,"This incident took place in Milan, Italy on May 24. The transgender individual being subdued by police had been reported after exposing his penis to school children and threatening to infect people with HIV. He then violently attempted to evade arrest. https://reduxx.info/italy-trans-identified-male-arrested-after-stripping-in-front-of-school-threatening-people-with-hiv/ https://www.ilgiorno.it/milano/cronaca/donna-picchiata-polizia-locale-c1pw3u27 https://tg.la7.it/cronaca/pestaggio-trans-a-milano-le-indagini-della-procura-25-05-2023-184758",32.15287
23758,1659334653831000065,"1. Anyone can request removal of info from Wayback. https://help.archive.org/help/how-do-i-request-to-remove-something-from-archive-org/. 2. Brewster Kahle—founder and Board chair of The Internet Archive (a non-profit that runs The Wayback Machine)—has directly responded to &amp; refuted these claims. &quot;Taylor Lorenz is not my niece, nor have I met her.&quot; https://twitter.com/brewster_kahle/status/1659283393753006082 https://archive.org/about/bios.php#brewster",28.817575
73316,1670478147249405952,"Percentage of children living absent their biological fathers: Black children: 64% Hispanic children: 42% White children: 24% Asian children: 16% https://datacenter.aecf.org/data/tables/107-children-in-single-parent-families-by-race-and-ethnicity#detailed/1/any/false/2048,1729,37,871,870,573,869,36,868,867/10,11,9,12,1,185,13/432,431",25.922888
31366,1653027648539377664,"&quot;Gender Queer&quot;, the book shown in the photo, features sexually explicit material. This book contains visual depictions of oral sex, masturbation and adult sexual contact with a minor. https://en.m.wikipedia.org/wiki/Gender_Queer https://a.co/d/2NsDwZD",23.053812
2949,1652817551989305344,"In late 2022, Biden’s campaign team connected with young social media influencers, including Harry Sisson, to draw in voters ages 18-29. https://nypost.com/2023/04/09/biden-turning-to-social-media-influencers-to-tout-agenda-report/ https://www.axios.com/2023/04/09/bidens-digital-strategy-an-army-of-influencers?utm_source=twitter&amp;utm_campaign=editorial&amp;utm_medium=social&amp;utm_content=politics https://www.washingtonpost.com/technology/2022/10/27/tiktok-democrats-influencers-biden/ Sisson recently signed with Palette management. Palette received more than 200k from the DNC in the last 8 months. https://www.linkedin.com/in/harry-sisson https://www.fec.gov/data/disbursements/?two_year_transaction_period=2024&amp;min_date=01/01/2023&amp;max_date=12/31/2024",22.738953
79858,1669888265485271046,"Duane stabbed 14-year-old Karen Slattery 18 times, killing her, then raped her corpse. He committed murder again years later. https://www.cbsnews.com/news/florida-executes-duane-owen-1984-killings-karen-slattery-georgianna-worden/",22.728142
55360,1601567065323851776,President Trump explicitly undermined American faith in election results in the months leading up to the 2020 election. https://www.npr.org/2021/02/08/965342252/timeline-what-trump-told-supporters-for-months-before-they-attacked Enforcing Twitter's Terms of Service is not election interference. https://help.twitter.com/en/rules-and-policies/twitter-rules,22.72155
23065,1664452730138632194,"Commenting, retweeting, and quote tweeting and sharing are restricted/banned.",22.490393


## Controversial (Variance)

In [22]:
var_scores = mf_model.get_vote_scores(torch.var)

In [23]:
note_results = pd.DataFrame({'noteId': note_ids, 'noteScore': var_scores})
scored_notes = filtered_notes.merge(note_results, on='noteId')
scored_notes = scored_notes.sort_values(by='noteScore', ascending=False)
scored_notes.head(20)[['noteId', 'summary', 'noteScore']]

Unnamed: 0,noteId,summary,noteScore
70960,1661796202554294297,"This incident took place in Milan, Italy on May 24. The transgender individual being subdued by police had been reported after exposing his penis to school children and threatening to infect people with HIV. He then violently attempted to evade arrest. https://reduxx.info/italy-trans-identified-male-arrested-after-stripping-in-front-of-school-threatening-people-with-hiv/ https://www.ilgiorno.it/milano/cronaca/donna-picchiata-polizia-locale-c1pw3u27 https://tg.la7.it/cronaca/pestaggio-trans-a-milano-le-indagini-della-procura-25-05-2023-184758",0.507403
17310,1666825935805648896,都市公園法第二条の２ 第五項において「野球場、陸上競技場、水泳プールその他の運動施設で政令で定めるもの」が公園施設として定義されており、水着の撮影は都市公園法に定める運動施設における運動用具を着用している光景を撮影しているのみであり、同法第一条の解釈には該当しない。 https://elaws.e-gov.go.jp/document?lawid=331AC0000000079,0.500676
73316,1670478147249405952,"Percentage of children living absent their biological fathers: Black children: 64% Hispanic children: 42% White children: 24% Asian children: 16% https://datacenter.aecf.org/data/tables/107-children-in-single-parent-families-by-race-and-ethnicity#detailed/1/any/false/2048,1729,37,871,870,573,869,36,868,867/10,11,9,12,1,185,13/432,431",0.500399
55360,1601567065323851776,President Trump explicitly undermined American faith in election results in the months leading up to the 2020 election. https://www.npr.org/2021/02/08/965342252/timeline-what-trump-told-supporters-for-months-before-they-attacked Enforcing Twitter's Terms of Service is not election interference. https://help.twitter.com/en/rules-and-policies/twitter-rules,0.451404
287,1660232132822962177,&quot;政府側は、今回の議長国会見を30分間の予定で実施するとしていた。最初に首相が会見を終えようとした際、開始から約40分が経過していた。&quot; これがすべてだと思います https://mainichi.jp/articles/20230521/k00/00m/010/156000c,0.446571
24658,1670476435214188544,"As of 2021 data, 64% of &quot;Black or African American&quot; children lived in single-parent families. https://datacenter.aecf.org/data/tables/107-children-in-single-parent-families-by-race-and-ethnicity",0.434403
34729,1654863564946001925,This post is misleading. Elon Musk was born into a extremely wealthy family in South Africa. https://sites.imsa.edu/acronym/2021/03/04/elon-musk-is-not-an-entrepreneur-hes-a-rich-deceitful-hack/ https://www.independent.co.uk/space/elon-musk-made-money-rich-b2212599.html,0.429679
44934,1669986609662443520,"Manche sollten Kollektive Anmerkungen nicht für die Untermalung ihrer persönlichen Weltsicht missbrauchen. Tweet/ Artikel geht konkret auf eine Falschdarstellung ein, setzt sie in den Kontext und belegt es. Kein Kommentar notwendig. Plus: Correctiv sind seriös, arbeiten sauber.",0.414386
22567,1659599098247520256,"Health authorities in Europe are debating the benefits and concerns of gender interventions. “But in Finland, Sweden, France, Norway, and the U.K., scientists and public-health officials are warning that, for some young people, these interventions may do more harm than good.” https://www.theatlantic.com/health/archive/2023/04/gender-affirming-care-debate-europe-dutch-protocol/673890/",0.405526
64822,1663705081445855236,La foto está alterada digitalmente para hacer parecer que los ojos de la actriz luzcan más separados de lo que realmente están en los trailers https://youtu.be/kpGo2_d3oYE https://youtu.be/HrjIOgH6IiI,0.398216
