In [1]:
%load_ext autoreload
import os
from functools import partial
from dataclasses import dataclass
from typing import Optional, Callable
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from pmf import PoissonMF

# Load & Process Data

In [2]:
data_path = './data/'
notes = pd.read_csv(os.path.join(data_path, 'notes-00000.tsv'), sep='\t')
# Convert NaN to empty string
notes['summary'] = notes['summary'].astype(str).fillna('').str.strip()

ratings = pd.read_csv(os.path.join(data_path, 'ratings-00000.tsv'), sep='\t')

# Drop rows with NaN in helpfulnessLevel column
ratings = ratings.dropna(subset=['helpfulnessLevel'])

In [3]:
# Print total number of ratings
print('Total number of ratings: {}'.format(len(ratings)))

# Print number of unique notes and raters
print('Number of unique notes: {}'.format(ratings['noteId'].nunique()))
print('Number of unique raters: {}'.format(ratings['raterParticipantId'].nunique()))

# Get list of notes with more than 5 ratings
note_rating_counts = ratings['noteId'].value_counts()
filtered_note_ids = note_rating_counts[note_rating_counts > 5].index.tolist()
print('Number of notes with more than 5 ratings: {}'.format(len(notes)))

# Get list of raters with more than 10 ratings
rater_counts = ratings['raterParticipantId'].value_counts()
filtered_rater_ids = rater_counts[rater_counts > 10].index.tolist()
print('Number of raters with more than 10 ratings: {}'.format(len(filtered_rater_ids)))

# Filter ratings to only include ratings rated by raters with more than 10 ratings and for notes with more than 5 ratings
ratings = ratings[ratings['raterParticipantId'].isin(filtered_rater_ids) & ratings['noteId'].isin(filtered_note_ids)]
print('Number of ratings after filtering: {}'.format(len(ratings)))

Total number of ratings: 5347044
Number of unique notes: 113682
Number of unique raters: 131866
Number of notes with more than 5 ratings: 122641
Number of raters with more than 10 ratings: 54350
Number of ratings after filtering: 5003405


In [4]:
# Convert the ratings matrix to three lists:
# - rating_labels, which is the 'helpfulnessLevel' column mapped to -1 for 'NOT_HELPFUL',
#   0 for 'SOMEWHAT_HELPFUL', and 1 for 'HELPFUL'
# - user_idxs, which is the 'raterParticipantId' column mapped to a unique integer
# - note_idxs, which is the 'noteId' column mapped to a unique integer
rating_labels = ratings['helpfulnessLevel'].map({'NOT_HELPFUL': -1, 'SOMEWHAT_HELPFUL': 0, 'HELPFUL': 1})
# Use a label encoder to map the user and note ids to unique integers
user_encoder = LabelEncoder()
note_encoder = LabelEncoder()
user_idxs = user_encoder.fit_transform(ratings['raterParticipantId'])
note_idxs = note_encoder.fit_transform(ratings['noteId'])

n_users = len(user_encoder.classes_)
n_notes = len(note_encoder.classes_)

# Sparse exposure matrix (did the user rate the note?)
exp_matrix = csr_matrix((np.ones_like(rating_labels), (user_idxs, note_idxs)), shape=(n_users, n_notes))

In [5]:
n_notes, n_users

(89064, 54350)

# Step 1a: Causal Inference, Exposure Model
Fit Poisson matrix factorization to the exposures/assignments (who rated what). We will then use the reconstructed exposures as substitute confounders.

In [6]:
pf = PoissonMF(n_components=4, random_state=1, verbose=True, a=0.3, b=0.3, c=0.3, d=0.3)
pf.fit(exp_matrix, user_idxs, note_idxs)

In [7]:
# Latent representations learned by Poisson MF
exp_user_factors, exp_item_factors = pf.Eb, pf.Et.T

# Step 1b: Causal Inference, Outcome Model
Now estimate the outcome model, i.e., matrix factorization on the observed ratings while controlling for the substitute confounders estimated from Step 1a.

In [8]:
%load_ext autoreload
import torch
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch import nn
from mf import MatrixFactorizationModel, ModelData

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [25]:
# Our full model that deconfounds with the substitute confounder from step 1a
deconf_mf_model = MatrixFactorizationModel(
    n_users, n_notes, 
    exp_user_factors=exp_user_factors, exp_item_factors=exp_item_factors,
    n_components=4)

# Regular matrix factorization without deconfounding
mf_model = MatrixFactorizationModel(n_users, n_notes, n_components=4)

rating_tensor = torch.FloatTensor(rating_labels).to(deconf_mf_model.device)
user_idxs_tensor = torch.LongTensor(user_idxs).to(deconf_mf_model.device)
note_idxs_tensor = torch.LongTensor(note_idxs).to(deconf_mf_model.device)
exp_tensor = torch.ones_like(rating_tensor).to(deconf_mf_model.device)

data = ModelData(rating_tensor, user_idxs_tensor, note_idxs_tensor, exp_tensor)

In [26]:
train_loss, val_loss = deconf_mf_model.fit(data, epochs=150, lr=0.1, print_interval=20, validate_fraction=0.1, print_loss=True)

Epoch 0: train L2-reg loss = 0.479, val L2-reg loss = 0.920
Epoch 0: train MSE = 0.920, val MSE = 0.918
Epoch 20: train L2-reg loss = 0.221, val L2-reg loss = 0.470
Epoch 20: train MSE = 0.359, val MSE = 0.433
Epoch 40: train L2-reg loss = 0.191, val L2-reg loss = 0.420
Epoch 40: train MSE = 0.299, val MSE = 0.379
Epoch 60: train L2-reg loss = 0.185, val L2-reg loss = 0.408
Epoch 60: train MSE = 0.289, val MSE = 0.367
Epoch 80: train L2-reg loss = 0.183, val L2-reg loss = 0.405
Epoch 80: train MSE = 0.286, val MSE = 0.365
Epoch 100: train L2-reg loss = 0.183, val L2-reg loss = 0.406
Epoch 100: train MSE = 0.285, val MSE = 0.365
Epoch 120: train L2-reg loss = 0.183, val L2-reg loss = 0.406
Epoch 120: train MSE = 0.285, val MSE = 0.366
Epoch 140: train L2-reg loss = 0.183, val L2-reg loss = 0.407
Epoch 140: train MSE = 0.284, val MSE = 0.366


In [27]:
train_loss, val_loss = mf_model.fit(data, epochs=150, lr=0.1, print_interval=20, validate_fraction=0.1, print_loss=True)

Epoch 0: train L2-reg loss = 0.465, val L2-reg loss = 0.898
Epoch 0: train MSE = 0.898, val MSE = 0.897
Epoch 20: train L2-reg loss = 0.261, val L2-reg loss = 0.489
Epoch 20: train MSE = 0.417, val MSE = 0.467
Epoch 40: train L2-reg loss = 0.239, val L2-reg loss = 0.446
Epoch 40: train MSE = 0.372, val MSE = 0.423
Epoch 60: train L2-reg loss = 0.236, val L2-reg loss = 0.440
Epoch 60: train MSE = 0.368, val MSE = 0.417
Epoch 80: train L2-reg loss = 0.236, val L2-reg loss = 0.439
Epoch 80: train MSE = 0.368, val MSE = 0.417
Epoch 100: train L2-reg loss = 0.236, val L2-reg loss = 0.439
Epoch 100: train MSE = 0.368, val MSE = 0.416
Epoch 120: train L2-reg loss = 0.236, val L2-reg loss = 0.439
Epoch 120: train MSE = 0.368, val MSE = 0.417
Epoch 140: train L2-reg loss = 0.236, val L2-reg loss = 0.439
Epoch 140: train MSE = 0.368, val MSE = 0.417


# Step 2: Voting Aggregation
Calculate results for different voting aggregation rules.

In [31]:
pd.set_option('display.max_colwidth', 1000)

In [33]:
# Define aggregations
def approval(x, dim, threshold=0.7):
    return (x > threshold).float().mean(dim=dim)
quantile = partial(torch.quantile, q=0.25)

# Collect aggregations into dict
filtered_notes = notes[notes['noteId'].isin(filtered_note_ids)]
note_ids = note_encoder.inverse_transform(np.arange(n_notes))
aggs = {'noteId': note_ids}

# Aggregations with deconfounder model
aggs['mean'] = mf_model.forward_majority_vote()
aggs['approval'] = mf_model.get_vote_scores(approval)
#aggs['quantile'] = mf_model.get_vote_scores(quantile)
#aggs['var'] = mf_model.get_vote_scores(torch.var)

# Aggregations with deconfounder mf model
aggs['decon_mean'] = deconf_mf_model.forward_majority_vote()
aggs['decon_approval'] = deconf_mf_model.get_vote_scores(approval)

In [34]:
note_results = pd.DataFrame(aggs)
scored_notes = filtered_notes.merge(note_results, on='noteId')

In [36]:
scored_notes

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,mean,approval,decon_mean,decon_approval
0,1537145358521839617,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9DBDE2415CB32782EAEE,1655318986910,1536848327979016193,NOT_MISLEADING,,,,0,0,...,0,1,1,0,They are expressing a personal opinion in a straightforward manner. This should not need a note.,0,0.043137,0.002263,0.074394,0.050120
1,1537147343715282945,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9DBDE2415CB32782EAEE,1655319460217,1537080831751102467,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,LITTLE_HARM,EASY,0,0,...,0,0,0,1,Teslas purchased after 12/31/19 are not eligible for US Federal tax credits because they exceeded the initial 200K eligible cars allowed a credit. States like CA have their own programs that issue rebates or credits but many have ended as well due to higher Tesla prices. https://cleanvehiclerebate.org/en/faqs/can-i-apply-rebate-my-tesla-model-3-or-tesla-model-y,0,0.086366,0.003643,0.336637,0.137847
2,1540422295029551104,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9DBDE2415CB32782EAEE,1656100269455,1540087463099736065,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,1,"The Committee has been found by numerous courts to be constitutional &amp; is not losing the interest of the American people. CNN reports &lt; 20M viewers tuned in to Thursday's presentation about the Jan 6 attack, and it reached a far larger number through social and others. https://www.cnn.com/2022/06/10/media/ratings-january-6-hearings/index.html",0,0.059988,0.000055,0.228562,0.000386
3,1586769867381669889,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9DBDE2415CB32782EAEE,1667150391800,1586411168880807936,NOT_MISLEADING,,,,0,0,...,0,0,0,1,"Clinton is not alone in this claim, the F.B.I. has repeatedly said that extremist violence from right-wing actors is one of the biggest threats confronting the bureau. https://www.nytimes.com/2022/08/13/nyregion/right-wing-rhetoric-threats-violence.html Paul Pelosi’s attacker has written many racist, antisemitic and pro-Trump blog entries. https://www.washingtonpost.com/politics/2022/10/29/paul-pelosi-attack-republicans-target/",0,0.014804,0.012806,0.092647,0.173579
4,1599066819402162177,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9DBDE2415CB32782EAEE,1670082213627,1598827733072560129,NOT_MISLEADING,,,,0,0,...,0,0,1,0,"This tweet is a personal opinion, presumably about reporter Matt Taibbi’s release of previously confidential Twitter internal company communications as provided by Elon Musk.",0,0.203077,0.000846,0.365588,0.092806
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
82526,1667602286296834048,D6826ADE83D2F5EA39AB31E1597F58CA7B2C332B16610FC055D2F0145E3EF571,1686422342262,1667503722447134722,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,1,...,0,0,0,1,小笠原諸島の南洋踊りは大正期、小笠原諸島からミクロネシア諸島に出稼ぎに行った人びとが伝えたもの。「アイヌ以外にもヤマトではない部族が雑居していたことが分かるネタ」とはいえない。 ソース：南洋踊り保存会公式サイト https://nanyou-odori-hozonkai.amebaownd.com/pages/2590069/page_201902082122,0,0.090093,0.004103,0.475123,0.177130
82527,1670058820989779968,D6826ADE83D2F5EA39AB31E1597F58CA7B2C332B16610FC055D2F0145E3EF571,1687008025773,1669730333368152065,NOT_MISLEADING,,,,0,0,...,0,0,1,1,https://www.zenshoren.or.jp/invoice_qa https://newsdig.tbs.co.jp/articles/-/543571 インボイス制度が導入された場合、インボイスを発行できない年間売上高1000万円以下の事業者との取引について、仕入税額控除が認められなくなる。このため、零細事業者は従来受け取れていた消費税相当分の報酬を受け取ることが出来なくなる可能性があり、全国商工会連合会をはじめとする各種団体が「実質的増税である」としてこの制度に反対している。 インボイス制度が脱税防止を目的とする制度であることは事実としても、「増税である」という指摘がただちに間違っているとはいいがたい。,0,0.064581,0.000294,0.280342,0.018583
82528,1672622338729476096,D6826ADE83D2F5EA39AB31E1597F58CA7B2C332B16610FC055D2F0145E3EF571,1687619216033,1672614857009688581,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,0,...,0,0,0,0,&commat;Sputnik_Not はロシアの国営メディア・Sputnikを模したジョークアカウント。,0,0.049470,0.000000,0.179120,0.000000
82529,1670228739597148161,ABAA5534156E3FEE72AEF02888B876C7BDE329B6005776FBD031BFEFDA7DA8FC,1687048537528,1670174683466027008,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,0,...,0,0,0,0,There is a rising branch of climate experts that have clearly drawn enough data and evidence to stop the spread of unnecessary climate change alarms. While we have bigger issues due to plastic pollution.,0,-0.116277,0.000000,-0.410453,0.000074


In [37]:
scored_notes.to_csv('scored_notes.csv', index=False)