In [59]:
import pandas as pd
from pathlib import Path
from collections import Counter
import numpy as np

Step 1 Prepare Dataset

In [38]:
# get the notes
notes = pd.read_csv("data/notes-00000.tsv", sep="\t", low_memory=False)
print(notes.columns)
print(notes.head())

Index(['noteId', 'noteAuthorParticipantId', 'createdAtMillis', 'tweetId',
       'classification', 'believable', 'harmful', 'validationDifficulty',
       'misleadingOther', 'misleadingFactualError',
       'misleadingManipulatedMedia', 'misleadingOutdatedInformation',
       'misleadingMissingImportantContext', 'misleadingUnverifiedClaimAsFact',
       'misleadingSatire', 'notMisleadingOther',
       'notMisleadingFactuallyCorrect',
       'notMisleadingOutdatedButNotWhenWritten', 'notMisleadingClearlySatire',
       'notMisleadingPersonalOpinion', 'trustworthySources', 'summary',
       'isMediaNote', 'isCollaborativeNote'],
      dtype='str')
                noteId                            noteAuthorParticipantId  \
0  1783179305159200982  C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...   
1  1783181538789605871  C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...   
2  1783182562279494134  C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...   
3  1883711635770196070  C784F04F26E124

In [25]:
# only safe the first 50000 notes
less_notes = notes.iloc[:50000].copy()
print(less_notes.shape)

(50000, 24)


In [60]:
ratings = pd.read_csv("data/ratings-00001.tsv", sep="\t", low_memory=False)
print(ratings.columns)

Index(['noteId', 'raterParticipantId', 'createdAtMillis', 'version', 'agree',
       'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther',
       'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
       'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
       'helpfulImportantContext', 'helpfulUnbiasedLanguage', 'notHelpfulOther',
       'notHelpfulIncorrect', 'notHelpfulSourcesMissingOrUnreliable',
       'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
       'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
       'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic',
       'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources',
       'notHelpfulOpinionSpeculation', 'notHelpfulNoteNotNeeded',
       'ratedOnTweetId', 'ratingSourceBucketed'],
      dtype='str')


In [51]:
note_ids = set(less_notes["noteId"])

# Initialize ratings column as empty lists
less_notes["ratings"] = [[] for _ in range(len(less_notes))]

labels_dict = {}
scores_dict = {}

# Mapping from label to numeric score 
score_map = {
    "HELPFUL": 1.0,
    "SOMEWHAT_HELPFUL": 0.5,
    "NOT_HELPFUL": -1.0,
}

In [52]:
note_ids = set(less_notes["noteId"])
rating_files = sorted(Path("data").glob("ratings-*.tsv"))
usecols = ["noteId", "helpfulnessLevel"]

for fp in rating_files:
    for chunk in pd.read_csv(fp, sep="\t", usecols=usecols, chunksize=200_000):
        chunk = chunk.loc[chunk["noteId"].isin(note_ids), :].copy()
        if chunk.empty:
            continue

        # drop NaN labels
        chunk = chunk.dropna(subset=["helpfulnessLevel"])
        if chunk.empty:
            continue

        # store original labels
        g_labels = chunk.groupby("noteId")["helpfulnessLevel"].apply(list)
        for nid, labs in g_labels.items():
            if nid not in labels_dict:
                labels_dict[nid] = []
            labels_dict[nid].extend(labs)

        # store numeric scores (only for labels we map)
        chunk["score"] = chunk["helpfulnessLevel"].map(score_map)
        chunk2 = chunk.dropna(subset=["score"])
        g_scores = chunk2.groupby("noteId")["score"].apply(list)
        for nid, sc in g_scores.items():
            if nid not in scores_dict:
                scores_dict[nid] = []
            scores_dict[nid].extend(sc)

In [53]:
less_notes["rating_labels"] = less_notes["noteId"].map(lambda nid: labels_dict.get(nid, []))
less_notes["rating_scores"] = less_notes["noteId"].map(lambda nid: scores_dict.get(nid, []))

less_notes["n_ratings"] = less_notes["rating_labels"].apply(len)  # counts non-NaN labels


In [58]:
less_notes.head()

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,summary,isMediaNote,isCollaborativeNote,n_ratings,rating_labels,rating_scores
0,1783179305159200982,C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...,1713978050878,1783159712986382830,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,0,...,0,0,0,1,The House failed to pass a border protection l...,0,0,16,"[NOT_HELPFUL, HELPFUL, NOT_HELPFUL, NOT_HELPFU...","[-1.0, 1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0..."
1,1783181538789605871,C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...,1713978583415,1783171851818021181,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,1,...,0,0,0,1,The United States has 50 States https://da...,0,0,336,"[NOT_HELPFUL, HELPFUL, HELPFUL, HELPFUL, HELPF...","[-1.0, 1.0, 1.0, 1.0, 1.0, -1.0, 1.0, 1.0, -1...."
2,1783182562279494134,C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...,1713978827435,1783154445682979015,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,0,0,...,0,0,0,1,TikTok only mentions “ban” and chooses to igno...,0,0,50,"[HELPFUL, HELPFUL, HELPFUL, NOT_HELPFUL, HELPF...","[1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0, 1.0..."
3,1883711635770196070,C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...,1737946826294,1883619411774345444,MISINFORMED_OR_POTENTIALLY_MISLEADING,,,,1,0,...,0,0,0,1,This could be considered a threat https://...,0,0,334,"[NOT_HELPFUL, NOT_HELPFUL, NOT_HELPFUL, NOT_HE...","[-1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1.0, -1...."
4,1537142913737428992,5684B38EB58FD8BE75ABA37F0BE040EC70380B002ADF9D...,1655318404027,1377030478167937024,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_MANY,CONSIDERABLE_HARM,EASY,0,1,...,0,0,0,1,Forbes has a good rundown of the investigation...,0,0,3,"[HELPFUL, HELPFUL, NOT_HELPFUL]","[1.0, 1.0, -1.0]"


In [61]:
# calculate rating median/mean per note
label_order = {"NOT_HELPFUL": 0, "SOMEWHAT_HELPFUL": 1, "HELPFUL": 2}
order_to_label = {v: k for k, v in label_order.items()}

def mean_score(scores):
    return float(np.mean(scores)) if scores else np.nan

def median_label(labels):
    if not labels:
        return np.nan
    # keep only known labels
    ordered = sorted((label_order[l] for l in labels if l in label_order))
    if not ordered:
        return np.nan
    n = len(ordered)
    mid = n // 2
    # pick the center label (for even n, choose the upper-middle by default)
    return order_to_label[ordered[mid]]

def mode_label(labels):
    if not labels:
        return np.nan
    labels = [l for l in labels if pd.notna(l)]
    if not labels:
        return np.nan
    c = Counter(labels)
    top = c.most_common()
    max_count = top[0][1]
    # if tie, return all tied labels (or pick one)
    tied = [lab for lab, cnt in top if cnt == max_count]
    return tied[0]  # or return tied

less_notes["mean_score"] = less_notes["rating_scores"].apply(mean_score)
less_notes["median_label"] = less_notes["rating_labels"].apply(median_label)
less_notes["mode_label"] = less_notes["rating_labels"].apply(mode_label)


In [65]:
print(less_notes.iloc[1][["mean_score", "median_label", "mode_label"]])

mean_score      0.092262
median_label     HELPFUL
mode_label       HELPFUL
Name: 1, dtype: object
