In [1]:
import pandas as pd
from pathlib import Path
from collections import Counter
import numpy as np

Step 1 Prepare Dataset

In [11]:
# get the notes
notes = pd.read_csv("data/notes-00000.tsv", sep="\t", low_memory=False)
print(notes.columns)
print(notes.head())

Index(['noteId', 'noteAuthorParticipantId', 'createdAtMillis', 'tweetId',
       'classification', 'believable', 'harmful', 'validationDifficulty',
       'misleadingOther', 'misleadingFactualError',
       'misleadingManipulatedMedia', 'misleadingOutdatedInformation',
       'misleadingMissingImportantContext', 'misleadingUnverifiedClaimAsFact',
       'misleadingSatire', 'notMisleadingOther',
       'notMisleadingFactuallyCorrect',
       'notMisleadingOutdatedButNotWhenWritten', 'notMisleadingClearlySatire',
       'notMisleadingPersonalOpinion', 'trustworthySources', 'summary',
       'isMediaNote', 'isCollaborativeNote'],
      dtype='str')
                noteId                            noteAuthorParticipantId  \
0  1783179305159200982  C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...   
1  1783181538789605871  C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...   
2  1783182562279494134  C784F04F26E124F4D6EC01658D8F5565005D3092741FB3...   
3  1883711635770196070  C784F04F26E124

In [3]:
# only safe the first 50000 notes
less_notes = notes.iloc[:50000].copy()
print(less_notes.shape)

(50000, 24)


In [60]:
ratings = pd.read_csv("data/ratings-00001.tsv", sep="\t", low_memory=False)
print(ratings.columns)

Index(['noteId', 'raterParticipantId', 'createdAtMillis', 'version', 'agree',
       'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther',
       'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
       'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
       'helpfulImportantContext', 'helpfulUnbiasedLanguage', 'notHelpfulOther',
       'notHelpfulIncorrect', 'notHelpfulSourcesMissingOrUnreliable',
       'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
       'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
       'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic',
       'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources',
       'notHelpfulOpinionSpeculation', 'notHelpfulNoteNotNeeded',
       'ratedOnTweetId', 'ratingSourceBucketed'],
      dtype='str')


In [5]:
note_ids = set(less_notes["noteId"])
rating_files = sorted(Path("data").glob("ratings-*.tsv"))
kept = []

for fp in rating_files:
    for chunk in pd.read_csv(fp, sep="\t", chunksize=200_000, low_memory=False):
        matched = chunk[chunk["noteId"].isin(note_ids)]
        if not matched.empty:
            kept.append(matched)


ratings_subset = pd.concat(kept, ignore_index=True)

print("notes:", len(less_notes))
print("ratings (rows):", len(ratings_subset))

notes: 50000
ratings (rows): 4020238


In [6]:
ratings_subset.head()

Unnamed: 0,noteId,raterParticipantId,createdAtMillis,version,agree,disagree,helpful,notHelpful,helpfulnessLevel,helpfulOther,...,notHelpfulOutdated,notHelpfulHardToUnderstand,notHelpfulArgumentativeOrBiased,notHelpfulOffTopic,notHelpfulSpamHarassmentOrAbuse,notHelpfulIrrelevantSources,notHelpfulOpinionSpeculation,notHelpfulNoteNotNeeded,ratedOnTweetId,ratingSourceBucketed
0,1562604999355420673,EBE1D39152E086F5D771B669B9FCFC7709B404D15529CF...,1661470073955,2,0,0,0,0,NOT_HELPFUL,0,...,0,0,0,0,0,1,1,0,-1,DEFAULT
1,1564954605158629376,EBE1D39152E086F5D771B669B9FCFC7709B404D15529CF...,1661957250063,2,0,0,0,0,HELPFUL,0,...,0,0,0,0,0,0,0,0,-1,DEFAULT
2,1567623547434192896,EBE1D39152E086F5D771B669B9FCFC7709B404D15529CF...,1662585827945,2,0,0,0,0,HELPFUL,0,...,0,0,0,0,0,0,0,0,-1,DEFAULT
3,1650865349124927491,EBE1D39152E086F5D771B669B9FCFC7709B404D15529CF...,1682438733156,2,0,0,0,0,NOT_HELPFUL,0,...,0,0,0,0,0,0,0,1,1650801827728986112,DEFAULT
4,1908856470994559161,356C830BABBC6636C0167854450FF02866F1BE4C696248...,1743948599611,2,0,0,0,0,NOT_HELPFUL,0,...,0,0,0,0,0,1,0,0,1908520186031841399,DEFAULT


In [8]:
# save the new ratings subset to csv
ratings_subset.to_csv("processed_data/ratings_subset.csv", sep="\t", index=False)

In [10]:
less_notes.to_csv("processed_data/less_notes.csv", sep="\t", index=False)

Step2: check all data status after run the CN algorithm

In [9]:

DATA_DIR = Path("processed_data")

# skip the file I created
HIDE_SUMMARY_FOR_BASES = {"less_notes", "ratings_subset"}

In [10]:
def stem_base(fp: Path) -> str:
    # get rid off the file type
    return fp.name.rsplit(".", 1)[0]

def sniff_sep(fp: Path) -> str:
    with open(fp, "r", encoding="utf-8", errors="replace") as f:
        for line in f:
            line = line.strip("\n")
            if not line.strip():
                continue
            tabs = line.count("\t")
            commas = line.count(",")
            return "\t" if tabs >= commas else ","
    return ","

def load_file(fp: Path) -> pd.DataFrame:
    if fp.suffix == ".parquet":
        return pd.read_parquet(fp)
    if fp.suffix in {".tsv", ".csv"}:
        sep = "\t" if fp.suffix == ".tsv" else sniff_sep(fp)
        return pd.read_csv(fp, sep=sep, low_memory=False)
    raise ValueError(f"Unsupported file: {fp}")

def drop_summary_if_needed(df: pd.DataFrame, base: str) -> pd.DataFrame:
    if "summary" in df.columns and base in HIDE_SUMMARY_FOR_BASES:
        return df.drop(columns=["summary"])
    return df

def safe_sort_for_compare(df: pd.DataFrame) -> pd.DataFrame:
    # Make comparison less sensitive to row ordering.
    # sorting by common columns if present.
    for key in ["noteId", "tweetId", "raterParticipantId", "noteAuthorParticipantId", "createdAtMillis"]:
        if key in df.columns:
            return df.sort_values(by=[key]).reset_index(drop=True)
    return df.reset_index(drop=True)

def top10_signature(df: pd.DataFrame) -> pd.DataFrame:
    # Use strings to compare robustly (handles floats/NA consistently)
    d = df.head(10).copy()
    return d.astype("string")

def compare_top10(df_a: pd.DataFrame, df_b: pd.DataFrame):
    # returns (same: bool, reason: str)
    if list(df_a.columns) != list(df_b.columns):
        return False, "Different columns"
    sig_a = top10_signature(df_a)
    sig_b = top10_signature(df_b)
    if sig_a.equals(sig_b):
        return True, "Top 10 rows match"
    return False, "Top 10 rows differ"

In [11]:

def basic_report(df: pd.DataFrame, title: str, max_cols_list: int = 30):
    print("=" * 90)
    print(title)
    print("-" * 90)
    print(f"Shape: {df.shape[0]:,} rows x {df.shape[1]:,} cols")

    cols = df.columns.tolist()
    if len(cols) <= max_cols_list:
        print("Columns:", cols)
    else:
        print("Columns (first/last):", cols[:max_cols_list//2], "...", cols[-max_cols_list//2:])

    dtype_counts = df.dtypes.astype(str).value_counts()
    print("\nDtypes summary:")
    for k, v in dtype_counts.items():
        print(f"  {k}: {v}")

    na = df.isna().sum()
    na_nonzero = na[na > 0].sort_values(ascending=False)
    if len(df.columns) > 0:
        total_cells = df.shape[0] * df.shape[1]
        print(f"\nMissing cells: {int(na.sum()):,} ({(na.sum()/total_cells if total_cells else 0):.2%})")
    if len(na_nonzero) > 0:
        print("Top missing columns:")
        for col, cnt in na_nonzero.head(10).items():
            print(f"  {col}: {cnt:,} ({cnt/df.shape[0]:.2%} of rows)")
    else:
        print("\nNo missing values detected.")

    # duplicate rows (may fail if unhashable columns exist)
    try:
        dup_rows = df.duplicated().sum()
        print(f"\nDuplicate rows: {dup_rows:,}")
    except Exception:
        print("\nDuplicate rows: (skipped; unhashable columns likely)")

    # Key uniqueness quick check
    print("\nKey column uniques (if present):")
    for k in ["noteId", "tweetId", "raterParticipantId", "noteAuthorParticipantId"]:
        if k in df.columns:
            print(f"  {k}: {df[k].nunique(dropna=True):,}")

    print("\nTop 3 rows (preview):")
    display(df.head(3))

In [None]:
files = sorted([fp for fp in DATA_DIR.iterdir() if fp.suffix in {".tsv", ".csv", ".parquet"}])

# group by base name
groups = {}
for fp in files:
    base = stem_base(fp)
    groups.setdefault(base, []).append(fp)

# report each group
for base, fps in sorted(groups.items()):
    # prefer .tsv as the "canonical" if present
    fps_sorted = sorted(
        fps,
        key=lambda p: (0 if p.suffix == ".tsv" else 1 if p.suffix == ".csv" else 2, p.name)
    )

    # load all versions
    loaded = []
    for fp in fps_sorted:
        df = load_file(fp)
        df = drop_summary_if_needed(df, base)
        df = safe_sort_for_compare(df)
        loaded.append((fp, df))

    if len(loaded) == 1:
        fp, df = loaded[0]
        basic_report(df, f"{base} ({fp.suffix})")
        continue

    # compare the first (preferred) against others using top 10
    fp0, df0 = loaded[0]
    all_same = True
    reasons = []
    for fpi, dfi in loaded[1:]:
        same, reason = compare_top10(df0, dfi)
        all_same &= same
        reasons.append((fpi.name, same, reason))

    if all_same:
        # only show the preferred version (tsv if exists)
        basic_report(df0, f"{base} (showing only {fp0.name}")
    else:
        print("=" * 90)
        print(f"{base}: formats differ → showing all versions")
        for name, same, reason in reasons:
            print(f"  vs {name}: {reason}")
        print("=" * 90)

        # show every version
        for fp, df in loaded:
            basic_report(df, f"{base} ({fp.name})")

aux_note_info (showing only aux_note_info.tsv; other formats match on top 10)
------------------------------------------------------------------------------------------
Shape: 2,543,490 rows x 67 cols
Columns (first/last): ['noteId', 'ratingWeight', 'createdAtMillis', 'noteAuthorParticipantId', 'awaitingBool', 'numRatingsLast28', 'numPopulationSampledRatings', 'currentStatus', 'crhBool', 'crnhBool', 'unlockedRatingStatus', 'preStabilizationRatingStatus', 'helpfulOther', 'helpfulInformative', 'helpfulClear'] ... ['notHelpfulOutdatedAdjustedRatio', 'notHelpfulHardToUnderstandAdjustedRatio', 'notHelpfulArgumentativeOrBiasedAdjustedRatio', 'notHelpfulOffTopicAdjustedRatio', 'notHelpfulSpamHarassmentOrAbuseAdjustedRatio', 'notHelpfulIrrelevantSourcesAdjustedRatio', 'notHelpfulOpinionSpeculationAdjustedRatio', 'notHelpfulNoteNotNeededAdjustedRatio', 'notHelpfulIncorrect_interval', 'p_incorrect_user_interval', 'num_voters_interval', 'tf_idf_incorrect_interval', 'lowDiligenceIntercept', 'coreN

Unnamed: 0,noteId,ratingWeight,createdAtMillis,noteAuthorParticipantId,awaitingBool,numRatingsLast28,numPopulationSampledRatings,currentStatus,crhBool,crnhBool,...,notHelpfulIrrelevantSourcesAdjustedRatio,notHelpfulOpinionSpeculationAdjustedRatio,notHelpfulNoteNotNeededAdjustedRatio,notHelpfulIncorrect_interval,p_incorrect_user_interval,num_voters_interval,tf_idf_incorrect_interval,lowDiligenceIntercept,coreNegFactor_populationSampledRatingCount,corePosFactor_populationSampledRatingCount
0,1352796878438424576,,1611366884227,B021517F4F8F9F372DCD70BB8F8A21A80C12D7EFDE8B2F...,True,0,0,NEEDS_MORE_RATINGS,False,False,...,,,,,,,,,0,0
1,1353415873227177985,,1611514464087,D4C9EDB464DF2D4A6D8F68CA8D1F86226B50EB9D3B0F33...,True,0,0,NEEDS_MORE_RATINGS,False,False,...,,,,,,,,,0,0
2,1354586938863443971,,1611793667892,C0AF45F4C4B2240E7AB31456957E5D770FF7AFA13627C7...,True,0,0,NEEDS_MORE_RATINGS,False,False,...,,,,,,,,,0,0


helpfulness_scores: formats differ → showing all versions
  vs helpfulness_scores.parquet: Top 10 rows differ
helpfulness_scores (helpfulness_scores.tsv)
------------------------------------------------------------------------------------------
Shape: 615,000 rows x 41 cols
Columns (first/last): ['raterParticipantId', 'coreRaterIntercept', 'coreRaterFactor1', 'crhCrnhRatioDifference', 'meanNoteScore', 'raterAgreeRatio', 'successfulRatingHelpfulCount', 'successfulRatingNotHelpfulCount', 'successfulRatingTotal', 'unsuccessfulRatingHelpfulCount', 'unsuccessfulRatingNotHelpfulCount', 'unsuccessfulRatingTotal', 'ratingsAwaitingMoreRatings', 'ratedAfterDecision', 'notesCurrentlyRatedHelpful'] ... ['groupRaterFactor1', 'modelingGroup', 'raterHelpfulnessReputation', 'numberOfTimesEarnedOut', 'expansionRaterIntercept', 'expansionRaterFactor1', 'expansionPlusRaterIntercept', 'expansionPlusRaterFactor1', 'multiGroupRaterIntercept', 'multiGroupRaterFactor1', 'modelingMultiGroup', 'coreWithTopicsRa

Unnamed: 0,raterParticipantId,coreRaterIntercept,coreRaterFactor1,crhCrnhRatioDifference,meanNoteScore,raterAgreeRatio,successfulRatingHelpfulCount,successfulRatingNotHelpfulCount,successfulRatingTotal,unsuccessfulRatingHelpfulCount,...,expansionRaterFactor1,expansionPlusRaterIntercept,expansionPlusRaterFactor1,multiGroupRaterIntercept,multiGroupRaterFactor1,modelingMultiGroup,coreWithTopicsRaterIntercept,coreWithTopicsRaterFactor1,coreFirstRoundRaterIntercept,coreFirstRoundRaterFactor1
0,000011269AD6F327AED0F4086A732B4052F9D28E8791E1...,,,,,,2.0,0.0,2.0,0.0,...,,,,,,,,,,
1,00002C7FD6E0080A69D0AB879C3D9BB704BEFCC078AD40...,,,,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,00003B703F86036C51F4F4B4C9F77B00C92D882421DA73...,,,,,,1.0,0.0,1.0,0.0,...,,,,,,1.0,,,,


helpfulness_scores (helpfulness_scores.parquet)
------------------------------------------------------------------------------------------
Shape: 615,000 rows x 41 cols
Columns (first/last): ['raterParticipantId', 'coreRaterIntercept', 'coreRaterFactor1', 'crhCrnhRatioDifference', 'meanNoteScore', 'raterAgreeRatio', 'successfulRatingHelpfulCount', 'successfulRatingNotHelpfulCount', 'successfulRatingTotal', 'unsuccessfulRatingHelpfulCount', 'unsuccessfulRatingNotHelpfulCount', 'unsuccessfulRatingTotal', 'ratingsAwaitingMoreRatings', 'ratedAfterDecision', 'notesCurrentlyRatedHelpful'] ... ['groupRaterFactor1', 'modelingGroup', 'raterHelpfulnessReputation', 'numberOfTimesEarnedOut', 'expansionRaterIntercept', 'expansionRaterFactor1', 'expansionPlusRaterIntercept', 'expansionPlusRaterFactor1', 'multiGroupRaterIntercept', 'multiGroupRaterFactor1', 'modelingMultiGroup', 'coreWithTopicsRaterIntercept', 'coreWithTopicsRaterFactor1', 'coreFirstRoundRaterIntercept', 'coreFirstRoundRaterFactor1']

Unnamed: 0,raterParticipantId,coreRaterIntercept,coreRaterFactor1,crhCrnhRatioDifference,meanNoteScore,raterAgreeRatio,successfulRatingHelpfulCount,successfulRatingNotHelpfulCount,successfulRatingTotal,unsuccessfulRatingHelpfulCount,...,expansionRaterFactor1,expansionPlusRaterIntercept,expansionPlusRaterFactor1,multiGroupRaterIntercept,multiGroupRaterFactor1,modelingMultiGroup,coreWithTopicsRaterIntercept,coreWithTopicsRaterFactor1,coreFirstRoundRaterIntercept,coreFirstRoundRaterFactor1
0,000011269AD6F327AED0F4086A732B4052F9D28E8791E1...,,,,,,2.0,0.0,2.0,0.0,...,,,,,,,,,,
1,00002C7FD6E0080A69D0AB879C3D9BB704BEFCC078AD40...,,,,,,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,00003B703F86036C51F4F4B4C9F77B00C92D882421DA73...,,,,,,1.0,0.0,1.0,0.0,...,,,,,,1.0,,,,


less_notes (.csv)
------------------------------------------------------------------------------------------
Shape: 50,000 rows x 23 cols
Columns: ['noteId', 'noteAuthorParticipantId', 'createdAtMillis', 'tweetId', 'classification', 'believable', 'harmful', 'validationDifficulty', 'misleadingOther', 'misleadingFactualError', 'misleadingManipulatedMedia', 'misleadingOutdatedInformation', 'misleadingMissingImportantContext', 'misleadingUnverifiedClaimAsFact', 'misleadingSatire', 'notMisleadingOther', 'notMisleadingFactuallyCorrect', 'notMisleadingOutdatedButNotWhenWritten', 'notMisleadingClearlySatire', 'notMisleadingPersonalOpinion', 'trustworthySources', 'isMediaNote', 'isCollaborativeNote']

Dtypes summary:
  int64: 18
  object: 5

Missing cells: 138,558 (12.05%)
Top missing columns:
  believable: 46,186 (92.37% of rows)
  harmful: 46,186 (92.37% of rows)
  validationDifficulty: 46,186 (92.37% of rows)

Duplicate rows: 0

Key column uniques (if present):
  noteId: 50,000
  tweetId: 48

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,tweetId,classification,believable,harmful,validationDifficulty,misleadingOther,misleadingFactualError,...,misleadingUnverifiedClaimAsFact,misleadingSatire,notMisleadingOther,notMisleadingFactuallyCorrect,notMisleadingOutdatedButNotWhenWritten,notMisleadingClearlySatire,notMisleadingPersonalOpinion,trustworthySources,isMediaNote,isCollaborativeNote
0,1354586938863443971,C0AF45F4C4B2240E7AB31456957E5D770FF7AFA13627C7...,1611793667892,1354585812722085890,MISINFORMED_OR_POTENTIALLY_MISLEADING,BELIEVABLE_BY_FEW,LITTLE_HARM,EASY,0,1,...,0,0,0,0,0,0,0,1,0,0
1,1354595874106564617,09FA91E7E0E9D628076C37B5027CA3A195AF37B3EA0694...,1611795798218,1354584088594669572,NOT_MISLEADING,,,,0,0,...,0,0,0,1,0,0,0,0,0,0
2,1354599474719465476,8123E43A815D28934F68DB62EB58C57A1D78DC63E8BFBE...,1611796656673,1354587070543450116,NOT_MISLEADING,,,,0,0,...,0,0,1,0,0,0,0,1,0,0


note_status_history (showing only note_status_history.tsv; other formats match on top 10)
------------------------------------------------------------------------------------------
Shape: 2,543,490 rows x 23 cols
Columns: ['noteId', 'noteAuthorParticipantId', 'createdAtMillis', 'timestampMillisOfFirstNonNMRStatus', 'firstNonNMRStatus', 'timestampMillisOfCurrentStatus', 'currentStatus', 'timestampMillisOfLatestNonNMRStatus', 'mostRecentNonNMRStatus', 'timestampMillisOfStatusLock', 'lockedStatus', 'timestampMillisOfRetroLock', 'currentCoreStatus', 'currentExpansionStatus', 'currentGroupStatus', 'currentDecidedBy', 'currentModelingGroup', 'timestampMillisOfMostRecentStatusChange', 'timestampMillisOfNmrDueToMinStableCrhTime', 'currentMultiGroupStatus', 'currentModelingMultiGroup', 'timestampMinuteOfFinalScoringOutput', 'timestampMillisOfFirstNmrDueToMinStableCrhTime']

Dtypes summary:
  float64: 11
  object: 10
  int64: 2

Missing cells: 31,561,915 (53.95%)
Top missing columns:
  timestamp

Unnamed: 0,noteId,noteAuthorParticipantId,createdAtMillis,timestampMillisOfFirstNonNMRStatus,firstNonNMRStatus,timestampMillisOfCurrentStatus,currentStatus,timestampMillisOfLatestNonNMRStatus,mostRecentNonNMRStatus,timestampMillisOfStatusLock,...,currentExpansionStatus,currentGroupStatus,currentDecidedBy,currentModelingGroup,timestampMillisOfMostRecentStatusChange,timestampMillisOfNmrDueToMinStableCrhTime,currentMultiGroupStatus,currentModelingMultiGroup,timestampMinuteOfFinalScoringOutput,timestampMillisOfFirstNmrDueToMinStableCrhTime
0,1352796878438424576,B021517F4F8F9F372DCD70BB8F8A21A80C12D7EFDE8B2F...,1611366884227,,,1770185000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,...,,,ExpansionPlusModel (v1.1),,-1.0,-1.0,,,,
1,1353415873227177985,D4C9EDB464DF2D4A6D8F68CA8D1F86226B50EB9D3B0F33...,1611514464087,,,1770185000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,...,,,ExpansionPlusModel (v1.1),,-1.0,-1.0,,,,
2,1354586938863443971,C0AF45F4C4B2240E7AB31456957E5D770FF7AFA13627C7...,1611793667892,,,1770185000000.0,NEEDS_MORE_RATINGS,,,1674003000000.0,...,NEEDS_MORE_RATINGS,NEEDS_MORE_RATINGS,CoreModel (v1.1),6.0,-1.0,-1.0,,,,


ratings_subset (.csv)
------------------------------------------------------------------------------------------
Shape: 4,020,238 rows x 33 cols
Columns (first/last): ['noteId', 'raterParticipantId', 'createdAtMillis', 'version', 'agree', 'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther', 'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic', 'helpfulGoodSources', 'helpfulUniqueContext'] ... ['notHelpfulOther', 'notHelpfulIncorrect', 'notHelpfulSourcesMissingOrUnreliable', 'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints', 'notHelpfulOutdated', 'notHelpfulHardToUnderstand', 'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic', 'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources', 'notHelpfulOpinionSpeculation', 'notHelpfulNoteNotNeeded', 'ratedOnTweetId', 'ratingSourceBucketed']

Dtypes summary:
  int64: 30
  object: 3

Missing cells: 2,328 (0.00%)
Top missing columns:
  helpfulnessLevel: 2,328 (0.06% of rows)

Duplicate rows: 0

Unnamed: 0,noteId,raterParticipantId,createdAtMillis,version,agree,disagree,helpful,notHelpful,helpfulnessLevel,helpfulOther,...,notHelpfulOutdated,notHelpfulHardToUnderstand,notHelpfulArgumentativeOrBiased,notHelpfulOffTopic,notHelpfulSpamHarassmentOrAbuse,notHelpfulIrrelevantSources,notHelpfulOpinionSpeculation,notHelpfulNoteNotNeeded,ratedOnTweetId,ratingSourceBucketed
0,1354586938863443971,7D19A98867E54C01B2DE175587FB2C6D9BD78E67226E88...,1710971015961,2,0,0,0,0,SOMEWHAT_HELPFUL,0,...,0,1,0,0,0,0,0,0,1354585812722085890,DEFAULT
1,1354845486180683779,553EEE07DD808F0118BB54CF1A3BAC46E4B381176F9340...,1674186377447,2,0,0,0,0,HELPFUL,0,...,0,0,0,0,0,0,0,0,-1,DEFAULT
2,1354862521111470083,D0FB9A0D7BDF30D932CDFEB01AD21946387CD7C2A17956...,1611860848452,1,1,0,1,0,,0,...,0,0,0,0,0,0,0,0,-1,DEFAULT


scored_notes: formats differ → showing all versions
  vs scored_notes.parquet: Top 10 rows differ
scored_notes (scored_notes.tsv)
------------------------------------------------------------------------------------------
Shape: 2,543,490 rows x 88 cols
Columns (first/last): ['noteId', 'coreNoteIntercept', 'coreNoteFactor1', 'finalRatingStatus', 'firstTag', 'secondTag', 'coreActiveRules', 'activeFilterTags', 'classification', 'createdAtMillis', 'coreRatingStatus', 'metaScorerActiveRules', 'decidedBy', 'expansionNoteIntercept', 'expansionNoteFactor1'] ... ['coreNoteInterceptPopulationSampled', 'coreWithTopicsNoteInterceptNoCorrelated', 'expansionNoteInterceptNoCorrelated', 'expansionPlusNoteInterceptNoCorrelated', 'groupNoteInterceptNoCorrelated', 'multiGroupNoteInterceptNoCorrelated', 'topicNoteInterceptNoCorrelated', 'gaussianNoteIntercept', 'gaussianNoteFactor1', 'gaussianRatingStatus', 'gaussianActiveRules', 'gaussianNoteInterceptNoCorrelated', 'gaussianNoteInterceptNoHighVol', 'gaus

Unnamed: 0,noteId,coreNoteIntercept,coreNoteFactor1,finalRatingStatus,firstTag,secondTag,coreActiveRules,activeFilterTags,classification,createdAtMillis,...,multiGroupNoteInterceptNoCorrelated,topicNoteInterceptNoCorrelated,gaussianNoteIntercept,gaussianNoteFactor1,gaussianRatingStatus,gaussianActiveRules,gaussianNoteInterceptNoCorrelated,gaussianNoteInterceptNoHighVol,gaussianNoteInterceptPopulationSampled,gaussianNumFinalRoundRatings
0,1352796878438424576,,,NEEDS_MORE_RATINGS,,,,,,1611366884227,...,,,,,,,,,,
1,1353415873227177985,,,NEEDS_MORE_RATINGS,,,,,,1611514464087,...,,,,,,,,,,
2,1354586938863443971,,,NEEDS_MORE_RATINGS,,,InitialNMR (v1.0),,MISINFORMED_OR_POTENTIALLY_MISLEADING,1611793667892,...,,,,,NEEDS_MORE_RATINGS,InitialNMR (v1.0),,,,


scored_notes (scored_notes.parquet)
------------------------------------------------------------------------------------------
Shape: 2,543,490 rows x 88 cols
Columns (first/last): ['noteId', 'coreNoteIntercept', 'coreNoteFactor1', 'finalRatingStatus', 'firstTag', 'secondTag', 'coreActiveRules', 'activeFilterTags', 'classification', 'createdAtMillis', 'coreRatingStatus', 'metaScorerActiveRules', 'decidedBy', 'expansionNoteIntercept', 'expansionNoteFactor1'] ... ['coreNoteInterceptPopulationSampled', 'coreWithTopicsNoteInterceptNoCorrelated', 'expansionNoteInterceptNoCorrelated', 'expansionPlusNoteInterceptNoCorrelated', 'groupNoteInterceptNoCorrelated', 'multiGroupNoteInterceptNoCorrelated', 'topicNoteInterceptNoCorrelated', 'gaussianNoteIntercept', 'gaussianNoteFactor1', 'gaussianRatingStatus', 'gaussianActiveRules', 'gaussianNoteInterceptNoCorrelated', 'gaussianNoteInterceptNoHighVol', 'gaussianNoteInterceptPopulationSampled', 'gaussianNumFinalRoundRatings']

Dtypes summary:
  float6

Unnamed: 0,noteId,coreNoteIntercept,coreNoteFactor1,finalRatingStatus,firstTag,secondTag,coreActiveRules,activeFilterTags,classification,createdAtMillis,...,multiGroupNoteInterceptNoCorrelated,topicNoteInterceptNoCorrelated,gaussianNoteIntercept,gaussianNoteFactor1,gaussianRatingStatus,gaussianActiveRules,gaussianNoteInterceptNoCorrelated,gaussianNoteInterceptNoHighVol,gaussianNoteInterceptPopulationSampled,gaussianNumFinalRoundRatings
0,1352796878438424576,,,NEEDS_MORE_RATINGS,,,,,,1611366884227,...,,,,,,,,,,
1,1353415873227177985,,,NEEDS_MORE_RATINGS,,,,,,1611514464087,...,,,,,,,,,,
2,1354586938863443971,,,NEEDS_MORE_RATINGS,,,InitialNMR (v1.0),,MISINFORMED_OR_POTENTIALLY_MISLEADING,1611793667892,...,,,,,NEEDS_MORE_RATINGS,InitialNMR (v1.0),,,,


Step3.1 prescoring

In [3]:
helpfulness_scores = pd.read_csv("processed_data/helpfulness_scores.tsv", sep="\t", low_memory=False)
print(helpfulness_scores.columns)
print(helpfulness_scores.shape)

Index(['raterParticipantId', 'coreRaterIntercept', 'coreRaterFactor1',
       'crhCrnhRatioDifference', 'meanNoteScore', 'raterAgreeRatio',
       'successfulRatingHelpfulCount', 'successfulRatingNotHelpfulCount',
       'successfulRatingTotal', 'unsuccessfulRatingHelpfulCount',
       'unsuccessfulRatingNotHelpfulCount', 'unsuccessfulRatingTotal',
       'ratingsAwaitingMoreRatings', 'ratedAfterDecision',
       'notesCurrentlyRatedHelpful', 'notesCurrentlyRatedNotHelpful',
       'notesAwaitingMoreRatings', 'enrollmentState',
       'successfulRatingNeededToEarnIn', 'authorTopNotHelpfulTagValues',
       'timestampOfLastStateChange', 'aboveHelpfulnessThreshold',
       'isEmergingWriter', 'aggregateRatingReceivedTotal',
       'timestampOfLastEarnOut', 'groupRaterIntercept', 'groupRaterFactor1',
       'modelingGroup', 'raterHelpfulnessReputation', 'numberOfTimesEarnedOut',
       'expansionRaterIntercept', 'expansionRaterFactor1',
       'expansionPlusRaterIntercept', 'expansionPlus

In [4]:
if "successfulRatingTotal" in helpfulness_scores.columns:
    helpfulness_scores["ratingCount"] = helpfulness_scores["successfulRatingTotal"].fillna(0)
    keep = helpfulness_scores["ratingCount"] >= 10

else:
    raise ValueError("Can't find successfulRatingTotal (and/or unsuccessfulRatingTotal) in columns.")

rater_with_10 = helpfulness_scores[keep].copy()

print("Original raters:", len(helpfulness_scores))
print("Kept raters:", len(rater_with_10))

Original raters: 615000
Kept raters: 1534


In [5]:
ratings = pd.read_csv("processed_data/ratings_subset.csv", sep="\t", low_memory=False)
print(ratings.columns)
print(ratings.shape)

Index(['noteId', 'raterParticipantId', 'createdAtMillis', 'version', 'agree',
       'disagree', 'helpful', 'notHelpful', 'helpfulnessLevel', 'helpfulOther',
       'helpfulInformative', 'helpfulClear', 'helpfulEmpathetic',
       'helpfulGoodSources', 'helpfulUniqueContext', 'helpfulAddressesClaim',
       'helpfulImportantContext', 'helpfulUnbiasedLanguage', 'notHelpfulOther',
       'notHelpfulIncorrect', 'notHelpfulSourcesMissingOrUnreliable',
       'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
       'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
       'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic',
       'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources',
       'notHelpfulOpinionSpeculation', 'notHelpfulNoteNotNeeded',
       'ratedOnTweetId', 'ratingSourceBucketed'],
      dtype='object')
(4020238, 33)


In [6]:
ratings = ratings.dropna(subset=["noteId", "raterParticipantId"])

# Count unique raters per note
rater_counts = ratings.groupby("noteId")["raterParticipantId"].nunique()

# Notes that meet threshold
keep_note_ids = rater_counts[rater_counts >= 5].index
# Filter ratings to only those notes
ratings_filt = ratings[ratings["noteId"].isin(keep_note_ids)].copy()

print("Original ratings rows:", len(ratings))
print("Original unique notes:", ratings["noteId"].nunique())
print("Kept unique notes:", len(keep_note_ids))
print("Kept ratings rows:", len(ratings_filt))

Original ratings rows: 4020238
Original unique notes: 44497
Kept unique notes: 36620
Kept ratings rows: 4002286


In [7]:
ratings_final = ratings_filt[ratings_filt["raterParticipantId"].isin(rater_with_10["raterParticipantId"])].copy()

print("=== Final filtered ratings (note filter + rater filter) ===")
print("Rows:", len(ratings_final))
print("Unique notes:", ratings_final["noteId"].nunique())
print("Unique raters:", ratings_final["raterParticipantId"].nunique())

=== Final filtered ratings (note filter + rater filter) ===
Rows: 281877
Unique notes: 28853
Unique raters: 1534


In [2]:
aux_note_info = pd.read_csv("processed_data/aux_note_info.tsv", sep="\t", low_memory=False)
print(aux_note_info.columns)
print(aux_note_info.shape)

Index(['noteId', 'ratingWeight', 'createdAtMillis', 'noteAuthorParticipantId',
       'awaitingBool', 'numRatingsLast28', 'numPopulationSampledRatings',
       'currentStatus', 'crhBool', 'crnhBool', 'unlockedRatingStatus',
       'preStabilizationRatingStatus', 'helpfulOther', 'helpfulInformative',
       'helpfulClear', 'helpfulEmpathetic', 'helpfulGoodSources',
       'helpfulUniqueContext', 'helpfulAddressesClaim',
       'helpfulImportantContext', 'helpfulUnbiasedLanguage', 'notHelpfulOther',
       'notHelpfulIncorrect', 'notHelpfulSourcesMissingOrUnreliable',
       'notHelpfulOpinionSpeculationOrBias', 'notHelpfulMissingKeyPoints',
       'notHelpfulOutdated', 'notHelpfulHardToUnderstand',
       'notHelpfulArgumentativeOrBiased', 'notHelpfulOffTopic',
       'notHelpfulSpamHarassmentOrAbuse', 'notHelpfulIrrelevantSources',
       'notHelpfulOpinionSpeculation', 'notHelpfulNoteNotNeeded',
       'notHelpfulOtherAdjusted', 'notHelpfulIncorrectAdjusted',
       'notHelpfulSources

In [3]:
note_status_history = pd.read_csv("processed_data/note_status_history.tsv", sep="\t", low_memory=False)
print(note_status_history.columns)
print(note_status_history.shape)

Index(['noteId', 'noteAuthorParticipantId', 'createdAtMillis',
       'timestampMillisOfFirstNonNMRStatus', 'firstNonNMRStatus',
       'timestampMillisOfCurrentStatus', 'currentStatus',
       'timestampMillisOfLatestNonNMRStatus', 'mostRecentNonNMRStatus',
       'timestampMillisOfStatusLock', 'lockedStatus',
       'timestampMillisOfRetroLock', 'currentCoreStatus',
       'currentExpansionStatus', 'currentGroupStatus', 'currentDecidedBy',
       'currentModelingGroup', 'timestampMillisOfMostRecentStatusChange',
       'timestampMillisOfNmrDueToMinStableCrhTime', 'currentMultiGroupStatus',
       'currentModelingMultiGroup', 'timestampMinuteOfFinalScoringOutput',
       'timestampMillisOfFirstNmrDueToMinStableCrhTime'],
      dtype='str')
(2543490, 23)


In [4]:
scored_notes = pd.read_csv("processed_data/scored_notes.tsv", sep="\t", low_memory=False)
print(scored_notes.columns)
print(scored_notes.shape)

Index(['noteId', 'coreNoteIntercept', 'coreNoteFactor1', 'finalRatingStatus',
       'firstTag', 'secondTag', 'coreActiveRules', 'activeFilterTags',
       'classification', 'createdAtMillis', 'coreRatingStatus',
       'metaScorerActiveRules', 'decidedBy', 'expansionNoteIntercept',
       'expansionNoteFactor1', 'expansionRatingStatus',
       'coverageNoteIntercept', 'coverageNoteFactor1', 'coverageRatingStatus',
       'coreNoteInterceptMin', 'coreNoteInterceptMax',
       'expansionNoteInterceptMin', 'expansionNoteInterceptMax',
       'coverageNoteInterceptMin', 'coverageNoteInterceptMax',
       'groupNoteIntercept', 'groupNoteFactor1', 'groupRatingStatus',
       'groupNoteInterceptMax', 'groupNoteInterceptMin', 'modelingGroup',
       'numRatings', 'timestampMillisOfCurrentStatus',
       'expansionPlusNoteIntercept', 'expansionPlusNoteFactor1',
       'expansionPlusRatingStatus', 'topicNoteIntercept', 'topicNoteFactor1',
       'topicRatingStatus', 'noteTopic', 'topicNoteConfi