In [5]:
import os, duckdb

# --- Paths ---
base_aggregator = r"C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\aggregator\data"
base_user_data = r"C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\user_data_aggregating\data"

master_path = os.path.join(base_aggregator, "user_period_master_clean.parquet")
notes_path = os.path.join(base_aggregator, "notes-00000.parquet")
scored_notes_path = os.path.join(base_user_data, "scored_notes_2dim.parquet")
output_path = os.path.join(base_aggregator, "user_period_master_complete_with_authored_scores.parquet")

print("üöÄ Enriching master dataset with statistics of notes authored by users...\n")

con = duckdb.connect()
con.execute("PRAGMA memory_limit='8GB';")

# Create enrichment query
query = f"""
COPY (
    WITH notes_with_scores AS (
        -- Join notes with their scores
        SELECT 
            n.noteId,
            n.noteAuthorParticipantId,
            n.createdAtMillis,
            to_timestamp(n.createdAtMillis / 1000) AS createdAt,
            s.coreNoteIntercept,
            s.coreNoteFactor1
        FROM read_parquet('{notes_path}') n
        LEFT JOIN read_parquet('{scored_notes_path}') s ON n.noteId = s.noteId
        WHERE n.createdAtMillis IS NOT NULL
          AND to_timestamp(n.createdAtMillis / 1000) >= '2023-01-01'::TIMESTAMP
    ),
    biweekly_authored_stats AS (
        -- Calculate biweekly stats for notes authored by each user
        SELECT
            noteAuthorParticipantId AS userId,
            -- Create 2-week periods starting from 2023-01-01
            DATE '2023-01-01' + (FLOOR(EPOCH(createdAt - TIMESTAMP '2023-01-01') / (14 * 86400)) * 14) * INTERVAL '1 day' AS period_start,
            
            -- Count of notes with scores (not total count since that already exists)
            COUNT(coreNoteIntercept) AS authored_notes_with_intercept,
            COUNT(coreNoteFactor1) AS authored_notes_with_factor1,
            
            -- Intercept statistics for authored notes
            ROUND(AVG(coreNoteIntercept), 4) AS avg_authored_note_intercept,
            ROUND(MIN(coreNoteIntercept), 4) AS min_authored_note_intercept,
            ROUND(MAX(coreNoteIntercept), 4) AS max_authored_note_intercept,
            ROUND(STDDEV(coreNoteIntercept), 4) AS stddev_authored_note_intercept,
            
            -- Factor1 statistics for authored notes
            ROUND(AVG(coreNoteFactor1), 4) AS avg_authored_note_factor1,
            ROUND(MIN(coreNoteFactor1), 4) AS min_authored_note_factor1,
            ROUND(MAX(coreNoteFactor1), 4) AS max_authored_note_factor1,
            ROUND(STDDEV(coreNoteFactor1), 4) AS stddev_authored_note_factor1
            
        FROM notes_with_scores
        GROUP BY noteAuthorParticipantId, period_start
    )
    SELECT 
        m.*,
        -- Add authored note statistics (excluding total_authored_notes since it already exists as total_notes_authored)
        a.authored_notes_with_intercept,
        a.authored_notes_with_factor1,
        a.avg_authored_note_intercept,
        a.min_authored_note_intercept,
        a.max_authored_note_intercept,
        a.stddev_authored_note_intercept,
        a.avg_authored_note_factor1,
        a.min_authored_note_factor1,
        a.max_authored_note_factor1,
        a.stddev_authored_note_factor1
        
    FROM read_parquet('{master_path}') m
    LEFT JOIN biweekly_authored_stats a 
        ON m.userId = a.userId 
        AND m.period_start = a.period_start
    ORDER BY m.userId, m.period_start
) TO '{output_path}' (FORMAT PARQUET);
"""

print("Executing enrichment query...")
con.execute(query)

print(f"\n‚úÖ Done! Enriched master dataset saved to:\n{output_path}")

# Verification stats
print("\nüìä Enrichment Verification:")
stats = con.execute(f"""
    SELECT 
        COUNT(*) as total_user_periods,
        COUNT(DISTINCT userId) as unique_users,
        COUNT(total_notes_authored) as periods_with_notes_authored_original,
        COUNT(avg_authored_note_intercept) as periods_with_authored_intercept_stats,
        COUNT(avg_authored_note_factor1) as periods_with_authored_factor1_stats,
        ROUND(COUNT(avg_authored_note_intercept) * 100.0 / NULLIF(COUNT(total_notes_authored), 0), 1) as pct_authored_with_scores
    FROM '{output_path}'
""").fetchdf()
print(stats.to_string(index=False))

# Get column count comparison
original_schema = con.execute(f"DESCRIBE SELECT * FROM '{master_path}'").fetchdf()
enriched_schema = con.execute(f"DESCRIBE SELECT * FROM '{output_path}'").fetchdf()

print(f"\nüìã Schema Comparison:")
print(f"Original columns: {len(original_schema)}")
print(f"Enriched columns: {len(enriched_schema)}")
print(f"Added: {len(enriched_schema) - len(original_schema)} new authored note score columns")

# Preview enriched data
print("\nüìù Preview of enriched dataset (users with authored note scores):")
preview = con.execute(f"""
    SELECT 
        userId,
        period_start,
        total_notes_authored,
        authored_notes_with_intercept,
        avg_authored_note_intercept,
        min_authored_note_intercept,
        max_authored_note_intercept,
        stddev_authored_note_intercept,
        avg_authored_note_factor1,
        stddev_authored_note_factor1
    FROM '{output_path}'
    WHERE avg_authored_note_intercept IS NOT NULL
    LIMIT 10
""").fetchdf()
print(preview.to_string(index=False))

# Show the new columns added
print(f"\nüìå New columns added:")
new_cols = [
    "authored_notes_with_intercept", "authored_notes_with_factor1",
    "avg_authored_note_intercept", "min_authored_note_intercept", "max_authored_note_intercept", "stddev_authored_note_intercept",
    "avg_authored_note_factor1", "min_authored_note_factor1", "max_authored_note_factor1", "stddev_authored_note_factor1"
]
for col in new_cols:
    print(f"  ‚Ä¢ {col}")

con.close()

print("\n‚úÖ Master dataset enrichment with authored note scores complete!")
print(f"üìÅ Final enriched file: {output_path}")
print(f"üìù Note: Using existing 'total_notes_authored' column instead of creating duplicate")

üöÄ Enriching master dataset with statistics of notes authored by users...

Executing enrichment query...
Executing enrichment query...


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))


‚úÖ Done! Enriched master dataset saved to:
C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\aggregator\data\user_period_master_complete_with_authored_scores.parquet

üìä Enrichment Verification:
 total_user_periods  unique_users  periods_with_notes_authored_original  periods_with_authored_intercept_stats  periods_with_authored_factor1_stats  pct_authored_with_scores
           21020118       1279178                               1030703                                 724627                               724627                      70.3

üìã Schema Comparison:
Original columns: 62
Enriched columns: 72
Added: 10 new authored note score columns

üìù Preview of enriched dataset (users with authored note scores):
                                                          userId period_start  total_notes_authored  authored_notes_with_intercept  avg_authored_note_intercept  min_authored_note_intercept  max_authored_note_intercept  stddev_authore

In [None]:
import os, duckdb

# Remove unique_tweets_requested column from enriched master dataset
base_aggregator = r"C:\Users\wongb\twitter-community-notes-time-series\twitter-community-notes-user-time-series\aggregator\data"
file_path = os.path.join(base_aggregator, "user_period_master_complete_with_authored_scores.parquet")

print("üóëÔ∏è Removing unique_tweets_requested column...")

con = duckdb.connect()
con.execute("PRAGMA memory_limit='8GB';")

# Remove the column and save back to same file
query = f"""
COPY (
    SELECT * EXCLUDE (unique_tweets_requested)
    FROM read_parquet('{file_path}')
    ORDER BY userId, period_start
) TO '{file_path}' (FORMAT PARQUET);
"""

con.execute(query)
con.close()

print(f"‚úÖ Removed unique_tweets_requested column from {file_path}")