# Read Cleaned Delta Tables from S3

This notebook reads cleaned reddit data from the `{subreddit}_clean` Delta tables stored on S3 using Polars and deltalake.

In [1]:
import os
import json
import yaml
import polars as pl
from deltalake import DeltaTable
from datetime import datetime

# Function to find files in the workspace
def find_file(filename, search_paths):
    """Search for a file in specified paths and parent directories"""
    # Check specific paths first
    for path in search_paths:
        if os.path.exists(path):
            return path
    
    # Walk up from CWD
    curr = os.getcwd()
    while True:
        f = os.path.join(curr, filename)
        if os.path.exists(f):
            return f
        parent = os.path.dirname(curr)
        if parent == curr:
            break
        curr = parent
    return None

def read_config_creds():
    """Read credentials and configuration files"""
    print(f"Current working directory: {os.getcwd()}")
    
    # Creds paths to search
    creds_paths = [
        "creds.json",
        "redditStreaming/creds.json",
        "/home/steven/reddit-streaming/creds.json",
        "/opt/workspace/creds.json",
    ]
    
    creds_file = find_file("creds.json", creds_paths)
    if not creds_file:
         raise FileNotFoundError("Could not find creds.json")
    
    print(f"Found credentials at: {creds_file}")
    with open(creds_file, "r") as f:
        creds = json.load(f)

    # Config paths to search
    config_paths = [
        "config.yaml",
        "redditStreaming/config.yaml",
        "/home/steven/reddit-streaming/config.yaml",
        "/opt/workspace/config.yaml"
    ]
    
    config_file = find_file("config.yaml", config_paths)
    if not config_file:
         raise FileNotFoundError("Could not find config.yaml")
         
    print(f"Found config at: {config_file}")
    with open(config_file, "r") as f:
        config = yaml.safe_load(f)
        
    return creds, config

# Load configuration and credentials
creds, config = read_config_creds()

# Extract values
aws_client = creds.get("aws_client")
aws_secret = creds.get("aws_secret")
subreddits = config.get("subreddit", [])
bucket_name = "reddit-streaming-stevenhurwitt-2" 

print(f"Subreddits to process: {subreddits}")
print(f"S3 Bucket: {bucket_name}")

Current working directory: /home/steven/reddit-streaming/redditStreaming/src/notebooks
Found credentials at: /home/steven/reddit-streaming/creds.json
Found config at: config.yaml
Subreddits to process: ['technology', 'ProgrammerHumor', 'news', 'worldnews']
S3 Bucket: reddit-streaming-stevenhurwitt-2


## Configure AWS for Delta Lake

In [2]:
# Configure AWS credentials for Delta Lake S3 access
storage_options = {
    "AWS_ACCESS_KEY_ID": aws_client,
    "AWS_SECRET_ACCESS_KEY": aws_secret,
    "AWS_REGION": "us-east-2"  # Adjust if your bucket is in a different region
}

print("AWS credentials configured for Delta Lake.")

AWS credentials configured for Delta Lake.


## Read Cleaned Delta Tables from S3

Read each `{subreddit}_clean` delta table from S3

In [3]:
# Read cleaned Delta tables for each subreddit
dfs = {}

for sub in subreddits:
    path = f"s3://{bucket_name}/{sub}_clean"
    print(f"Reading from: {path}")
    try:
        # Read Delta table using deltalake and convert to Polars
        dt = DeltaTable(path, storage_options=storage_options)
        df = dt.to_pyarrow_table()
        df = pl.from_arrow(df)
        dfs[sub] = df
        count = len(df)
        print(f"✓ Successfully read {sub}_clean: {count:,} records")
    except Exception as e:
        print(f"✗ Error reading {sub}_clean: {str(e)}")

print(f"\nTotal tables loaded: {len(dfs)}")

Reading from: s3://reddit-streaming-stevenhurwitt-2/technology_clean
✓ Successfully read technology_clean: 3,354 records
Reading from: s3://reddit-streaming-stevenhurwitt-2/ProgrammerHumor_clean
✓ Successfully read ProgrammerHumor_clean: 2,350 records
Reading from: s3://reddit-streaming-stevenhurwitt-2/news_clean
✓ Successfully read news_clean: 2,186 records
Reading from: s3://reddit-streaming-stevenhurwitt-2/worldnews_clean
✓ Successfully read worldnews_clean: 4,990 records

Total tables loaded: 4


## Display Individual Subreddit Data

View data from each subreddit separately

In [4]:
# Display sample data from each subreddit
for sub, df in dfs.items():
    print(f"\n{'='*60}")
    print(f"Subreddit: r/{sub}")
    print(f"{'='*60}")
    
    # Add human-readable timestamp
    display_df = df.with_columns(
        pl.from_epoch("created_utc", time_unit="s").alias("created_time")
    )
    
    # Show schema
    print("\nSchema:")
    print(display_df.schema)
    
    # Show sample records (most recent)

    print(f"\nSample records (most recent):")    
    print(display_df.sort("created_utc", descending=True).head(10))


Subreddit: r/technology

Schema:
Schema({'approved_at_utc': Datetime(time_unit='us', time_zone=None), 'subreddit': String, 'selftext': String, 'author_fullname': String, 'saved': Boolean, 'mod_reason_title': String, 'gilded': Int32, 'clicked': Boolean, 'title': String, 'subreddit_name_prefixed': String, 'hidden': Boolean, 'pwls': Int32, 'link_flair_css_class': String, 'downs': Int32, 'thumbnail_height': Int32, 'top_awarded_type': String, 'hide_score': Boolean, 'name': String, 'quarantine': Boolean, 'link_flair_text_color': String, 'upvote_ratio': Float32, 'author_flair_background_color': String, 'ups': Int32, 'total_awards_received': Int32, 'thumbnail_width': Int32, 'author_flair_template_id': String, 'is_original_content': Boolean, 'secure_media': String, 'is_reddit_media_domain': Boolean, 'is_meta': Boolean, 'category': String, 'link_flair_text': String, 'can_mod_post': Boolean, 'score': Int32, 'approved_by': String, 'is_created_from_ads_ui': Boolean, 'author_premium': Boolean, 'thu

## Union All Subreddits and Sort by Recency

Combine all subreddit data into a single DataFrame and sort by most recent posts

In [5]:
# Union all dataframes and sort by recency
if dfs:
    # Concatenate all dataframes
    all_dfs = list(dfs.values())
    combined_df = pl.concat(all_dfs)
    
    # Sort by created_utc (most recent first)
    sorted_df = combined_df.sort("created_utc", descending=True)
    
    # Add human-readable timestamp
    display_df = sorted_df.with_columns(
        pl.from_epoch("created_utc", time_unit="s").alias("created_time")
    )
    
    total_count = len(sorted_df)
    print(f"Total records across all subreddits: {total_count:,}")
    print(f"\nMost recent posts across all subreddits:")
    
    # Display relevant columns
    print(display_df.select([
            "subreddit", 
            "created_time", 
            "title", 
            "score", 
            "author",
            "num_comments"

        ]).head(50)    
    )
    
else:
    print("No data available from any subreddit.")

Total records across all subreddits: 12,880

Most recent posts across all subreddits:
shape: (50, 6)
┌────────────┬─────────────────┬──────────────────────┬───────┬─────────────────────┬──────────────┐
│ subreddit  ┆ created_time    ┆ title                ┆ score ┆ author              ┆ num_comments │
│ ---        ┆ ---             ┆ ---                  ┆ ---   ┆ ---                 ┆ ---          │
│ str        ┆ datetime[μs]    ┆ str                  ┆ i32   ┆ str                 ┆ i32          │
╞════════════╪═════════════════╪══════════════════════╪═══════╪═════════════════════╪══════════════╡
│ news       ┆ +48007-03-05    ┆ How the world has    ┆ 4     ┆ GroundbreakingArm17 ┆ 0            │
│            ┆ 21:58:03.044864 ┆ reacted to U…        ┆       ┆ 3                   ┆              │
│ worldnews  ┆ +48003-02-13    ┆ How the world has    ┆ 1     ┆ GroundbreakingArm17 ┆ 1            │
│            ┆ 10:24:43.044864 ┆ reacted to U…        ┆       ┆ 3                   ┆      

## Summary Statistics

View summary statistics for each subreddit

In [6]:
# Summary statistics by subreddit
if dfs:
    print("Summary by subreddit:")
    print(f"{'Subreddit':<20} {'Records':>15} {'Avg Score':>12} {'Avg Comments':>15}")
    print("="*65)
    
    for sub, df in dfs.items():
        count = len(df)
        avg_score = df.select(pl.col("score").mean()).item()
        avg_comments = df.select(pl.col("num_comments").mean()).item()
        
        print(f"{sub:<20} {count:>15,} {avg_score:>12.2f} {avg_comments:>15.2f}")

Summary by subreddit:
Subreddit                    Records    Avg Score    Avg Comments
technology                     3,354         2.42            0.37
ProgrammerHumor                2,350         4.36            0.37
news                           2,186        55.11            5.11
worldnews                      4,990         5.70            1.15


In [None]:
# Clean up (nothing needed for Polars)

## Post count by date/subreddit

In [7]:
display_df.group_by("subreddit", "date").agg([
    pl.len().alias("post_count")]).sort("date", descending=True).head(20)

subreddit,date,post_count
str,date,u32
"""news""",2026-03-01,15
"""technology""",2026-03-01,3
"""ProgrammerHumor""",2026-03-01,3
"""worldnews""",2026-03-01,20
"""technology""",2026-02-28,59
…,…,…
"""ProgrammerHumor""",2026-02-26,31
"""news""",2026-02-25,32
"""technology""",2026-02-25,63
"""worldnews""",2026-02-25,95


In [8]:
display_df.head(10)

approved_at_utc,subreddit,selftext,author_fullname,saved,mod_reason_title,gilded,clicked,title,subreddit_name_prefixed,hidden,pwls,link_flair_css_class,downs,thumbnail_height,top_awarded_type,hide_score,name,quarantine,link_flair_text_color,upvote_ratio,author_flair_background_color,ups,total_awards_received,thumbnail_width,author_flair_template_id,is_original_content,secure_media,is_reddit_media_domain,is_meta,category,link_flair_text,can_mod_post,score,approved_by,is_created_from_ads_ui,author_premium,…,locked,author_flair_text,visited,removed_by,mod_note,distinguished,subreddit_id,author_is_blocked,mod_reason_by,num_reports,removal_reason,link_flair_background_color,id,is_robot_indexable,report_reasons,author,discussion_type,num_comments,send_replies,whitelist_status,contest_mode,author_patreon_flair,author_flair_text_color,permalink,parent_whitelist_status,stickied,url,subreddit_subscribers,created_utc,num_crossposts,media,is_video,date,year,month,day,created_time
datetime[μs],str,str,str,bool,str,i32,bool,str,str,bool,i32,str,i32,i32,str,bool,str,bool,str,f32,str,i32,i32,i32,str,bool,str,bool,bool,str,str,bool,i32,str,bool,bool,…,bool,str,bool,str,str,str,str,bool,str,i32,str,str,str,bool,str,str,str,i32,bool,str,bool,bool,str,str,str,bool,str,i32,datetime[μs],i32,str,bool,date,i32,i8,i8,datetime[μs]
,"""news""","""""","""t2_ei935xnv""",False,,0,False,"""How the world has reacted to U…","""r/news""",False,6,,0,,,True,"""t3_1rho3m9""",False,"""dark""",1.0,,4,0,,,False,,False,False,,,False,4,,False,False,…,False,,False,,,,"""t5_2qh3l""",False,,,,"""""","""1rho3m9""",True,,"""GroundbreakingArm173""",,0,True,,False,False,,"""/r/news/comments/1rho3m9/how_t…",,False,"""https://www.bbc.co.uk/news/art…",31212199,2026-03-01 04:43:44,0,,False,2026-03-01,2026,3,1,+48007-03-05 21:58:03.044864
,"""worldnews""","""""","""t2_ei935xnv""",False,,0,False,"""How the world has reacted to U…","""r/worldnews""",False,7,"""normal""",0,78.0,,True,"""t3_1rho1xf""",False,,0.67,,1,0,140.0,,False,,False,False,,"""Dynamic Paywall""",False,1,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,,"""1rho1xf""",True,,"""GroundbreakingArm173""",,1,True,,False,False,,"""/r/worldnews/comments/1rho1xf/…",,False,"""https://www.bbc.co.uk/news/art…",47109817,2026-03-01 04:41:36,0,,False,2026-03-01,2026,3,1,+48003-02-13 10:24:43.044864
,"""worldnews""","""""","""t2_vgozecboq""",False,,0,False,"""South Korea president calls on…","""r/worldnews""",False,7,"""northkorea""",0,73.0,,True,"""t3_1rhnopa""",False,,1.0,,1,0,140.0,,False,,False,False,,"""North Korea""",False,1,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,,"""1rhnopa""",True,,"""Little-Chemical5006""",,1,True,,False,False,,"""/r/worldnews/comments/1rhnopa/…",,False,"""https://www.reuters.com/world/…",47109724,2026-03-01 04:22:24,0,,False,2026-03-01,2026,3,1,+47966-08-13 02:24:43.044864
,"""worldnews""","""""","""t2_lakfqom6i""",False,,0,False,"""Blasts continue in Kabul amid …","""r/worldnews""",False,7,,0,78.0,,True,"""t3_1rhnmz6""",False,"""dark""",1.0,,1,0,140.0,,False,,False,False,,,False,1,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,"""""","""1rhnmz6""",True,,"""Cybertronian1512""",,1,True,,False,False,,"""/r/worldnews/comments/1rhnmz6/…",,False,"""https://www.thehindu.com/news/…",47109718,2026-03-01 04:20:16,0,,False,2026-03-01,2026,3,1,+47962-07-23 14:51:23.044864
,"""worldnews""","""""","""t2_qhdi4nf35""",False,,0,False,"""State of Palestine strongly co…","""r/worldnews""",False,7,"""palestisrael""",0,,,True,"""t3_1rhnlou""",False,,1.0,,1,0,,,False,,False,False,,"""Israel/Palestine""",False,1,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,,"""1rhnlou""",True,,"""PestoBolloElemento""",,0,True,,False,False,,"""/r/worldnews/comments/1rhnlou/…",,False,"""https://english.wafa.ps/Pages/…",47109711,2026-03-01 04:18:08,0,,False,2026-03-01,2026,3,1,+47958-07-03 03:18:03.044864
,"""news""","""""","""t2_evw04""",False,,0,False,"""Actor Shia LaBeouf arrested ag…","""r/news""",False,6,,0,,,True,"""t3_1rhnjwi""",False,"""dark""",1.0,,1,0,,,False,,False,False,,,False,1,,False,True,…,False,,False,,,,"""t5_2qh3l""",False,,,,"""""","""1rhnjwi""",True,,"""AudibleNod""",,0,False,,False,False,,"""/r/news/comments/1rhnjwi/actor…",,False,"""https://abcnews.com/Entertainm…",31211994,2026-03-01 04:16:00,0,,False,2026-03-01,2026,3,1,+47954-06-12 15:44:43.044864
,"""worldnews""","""""","""t2_jjlaph3r""",False,,0,False,"""US strikes on Iran trigger ren…","""r/worldnews""",False,7,,0,105.0,,True,"""t3_1rhnig1""",False,"""dark""",0.4,,0,0,140.0,,False,,False,False,,,False,0,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,"""""","""1rhnig1""",True,,"""SuperXGamerAb""",,1,True,,False,False,,"""/r/worldnews/comments/1rhnig1/…",,False,"""https://www.aljazeera.com/news…",47109704,2026-03-01 04:13:52,0,,False,2026-03-01,2026,3,1,+47950-05-23 04:11:23.044864
,"""worldnews""","""""","""t2_3ggncgkz""",False,,0,False,"""Hundreds of thousands of trave…","""r/worldnews""",False,7,,0,73.0,,True,"""t3_1rhnbd5""",False,"""dark""",1.0,,1,0,140.0,,False,,False,False,,,False,1,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,"""""","""1rhnbd5""",True,,"""papipota""",,0,True,,False,False,,"""/r/worldnews/comments/1rhnbd5/…",,False,"""https://www.theguardian.com/us…",47109660,2026-03-01 04:03:12,0,,False,2026-03-01,2026,3,1,+47930-02-09 18:24:43.044864
,"""technology""","""""","""t2_6w1qkho3y""",False,,0,False,"""Creepy ‘Marathon’ AI Enemies A…","""r/technology""",False,6,"""adblock""",0,78.0,,True,"""t3_1rhn4ky""",False,"""dark""",0.67,,1,0,140.0,,False,,False,False,,"""ADBLOCK WARNING""",False,1,,False,False,…,False,,False,,,,"""t5_2qh16""",False,,,,"""""","""1rhn4ky""",True,,"""TylerFortier_Photo""",,1,True,,False,False,,"""/r/technology/comments/1rhn4ky…",,False,"""https://www.forbes.com/sites/p…",20159129,2026-03-01 03:54:40,0,,False,2026-03-01,2026,3,1,+47913-11-19 20:11:23.044864
,"""worldnews""","""""","""t2_qjhngrvf""",False,,0,False,"""209 billion barrels: Iran’s oi…","""r/worldnews""",False,7,,0,78.0,,True,"""t3_1rhn2sz""",False,"""dark""",1.0,,1,0,140.0,,False,,False,False,,,False,1,,False,False,…,False,,False,,,,"""t5_2qh13""",False,,,,"""""","""1rhn2sz""",True,,"""jupa300""",,1,True,,False,False,,"""/r/worldnews/comments/1rhn2sz/…",,False,"""https://www.businesstoday.in/w…",47109608,2026-03-01 03:50:24,0,,False,2026-03-01,2026,3,1,+47905-10-09 21:04:43.044864
