# Read Cleaned Delta Tables from S3

This notebook reads cleaned reddit data from the `{subreddit}_clean` Delta tables stored on S3 using Polars and deltalake.

In [None]:
! uv pip install polars deltalake

In [1]:
import os
import json
import yaml
import polars as pl
from deltalake import DeltaTable
from datetime import datetime

# Function to find files in the workspace
def find_file(filename, search_paths):
    """Search for a file in specified paths and parent directories"""
    # Check specific paths first
    for path in search_paths:
        if os.path.exists(path):
            return path
    
    # Walk up from CWD
    curr = os.getcwd()
    while True:
        f = os.path.join(curr, filename)
        if os.path.exists(f):
            return f
        parent = os.path.dirname(curr)
        if parent == curr:
            break
        curr = parent
    return None

def read_config_creds():
    """Read credentials and configuration files"""
    print(f"Current working directory: {os.getcwd()}")
    
    # Creds paths to search
    creds_paths = [
        "creds.json",
        "redditStreaming/creds.json",
        "/home/steven/reddit-streaming/creds.json",
        "/opt/workspace/creds.json",
    ]
    
    creds_file = find_file("creds.json", creds_paths)
    if not creds_file:
         raise FileNotFoundError("Could not find creds.json")
    
    print(f"Found credentials at: {creds_file}")
    with open(creds_file, "r") as f:
        creds = json.load(f)

    # Config paths to search
    config_paths = [
        "config.yaml",
        "redditStreaming/config.yaml",
        "/home/steven/reddit-streaming/config.yaml",
        "/opt/workspace/config.yaml"
    ]
    
    config_file = find_file("config.yaml", config_paths)
    if not config_file:
         raise FileNotFoundError("Could not find config.yaml")
         
    print(f"Found config at: {config_file}")
    with open(config_file, "r") as f:
        config = yaml.safe_load(f)
        
    return creds, config

# Load configuration and credentials
creds, config = read_config_creds()

# Extract values
aws_client = creds.get("aws_client")
aws_secret = creds.get("aws_secret")
subreddits = config.get("subreddit", [])
bucket_name = "reddit-streaming-stevenhurwitt-2" 

print(f"Subreddits to process: {subreddits}")
print(f"S3 Bucket: {bucket_name}")

Current working directory: /home/steven/reddit-streaming/redditStreaming/src/notebooks
Found credentials at: /home/steven/reddit-streaming/creds.json
Found config at: config.yaml
Subreddits to process: ['technology', 'ProgrammerHumor', 'news', 'worldnews']
S3 Bucket: reddit-streaming-stevenhurwitt-2


## Configure AWS for Delta Lake

In [2]:
# Configure AWS credentials for Delta Lake S3 access
storage_options = {
    "AWS_ACCESS_KEY_ID": aws_client,
    "AWS_SECRET_ACCESS_KEY": aws_secret,
    "AWS_REGION": "us-east-2"  # Adjust if your bucket is in a different region
}

print("AWS credentials configured for Delta Lake.")

AWS credentials configured for Delta Lake.


## Read Cleaned Delta Tables from S3

Read each `{subreddit}_clean` delta table from S3

In [3]:
# Read cleaned Delta tables for each subreddit
dfs = {}

for sub in subreddits:
    path = f"s3://{bucket_name}/{sub}_clean"
    print(f"Reading from: {path}")
    try:
        # Read Delta table using deltalake and convert to Polars
        dt = DeltaTable(path, storage_options=storage_options)
        df = dt.to_pyarrow_table()
        df = pl.from_arrow(df)
        dfs[sub] = df
        count = len(df)
        print(f"✓ Successfully read {sub}_clean: {count:,} records")
    except Exception as e:
        print(f"✗ Error reading {sub}_clean: {str(e)}")

print(f"\nTotal tables loaded: {len(dfs)}")

Reading from: s3://reddit-streaming-stevenhurwitt-2/technology_clean
✓ Successfully read technology_clean: 3,105 records
Reading from: s3://reddit-streaming-stevenhurwitt-2/ProgrammerHumor_clean
✓ Successfully read ProgrammerHumor_clean: 2,222 records
Reading from: s3://reddit-streaming-stevenhurwitt-2/news_clean
✓ Successfully read news_clean: 1,981 records
Reading from: s3://reddit-streaming-stevenhurwitt-2/worldnews_clean
✓ Successfully read worldnews_clean: 4,528 records

Total tables loaded: 4


## Display Individual Subreddit Data

View data from each subreddit separately

In [4]:
# Display sample data from each subreddit
for sub, df in dfs.items():
    print(f"\n{'='*60}")
    print(f"Subreddit: r/{sub}")
    print(f"{'='*60}")
    
    # Add human-readable timestamp
    display_df = df.with_columns(
        pl.from_epoch("created_utc", time_unit="s").alias("created_time")
    )
    
    # Show schema
    print("\nSchema:")
    print(display_df.schema)
    
    # Show sample records (most recent)

    print(f"\nSample records (most recent):")    
    print(display_df.sort("created_utc", descending=True).head(10))


Subreddit: r/technology

Schema:
Schema({'approved_at_utc': Datetime(time_unit='us', time_zone=None), 'subreddit': String, 'selftext': String, 'author_fullname': String, 'saved': Boolean, 'mod_reason_title': String, 'gilded': Int32, 'clicked': Boolean, 'title': String, 'subreddit_name_prefixed': String, 'hidden': Boolean, 'pwls': Int32, 'link_flair_css_class': String, 'downs': Int32, 'thumbnail_height': Int32, 'top_awarded_type': String, 'hide_score': Boolean, 'name': String, 'quarantine': Boolean, 'link_flair_text_color': String, 'upvote_ratio': Float32, 'author_flair_background_color': String, 'ups': Int32, 'total_awards_received': Int32, 'thumbnail_width': Int32, 'author_flair_template_id': String, 'is_original_content': Boolean, 'secure_media': String, 'is_reddit_media_domain': Boolean, 'is_meta': Boolean, 'category': String, 'link_flair_text': String, 'can_mod_post': Boolean, 'score': Int32, 'approved_by': String, 'is_created_from_ads_ui': Boolean, 'author_premium': Boolean, 'thu

## Union All Subreddits and Sort by Recency

Combine all subreddit data into a single DataFrame and sort by most recent posts

In [7]:
# Union all dataframes and sort by recency
if dfs:
    # Concatenate all dataframes
    all_dfs = list(dfs.values())
    combined_df = pl.concat(all_dfs)
    
    # Sort by created_utc (most recent first)
    sorted_df = combined_df.sort("created_utc", descending=True)
    
    # Add human-readable timestamp
    display_df = sorted_df.with_columns(
        pl.from_epoch("created_utc", time_unit="s").alias("created_time")
    )
    
    total_count = len(sorted_df)
    print(f"Total records across all subreddits: {total_count:,}")
    print(f"\nMost recent posts across all subreddits:")
    
    # Display relevant columns
    print(display_df.select([
            "subreddit", 
            "created_time", 
            "title", 
            "score", 
            "author",
            "num_comments"

        ]).head(50)    
    )
    
else:
    print("No data available from any subreddit.")

Total records across all subreddits: 11,836

Most recent posts across all subreddits:
shape: (50, 6)
┌────────────┬─────────────────┬──────────────────────┬───────┬─────────────────────┬──────────────┐
│ subreddit  ┆ created_time    ┆ title                ┆ score ┆ author              ┆ num_comments │
│ ---        ┆ ---             ┆ ---                  ┆ ---   ┆ ---                 ┆ ---          │
│ str        ┆ datetime[μs]    ┆ str                  ┆ i32   ┆ str                 ┆ i32          │
╞════════════╪═════════════════╪══════════════════════╪═══════╪═════════════════════╪══════════════╡
│ worldnews  ┆ +37055-07-19    ┆ Germany's Merz lands ┆ 1     ┆ Little-Chemical5006 ┆ 0            │
│            ┆ 21:58:03.044864 ┆ in Beijin…           ┆       ┆                     ┆              │
│ technology ┆ +37051-06-29    ┆ India’s AI boom      ┆ 1     ┆ Logical_Welder3467  ┆ 0            │
│            ┆ 10:24:43.044864 ┆ pushes firms t…      ┆       ┆                     ┆      

## Summary Statistics

View summary statistics for each subreddit

In [8]:
# Summary statistics by subreddit
if dfs:
    print("Summary by subreddit:")
    print(f"{'Subreddit':<20} {'Records':>15} {'Avg Score':>12} {'Avg Comments':>15}")
    print("="*65)
    
    for sub, df in dfs.items():
        count = len(df)
        avg_score = df.select(pl.col("score").mean()).item()
        avg_comments = df.select(pl.col("num_comments").mean()).item()
        
        print(f"{sub:<20} {count:>15,} {avg_score:>12.2f} {avg_comments:>15.2f}")

Summary by subreddit:
Subreddit                    Records    Avg Score    Avg Comments
technology                     3,105         2.31            0.36
ProgrammerHumor                2,222         3.53            0.33
news                           1,981        50.40            4.27
worldnews                      4,528         2.80            0.48


In [None]:
# Clean up (nothing needed for Polars)

## Post count by date/subreddit

In [14]:
display_df.group_by("subreddit", "date").agg([
    pl.len().alias("post_count"),
    pl.col("score").mean().alias("avg_score"),
    pl.col("num_comments").mean().alias("avg_comments")]).sort("date", descending=True).head(20)

subreddit,date,post_count,avg_score,avg_comments
str,date,u32,f64,f64
"""worldnews""",2026-02-25,16,1.3125,0.25
"""technology""",2026-02-25,12,1.25,0.333333
"""news""",2026-02-25,9,3.444444,0.555556
"""ProgrammerHumor""",2026-02-25,3,1.0,0.333333
"""technology""",2026-02-24,77,1.103896,0.064935
…,…,…,…,…
"""ProgrammerHumor""",2026-02-22,19,13.052632,1.052632
"""worldnews""",2026-02-21,71,2.816901,0.43662
"""technology""",2026-02-21,32,15.15625,2.40625
"""news""",2026-02-21,27,127.888889,7.814815
