# Reddit API Data Ingestion Notebook
--------------------------------------------------------------------------------
## Description

This notebook collects recent Reddit posts from popular cryptocurrency subreddits using the Reddit API. It filters posts based on relevant crypto keywords, structures the data, and writes the results as JSON files to a raw landing volume for downstream processing.

--------------------------------------------------------------------------------

### Key Features:

-  Uses the `praw` library to access Reddit's API  
-  Filters posts based on a configurable list of crypto-related keywords  
-  Targets a curated list of popular crypto subreddits  
-  Normalizes keywords for consistent matching  
-  Writes raw JSON files to Databricks Volumes with timestamp-based unique filenames  
-  Designed as the upstream source for the Reddit Sentiment DLT pipeline  

In [0]:
# --------------------------------------------------------------------------------
# Install Required Libraries
# --------------------------------------------------------------------------------
%pip install praw
dbutils.library.restartPython()

In [0]:
# --------------------------------------------------------------------------------
# Import Libraries
# --------------------------------------------------------------------------------
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
from datetime import datetime
import praw


# --------------------------------------------------------------------------------
# Initialize Reddit API Client
# --------------------------------------------------------------------------------
# Note: For production, store these credentials securely using Databricks secrets.

client_id = "zpQTW6qW_vFnhZJxgqU3BA"
client_secret = "EUEH0gvL7hErfu890AQ2aRJA-cdHFA"
user_agent = "Crypto Sentiment Tracker"

reddit = praw.Reddit(client_id=client_id, client_secret=client_secret, user_agent=user_agent)


# --------------------------------------------------------------------------------
# Notebook Parameter: Post Limit Per Subreddit (Define notebook widget)
# --------------------------------------------------------------------------------
dbutils.widgets.text("limit_reddit", "500", "Limit per subreddit")
limit = int(dbutils.widgets.get("limit_reddit"))



# --------------------------------------------------------------------------------
# Define Keywords and Target Subreddits
# --------------------------------------------------------------------------------
keywords = [
    "bitcoin", "btc",
    "ethereum", "eth",
    "binance", "bnb",
    "solana", "sol",
    "ripple", "xrp",
    "dogecoin", "doge",
    "cardano", "ada",
    "polkadot", "dot"
]

# Normalize keywords to lowercase for consistency
keywords = [kw.lower() for kw in keywords]

subreddits = ["CryptoCurrency", "Bitcoin", "CryptoMarkets", "Ethereum", "Dogecoin", "Altcoin", "DeFi", "BitcoinBeginners",
    "NFT", "CryptoMoonShots"]



# --------------------------------------------------------------------------------
# Collect Reddit Posts Matching Keywords
# --------------------------------------------------------------------------------
posts = []
for sub in subreddits:
    for submission in reddit.subreddit(sub).new(limit=limit):
        title_lower = submission.title.lower()
        if any(kw in title_lower for kw in keywords):
            matched_kw = next((kw for kw in keywords if kw in title_lower), "unknown")
            posts.append((
                submission.id,
                submission.title,
                datetime.utcfromtimestamp(submission.created_utc),
                sub,
                matched_kw
            ))



# --------------------------------------------------------------------------------
# Create Spark DataFrame from Collected Posts
# --------------------------------------------------------------------------------
schema = StructType([
    StructField("post_id", StringType()),
    StructField("text", StringType()),
    StructField("created_utc", TimestampType()),
    StructField("subreddit", StringType()),
    StructField("keyword", StringType())
])

spark_df = spark.createDataFrame(posts, schema=schema)



# --------------------------------------------------------------------------------
# Write Data to Raw Landing Volume (JSON Format)
# --------------------------------------------------------------------------------
# Filename includes UTC timestamp to ensure uniqueness

volume_path = "/Volumes/tabular/dataexpert/sakethg/capstone/raw/reddit_sentiment"
timestamp_str = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
unique_filename = f"reddit_sentiment_{timestamp_str}.json"
full_path = f"{volume_path}/{unique_filename}"

spark_df.write.mode("overwrite").json(full_path)

print(f"Reddit sentiment data written to {full_path}")