In [7]:
import praw
import pandas as pd
import time
import os
from pathlib import Path

In [5]:
# ==============================
# PRAW setup
# ==============================
# Initialize PRAW with your credentials
reddit = praw.Reddit(client_id='jQqFEBqOyeJcGFBvU5N3Zw', #Change Client ID based on app
                     client_secret='miUWh8AmwnAx20CCLgjyxBrZ0rHsyQ', # Change Client Secret
                     user_agent='Opiates_Recovery_data_scraper by /u/Sajjad_Islam',
                     username='Sajjad_Islam',  # Your Reddit username
                     password='patuakhali'   # Your Reddit password
                    )

In [6]:
# Verify the connection
try:
    me = reddit.user.me()
    print(f"Authenticated as: {me}")
except Exception as e:
    print(f"Auth error: {e}")

Authenticated as: Sajjad_Islam


In [12]:
# ==============================
# Resolve base paths
# ==============================
try:
    # Works when running as a .py script
    script_dir = Path(__file__).resolve().parent
except NameError:
    # Fallback for Jupyter/interactive sessions
    script_dir = Path.cwd()

# Data_Process is one level up from reddit_data_proces
project_dir = script_dir.parent
OUTPUT_DIR = project_dir /"Data_Process"/ "Data_Lake"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

print(f"Output directory: {OUTPUT_DIR}")

Output directory: z:\Documents\Projects\PSS_XAI\Data_Process\Data_Lake


In [13]:
# ==============================
# Config
# ==============================
SUBREDDITS = ["ExplainLikeImFive", "TodayILearned"]
TARGET_PER_SUB = 2000
BATCH_SIZE = 100
PAUSE_SECONDS = 2


In [14]:
# ==============================
# Helper to fetch latest posts
# ==============================
def fetch_latest_posts(subreddit_name, target=2000, batch_size=100, pause=2):
    sr = reddit.subreddit(subreddit_name)
    collected = []
    seen_ids = set()
    after = None
    total_fetched = 0

    while len(collected) < target:
        submissions = list(sr.new(limit=batch_size, params={"after": after}))
        if not submissions:
            print(f"No more posts available for r/{subreddit_name}.")
            break

        for sub in submissions:
            if sub.id in seen_ids:
                continue
            seen_ids.add(sub.id)
            collected.append({
                "id": sub.id,
                "title": sub.title,
                "author": sub.author.name if sub.author else "[deleted]",
                "selftext": sub.selftext or "",
                "score": sub.score,
                "ups": sub.ups,
                "num_comments": sub.num_comments,
                "created_utc": sub.created_utc,
                "permalink": f"https://www.reddit.com{sub.permalink}",
                "url": sub.url,
                "subreddit": subreddit_name
            })
            if len(collected) >= target:
                break

        after = submissions[-1].fullname
        total_fetched += len(submissions)
        print(f"r/{subreddit_name}: collected {len(collected)} so far (fetched {total_fetched} raw).")
        time.sleep(pause)

    df = pd.DataFrame(collected)
    if not df.empty and "created_utc" in df.columns:
        df = df.sort_values("created_utc", ascending=False).reset_index(drop=True)
    return df


In [15]:
# ==============================
# Run and save
# ==============================
for sr_name in SUBREDDITS:
    print(f"\nFetching latest posts from r/{sr_name}...")
    df = fetch_latest_posts(sr_name, target=TARGET_PER_SUB, batch_size=BATCH_SIZE, pause=PAUSE_SECONDS)
    out_path = OUTPUT_DIR / f"{sr_name}_latest_posts.csv"
    df.to_csv(out_path, index=False, encoding="utf-8")
    print(f"Saved {len(df)} rows to {out_path}")

print("\nDone.")


Fetching latest posts from r/ExplainLikeImFive...
r/ExplainLikeImFive: collected 100 so far (fetched 100 raw).
r/ExplainLikeImFive: collected 200 so far (fetched 200 raw).
r/ExplainLikeImFive: collected 300 so far (fetched 300 raw).
r/ExplainLikeImFive: collected 400 so far (fetched 400 raw).
r/ExplainLikeImFive: collected 500 so far (fetched 500 raw).
r/ExplainLikeImFive: collected 600 so far (fetched 600 raw).
r/ExplainLikeImFive: collected 700 so far (fetched 700 raw).
r/ExplainLikeImFive: collected 800 so far (fetched 800 raw).
r/ExplainLikeImFive: collected 900 so far (fetched 900 raw).
r/ExplainLikeImFive: collected 924 so far (fetched 924 raw).
No more posts available for r/ExplainLikeImFive.
Saved 924 rows to z:\Documents\Projects\PSS_XAI\Data_Process\Data_Lake\ExplainLikeImFive_latest_posts.csv

Fetching latest posts from r/TodayILearned...
r/TodayILearned: collected 100 so far (fetched 100 raw).
r/TodayILearned: collected 200 so far (fetched 200 raw).
r/TodayILearned: collec

### Clean data for use

In [20]:
from pathlib import Path
import pandas as pd

# ============== locate Data_Process robustly ==============
def find_data_process(start: Path) -> Path:
    start = start.resolve()
    # case 1: you are already inside Data_Process
    if start.name == "Data_Process":
        return start
    # walk up and return the first folder that is named Data_Process
    for p in [start, *start.parents]:
        if p.name == "Data_Process":
            return p
        # also handle running from PSS_XAI or deeper
        candidate = p / "Data_Process"
        if candidate.exists() and candidate.is_dir():
            return candidate
    raise FileNotFoundError("Could not find a folder named Data_Process above the current path.")

# works in script and notebook
try:
    script_dir = Path(__file__).resolve().parent
except NameError:
    script_dir = Path.cwd()

DATA_PROCESS = find_data_process(script_dir)

input_dir  = DATA_PROCESS / "Data_Lake"
output_dir = DATA_PROCESS / "Data_Warehouse"
output_dir.mkdir(parents=True, exist_ok=True)

print("Resolved paths:")
print("  CWD:        ", Path.cwd())
print("  Data_Process:", DATA_PROCESS)
print("  Input dir:  ", input_dir)
print("  Output dir: ", output_dir)

# ============== load inputs ==============
files_expected = [
    input_dir / "ExplainLikeImFive_latest_posts.csv",
    input_dir / "TodayILearned_latest_posts.csv",
]

# if any expected file is missing, try to auto discover
existing = [f for f in files_expected if f.exists()]
if len(existing) < 2:
    discovered = sorted(input_dir.glob("*_latest_posts.csv"))
    print(f"Discovered in Data_Lake: {[p.name for p in discovered]}")
    if discovered:
        existing = discovered

if not existing:
    raise FileNotFoundError("No input CSVs found in Data_Lake. Make sure the crawl step saved files there.")

dfs = []
for f in existing:
    print(f"Reading {f}")
    df_src = pd.read_csv(f)
    if df_src.empty:
        print(f"Warning: {f.name} is empty. Skipping.")
        continue
    df_out = pd.DataFrame({
        "text": df_src.get("selftext", "").fillna(""),
        "label": "none",
        "sub-source": df_src.get("subreddit", "").fillna(""),
        "source": "Dataset_own_1",
    })
    dfs.append(df_out)

if not dfs:
    raise ValueError("All input CSVs were empty after loading. Nothing to merge.")

merged = pd.concat(dfs, ignore_index=True)

out_path = output_dir / "no_mental_condition_dataset_own.csv"
merged.to_csv(out_path, index=False, encoding="utf-8")
print(f"Saved {len(merged)} rows to {out_path}")


Resolved paths:
  CWD:         z:\Documents\Projects\PSS_XAI\reddit_data_proces
  Data_Process: \\venn.mscsnet.mu.edu\accounts\kislam\Documents\Projects\PSS_XAI\Data_Process
  Input dir:   \\venn.mscsnet.mu.edu\accounts\kislam\Documents\Projects\PSS_XAI\Data_Process\Data_Lake
  Output dir:  \\venn.mscsnet.mu.edu\accounts\kislam\Documents\Projects\PSS_XAI\Data_Process\Data_Warehouse
Reading \\venn.mscsnet.mu.edu\accounts\kislam\Documents\Projects\PSS_XAI\Data_Process\Data_Lake\ExplainLikeImFive_latest_posts.csv
Reading \\venn.mscsnet.mu.edu\accounts\kislam\Documents\Projects\PSS_XAI\Data_Process\Data_Lake\TodayILearned_latest_posts.csv
Saved 1858 rows to \\venn.mscsnet.mu.edu\accounts\kislam\Documents\Projects\PSS_XAI\Data_Process\Data_Warehouse\no_mental_condition_dataset_own.csv


In [4]:
from pathlib import Path
import pandas as pd

# ===== CONFIG =====
input_file = Path(r"D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\no_mental_condition_dataset_own.csv")
WORD_CAP = 400

def get_unique_path(base_path: Path) -> Path:
    """Return a unique path by adding _2, _3, ... if needed."""
    if not base_path.exists():
        return base_path
    stem, ext = base_path.stem, base_path.suffix
    i = 2
    while True:
        candidate = base_path.with_name(f"{stem}_{i}{ext}")
        if not candidate.exists():
            return candidate
        i += 1

# ===== load input =====
if not input_file.exists():
    raise FileNotFoundError(f"{input_file} not found")

df = pd.read_csv(input_file)
if "text" not in df.columns:
    raise ValueError("'text' column not found in input file")

# ===== filter =====
df["word_count"] = df["text"].apply(lambda x: len(str(x).split()))
kept = df[df["word_count"] <= WORD_CAP].drop(columns=["word_count"])
removed = len(df) - len(kept)

# ===== deduplicate =====
kept = kept.drop_duplicates().reset_index(drop=True)

# ===== save =====
out_file = get_unique_path(input_file.with_name("no_mental_condition_dataset_own_small.csv"))
kept.to_csv(out_file, index=False, encoding="utf-8")

# ===== report =====
print(f"Input file: {input_file}")
print(f"Total rows in input: {len(df)}")
print(f"Rows kept (<= {WORD_CAP} words): {len(kept)}")
print(f"Rows removed (> {WORD_CAP} words): {removed}")
print(f"Deduplicated rows in output: {len(kept)}")
print(f"Saved to: {out_file}")


Input file: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\no_mental_condition_dataset_own.csv
Total rows in input: 1858
Rows kept (<= 400 words): 708
Rows removed (> 400 words): 1
Deduplicated rows in output: 708
Saved to: D:\Sajjad-Workspace\PSS_XAI\Data_Process\Data_Warehouse\no_mental_condition_dataset_own_small.csv
