In [1]:
import os
import sqlite3
import requests
import subprocess

# Setup paths
DATA_PATH = os.path.expanduser('~/Reddit_Virality_Data')
media_dir = os.path.join(DATA_PATH, 'media_files')
os.makedirs(media_dir, exist_ok=True)

db_path = os.path.join(DATA_PATH, 'virality.db')

# Connect to the database using a context manager
with sqlite3.connect(db_path) as conn:
    cursor = conn.cursor()
    cursor.execute("SELECT id, url FROM posts")
    rows = cursor.fetchall()

# Supported image extensions
image_extensions = ('.jpg', '.jpeg', '.png', '.gif')
# Marker to identify v.redd.it video URLs
video_marker = "v.redd.it"

for post_id, url in rows:
    url_lower = url.lower()
    
    # Download images directly
    if url_lower.endswith(image_extensions):
        file_extension = os.path.splitext(url)[1]
        file_name = f"{post_id}{file_extension}"
        file_path = os.path.join(media_dir, file_name)
        
        # Skip if file exists
        if os.path.exists(file_path):
            print(f"Skipped: {file_name} already exists.")
            continue
        
        try:
            r = requests.get(url, stream=True, timeout=10)
            r.raise_for_status()
            with open(file_path, 'wb') as f:
                for chunk in r.iter_content(1024):
                    if chunk:
                        f.write(chunk)
            print(f"Downloaded: {file_name}")
        except Exception as e:
            print(f"Failed to download {url}: {e}")
    
    # Download videos using yt-dlp if URL contains v.redd.it
    elif video_marker in url_lower:
        # Set output file template using post_id
        output_template = os.path.join(media_dir, f"{post_id}.%(ext)s")
        # Check if any file for this post_id exists in the media directory
        if any(fname.startswith(post_id) for fname in os.listdir(media_dir)):
            print(f"Skipped: video {post_id} already exists.")
            continue
        
        try:
            # Use yt-dlp to download the video with the specified output template
            subprocess.run(["yt-dlp", "-o", output_template, url], check=True)
            print(f"Downloaded video: {post_id}")
        except Exception as e:
            print(f"Failed to download video {url}: {e}")

print("Download process complete.")


Skipped: 1jg4vxj.jpeg already exists.
Skipped: 1jg4wdy.jpeg already exists.
Skipped: 1jg4m25.png already exists.
Skipped: video 1jg4w0m already exists.
Failed to download https://i.redd.it/rzoe6welyxpe1.png: 404 Client Error: Not Found for url: https://i.redd.it/rzoe6welyxpe1.png
Failed to download https://i.redd.it/9f5dg2oo0ype1.png: 404 Client Error: Not Found for url: https://i.redd.it/9f5dg2oo0ype1.png
Skipped: 1jg5026.jpeg already exists.
Skipped: 1jg4rn4.jpeg already exists.
Skipped: 1jg4rka.jpeg already exists.
Skipped: video 1jg4qby already exists.
Skipped: 1jg50w6.png already exists.
Skipped: 1jg52n1.png already exists.
Skipped: 1jg54cf.jpeg already exists.
Skipped: 1jg4x4e.jpeg already exists.
Skipped: 1jg52wu.jpeg already exists.
Skipped: 1jg52na.jpeg already exists.
Failed to download https://i.redd.it/caddl5hwzxpe1.jpeg: 404 Client Error: Not Found for url: https://i.redd.it/caddl5hwzxpe1.jpeg
Skipped: 1jg57va.jpeg already exists.
Failed to download https://i.redd.it/7251p