In [4]:
import praw
import pandas as pd
from datetime import datetime
import sys
import os
import time
from prawcore.exceptions import TooManyRequests, RequestException


In [19]:
from dotenv import load_dotenv
load_dotenv('../.env')

True

In [21]:
# Get environment variables
REDDIT_CLIENT_ID = os.getenv('REDDIT_CLIENT_ID')
REDDIT_CLIENT_SECRET = os.getenv('REDDIT_CLIENT_SECRET')
REDDIT_USER_AGENT = os.getenv('REDDIT_USER_AGENT')

In [5]:
def initialize_reddit():
    return praw.Reddit(
        client_id=REDDIT_CLIENT_ID,
        client_secret=REDDIT_CLIENT_SECRET,
        user_agent=REDDIT_USER_AGENT
    )

In [6]:
def scrape_subreddit(SUBREDDIT, TIME_PERIOD, limit=1000):
    """
    Scrapes posts from a specified subreddit with rate limiting and error handling.
    """
    reddit = initialize_reddit()
    subreddit = reddit.subreddit(SUBREDDIT)
    
    posts_data = []
    
    try:
        # Get posts with rate limiting
        for post in subreddit.top(time_filter=TIME_PERIOD, limit=limit):
            try:
                # Add delay between requests
                time.sleep(.5)  # 2 second delay between posts
                
                # Get comments with error handling
                comments = []
                try:
                    post.comments.replace_more(limit=None)
                    for comment in post.comments.list():
                        comments.append({
                            'comment_id': comment.id,
                            'post_id': post.id,
                            'body': comment.body,
                            'score': comment.score,
                            'created_utc': datetime.fromtimestamp(comment.created_utc),
                            'author': str(comment.author)
                        })
                except Exception as e:
                    print(f"Error getting comments for post {post.id}: {str(e)}")
                
                # Store post data
                posts_data.append({
                    'id': post.id,
                    'title': post.title,
                    'body': post.selftext,
                    'score': post.score,
                    'created_utc': datetime.fromtimestamp(post.created_utc),
                    'num_comments': post.num_comments,
                    'comments': comments
                })
                
                print(f"Processed post {len(posts_data)}: {post.id}")
                
            except TooManyRequests:
                print("Hit rate limit, waiting 60 seconds...")
                time.sleep(60)
                continue
            except Exception as e:
                print(f"Error processing post: {str(e)}")
                continue
                
    except RequestException as e:
        print(f"Network error: {str(e)}")
        print("Please check your internet connection and try again.")
        if posts_data:  # If we have some data, save it
            print("Saving partial data...")
        else:
            return pd.DataFrame(), pd.DataFrame()
    
    # Create separate DataFrames for posts and comments
    posts_df = pd.DataFrame([{k: v for k, v in post.items() if k != 'comments'} 
                           for post in posts_data])
    
    # Flatten comments into their own DataFrame
    comments_df = pd.DataFrame([
        comment
        for post in posts_data
        for comment in post['comments']
    ])
    
    return posts_df, comments_df

In [23]:
scrape_subreddit('asianbeauty', 'year', 10)

Processed post 1: 1eit3j7
Processed post 2: 1emnbk1
Processed post 3: 1f1i3wd
Processed post 4: 1edz5c0
Processed post 5: 1f45ijs
Processed post 6: 1dg6qo5
Processed post 7: 1dxrs5x
Processed post 8: 1f6tsy0
Processed post 9: 1ffzqpf
Processed post 10: 1dy1ktr


(        id                                              title  \
 0  1eit3j7                                   Life is not fair   
 1  1emnbk1              Tsubaki hair mask 1yr before & after    
 2  1f1i3wd                                       Jelly nails?   
 3  1edz5c0                                   My Costco today…   
 4  1f45ijs   Romand New Lip Product: Sheer Jelly Tinted Stick   
 5  1dg6qo5                   my entire AB lippie collection 💗   
 6  1dxrs5x  Just fyi, the beauty of joseon sunscreen stick...   
 7  1f6tsy0  Randomly came across this store Teso Life in m...   
 8  1ffzqpf                                 guilty as charged…   
 9  1dy1ktr                      Received items I didn’t order   
 
                                                 body  score  \
 0                                                      6622   
 1  Unfortunately two embarrassing photos of me is...   4313   
 2  Is there any asian nail polish brands that cou...   2573   
 3  The Mediheal