In [64]:
import pandas as pd
import requests
import time
import os
from concurrent.futures import ThreadPoolExecutor

# Reddit API credentials
client_id = 'Nrbc6n5KVQ6fEQ_7JUfH3A'
client_secret = 'Y0STrnfxdJh_r5h9TS5qxx2tAs5swg'
username = 'Important_Trade_7759'
password = 'saima123'
user_agent = 'data'

# Authentication with Reddit API
def get_access_token():
    auth = requests.auth.HTTPBasicAuth(client_id, client_secret)
    data = {'grant_type': 'password', 'username': username, 'password': password}
    headers = {'User-Agent': user_agent}
    res = requests.post('https://www.reddit.com/api/v1/access_token', auth=auth, data=data, headers=headers)
    token = res.json()['access_token']
    headers['Authorization'] = f'bearer {token}'
    return headers

headers = get_access_token()

# Generalized function to fetch data from Reddit with retry logic
def fetch_data(url, params=None, retries=3, delay=5):
    attempt = 0
    while attempt < retries:
        try:
            response = requests.get(url, headers=headers, params=params)
            if response.status_code == 200:
                return response.json()
            elif response.status_code == 429:
                retry_after = int(response.headers.get('Retry-After', delay))
                print(f"Rate limit exceeded. Retrying after {retry_after} seconds...")
                time.sleep(retry_after)
                attempt += 1
            else:
                print(f"Failed to fetch data from {url}, Status Code: {response.status_code}")
                return None
        except requests.RequestException as e:
            print(f"Error fetching data from {url}: {str(e)}")
            attempt += 1
            if attempt < retries:
                print(f"Retrying... ({attempt}/{retries})")
                time.sleep(delay)
            else:
                print("Max retries reached. Moving on.")
                return None

# Function to collect posts from a subreddit with pagination
def collect_posts_by_category(subreddit, category, limit=1000, time_range=None):
    posts = []
    after = None  # To keep track of pagination
    collected_posts = 0  # Counter for posts collected so far

    # Set the correct endpoint based on the category
    url = f'https://oauth.reddit.com/r/{subreddit}/{category}'

    while collected_posts < limit:
        params = {'limit': 100}  # Max limit per request
        if after:
            params['after'] = after  # Pagination parameter to get the next set of posts
        if time_range and (category == 'top' or category == 'controversial'):
            params['t'] = time_range  # Add time range for 'top' or 'controversial' categories

        data = fetch_data(url, params)

        # Check if valid data is returned
        if data and 'data' in data and 'children' in data['data']:
            # Extract posts
            for post in data['data']['children']:
                # Extract post details
                post_id = post['data']['id']
                post_title = post['data']['title']
                post_content = post['data']['selftext']
                post_created_utc = post['data']['created_utc']
                post_subreddit = subreddit

                # Fetch top-level comments for the post
                comments = collect_comments(post_id)

                # Append post details to the list
                posts.append([post_id, post_title, post_content, post_created_utc, post_subreddit, category, comments])
                collected_posts += 1

                # Stop if we reach the desired limit
                if collected_posts >= limit:
                    break

            # Update 'after' for pagination
            after = data['data'].get('after')
            if not after:  # No more posts to fetch
                print(f"No more posts available for subreddit: {subreddit} in category: {category}.")
                break

            # Log the 'after' value for debugging
            print(f"Pagination token (after): {after}")

            # Add a delay to avoid hitting rate limits
            time.sleep(2)  # Increased sleep time to avoid rate limits

        else:
            # Handle case when data is missing or invalid
            print(f"No data returned or missing 'children' for subreddit: {subreddit} in category: {category}.")
            break

    return posts[:limit]  # Return only up to the specified limit

# Function to collect top-level comments for a post
def collect_comments(post_id):
    comments = []
    url = f'https://oauth.reddit.com/comments/{post_id}'
    data = fetch_data(url)

    if data and len(data) > 1 and 'data' in data[1]:
        for comment in data[1]['data']['children']:
            if 'body' in comment['data']:
                comments.append(comment['data']['body'])

    return " | ".join(comments)  # Join all comments into a single string

# Function to collect posts for multiple subreddits and categories
def collect_posts_for_subreddits(subreddits, categories, limit_per_category=1000):
    all_posts = []
    with ThreadPoolExecutor(max_workers=5) as executor:  # Reduced concurrency to avoid rate limits
        futures = []
        for subreddit in subreddits:
            for category in categories:
                if category == 'top' or category == 'controversial':
                    # These categories require an additional 't' parameter to specify the time period
                    time_ranges = ['day', 'week', 'month', 'year', 'all']
                    for time_range in time_ranges:
                        futures.append(executor.submit(collect_posts_by_category, subreddit, category, limit_per_category, time_range))
                else:
                    futures.append(executor.submit(collect_posts_by_category, subreddit, category, limit_per_category))
        
        for future in futures:
            all_posts.extend(future.result())

    return all_posts

# List of subreddits and categories to collect data from
subreddits_to_collect = ['gadgets']
categories_to_collect = ['hot', 'new', 'top', 'rising', 'controversial']

# Collect data from specified subreddits and categories in an infinite loop every 6 hours
while True:
    # Refresh the token before collecting data
    headers = get_access_token()

    # Collect data
    all_posts = collect_posts_for_subreddits(subreddits_to_collect, categories_to_collect, limit_per_category=4000)

    # Create DataFrame
    columns = ['post_id', 'title', 'content', 'timestamp', 'subreddit', 'category', 'comments']
    df_combined = pd.DataFrame(all_posts, columns=columns)

    # Convert timestamp to datetime
    df_combined['timestamp'] = pd.to_datetime(df_combined['timestamp'], unit='s')

    # Check for unique posts based on 'post_id'
    df_unique = df_combined.drop_duplicates(subset=['post_id'])

    # Define the file path
    file_path = './data/reddit_posts.csv'

    # Append the unique DataFrame to the CSV file without duplicating posts
    if os.path.exists(file_path):
        # Load the existing CSV
        df_existing = pd.read_csv(file_path)
        # Combine with new data and drop duplicates based on 'post_id'
        df_combined = pd.concat([df_existing, df_unique], ignore_index=True).drop_duplicates(subset=['post_id'])
    else:
        df_combined = df_unique

    # Save the combined unique posts to the CSV file in append mode
    df_combined.to_csv(file_path, index=False, mode='a', header=not os.path.exists(file_path))

    # Output the unique DataFrame (for checking)
    print(df_combined.head())
    print(f"Total unique posts: {df_combined.shape[0]}")

    # Sleep for 6 hours before running again
    time.sleep(21600)  # Sleep for 6 hours

No more posts available for subreddit: gadgets in category: top.
No more posts available for subreddit: gadgets in category: top.
Pagination token (after): t3_1fghjsd
Pagination token (after): t3_1fghjsd
Pagination token (after): t3_1fj7e4r
No more posts available for subreddit: gadgets in category: top.
No more posts available for subreddit: gadgets in category: rising.
No more posts available for subreddit: gadgets in category: controversial.
Pagination token (after): t3_179wp34
No more posts available for subreddit: gadgets in category: controversial.
Pagination token (after): t3_1eih47s
Pagination token (after): t3_1eiknn3
Pagination token (after): t3_at793k
Rate limit exceeded. Retrying after 5 seconds...
Rate limit exceeded. Retrying after 5 seconds...
Rate limit exceeded. Retrying after 5 seconds...
Rate limit exceeded. Retrying after 5 seconds...
Rate limit exceeded. Retrying after 5 seconds...
Rate limit exceeded. Retrying after 5 seconds...
Rate limit exceeded. Retrying after

KeyboardInterrupt: 