In [40]:
import logging
import datetime
import praw
import csv
import os
import time
import prawcore
import praw.exceptions  # Import PRAW exceptions

# Configure logging
logging.basicConfig(level=logging.DEBUG)

# Get the current datetime
current_time = datetime.datetime.now()
logging.debug(f"Script started at {current_time}")

# Reddit credentials and setup
user_agent = 'Scraper 1.0 by u/Impossible_Boat825'
reddit = praw.Reddit(
    client_id='ZPOlwpi8u-wk-jnOOLArmg',        # Your client ID
    client_secret='LeBC2BvoVAlyYxBKpI6eMU-tV6_Yzg',  # Your client secret
    user_agent=user_agent
)

# Subreddits to scrape
subreddits = ['seattle','austin', 'dallas', 'houston', 'chicago', 'sandiego', 'nyc', 'LosAngeles', 'chicago', 'phoenix', 'philadelphia', 'Columbus', 'Denver', 'Nashville', 'LasVegas', 'Detroit','Portland','Jacksonville','Boston', 'SanFrancisco']
#'seattle','austin', 'dallas', 'houston', 'chicago', 'sandiego', 'nyc', 'LosAngeles', 'chicago', 'phoenix', 'philadelphia', 'Columbus', 'Denver', 'Nashville', 'LasVegas', 'Detroit','Portland','Jacksonville','Boston', 'SanFrancisco'
# 'austincirclejerk', 'houstoncirclejerk', atlantacirclejerk, chicagocirclejerks
#'seattle','austin', 'dallas', 'houston', 'chicago', 'sandiego', 'nyc', 'LosAngeles', 'chicago', 'phoenix'
num_of_posts = 6000  # Number of posts to scrape from each subreddit

# Specify the output file path
output_file_path = r"C:\Users\Kimble\Downloads\reddit_posts.csv"

# Open a CSV file to write data
with open(output_file_path, mode='w', encoding='utf-8', newline='') as csv_file:
    fieldnames = ['Subreddit', 'Title', 'Body', 'Score', 'URL', 'Comment 1', 'Comment 2', 'Comment 3', 'Comment 4', 'Comment 5']
    writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
    writer.writeheader()

    for subreddit_name in subreddits:
        subreddit = reddit.subreddit(subreddit_name)
        logging.debug(f"Fetching up to {num_of_posts} posts from r/{subreddit_name}")

        while True:
            try:
                posts = subreddit.hot(limit=num_of_posts)
                break  # Exit the loop if successful
            except praw.exceptions.RedditAPIException as e:
                if any(error.error_type == 'RATELIMIT' for error in e.items):
                    logging.warning(f"Rate limit exceeded. Waiting for 60 seconds.")
                    time.sleep(60)  # Wait for 1 minute
                else:
                    logging.error(f"An error occurred: {e}")
                    raise  # Re-raise the exception
            except prawcore.exceptions.TooManyRequests as e:
                logging.warning(f"Rate limit exceeded. Waiting for 60 seconds. Details: {e}")
                time.sleep(60)  # Wait for 1 minute
            except Exception as e:
                logging.error(f"An unexpected error occurred: {e}")
                raise  # Re-raise the exception

        for idx, post in enumerate(posts, start=1):
            if post.is_self and post.selftext.strip():
                logging.debug(f"Processing post #{idx}: {post.title}")

                # Fetch top 5 comments
                while True:
                    try:
                        post.comments.replace_more(limit=0)
                        comments = post.comments[:5]
                        break  # Exit the loop if successful
                    except praw.exceptions.RedditAPIException as e:
                        if any(error.error_type == 'RATELIMIT' for error in e.items):
                            logging.warning(f"Rate limit exceeded. Waiting for 60 seconds.")
                            time.sleep(60)  # Wait for 1 minute
                        else:
                            logging.error(f"An error occurred: {e}")
                            raise  # Re-raise the exception
                    except prawcore.exceptions.TooManyRequests as e:
                        logging.warning(f"Rate limit exceeded. Waiting for 60 seconds. Details: {e}")
                        time.sleep(60)  # Wait for 1 minute
                    except Exception as e:
                        logging.error(f"An unexpected error occurred: {e}")
                        raise  # Re-raise the exception

                # Extract comments text
                comments_text = [comment.body.replace('\n', ' ').replace('\r', ' ') for comment in comments]
                while len(comments_text) < 5:
                    comments_text.append('')

                # Prepare data for CSV
                row = {
                    'Subreddit': subreddit_name,
                    'Title': post.title.replace('\n', ' ').replace('\r', ' '),
                    'Body': post.selftext.replace('\n', ' ').replace('\r', ' '),
                    'Score': post.score,
                    'URL': post.url,
                    'Comment 1': comments_text[0],
                    'Comment 2': comments_text[1],
                    'Comment 3': comments_text[2],
                    'Comment 4': comments_text[3],
                    'Comment 5': comments_text[4],
                }
                writer.writerow(row)
            else:
                logging.debug(f"Skipping post #{idx}: {post.title} (No text in body)")

logging.debug("Script finished successfully")
print(f"CSV file has been saved to {output_file_path}")


DEBUG:root:Script started at 2024-10-14 22:20:53.470291
DEBUG:root:Fetching up to 6000 posts from r/austincirclejerk
DEBUG:prawcore:Fetching: GET https://oauth.reddit.com/r/austincirclejerk/hot
DEBUG:prawcore:Data: None
DEBUG:prawcore:Params: {'limit': 6000, 'raw_json': 1}
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.reddit.com:443
DEBUG:urllib3.connectionpool:https://www.reddit.com:443 "POST /api/v1/access_token HTTP/1.1" 200 658
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): oauth.reddit.com:443
DEBUG:urllib3.connectionpool:https://oauth.reddit.com:443 "GET /r/austincirclejerk/hot?limit=6000&raw_json=1 HTTP/1.1" 200 58099
DEBUG:prawcore:Response: 200 (58099 bytes)
DEBUG:root:Skipping post #1: Outjerked again (No text in body)
DEBUG:root:Skipping post #2: General of r/Austin. I salute you!! (No text in body)
DEBUG:root:Processing post #3: How to know if your child is throwing lobsters at HEB:
DEBUG:prawcore:Fetching: GET https://oauth.reddit.com

CSV file has been saved to C:\Users\Kimble\Downloads\reddit_posts.csv


In [41]:
import pandas as pd
from collections import Counter
import re

# Load the data
reddit_posts = pd.read_csv("C:\\Users\\Kimble\\Downloads\\reddit_posts.csv")

# Function to clean and tokenize text
def tokenize(text):
    text = re.sub(r'\W+', ' ', text.lower())  # Remove non-word characters and lower the case
    return text.split()

# Columns to consider for text aggregation
columns_to_combine = ['Title', 'Body', 'Comment 1', 'Comment 2', 'Comment 3', 'Comment 4', 'Comment 5']

# Dictionaries to hold the word counts and weighted counts for each city
word_counts_per_city = {}
weighted_word_counts_per_city = {}

# Process each row in the dataframe
for _, row in reddit_posts.iterrows():
    subreddit = row['Subreddit']
    combined_text = ' '.join(row[col] for col in columns_to_combine if pd.notna(row[col]))
    tokens = tokenize(combined_text)
    score = row['Score']
    
    # Update unweighted and weighted word counts for the city
    if subreddit not in word_counts_per_city:
        word_counts_per_city[subreddit] = Counter(tokens)
        weighted_word_counts_per_city[subreddit] = Counter({word: count * score for word, count in Counter(tokens).items()})
    else:
        word_counts_per_city[subreddit].update(tokens)
        weighted_word_counts_per_city[subreddit].update({word: count * score for word, count in Counter(tokens).items()})

# Create DataFrames for each type of word count
# Unweighted total word counts
total_word_counts_df = pd.DataFrame(list(total_word_counts.items()), columns=['Word', 'Count'])

# Unweighted word counts by city
unweighted_counts_df = pd.DataFrame.from_dict(word_counts_per_city, orient='index').fillna(0).astype(int).T

# Weighted total word counts
total_weighted_word_counts_df = pd.DataFrame(list(total_weighted_word_counts.items()), columns=['Word', 'Weighted Count'])

# Weighted word counts by city
weighted_counts_data = [(city, word, count) for city, words in weighted_word_counts_per_city.items() for word, count in words.items()]
weighted_counts_df = pd.DataFrame(weighted_counts_data, columns=['City', 'Word', 'Weighted Count'])
weighted_counts_pivot_df = weighted_counts_df.pivot_table(index='Word', columns='City', values='Weighted Count', fill_value=0)

# Save all CSV files
total_word_counts_df.to_csv('total_word_counts.csv', index=False)
unweighted_counts_df.to_csv('total_word_counts_by_city.csv')
total_weighted_word_counts_df.to_csv('total_weighted_word_counts.csv', index=False)
weighted_counts_pivot_df.to_csv('weighted_word_counts_by_city.csv')



