# Reddit Data Analysis Project

**Author:** Rebecca Anselmetti 
**Date:** November 14, 2024  
**Goal:** Analyze posts and comments from subreddits relevant to AI companions

## Project Overview
This notebook will help us:
- Connect to Reddit API
- Collect posts from subreddits
- Analyze engagement patterns
- Visualize the results

In [1]:
# Import necessary libraries
import praw
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

# Set up plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("✓ All libraries imported successfully!")
print(f"✓ Pandas version: {pd.__version__}")

✓ All libraries imported successfully!
✓ Pandas version: 2.3.3


In [7]:
import requests
import time

def get_reddit_posts(subreddit, limit=100, sort='hot'):
    """
    Get posts from a subreddit using Reddit's public JSON API
    No authentication required!
    """
    url = f"https://www.reddit.com/r/{subreddit}/{sort}.json"
    headers = {'User-Agent': 'reddit-analysis-project/1.0'}
    params = {'limit': limit}
    
    response = requests.get(url, headers=headers, params=params)
    
    if response.status_code == 200:
        data = response.json()
        posts = []
        
        for post in data['data']['children']:
            post_data = post['data']
            posts.append({
                'title': post_data['title'],
                'score': post_data['score'],
                'num_comments': post_data['num_comments'],
                'created_utc': datetime.fromtimestamp(post_data['created_utc']),
                'author': post_data['author'],
                'url': post_data['url'],
                'selftext': post_data.get('selftext', '')[:200]
            })
        
        return posts
    else:
        print(f"Error: {response.status_code}")
        return []

# Test it out
subreddit_name = "MyBoyfriendIsAI"
print(f"Fetching posts from r/{subreddit_name}...")

posts_data = get_reddit_posts(subreddit_name, limit=100, sort='hot')
df = pd.DataFrame(posts_data)

print(f"✓ Collected {len(df)} posts")
df.head()

Fetching posts from r/MyBoyfriendIsAI...
Error: 403
✓ Collected 0 posts


In [None]:
# Basic statistics about the posts
print("=" * 50)
print("BASIC STATISTICS")
print("=" * 50)
print(f"\nTotal posts analyzed: {len(df)}")
print(f"Average score: {df['score'].mean():.2f}")
print(f"Average comments: {df['num_comments'].mean():.2f}")
print(f"\nTop 5 most upvoted posts:")
print(df.nlargest(5, 'score')[['title', 'score']])

In [None]:
python# Plot score distribution
plt.figure(figsize=(12, 5))

plt.subplot(1, 2, 1)
plt.hist(df['score'], bins=30, color='skyblue', edgecolor='black')
plt.xlabel('Score')
plt.ylabel('Frequency')
plt.title(f'Distribution of Post Scores in r/{subreddit_name}')

plt.subplot(1, 2, 2)
plt.scatter(df['score'], df['num_comments'], alpha=0.5, color='coral')
plt.xlabel('Score')
plt.ylabel('Number of Comments')
plt.title('Score vs Comments')

plt.tight_layout()
plt.show()

In [None]:
# Analyze posting times
df['hour'] = df['created_utc'].dt.hour
df['day_of_week'] = df['created_utc'].dt.day_name()

# Posts by hour
posts_by_hour = df['hour'].value_counts().sort_index()

plt.figure(figsize=(12, 5))
posts_by_hour.plot(kind='bar', color='mediumpurple')
plt.xlabel('Hour of Day (UTC)')
plt.ylabel('Number of Posts')
plt.title(f'Posting Activity by Hour in r/{subreddit_name}')
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
# Save results to CSV
output_filename = f'reddit_{subreddit_name}_analysis.csv'
df.to_csv(output_filename, index=False)
print(f"✓ Data saved to {output_filename}")

# Summary report
print("\n" + "=" * 50)
print("ANALYSIS COMPLETE")
print("=" * 50)
print(f"\nDataset: r/{subreddit_name}")
print(f"Posts analyzed: {len(df)}")
print(f"Date range: {df['created_utc'].min()} to {df['created_utc'].max()}")
print(f"Output file: {output_filename}")