In [26]:
import requests
import pandas as pd

API_KEY = "8mcrnuLv_pAWcwLUPdRpGDAg1gHYqlOh"

print("Fetching SVB stock data: Jan 2021 - March 10, 2023")

all_data = []

# 2021
print("\nFetching 2021...")
url = f"https://api.polygon.io/v2/aggs/ticker/SIVB/range/15/minute/2021-01-01/2021-12-31?adjusted=true&sort=asc&limit=50000&apiKey={API_KEY}"
response = requests.get(url)
data = response.json()
if 'results' in data:
    all_data.extend(data['results'])
    print(f"  ✓ {len(data['results'])} bars")

# 2022
print("Fetching 2022...")
url = f"https://api.polygon.io/v2/aggs/ticker/SIVB/range/15/minute/2022-01-01/2022-12-31?adjusted=true&sort=asc&limit=50000&apiKey={API_KEY}"
response = requests.get(url)
data = response.json()
if 'results' in data:
    all_data.extend(data['results'])
    print(f"  ✓ {len(data['results'])} bars")

# 2023 (Jan 1 - March 10)
print("Fetching Jan-March 10, 2023...")
url = f"https://api.polygon.io/v2/aggs/ticker/SIVB/range/15/minute/2023-01-01/2023-03-10?adjusted=true&sort=asc&limit=50000&apiKey={API_KEY}"
response = requests.get(url)
data = response.json()
if 'results' in data:
    all_data.extend(data['results'])
    print(f"  ✓ {len(data['results'])} bars")

# Convert to DataFrame
df = pd.DataFrame(all_data)
df['timestamp'] = pd.to_datetime(df['t'], unit='ms')
df = df.rename(columns={'o': 'Open', 'h': 'High', 'l': 'Low', 'c': 'Close', 'v': 'Volume'})
df = df[['timestamp', 'Open', 'High', 'Low', 'Close', 'Volume']]
df = df.sort_values('timestamp').drop_duplicates()

df.to_csv('svb_stock_jan2021_mar2023.csv', index=False)

print(f"\n✓ SAVED: svb_stock_jan2021_mar2023.csv")
print(f"  Total bars: {len(df)}")
print(f"  Date range: {df['timestamp'].min()} to {df['timestamp'].max()}")

Fetching SVB stock data: Jan 2021 - March 10, 2023

Fetching 2021...
  ✓ 5056 bars
Fetching 2022...
  ✓ 4711 bars
Fetching Jan-March 10, 2023...
  ✓ 1391 bars

✓ SAVED: svb_stock_jan2021_mar2023.csv
  Total bars: 11158
  Date range: 2021-01-04 09:30:00 to 2023-03-10 13:30:00


In [28]:
import requests
import pandas as pd
import time
from datetime import datetime

SUBREDDITS = [
    'wallstreetbets', 'stocks', 'investing', 'technology',
    'news', 'bayarea', 'finance', 'economics'
]

SEARCH_TERMS = ['SVB', 'Silicon Valley Bank', 'SIVB', 'bank run']

# Feb 10 - March 10, 2023 (4 weeks)
AFTER = int(datetime(2023, 2, 10, 0, 0).timestamp())
BEFORE = int(datetime(2023, 3, 10, 23, 59).timestamp())

print("Fetching Reddit data: Feb 10 - March 10, 2023 (4 weeks)")

all_posts = []

for subreddit in SUBREDDITS:
    for term in SEARCH_TERMS:
        print(f"  Searching r/{subreddit} for '{term}'...")
        
        url = "https://api.pullpush.io/reddit/search/submission"
        params = {
            'subreddit': subreddit,
            'q': term,
            'after': AFTER,
            'before': BEFORE,
            'size': 100
        }
        
        try:
            response = requests.get(url, params=params, timeout=30)
            data = response.json()
            posts = data.get('data', [])
            all_posts.extend(posts)
            print(f"    Found {len(posts)} posts")
            time.sleep(1)
        except Exception as e:
            print(f"    Error: {e}")

# Remove duplicates
unique_posts = {post['id']: post for post in all_posts}
all_posts = list(unique_posts.values())

print(f"\n✓ Total unique posts: {len(all_posts)}")

# Convert to DataFrame
posts_df = pd.DataFrame([
    {
        'id': post.get('id'),
        'timestamp': datetime.fromtimestamp(post.get('created_utc', 0)),
        'subreddit': post.get('subreddit'),
        'author': post.get('author'),
        'title': post.get('title', ''),
        'text': post.get('selftext', ''),
        'score': post.get('score', 0),
        'num_comments': post.get('num_comments', 0),
        'full_text': (post.get('title', '') + ' ' + post.get('selftext', '')).strip()
    }
    for post in all_posts
])

# ============================================
# DATA CLEANING - POSTS
# ============================================

print("\nCleaning posts data...")
initial_posts = len(posts_df)

# Remove deleted/removed authors
posts_df = posts_df[~posts_df['author'].isin(['[deleted]', '[removed]', 'AutoModerator'])]

# Remove empty or too-short text (< 10 characters)
posts_df = posts_df[posts_df['full_text'].str.len() >= 10]

# Remove posts with deleted/removed content
posts_df = posts_df[~posts_df['text'].isin(['[deleted]', '[removed]'])]
posts_df = posts_df[~posts_df['title'].isin(['[deleted]', '[removed]'])]

# Remove duplicates
posts_df = posts_df.drop_duplicates(subset=['full_text'])

# Remove bots (common bot names)
bot_keywords = ['bot', 'Bot', 'BOT', 'AutoMod']
posts_df = posts_df[~posts_df['author'].str.contains('|'.join(bot_keywords), na=False)]

# Remove null timestamps
posts_df = posts_df.dropna(subset=['timestamp'])

print(f"  Removed {initial_posts - len(posts_df)} dirty posts")
print(f"  Clean posts: {len(posts_df)}")

posts_df = posts_df.sort_values('timestamp')
posts_df.to_csv('svb_reddit_posts_4weeks.csv', index=False)

print(f"✓ SAVED: svb_reddit_posts_4weeks.csv ({len(posts_df)} posts)")
print(f"  Date range: {posts_df['timestamp'].min()} to {posts_df['timestamp'].max()}")

# ============================================
# GET COMMENTS
# ============================================

print("\nFetching comments from top 30 posts...")

top_posts = posts_df.nlargest(30, 'score')
all_comments = []

for idx, post in top_posts.iterrows():
    url = "https://api.pullpush.io/reddit/search/comment"
    params = {'link_id': post['id'], 'size': 100}
    
    try:
        response = requests.get(url, params=params, timeout=30)
        data = response.json()
        comments = data.get('data', [])
        all_comments.extend(comments)
        print(f"  Post {post['id']}: {len(comments)} comments")
        time.sleep(1)
    except:
        pass

comments_df = pd.DataFrame([
    {
        'id': c.get('id'),
        'post_id': c.get('link_id', '').replace('t3_', ''),
        'timestamp': datetime.fromtimestamp(c.get('created_utc', 0)),
        'author': c.get('author'),
        'text': c.get('body', ''),
        'score': c.get('score', 0),
        'subreddit': c.get('subreddit')
    }
    for c in all_comments
])

# ============================================
# DATA CLEANING - COMMENTS
# ============================================

print("\nCleaning comments data...")
initial_comments = len(comments_df)

# Remove deleted/removed authors
comments_df = comments_df[~comments_df['author'].isin(['[deleted]', '[removed]', 'AutoModerator'])]

# Remove empty or too-short text (< 5 characters)
comments_df = comments_df[comments_df['text'].str.len() >= 5]

# Remove deleted/removed content
comments_df = comments_df[~comments_df['text'].isin(['[deleted]', '[removed]'])]

# Remove duplicates
comments_df = comments_df.drop_duplicates(subset=['text'])

# Remove bots
comments_df = comments_df[~comments_df['author'].str.contains('|'.join(bot_keywords), na=False)]

# Remove null timestamps
comments_df = comments_df.dropna(subset=['timestamp'])

print(f"  Removed {initial_comments - len(comments_df)} dirty comments")
print(f"  Clean comments: {len(comments_df)}")

comments_df = comments_df.sort_values('timestamp')
comments_df.to_csv('svb_reddit_comments_4weeks.csv', index=False)

print(f"✓ SAVED: svb_reddit_comments_4weeks.csv ({len(comments_df)} comments)")

# ============================================
# FINAL SUMMARY
# ============================================

print("\n" + "="*60)
print("DATA COLLECTION COMPLETE")
print("="*60)
print(f"Clean posts: {len(posts_df)}")
print(f"Clean comments: {len(comments_df)}")
print(f"Total texts for GoEmotions: {len(posts_df) + len(comments_df)}")
print("\n✓ READY FOR GOEMOTIONS IN COLAB")

Fetching Reddit data: Feb 10 - March 10, 2023 (4 weeks)
  Searching r/wallstreetbets for 'SVB'...
    Found 89 posts
  Searching r/wallstreetbets for 'Silicon Valley Bank'...
    Found 36 posts
  Searching r/wallstreetbets for 'SIVB'...
    Found 51 posts
  Searching r/wallstreetbets for 'bank run'...
    Found 21 posts
  Searching r/stocks for 'SVB'...
    Found 14 posts
  Searching r/stocks for 'Silicon Valley Bank'...
    Found 13 posts
  Searching r/stocks for 'SIVB'...
    Found 6 posts
  Searching r/stocks for 'bank run'...
    Found 10 posts
  Searching r/investing for 'SVB'...
    Found 14 posts
  Searching r/investing for 'Silicon Valley Bank'...
    Found 9 posts
  Searching r/investing for 'SIVB'...
    Found 8 posts
  Searching r/investing for 'bank run'...
    Found 9 posts
  Searching r/technology for 'SVB'...
    Found 1 posts
  Searching r/technology for 'Silicon Valley Bank'...
    Found 6 posts
  Searching r/technology for 'SIVB'...
    Found 0 posts
  Searching r/tec

In [19]:
print(f"Number of comments: {len(comments_df)}")
print(f"Number of posts: {len(posts_df)}")

Number of comments: 1973
Number of posts: 295
