# Reddit

In [1]:
import praw
import pandas as pd
import requests
import os


In [None]:
CLIENT_ID = ""
CLIENT_SECRET = ""
USER_AGENT = ""


In [None]:
def get_reddit_posts(subreddit_name, num_limit=500):
    reddit = praw.Reddit(
        client_id=CLIENT_ID,
        client_secret=CLIENT_SECRET,
        user_agent=USER_AGENT
    )
    subreddit = reddit.subreddit(subreddit_name)
    figure_dir = f"Figures/reddit_images/{subreddit_name}"
    os.makedirs(figure_dir, exist_ok=True)

    posts_data = []
    for post in subreddit.new(limit=num_limit):
        image_url = None
        if post.url.endswith((".jpg", ".jpeg", ".png")):
            image_url = post.url
            # Download image
            img_data = requests.get(image_url).content
            filename = f"{figure_dir}/{post.id}.jpg"
            with open(filename, "wb") as f:
                f.write(img_data)

        posts_data.append({
            "created_utc": pd.to_datetime(post.created_utc, unit='s'),
            "title": post.title,
            "author": str(post.author),
            "score": post.score,
            "text": post.selftext,
            "url": post.url,
            "image_url": image_url,
            "permalink": f"https://www.reddit.com{post.permalink}"
        })

    print(f"Get {len(posts_data)} posts from {subreddit_name}")
    posts_df = pd.DataFrame(posts_data)
    posts_df.to_csv(f"Posts/reddit_{subreddit_name}_post.csv", index=False)
    return pd.DataFrame(posts_df)

In [54]:
posts_MicrocapStocks = get_reddit_posts("MicrocapStocks", num_limit=500)
posts_MicrocapStocks.head()

Get 102 posts from MicrocapStocks


Unnamed: 0,created_utc,title,author,score,text,url,image_url,permalink
0,2025-04-02 16:46:58,CarParts.com (NASDAQ: PRTS) Special Situation,Leather-Moment1068,1,**Summary**\n\n[CarParts.com](http://CarParts....,https://www.reddit.com/r/MicrocapStocks/commen...,,https://www.reddit.com/r/MicrocapStocks/commen...
1,2024-12-02 21:42:41,$WFLD - Realistic Assessment of Wellfield Tech...,TradeToday,1,,/r/pennystocks/comments/1h318ox/wfld_realistic...,,https://www.reddit.com/r/MicrocapStocks/commen...
2,2024-10-03 19:25:41,$WFLD Positive Developments / Resumption of Tr...,TradeToday,1,"Dear fellow Wellfield Technologies investors,\...",https://www.reddit.com/r/MicrocapStocks/commen...,,https://www.reddit.com/r/MicrocapStocks/commen...
3,2024-09-06 15:37:54,$WFLD Good News / Reverse Take-Over (RTO) of T...,TradeToday,3,"Dear Wellfield Technologies Investors,\n\nI wa...",https://www.reddit.com/r/MicrocapStocks/commen...,,https://www.reddit.com/r/MicrocapStocks/commen...
4,2024-08-14 17:34:52,William Hogan Joins Nepra Foods as Chief Execu...,DigitalMan358,2,"VANCOUVER, BC / ACCESSWIRE / August 12, 2024 /...",https://www.reddit.com/r/MicrocapStocks/commen...,,https://www.reddit.com/r/MicrocapStocks/commen...


In [55]:
posts_wallstreetbets = get_reddit_posts("wallstreetbets", num_limit=500)
posts_wallstreetbets.head()

Get 500 posts from wallstreetbets


Unnamed: 0,created_utc,title,author,score,text,url,image_url,permalink
0,2025-04-03 20:30:11,"Started options last week, nuked half my portf...",Blymin,1,I have no idea what I’m doing.\nThis is genuin...,https://www.reddit.com/gallery/1jqtc5a,,https://www.reddit.com/r/wallstreetbets/commen...
1,2025-04-03 20:26:28,Craziest 48 Hours of my Life,StocksAtNight2,8,I have been on an ultra regarded gay bear winn...,https://www.reddit.com/gallery/1jqt8px,,https://www.reddit.com/r/wallstreetbets/commen...
2,2025-04-03 20:25:33,That crash was a let down,Spare-Animal5126,7,I wanted like 20% down on everything just cuz ...,https://www.reddit.com/r/wallstreetbets/commen...,,https://www.reddit.com/r/wallstreetbets/commen...
3,2025-04-03 20:25:05,Is anyone tired of winning yet?,Dizzy-Concert15,3,Asking for a friend,https://www.reddit.com/r/wallstreetbets/commen...,,https://www.reddit.com/r/wallstreetbets/commen...
4,2025-04-03 20:25:00,The Big Dump,Soft-Presentation-66,1,You all really thought the 1% run up on libera...,https://www.reddit.com/gallery/1jqt7f4,,https://www.reddit.com/r/wallstreetbets/commen...


In [57]:
posts_hedgefund = get_reddit_posts("hedgefund", num_limit=500)
posts_hedgefund.head()

Get 500 posts from hedgefund


Unnamed: 0,created_utc,title,author,score,text,url,image_url,permalink
0,2025-04-03 14:42:43,Career outlook for controllers at inv manageme...,paul_reuben,1,Does anyone have thoughts on the market for co...,https://www.reddit.com/r/hedgefund/comments/1j...,,https://www.reddit.com/r/hedgefund/comments/1j...
1,2025-04-02 02:01:38,What's the best way to network as a high schoo...,Sad-Software9263,1,,https://www.reddit.com/r/hedgefund/comments/1j...,,https://www.reddit.com/r/hedgefund/comments/1j...
2,2025-04-01 21:31:09,Fully Dynamic Data Extraction,fin_antics,2,I'm the CEO of a stealth startup building a HF...,https://www.reddit.com/r/hedgefund/comments/1j...,,https://www.reddit.com/r/hedgefund/comments/1j...
3,2025-04-01 16:55:39,Daily Traded Volumes Feeds for Derivatives,FuturesandOptionsFOW,1,Hi everyone! This is a corporate account for a...,https://www.reddit.com/r/hedgefund/comments/1j...,,https://www.reddit.com/r/hedgefund/comments/1j...
4,2025-04-01 02:12:35,Can you break into a hedgefund from a non targ...,Sad-Software9263,15,"Hi, I had a question on whether you could brea...",https://www.reddit.com/r/hedgefund/comments/1j...,,https://www.reddit.com/r/hedgefund/comments/1j...


In [58]:
posts_fintech = get_reddit_posts("fintech", num_limit=500)
posts_fintech.head()

Get 429 posts from fintech


Unnamed: 0,created_utc,title,author,score,text,url,image_url,permalink
0,2025-04-03 13:44:38,Mémoire,Emotional-Bath-5682,1,"Dear Sirs,\nAs part of our master's thesis pre...",https://www.reddit.com/r/fintech/comments/1jqi...,,https://www.reddit.com/r/fintech/comments/1jqi...
1,2025-04-03 12:24:47,Launched a Free API That Converts Raw Credit R...,creditparsepro,1,Hello r/fintech!\n\n​If you're looking to enha...,https://www.reddit.com/r/fintech/comments/1jqh...,,https://www.reddit.com/r/fintech/comments/1jqh...
2,2025-04-03 06:33:47,Which Open Banking API Is Best for Your Fintec...,Apprehensive-Bag5639,1,,https://www.fintegrationfs.com/post/plaid-vs-t...,,https://www.reddit.com/r/fintech/comments/1jqb...
3,2025-04-03 03:49:42,Trying to find Product Market Fit,RicoFleer,0,What services that your bank or any other fint...,https://www.reddit.com/r/fintech/comments/1jq8...,,https://www.reddit.com/r/fintech/comments/1jq8...
4,2025-04-02 20:13:22,Question for finance pros: what’s the hardest ...,Pixelated-Paradox,1,"Hi all,\nI’m working on a tool to help finance...",https://www.reddit.com/r/fintech/comments/1jpy...,,https://www.reddit.com/r/fintech/comments/1jpy...


In [4]:
posts_MicrocapStocks = get_reddit_posts("investing", num_limit=500)
posts_MicrocapStocks.head()

Get 500 posts from investing


Unnamed: 0,created_utc,title,author,score,text,url,image_url,permalink
0,2025-04-10 19:21:12,FZDXX question for future house down payment,InvestmentAdvice2024,1,I am looking for feedback or thoughts on what ...,https://www.reddit.com/r/investing/comments/1j...,,https://www.reddit.com/r/investing/comments/1j...
1,2025-04-10 19:08:38,Anyone else adjust their investing strategies?,dezahpp,1,"Ever since Trump took office, I've lost a bunc...",https://www.reddit.com/r/investing/comments/1j...,,https://www.reddit.com/r/investing/comments/1j...
2,2025-04-10 18:44:04,Final-Lock789 SCAM sigh... I was so excited,username48378645,0,"Oh man, I'm so disappointed. I thought I've fi...",https://www.reddit.com/r/investing/comments/1j...,,https://www.reddit.com/r/investing/comments/1j...
3,2025-04-10 18:38:28,Looking to get back in eventually... But when?,jonwb1,0,Hi... Before my SWTSX Index Fund lost half its...,https://www.reddit.com/r/investing/comments/1j...,,https://www.reddit.com/r/investing/comments/1j...
4,2025-04-10 18:31:15,How the hell are people saying their 401k are ...,Over-Concern3522,149,I’ve been seeing so much fear across the board...,https://www.reddit.com/r/investing/comments/1j...,,https://www.reddit.com/r/investing/comments/1j...


# X

In [5]:
import tweepy
import requests
import pandas as pd
import os

In [None]:
BEARER_TOKEN = ""

In [None]:
def get_X_posts(community_name):
    client = tweepy.Client(bearer_token=BEARER_TOKEN, wait_on_rate_limit=True)
    query = f"{community_name} -is:retweet"

    # Search tweets (recent 7 days for free tier)
    response = client.search_recent_tweets(
        query=query,
        max_results=100,
        tweet_fields=["created_at", "author_id", "text", "attachments"],
        expansions=["attachments.media_keys"],
        media_fields=["url", "type"]
    )

    # Preplare image URLs
    media_lookup = {}
    if "media" in response.includes:
        for media in response.includes["media"]:
            if media.type == "photo":
                media_lookup[media.media_key] = media.url

    # Parse tweets
    data = []
    for tweet in response.data:
        image_urls = []

        if tweet.attachments and "media_keys" in tweet.attachments:
            for key in tweet.attachments["media_keys"]:
                if key in media_lookup:
                    url = media_lookup[key]
                    image_urls.append(url)

                    # Download the image
                    img_data = requests.get(url).content
                    filename = f"Figures/tweet_images/{tweet.id}_{key}.jpg"
                    with open(filename, "wb") as f:
                        f.write(img_data)

        data.append({
            "tweet_id": tweet.id,
            "author_id": tweet.author_id,
            "created_at": tweet.created_at,
            "text": tweet.text,
            "image_urls": ", ".join(image_urls) if image_urls else ""
        })

    df = pd.DataFrame(data)
    df.to_csv(f"Posts/X_{community_name}_posts_.csv", index=False)



In [None]:
get_X_posts("MicroCapClub")

In [9]:
get_X_posts("WSJ Markets")

# Combine datasets

In [None]:
r_fintech = pd.read_csv("Posts/reddit_fintech_post.csv")
r_hedgefund = pd.read_csv("Posts/reddit_hedgefund_post.csv")
r_investing = pd.read_csv("Posts/reddit_investing_post.csv")
r_wallstreetbets = pd.read_csv("Posts/reddit_wallstreetbets_post.csv")
r_MicrocapStocks = pd.read_csv("Posts/reddit_MicrocapStocks_post.csv")
x_microcapclub = pd.read_csv("X_MicroCapClub_posts_.csv")
x_wsjmarkets = pd.read_csv("X_WSJ Markets_posts_.csv")

In [15]:
r_fintech.columns

Index(['time', 'title', 'author', 'score', 'text', 'url', 'image_urls',
       'permalink'],
      dtype='object')

In [None]:
r_fintech['text'] = r_fintech['title'] + ' ' + r_fintech['text']
r_hedgefund['text'] = r_hedgefund['title'] + ' ' + r_hedgefund['text']


In [13]:
x_microcapclub.columns

Index(['tweet_id', 'author_id', 'created_at', 'text', 'image_urls'], dtype='object')

In [16]:
r_rename_columns = {
    "created_utc": "time",
    "image_url": "image_urls",
}

r_fintech.rename(columns=r_rename_columns, inplace=True)
r_hedgefund.rename(columns=r_rename_columns, inplace=True)
r_investing.rename(columns=r_rename_columns, inplace=True)
r_wallstreetbets.rename(columns=r_rename_columns, inplace=True)
r_MicrocapStocks.rename(columns=r_rename_columns, inplace=True)

In [19]:
r_fintech['text'] = r_fintech['title'] + ' ' + r_fintech['text']
r_hedgefund['text'] = r_hedgefund['title'] + ' ' + r_hedgefund['text']
r_investing['text'] = r_investing['title'] + ' ' + r_investing['text']
r_wallstreetbets['text'] = r_wallstreetbets['title'] + ' ' + r_wallstreetbets['text']
r_MicrocapStocks['text'] = r_MicrocapStocks['title'] + ' ' + r_MicrocapStocks['text']

In [25]:
r_fintech = r_fintech[['time', 'text', 'image_urls']]
r_hedgefund = r_hedgefund[['time', 'text', 'image_urls']]
r_investing = r_investing[['time', 'text', 'image_urls']]
r_wallstreetbets = r_wallstreetbets[['time', 'text', 'image_urls']]
r_MicrocapStocks = r_MicrocapStocks[['time', 'text', 'image_urls']]

In [17]:
x_renmae_columns = {
    "created_at": "time",
    "author_id": "author",
    "image_urls": "image_urls",
}

x_wsjmarkets.rename(columns=x_renmae_columns, inplace=True)
x_microcapclub.rename(columns=x_renmae_columns, inplace=True)

In [18]:
x_wsjmarkets.columns

Index(['tweet_id', 'author', 'time', 'text', 'image_urls'], dtype='object')

In [26]:
x_wsjmarkets = x_wsjmarkets[['time', 'text', 'image_urls']]
x_microcapclub = x_microcapclub[['time', 'text', 'image_urls']]

In [27]:
r_fintech['source'] = 'reddit/fintech'
r_hedgefund['source'] = 'reddit/hedgefund'
r_investing['source'] = 'reddit/investing'
r_wallstreetbets['source'] = 'reddit/wallstreetbets'
r_MicrocapStocks['source'] = 'reddit/MicrocapStocks'
x_microcapclub['source'] = 'X/MicroCapClub'
x_wsjmarkets['source'] = 'X/WSJ Markets'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  r_fintech['source'] = 'reddit/fintech'


In [29]:
combined_df = pd.concat([r_fintech, r_hedgefund, r_investing, r_wallstreetbets, r_MicrocapStocks, x_microcapclub, x_wsjmarkets], ignore_index=True)

In [32]:
combined_df.to_csv("combined_posts.csv", index=False)

# Content Analysis

In [3]:
from openai import OpenAI
import pandas as pd

In [4]:
combined_df = pd.read_csv("combined_posts.csv")

In [5]:
posts_content = combined_df[['text','source']]

In [6]:
posts_json = posts_content.to_json(orient="records", lines=True)

In [None]:
def get_response(posts):
    client = OpenAI(api_key="")
    prompt = f"""
        Your job is to read the following social media posts about finance. 
        Based on the posts, please summarize the general topics discussed in the posts.
        Please only give less than or equal to 5 general topics, such as Idea, Econmoic News and Opinions, Market Trends and Analysis, Regulary and Policy Discussions, Technology, and so on.
        Notice that the topics should be general and not too specific.
        Don't give any other information. Don't give any introduction or conclusion.

        Posts:
        {posts}
        """
    response = client.chat.completions.create(
                        model="gpt-4o",
                        messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}],)
                    
    responses = response.choices[0].message.content
    return responses

In [10]:
import math

In [18]:
def chunk_dataframe(df, chunk_size=100):
    num_chunks = math.ceil(len(df) / chunk_size)
    return [df[i * chunk_size:(i + 1) * chunk_size] for i in range(num_chunks)]


In [19]:
chunks = chunk_dataframe(posts_content)


In [20]:
for i, chunk in enumerate(chunks):
    chunk_json = chunk.to_json(orient="records", lines=True)
    print(f"Processing chunk {i + 1}/{len(chunks)}...")
    response = get_response(chunk_json) 
    print(response)
    with open("keywords.txt", "a") as file:
        file.write(response)



Processing chunk 1/23...
1. Technology
2. Market Trends and Analysis
3. Regulation and Policy Discussions
4. Startups and Entrepreneurship
5. Financial Services and Solutions
Processing chunk 2/23...
1. Technology and Innovation: Discussions around AI, digital wallets, OCR tools, blockchain, and no-code solutions for fintech applications.
2. Career and Job Market: Advice and inquiries about career paths in fintech, product management, sales, compliance roles, and opportunities for students and recent graduates.
3. Entrepreneurship and Startups: Challenges and ideas for fintech startups, including securing partnerships, navigating regulations, and finding niche solutions.
4. Regulatory and Compliance: Navigating legal requirements, compliance challenges, and managing regulatory updates in the fintech industry.
5. Financial Services and Products: Insights on financial data APIs, payment systems, cross-border remittances, and emerging fintech trends such as digital lending and investment 

1. Market Trends, Economic News, and Analysis
2. Investment
3. Technology and Innovation in Finance
4. Regulatory, Compliance, and Policy Discussions
5. Startups, Entrepreneurship, and Business Development
6. Careers, Education, and Professional Development

# Get Sample Questions

In [None]:
from openai import OpenAI
import pandas as pd
import re

In [None]:
combined_df = pd.read_csv("combined_posts.csv")

In [None]:
categories1 = ["Fintech", 
              "AI", 
              "Blockchain", 
              "Quantum Tech", 
              "Payments", 
              "Trading Tools", 
              "Digital Wallets",
              "Market Trends",
              "Stock Trading",
              "Options Trading",
              "Hedge Funds",
              "Economic News",
              "Risk Management",
              "Asset Allocation",
              "Investment Strategy",
              "Compliance",
              "Licensing",
              "Financial Policy",
              "Tariffs",
              "Regulation",
              "Personal Finance",
              "Retirement",
              "Tax Strategy",
              "Financial Products"
              "Remittances",
              "Company News",
              "Sector Trends",
              "Earnings",
              "Industry Events",
              ]

categories2 = ["3D Printing",
               "AdTech",
               "Agtech",
               "Artificial Intelligence",
               "Augmented Reality (AR)",
               "Big Data",
               "Blockchain",
               "Cannabis/Medical Marijuana",
               "Clean Technology",
               "Cloud Computing",
               "E-commerce",
               "EdTech",
               "Electric & Hybrid Vehicles",
               "FinTech",
               "HealthTech",
               "Infrastructure",
               "InsureTech",
               "IoT (Internet of Things)",
               "Manufacturing",
               "Mobile Apps",
               "Nanotechnology",
               "Real Estate Tech",
               "RegTech",
               "Research (Non-Medical)",
               "Robotics",
               "Social Media",
               "Urban Planning",
               "Virtual Reality (VR)",
               "Wearables & Quantified Self"
            ]

In [None]:
def classify_post(post):
    try:
        client = OpenAI(api_key="")
        prompt = f"""
            Your job is to classify the following post into a category. 
            Please choose the most relevant category from the following list:
            {categories2}
            Please only give one category and its name. Don't give any other information. Don't give any introduction or conclusion.
            
            Post:
            {post}
            """
        response = client.chat.completions.create(
                            model="gpt-4o",
                            messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}],)
                        
        responses = response.choices[0].message.content
        for category in categories2:
            if category in responses:
                return category
        return "Unknown"
    except Exception as e:
        print(f"Error processing post: {post}")
        print(f"Error: {e}")
        return "Error"

In [None]:
combined_df['category2'] = combined_df['text'].apply(classify_post)

In [None]:
def get_sample_questions(posts, category):
    client = OpenAI(api_key="")
    prompt = f"""
        Your job is to analyze the following posts related to the topic of {category}.
        Please provide a list of sample questions that can be asked about this topic. These questions can be related to the posts but better not to specific posts.
        Please only give the questions. Don't give any other information. Don't give any introduction or conclusion.
        
        Post:
        {posts}
        """
    response = client.chat.completions.create(
                        model="gpt-4o",
                        messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}],)
                    
    responses = response.choices[0].message.content
    return responses

In [None]:
def process_posts_by_category(df, category_label):
    grouped = df.groupby(category_label)

    for category, group in grouped:
        print(f"Processing category: {category}")
        with open("sample_questions.txt", "a") as file:
            file.write("\n====================================\n")
            file.write(f"Category: {category}\n")
            
        posts = group['text'].tolist()

        chunk_size = 60
        for i in range(0, len(posts), chunk_size):
            chunk = posts[i:i + chunk_size]
            try:
                chunk = [str(post) for post in chunk if pd.notnull(post)]
                chunk_text = "\n".join(chunk)
                chunk_questions = get_sample_questions(chunk_text, category)
                with open("sample_questions.txt", "a") as file:
                    file.write(chunk_questions)
            except Exception as e:
                print(f"Error processing category {category}, chunk {i // chunk_size + 1}: {e}")



In [None]:
process_posts_by_category(combined_df, "category2")


In [None]:
def process_questions(file_name):
    file_path = f"{file_name}.txt"
    categories = []
    questions = []
    with open(file_path, "r") as file:
        current_category = None
        for line in file:
            line = line.strip()
            if line.startswith("="):
                continue
            elif line.startswith("Category:"):
                current_category = line.replace("Category:", "").strip()
            elif line and current_category:
                question = re.sub(r"^\d+[\.\-]?\s*", "", line)
                categories.append(current_category)
                questions.append(question)

    df = pd.DataFrame({"category": categories, "questions": questions})
    output_csv_path = f"{file_name}.csv"
    df.to_csv(output_csv_path, index=False)


In [None]:
process_questions("sample_questions_2")

In [None]:
def select_questions_prompt(quesitons, category):
    client = OpenAI(api_key="")
    prompt = f"""
        Your job is to select or generate 10 best or most frequent questions related to the topic of {category} from the following questions.
        Please provide a list of 10 sample questions from the following questions.
        Please only give the questions. Don't give any other information. Don't give any introduction or conclusion.
        
        Post:
        {quesitons}
        """
    response = client.chat.completions.create(
                        model="gpt-4o",
                        messages=[{"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt}],)
                    
    responses = response.choices[0].message.content
    return responses

def select_questions(file_path, category_label):
    df = pd.read_csv(f"sample_questions/{file_path}")
    grouped = df.groupby("category")

    for category, group in grouped:
        print(f"Processing category: {category}")
        with open(f"sample_questions_{file_path}_filtered.txt", "a") as file:
            file.write("\n====================================\n")
            file.write(f"Category: {category}\n")
            
        questions = group['questions'].tolist()

        select_questions = select_questions_prompt(questions, category)
        with open(f"sample_questions_{file_path}_filtered.txt", "a") as file:
            file.write(select_questions)
            file.write("\n")

    process_questions(f"sample_questions_{file_path}_filtered")