# Scrape data from Reddit for stock market discussions using PRAW

Set Up PRAW

In [None]:
pip install praw

Create a Reddit Instance

In [None]:
import praw

reddit = praw.Reddit(
    client_id="your_client_id",
    client_secret="your_client_secret",
    username="your_reddit_username",
    password="your_reddit_password",
    user_agent="your_user_agent",
)

Choose the subreddits you want to scrape data from, such as "wallstreetbets", "stocks", "investing", and "StockMarket".

In [None]:
subreddits_to_parse = ['wallstreetbets', 'stocks', 'investing', 'StockMarket']

In [None]:
import re
from collections import Counter

#extract_data
for sub in subreddits_to_parse:
    subreddit_instance = reddit.subreddit(sub)
    submissions = subreddit_instance.hot(limit=50)  # get_the_top_50_hot_posts
    for submission in submissions:
        print(f"Submission title: {submission.title}")
#analysing_the_comment
for submission in submissions:
    submission.comments.replace_more(limit=0)  # flatten_the_comment_tree
    comments = submission.comments.list()  # get_all_comments
    for comment in comments:
        print(f"Comment: {comment.body}")
#filter_stocks_tickets
def clean_word(word):
    return re.sub(r'[^\w\s]', '', word).upper()  # remove_punctuation_and_convert_to_uppercase

potential_stock_symbols = []
for word in word_collection:
    cleaned_word = clean_word(word)
    if cleaned_word.isupper() and not containsNumber(cleaned_word):
        potential_stock_symbols.append(cleaned_word)
#count_frequency
cnt = Counter(potential_stock_symbols)
trending_tickers = [k for k, v in cnt.items() if v > 50]  # filter_tickers_mentioned_more_than_50_times
print("Trending Stock Tickers:", trending_tickers)

# Step 1: Data Scraping from Reddit

In [None]:
import pandas as pd

# Initialize a list to hold the data
data = []

#loop_through_extract_data_post_comment_and_subreddits
for sub in subreddits_to_parse:
    subreddit_instance = reddit.subreddit(sub)
    submissions = subreddit_instance.hot(limit=50)  # get_the_top_50_hot_posts
    for submission in submissions:
        submission.comments.replace_more(limit=0)  # flatten_the_comment_tree
        comments = submission.comments.list()  # get_all_comments
        for comment in comments:
            data.append({
                'subreddit': sub,
                'title': submission.title,
                'comment': comment.body,
                'created_utc': submission.created_utc
            })

# create_a_dataFrame
df = pd.DataFrame(data)
df.to_csv('reddit_stock_data.csv', index=False)

# Step 2: Data Cleaning and Preprocessing

In [None]:
# load_the_data
df = pd.read_csv('reddit_stock_data.csv')

# remove_duplicates_and_nulls
df.drop_duplicates(subset=['title', 'comment'], inplace=True)
df.dropna(subset=['comment'], inplace=True)

# further_cleaning
df['comment'] = df['comment'].str.replace(r'http\S+|www\S+|https\S+', '', case=False)
df['comment'] = df['comment'].str.replace(r'\@\w+|\#', '', '', case=False)

# Step 3: Sentiment Analysis

In [None]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

# download_VADER_lexicon
nltk.download('vader_lexicon')

# initialize_the_VADER_sentiment_analyzer
sia = SentimentIntensityAnalyzer()

# apply_sentiment_analysis
df['sentiment'] = df['comment'].apply(lambda x: sia.polarity_scores(x)['compound'])

# Step 4: Feature Extraction

In [None]:

df['length'] = df['comment'].apply(len)  # length_of_the_comment
df['is_positive'] = df['sentiment'] > 0  # binary_feature_for_positive_sentiment

# grp_by_date_and_aggregate_features
df['created_utc'] = pd.to_datetime(df['created_utc'], unit='s')
df.set_index('created_utc', inplace=True)

# resample_to_daily_frequency_and_aggregate
daily_data = df.resample('D').agg({
    'sentiment': 'mean',
    'length': 'mean',
    'is_positive': 'sum'
}).fillna(0)

# save_the_processed_data
daily_data.to_csv('daily_stock_sentiment.csv')

# Step 5: Prediction Model

In [None]:
# required_libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import matplotlib.pyplot as plt

daily_data['price_movement'] = np.random.choice([0, 1], size=len(daily_data))  # Binary target: 0 = Down, 1 = Up

# Features and Target
X = daily_data[['sentiment', 'length', 'is_positive']]
y = daily_data['price_movement']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Comparison
models = {
    'Logistic Regression': LogisticRegression(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Support Vector Machine': SVC(kernel='linear', random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5)
}

results = []

for model_name, model in models.items():
    # Train model
    model.fit(X_train, y_train)

    # Predict
    y_pred = model.predict(X_test)

    # Evaluate
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, zero_division=0)
    recall = recall_score(y_test, y_pred, zero_division=0)
    f1 = f1_score(y_test, y_pred, zero_division=0)

    # Append results
    results.append({
        'Model': model_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1 Score': f1
    })

# Create a DataFrame for results
results_df = pd.DataFrame(results)

# Display results
print("Model Comparison:")
print(results_df)

# Choose the best model
best_model_name = results_df.sort_values('F1 Score', ascending=False).iloc[0]['Model']
best_model = models[best_model_name]

print(f"\nBest Model: {best_model_name}")

# Plot comparison
results_df.set_index('Model')[['Accuracy', 'Precision', 'Recall', 'F1 Score']].plot(kind='bar', figsize=(10, 6))
plt.title('Model Comparison Metrics')
plt.ylabel('Score')
plt.xlabel('Model')
plt.show()
