DATA SCRAPING

In [None]:
pip install praw

In [None]:
import praw

reddit = praw.Reddit(client_id='dso-2wKoiaavUz1AoA3BCQ',
                     client_secret='NkN503DNQF3R5aNrW9ebO_WC1TqEAA',
                     user_agent='StockDataScraper v1.0')

subreddits = ['stocks', 'investing', 'wallstreetbets', 'stockmarket', 'financialindependence']

for subreddit_name in subreddits:
    subreddit = reddit.subreddit(subreddit_name)

    print(f"Fetching posts from subreddit: {subreddit_name}")

    top_posts = subreddit.top(limit=500)

    for post in top_posts:
        print(f"Title: {post.title}")
        print(f"Text: {post.selftext}")
        print(f"Score: {post.score}")
        print(f"URL: {post.url}")
        print('-' * 80)


In [None]:
subreddits = ['stocks', 'investing', 'StockMarket', 'wallstreetbets', 'finance']

all_posts = []

for subreddit in subreddits:
    print(f"Fetching posts from r/{subreddit}...")
    posts = reddit.subreddit(subreddit).search('stock', limit=2000)
    for post in posts:
        all_posts.append({
            'Title': post.title,
            'Author': post.author.name if post.author else 'N/A',
            'Upvotes': post.score,
            'Comments': post.num_comments,
            'Created': post.created_utc,
            'URL': post.url
        })


In [None]:

import pandas as pd


df = pd.DataFrame(all_posts)


df.to_csv('reddit_combined_posts.csv', index=False)
print(f"Collected {len(df)} posts across all subreddits.")


Collected 1157 posts across all subreddits.


In [None]:
import pandas as pd

data=pd.read_csv('/content/reddit_combined_posts.csv')
data.head()

EXPLPORATORY DATA ANALYSIS

In [None]:
data.isnull().sum()

Unnamed: 0,0
Title,0
Author,0
Upvotes,0
Comments,0
Created,0
URL,0


In [None]:
import pandas as pd

df = pd.read_csv('reddit_combined_posts.csv')

print(df.head())

print(df.info())

print(df.isnull().sum())

df = df.drop_duplicates()

df = df.dropna(subset=['Title'])

print(f"Dataset after cleaning has {len(df)} entries.")


APPLYING PREPROCESSING TO TITLE COLUMN

In [None]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('reddit_combined_posts.csv')

df = df.dropna(subset=['Title'])

def preprocess_text_spacy(text):
    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    return " ".join(tokens)

df['Cleaned_Title'] = df['Title'].apply(preprocess_text_spacy)

print(df[['Title', 'Cleaned_Title']].head())


SENTIMENT ANALYSIS

In [None]:
from textblob import TextBlob

df['Sentiment_Polarity'] = df['Cleaned_Title'].apply(lambda x: TextBlob(x).sentiment.polarity)

print(df[['Cleaned_Title', 'Sentiment_Polarity']].head())

import matplotlib.pyplot as plt
plt.hist(df['Sentiment_Polarity'], bins=20, edgecolor='k')
plt.title('Distribution of Sentiment Polarity')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()


EXTRACTING FEATURES RELEVANT TO STOCK MARKET

In [None]:
df['Word_Count'] = df['Cleaned_Title'].apply(lambda x: len(x.split()))

df['Normalized_Upvotes'] = (df['Upvotes'] - df['Upvotes'].mean()) / df['Upvotes'].std()

df['Comment_Upvote_Ratio'] = df['Comments'] / (df['Upvotes'] + 1)

print(df[['Cleaned_Title', 'Word_Count', 'Normalized_Upvotes', 'Comment_Upvote_Ratio']].head())

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(df[['Sentiment_Polarity', 'Word_Count', 'Normalized_Upvotes', 'Comment_Upvote_Ratio']].corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


ADDING LABEL COLUMN

In [None]:
def create_labels(row):
    if row['Sentiment_Polarity'] > 0.1:
        return 1  # Positive label
    elif row['Sentiment_Polarity'] < -0.1:
        return -1  # Negative label
        return 0  # Neutral label

df['Label'] = df.apply(create_labels, axis=1)
print("Label column created successfully!")
print(df[['Title', 'Sentiment_Polarity', 'Label']].head())


Label column created successfully!
                                               Title  Sentiment_Polarity  \
0             Which stock is hidding in plain sight?           -0.214286   
1  I think Alphabet (GOOGL) is the most undervalu...            0.285714   
2                What’s your favorite stock and why?            0.500000   
3  Starbucks giving incoming CEO Niccol $85M in c...            0.000000   
4  Microsoft announces $60 billion stock buyback ...            0.000000   

   Label  
0     -1  
1      1  
2      1  
3      0  
4      0  


PREPARING THE MODEL AND CONFUSION MATRIX

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns

features = ['Sentiment_Polarity', 'Word_Count', 'Normalized_Upvotes', 'Comment_Upvote_Ratio']
target = 'Label'

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


SAVING THE MODEL

In [None]:
import pickle
pickle.dump(rf_model,open('Stock_model.pkl','wb'))

print("model successfully saved")



TESTING ON HISTORICAL DATA

In [None]:
!pip install yfinance pandas matplotlib scikit-learn
import yfinance as yf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report


In [None]:
import yfinance as yf

ticker = "AAPL"
start_date = "2010-01-01"
end_date = "2024-12-31"

stock_data = yf.download(ticker, start=start_date, end=end_date)

print(stock_data.index.min(), "to", stock_data.index.max())


In [None]:
import pandas as pd

reddit_data = pd.read_csv('reddit_combined_posts.csv')


print(reddit_data.head())


CONVERTING THE UNIX TIMESTAMP TO DATETIME

In [None]:
reddit_data['Created_Date'] = pd.to_datetime(reddit_data['Created'], unit='s')

reddit_data['Created_Date'] = reddit_data['Created_Date'].dt.normalize()

print(reddit_data[['Created_Date']].head())


In [None]:
stock_data.index = stock_data.index.normalize()

print(stock_data.index[:10])


CHECKING WHETHER THE REDDIT POSTS ARE NORMALIZED TOO

In [None]:
print(reddit_data['Created_Date'].head())


CHECKING STOCK DATA IS IN DATETIME FORMAT

In [None]:
stock_data.index = pd.to_datetime(stock_data.index).normalize()

print(stock_data.index[:10])


In [None]:
print(reddit_data['Created_Date'].head())


In [None]:
def get_closest_stock_price(date, stock_data):
    closest_date = min(stock_data.index, key=lambda x: abs(x - date))
    return stock_data.loc[closest_date, 'Close']


In [None]:
reddit_data['Created_Date'] = reddit_data['Created_Date'].dt.tz_localize(None)

stock_data.index = stock_data.index.tz_localize(None)

def get_closest_stock_price(date, stock_data):
    closest_date = min(stock_data.index, key=lambda x: abs(x - date))
    return stock_data.loc[closest_date, 'Close']

reddit_data['Stock_Price'] = reddit_data['Created_Date'].apply(
    lambda x: get_closest_stock_price(x, stock_data) if x not in stock_data.index else stock_data.loc[x, 'Close']
)


print(reddit_data[['Created_Date', 'Stock_Price']].head())


In [None]:
import random
reddit_data['Sentiment_Score'] = [random.uniform(-1, 1) for _ in range(len(reddit_data))]

reddit_data['Word_Count'] = reddit_data['Title'].apply(lambda x: len(x.split()))

reddit_data['Normalized_Upvotes'] = reddit_data['Upvotes'] / reddit_data['Upvotes'].max()
reddit_data['Normalized_Comments'] = reddit_data['Comments'] / reddit_data['Comments'].max()

reddit_data['Stock_Movement'] = reddit_data['Stock_Price'].diff().apply(lambda x: 2 if x == 0 else (1 if x > 0 else 0))

reddit_data = reddit_data.dropna(subset=['Stock_Price'])

reddit_data['Stock_Movement_Label'] = reddit_data['Stock_Movement'].apply(
    lambda x: 1 if x > 0 else (0 if x < 0 else 2)
)

print(reddit_data.head())


In [None]:
from sklearn.model_selection import train_test_split

X = reddit_data[['Sentiment_Score', 'Word_Count', 'Normalized_Upvotes', 'Normalized_Comments', 'Stock_Price']]
y = reddit_data['Stock_Movement_Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Training data size: {X_train.shape}")
print(f"Test data size: {X_test.shape}")


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))


In [None]:
new_reddit_post = {
    'Sentiment_Score': 0.3,
    'Word_Count': 120,
    'Normalized_Upvotes': 0.7,
    'Normalized_Comments': 0.5,
    'Stock_Price': 150
}
new_df = pd.DataFrame([new_reddit_post])

predicted_movement = rf_model.predict(new_df)

if predicted_movement == 1:
    print("Predicted Stock Movement: Up")
elif predicted_movement == 0:
    print("Predicted Stock Movement: Down")
else:
    print("Predicted Stock Movement: Neutral")


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

plt.plot(reddit_data['Created_Date'], reddit_data['Stock_Price'], label='Stock Price', color='blue')

plt.scatter(reddit_data['Created_Date'], reddit_data['Stock_Price'], c=reddit_data['Stock_Movement_Label'], cmap='coolwarm', label='Predicted Movement')

plt.title('Stock Prices and Predicted Movements')
plt.xlabel('Date')
plt.ylabel('Stock Price')
plt.legend()
plt.show()
