In [None]:
pip install praw

***Data Scraping***

In [None]:
import praw

reddit = praw.Reddit(client_id='dso-2wKoiaavUz1AoA3BCQ',
                     client_secret='NkN503DNQF3R5aNrW9ebO_WC1TqEAA',
                     user_agent='StockDataScraper v1.0')


subreddit = reddit.subreddit('stocks')


top_posts = subreddit.top(limit=10)


for post in top_posts:
    print(f"Title: {post.title}")
    print(f"Text: {post.selftext}")
    print(f"Score: {post.score}")
    print(f"URL: {post.url}")
    print('-' * 80)


In [None]:
subreddits = ['stocks', 'investing', 'StockMarket', 'wallstreetbets', 'finance']

all_posts = []

for subreddit in subreddits:
    print(f"Fetching posts from r/{subreddit}...")
    posts = reddit.subreddit(subreddit).search('stock', limit=400)
    for post in posts:
        all_posts.append({
            'Title': post.title,
            'Author': post.author.name if post.author else 'N/A',
            'Upvotes': post.score,
            'Comments': post.num_comments,
            'Created': post.created_utc,
            'URL': post.url
        })


SAVING THE CSV

In [None]:

import pandas as pd


df = pd.DataFrame(all_posts)


df.to_csv('reddit_combined_posts.csv', index=False)
print(f"Collected {len(df)} posts across all subreddits.")


In [None]:
import pandas as pd

data=pd.read_csv('/content/reddit_combined_posts.csv')
data.head()

In [6]:
data.isnull().sum()

Unnamed: 0,0
Title,0
Author,0
Upvotes,0
Comments,0
Created,0
URL,0


DATA PREPROCESSING

In [None]:
import pandas as pd

df = pd.read_csv('reddit_combined_posts.csv')

print(df.head())

print(df.info())

print(df.isnull().sum())

df = df.drop_duplicates()

df = df.dropna(subset=['Title'])

print(f"Dataset after cleaning has {len(df)} entries.")


In [None]:
import pandas as pd
import spacy

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('reddit_combined_posts.csv')

df = df.dropna(subset=['Title'])

def preprocess_text_spacy(text):
    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    return " ".join(tokens)

df['Cleaned_Title'] = df['Title'].apply(preprocess_text_spacy)

print(df[['Title', 'Cleaned_Title']].head())


CREATING A TARGET COLUMN

In [None]:
def classify_upvotes(upvotes):
    if upvotes > 1000:
        return 1  # Positive
    elif upvotes > 500:
        return 0  # Neutral
    else:
        return -1  # Negative

df['Label'] = df['Upvotes'].apply(classify_upvotes)

print(df[['Title', 'Upvotes', 'Label']].head())


In [None]:
df.columns

HANDLING NULL VALUES FOR THE TARGET VALUE

In [None]:
print("Missing values in each column:")
print(df.isnull().sum())

target_column = 'Label'

if target_column in df.columns:
    df = df.dropna(subset=[target_column])
else:
    raise ValueError(f"Target column '{target_column}' not found in the dataset.")

print(f"Dataset shape after dropping null values: {df.shape}")

selected_features = ['Cleaned_Title']
selected_target = target_column

df_model = df[selected_features + [selected_target]]

print("Data prepared for modeling:")
print(df_model.head())


FEATURE SELECTION

KEEPING ONLY THE NEEDED COLUMNS

In [None]:
selected_columns = ['Cleaned_Title', 'Upvotes', 'Comments', 'Label']
df = df[selected_columns]

print("Dataset preview:")
print(df.head())


TEXT PREPROCESSING AND VECTORIZATION

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(
    max_features=5000,
    ngram_range=(1, 2),
    stop_words='english'
)

X_text = vectorizer.fit_transform(df['Cleaned_Title'])

print("TF-IDF feature shape:", X_text.shape)


COMBINING TEXT WITH NUMERICAL VALUES

In [None]:
from scipy.sparse import hstack


numerical_features = df[['Upvotes', 'Comments']].values
X_combined = hstack([X_text, numerical_features])


y = df['Label']

print("Combined feature shape:", X_combined.shape)


TRAIN-TEST SPLIT

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


TRAINING THE MODEL AND CONFUSION MATRIX

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

model = RandomForestClassifier()
model.fit(X_train, y_train)


y_pred = model.predict(X_test)


accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Classification Report:")
print(classification_report(y_test, y_pred))


SAVING THE MODEL

In [None]:
import joblib

joblib.dump(model, 'random_forest_model.pkl')


In [None]:
!pip install streamlit

In [None]:

!pip install pyngrok


CREATING THE APP

In [None]:
%%writefile app.py
import streamlit as st
import joblib
import numpy as np
import pandas as pd
import yfinance as yf
from textblob import TextBlob

import pickle

model=joblib.load("random_forest_model.pkl")



st.set_page_config(page_title="Stock Movement Prediction", layout="wide")

st.title("Stock Price Movement Prediction")
st.markdown("""
This interactive web application predicts stock price movements based on **user-generated content** and **historical stock data**.
It combines sentiment analysis with market trends to forecast price directions.
""")

st.sidebar.header("User Inputs")

stock_symbol = st.sidebar.text_input("Stock Symbol (e.g., AAPL, TSLA, etc.)", value="AAPL")

st.sidebar.header("Select Date Range")
start_date = st.sidebar.date_input("Start Date", value=pd.to_datetime("2020-01-01"))
end_date = st.sidebar.date_input("End Date", value=pd.to_datetime("2024-01-01"))

st.sidebar.header("Analyze Custom Sentiment")
user_text = st.sidebar.text_area("Enter a snippet of text (e.g., a tweet or discussion):", "")
analyze_sentiment_button = st.sidebar.button("Analyze Sentiment")

st.sidebar.header("Overall Sentiment")
user_sentiment = st.sidebar.radio("Sentiment on Social Media", ("Positive", "Negative", "Neutral"))

if analyze_sentiment_button and user_text:
    blob = TextBlob(user_text)
    sentiment_polarity = blob.sentiment.polarity
    if sentiment_polarity > 0:
        st.sidebar.success(f"Sentiment Analysis Result: Positive (Score: {sentiment_polarity:.2f})")
        sentiment_score = 1
    elif sentiment_polarity < 0:
        st.sidebar.error(f"Sentiment Analysis Result: Negative (Score: {sentiment_polarity:.2f})")
        sentiment_score = -1
    else:
        st.sidebar.info(f"Sentiment Analysis Result: Neutral (Score: {sentiment_polarity:.2f})")
        sentiment_score = 0
else:
    sentiment_score = 1 if user_sentiment == "Positive" else (-1 if user_sentiment == "Negative" else 0)

st.header(f"Stock Data for {stock_symbol}")
try:
    stock_data = yf.download(stock_symbol, start=start_date, end=end_date)
    if not stock_data.empty:
        st.write(stock_data.tail())
        st.line_chart(stock_data["Close"], use_container_width=True)
    else:
        st.warning("No stock data found for the given date range.")
except Exception as e:
    st.error(f"Error fetching data: {e}")

st.header("Stock Movement Prediction")
if st.button("Predict Stock Movement"):
    try:
        features = np.array([[sentiment_score]])
        prediction = model.predict(features)
        confidence = max(model.predict_proba(features)[0])

        if prediction[0] == 1:
            st.success(f"Prediction: The stock price is likely to go **UP**.")
        else:
            st.error(f"Prediction: The stock price is likely to go **DOWN**.")

        st.markdown(f"**Model Confidence:** {confidence * 100:.2f}%")

    except Exception as e:
        st.error(f"Error making prediction: {e}")


In [None]:
!ngrok config add-authtoken 2pGIJtOG0dHmK9SQAigPpHztPzi_28Fb1k2WQF5ujVspb4TGz


  RUNNING THE APP

In [None]:
from pyngrok import ngrok

public_url = ngrok.connect(addr="8501", proto="http")
print(f"Access your Streamlit app here: {public_url}")

!streamlit run app.py &>/dev/null &
