In [None]:
data.isnull().sum()

In [None]:
import pandas as pd

df = pd.read_csv('reddit_combined_posts.csv')

print(df.head())

print(df.info())

print(df.isnull().sum())

df = df.drop_duplicates()

df = df.dropna(subset=['Title'])

print(f"Dataset after cleaning has {len(df)} entries.")


In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('reddit_combined_posts.csv')

df = df.dropna(subset=['Title'])

def preprocess_text_spacy(text):
    doc = nlp(text)

    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]

    return " ".join(tokens)

df['Cleaned_Title'] = df['Title'].apply(preprocess_text_spacy)

print(df[['Title', 'Cleaned_Title']].head())


SENTIMENT ANALYSIS

In [None]:
from textblob import TextBlob

df['Sentiment_Polarity'] = df['Cleaned_Title'].apply(lambda x: TextBlob(x).sentiment.polarity)

print(df[['Cleaned_Title', 'Sentiment_Polarity']].head())

import matplotlib.pyplot as plt
plt.hist(df['Sentiment_Polarity'], bins=20, edgecolor='k')
plt.title('Distribution of Sentiment Polarity')
plt.xlabel('Sentiment Polarity')
plt.ylabel('Frequency')
plt.show()


EXTRACTING FEATURES RELEVANT TO STOCK MARKET

In [None]:
df['Word_Count'] = df['Cleaned_Title'].apply(lambda x: len(x.split()))

df['Normalized_Upvotes'] = (df['Upvotes'] - df['Upvotes'].mean()) / df['Upvotes'].std()

df['Comment_Upvote_Ratio'] = df['Comments'] / (df['Upvotes'] + 1)

print(df[['Cleaned_Title', 'Word_Count', 'Normalized_Upvotes', 'Comment_Upvote_Ratio']].head())

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 6))
sns.heatmap(df[['Sentiment_Polarity', 'Word_Count', 'Normalized_Upvotes', 'Comment_Upvote_Ratio']].corr(), annot=True, cmap='coolwarm')
plt.title("Feature Correlation Heatmap")
plt.show()


ADDING LABEL COLUMN

In [None]:
def create_labels(row):
    if row['Sentiment_Polarity'] > 0.1:
        return 1  # Positive
    elif row['Sentiment_Polarity'] < -0.1:
        return -1  # Negative
        return 0  # Neutral

df['Label'] = df.apply(create_labels, axis=1)
print("Label column created successfully!")
print(df[['Title', 'Sentiment_Polarity', 'Label']].head())


PREPARING THE MODEL AND GENERATING THE CONFUSION MATRIX

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import seaborn as sns

features = ['Sentiment_Polarity', 'Word_Count', 'Normalized_Upvotes', 'Comment_Upvote_Ratio']
target = 'Label'

X = df[features]
y = df[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42, n_estimators=100)
rf_model.fit(X_train, y_train)

y_pred = rf_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy * 100:.2f}%")

print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Neutral', 'Positive']))

conf_matrix = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()


SAVING THE MODEL

In [None]:
import pickle
pickle.dump(rf_model,open('Stock_model.pkl','wb'))

print("model successfully saved")

