In [1]:
# Sentiment Analysis on Reddit Posts
#This notebook demonstrates data scraping, preprocessing, model training, and evaluation for sentiment analysis using Reddit posts.

In [None]:
# Import Libraries
import praw
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import config

In [2]:
## 1. Initialize Reddit API
#We use the `praw` library to connect to Reddit and scrape data.

In [None]:
# Initialize Reddit API
reddit = praw.Reddit(
    client_id=config.REDDIT_CLIENT_ID,
    client_secret=config.REDDIT_CLIENT_SECRET,
    user_agent=config.REDDIT_USER_AGENT,
)

In [None]:
## 2. Data Scraping
Define a function to scrape posts from a subreddit based on a keyword. Extract relevant details and store them in a DataFrame.

In [None]:
# Scraping function
def scrape_reddit(subreddit_name, keyword, limit=100):
    subreddit = reddit.subreddit(subreddit_name)
    posts = []
    for post in subreddit.search(keyword, limit=limit):
        posts.append({
            "title": post.title,
            "selftext": post.selftext,
            "created_utc": post.created_utc,
            "score": post.score,
            "comments": post.num_comments
        })
    return pd.DataFrame(posts)

# Scrape data
df = scrape_reddit("wallstreetbets", "stocks", limit=500)
df['created_date'] = pd.to_datetime(df['created_utc'], unit='s')

# Save raw data for reference
df.to_csv("reddit_data.csv", index=False)
df.head()

In [None]:
## 3. Preprocessing and Sentiment Analysis
Perform the following steps:
1. Preprocess text data.
2. Apply sentiment analysis using `SentimentIntensityAnalyzer`.
3. Categorize sentiments into positive, negative, or neutral.

In [None]:
# Preprocessing
nltk.download("vader_lexicon")
sia = SentimentIntensityAnalyzer()

def preprocess_text(text):
    if isinstance(text, str):
        text = text.lower()
        text = nltk.word_tokenize(text)
        text = " ".join([word for word in text if word.isalnum()])
        return text
    return ""

df['cleaned_text'] = df['title'] + " " + df['selftext']
df['cleaned_text'] = df['cleaned_text'].apply(preprocess_text)

# Sentiment analysis
df['sentiment'] = df['cleaned_text'].apply(lambda x: sia.polarity_scores(x)['compound'])
df['sentiment_category'] = df['sentiment'].apply(
    lambda x: 'positive' if x > 0.05 else ('negative' if x < -0.05 else 'neutral')
)

# Save processed data
df.to_csv("processed_reddit_data.csv", index=False)
df.head()

In [None]:
## 4. Feature Engineering
Prepare features for model training and encode sentiment categories as numerical labels.

In [None]:
# Feature engineering
df['sentiment_label'] = df['sentiment_category'].map({'negative': 0, 'neutral': 1, 'positive': 2})
X = df[['sentiment', 'score', 'comments']]
y = df['sentiment_label']

In [None]:
## 5. Model Training and Evaluation
Split the data, train a Random Forest model, and evaluate its performance.

In [None]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
## 6. Visualizations
### 6.1 Sentiment Distribution
Plot the distribution of sentiment categories.

In [None]:
# Sentiment distribution
sns.countplot(df['sentiment_category'])
plt.title("Sentiment Distribution")
plt.show()

In [None]:
### 6.2 Feature Importance
Visualize the importance of each feature in the Random Forest model.

In [None]:
# Feature importance
importances = model.feature_importances_
feature_names = X.columns

sns.barplot(x=importances, y=feature_names)
plt.title("Feature Importance")
plt.show()