### Importing Libraries and loading file from project path

In [21]:
pip install vaderSentiment

Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
Installing collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2
Note: you may need to restart the kernel to use updated packages.


In [7]:
# Import the tools we need
import pandas as pd
import os
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# --- Setup file paths ---
project_root = "/Users/pranithapadala/Documents/SmartProductPulse"
RAW_DATA_DIR = os.path.join(project_root, "data/raw")
PROCESSED_DATA_DIR = os.path.join(project_root, "data/processed")

# Input file for this notebook: The raw Reddit posts
REDDIT_FILE = os.path.join(RAW_DATA_DIR, "raw_reddit_posts.csv")

# Output file for this notebook: Cleaned Reddit data with sentiment scores
PROCESSED_REDDIT_FILE = os.path.join(PROCESSED_DATA_DIR, "processed_reddit_posts.csv")

# Make sure the output folder exists
os.makedirs(PROCESSED_DATA_DIR, exist_ok=True)

print("Setup Complete.")

Setup Complete.


### Load the raw data

In [8]:
# Load the raw Reddit posts file
df_reddit = pd.read_csv(REDDIT_FILE)
print(f"Loaded {len(df_reddit)} raw Reddit posts.")

Loaded 115 raw Reddit posts.


### Prepare Text for VADER and perform sentiment analysis

In [9]:
print("--- Starting Reddit Data Processing ---")

# 1. Create a column for VADER that keeps capital letters and punctuation
#    (We only remove URLs as they are just noise)
df_reddit['sentiment_text'] = df_reddit['text'].astype(str).str.replace(r'https?://\S+|www\.\S+', '', regex=True).str.strip()

# 2. Initialize VADER and calculate the sentiment score for each post
analyzer = SentimentIntensityAnalyzer()
df_reddit['sentiment_score'] = df_reddit['sentiment_text'].apply(lambda text: analyzer.polarity_scores(text)['compound'])

# 3. Create a simple text label ('Positive', 'Negative', 'Neutral')
def get_sentiment_label(score):
    if score >= 0.05: return 'Positive'
    elif score <= -0.05: return 'Negative'
    else: return 'Neutral'
df_reddit['sentiment_label'] = df_reddit['sentiment_score'].apply(get_sentiment_label)

# 4. Create a fully clean, lowercase 'cleaned_text' column for duplicate checking
df_reddit['cleaned_text'] = df_reddit['text'].astype(str).str.lower().str.replace(r'[^a-z0-9\s]', '', regex=True).str.replace(r'\s+', ' ', regex=True).str.strip()

print("Sentiment analysis and text cleaning complete.")


--- Starting Reddit Data Processing ---
Sentiment analysis and text cleaning complete.


### Save final processed file

In [10]:
# Remove any duplicate posts based on the fully cleaned text
initial_rows = len(df_reddit)
df_reddit.drop_duplicates(subset=['cleaned_text'], keep='first', inplace=True)
final_rows = len(df_reddit)
print(f"Removed {initial_rows - final_rows} duplicate posts.")

# Select only the columns we need for the next stage of the project
output_columns = ['product_id', 'sentiment_score', 'sentiment_label']
df_processed_reddit = df_reddit[output_columns]

# --- Save the clean Reddit data to a new file ---
df_processed_reddit.to_csv(PROCESSED_REDDIT_FILE, index=False)

print(f"\n--- SUCCESS! ---")
print(f"Processed Reddit data has been saved to:\n{PROCESSED_REDDIT_FILE}")
print("This notebook's job is done. The next notebook will handle the dashboard.")

Removed 0 duplicate posts.

--- SUCCESS! ---
Processed Reddit data has been saved to:
/Users/pranithapadala/Documents/SmartProductPulse/data/processed/processed_reddit_posts.csv
This notebook's job is done. The next notebook will handle the dashboard.
