In [7]:
import requests
import requests.auth
import pandas as pd
import time
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from prophet import Prophet

# Reddit API credentials
Client_Id = "Wkd1xI3snG5zIigFeGAIrg"
Secret_Key = "75oRnclrvfEfPS-oMW2rMGoGRM9plw"

# Authentication
Auth = requests.auth.HTTPBasicAuth(Client_Id, Secret_Key)

# Data for token request
Data = {"grant_type": "client_credentials"}

# Headers
Headers = {
    "User-Agent": "MyRedditApp/0.0.1",
    "Content-Type": "application/x-www-form-urlencoded"
}

# Request access token
res = requests.post("https://www.reddit.com/api/v1/access_token", auth=Auth, data=Data, headers=Headers)

if res.status_code == 200:
    TOKEN = res.json().get("access_token")  # Get access token
    Headers["Authorization"] = f"Bearer {TOKEN}"  # Add Authorization header

    all_posts = []  # Store all posts
    total_fetched = 0  # Counter for posts
    sort_methods = ["hot", "top", "new", "rising"]  # Fetch from multiple categories

    while total_fetched < 20000:  # Loop until 20,000 posts are collected
        for sort in sort_methods:
            after = None  # Reset pagination for each sort method

            while total_fetched < 20000:
                response = requests.get(f"https://oauth.reddit.com/r/Python/{sort}", headers=Headers, params={"limit": 100, "after": after, "t": "all"})
                
                if response.status_code == 200:
                    data = response.json()
                    posts = data["data"]["children"]
                    
                    if not posts:
                        break

                    for post in posts:
                        post_data = post["data"]
                        all_posts.append({
                            "title": post_data.get("title", "N/A"),
                            "ups": post_data.get("ups", 0),
                            "downs": post_data.get("downs", 0),
                            "score": post_data.get("score", 0),
                            "subreddit": post_data.get("subreddit", "Unknown"),
                            "selftext": post_data.get("selftext", ""),
                            "upvote_ratio": post_data.get("upvote_ratio", 0.0),
                            "post_hint": post_data.get("post_hint", "N/A"),
                            "num_comments": post_data.get("num_comments", 0),
                            "created_utc": pd.to_datetime(post_data.get("created_utc", None), unit='s'),
                            "subreddit_subscribers": post_data.get("subreddit_subscribers", 0),
                            "is_self": post_data.get("is_self", False),
                            "is_video": post_data.get("is_video", False),
                            "domain": post_data.get("domain", "Unknown"),
                            "permalink": post_data.get("permalink", "N/A"),
                            "num_crossposts": post_data.get("num_crossposts", 0),
                            "author": post_data.get("author", "Unknown"),
                            "author_premium": post_data.get("author_premium", False),
                            "author_flair_text": post_data.get("author_flair_text", "N/A"),
                            "media_only": post_data.get("media_only", False),
                            "over_18": post_data.get("over_18", False),
                            "is_crosspostable": post_data.get("is_crosspostable", False),
                            "link_flair_text": post_data.get("link_flair_text", "N/A"),
                            "all_awardings": len(post_data.get("all_awardings", [])),
                            "gildings": post_data.get("gildings", {}),
                        })

                        total_fetched += 1
                        if total_fetched >= 20000:
                            break  # Stop fetching once we reach 20,000 rows

                    after = data["data"].get("after")
                    
                    if not after:
                        break  # Stop if no more pages available
                    
                    time.sleep(1)  # Respect API rate limit
                else:
                    print(f"Error fetching {sort} posts:", response.status_code, response.text)
                    break

        if total_fetched >= 20000:
            break  # Ensure we stop the outer loop once we reach 20,000

    # Convert to DataFrame
    df = pd.DataFrame(all_posts)
    
    # Sentiment Analysis
    df['sentiment'] = df['title'].apply(lambda x: "Positive" if TextBlob(x).sentiment.polarity > 0 else ("Negative" if TextBlob(x).sentiment.polarity < 0 else "Neutral"))
    
    # Topic Modeling
    vectorizer = CountVectorizer(stop_words='english')
    X = vectorizer.fit_transform(df['title'])
    lda = LatentDirichletAllocation(n_components=5, random_state=42)
    topics = lda.fit_transform(X)
    df['topic'] = topics.argmax(axis=1)
    
    # Trend Prediction using Prophet
    trend_df = df[['created_utc', 'num_comments']].dropna()
    trend_df = trend_df.rename(columns={'created_utc': 'ds', 'num_comments': 'y'})
    model = Prophet()
    model.fit(trend_df)
    future = model.make_future_dataframe(periods=30)
    forecast = model.predict(future)
    
    # Save DataFrame to CSV
    csv_filename = "Reddit_Posts_20k.csv"
    df.to_csv(csv_filename, index=False)

    print(f"Successfully saved {len(df)} posts in '{csv_filename}'")
else:
    print("Error:", res.status_code, res.text)


  from .autonotebook import tqdm as notebook_tqdm
Importing plotly failed. Interactive plots will not work.
15:26:40 - cmdstanpy - INFO - Chain [1] start processing
15:26:59 - cmdstanpy - INFO - Chain [1] done processing


Successfully saved 20000 posts in 'Reddit_Posts_20k.csv'


In [10]:
pip install langdetect


Defaulting to user installation because normal site-packages is not writeableNote: you may need to restart the kernel to use updated packages.

Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------------------------------------- 0.0/981.5 kB ? eta -:--:--
     ---------- ----------------------------- 262.1/981.5 kB ? eta -:--:--
     ------------------------------ ------- 786.4/981.5 kB 1.9 MB/s eta 0:00:01
     -------------------------------------- 981.5/981.5 kB 2.0 MB/s eta 0:00:00
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Building wheels for collected packages: langdetect
  Building wheel for lang

In [11]:
import pandas as pd
import re
from langdetect import detect, DetectorFactory
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Set random seed for consistent language detection
DetectorFactory.seed = 42

# Load dataset
df = pd.read_csv("Reddit_Posts_20k.csv")

# 1️⃣ **Remove Duplicates**
df = df.drop_duplicates()

# 2️⃣ **Detect and Remove Non-English Posts**
def detect_lang(text):
    try:
        return detect(text)
    except:
        return "unknown"

df['language'] = df['title'].apply(lambda x: detect_lang(str(x)) if pd.notnull(x) else "unknown")
df = df[df['language'] == 'en']  # Keep only English posts

# 3️⃣ **Handle Missing Values**
df = df.fillna("Unknown")  # Replace missing values with "Unknown"

# 4️⃣ **Normalize Data**  
# Extract hashtags from the title
df['hashtags'] = df['title'].apply(lambda x: re.findall(r"#\w+", str(x)))

# Extract mentions from selftext
df['mentions'] = df['selftext'].apply(lambda x: re.findall(r"@\w+", str(x)))

# Convert timestamps
df['created_utc'] = pd.to_datetime(df['created_utc'], errors='coerce')

# 5️⃣ **Sentiment Analysis**  
df['sentiment'] = df['title'].apply(lambda x: "Positive" if TextBlob(x).sentiment.polarity > 0 else ("Negative" if TextBlob(x).sentiment.polarity < 0 else "Neutral"))

# 6️⃣ **Topic Modeling** (Latent Dirichlet Allocation)
vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['title'])
lda = LatentDirichletAllocation(n_components=5, random_state=42)
topics = lda.fit_transform(X)
df['topic'] = topics.argmax(axis=1)

# Save cleaned dataset
df.to_csv("Reddit_Cleaned.csv", index=False)

print(f"✅ Data Cleaning Complete! Saved as 'Reddit_Cleaned.csv'.")


✅ Data Cleaning Complete! Saved as 'Reddit_Cleaned.csv'.


In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Load dataset
# df = pd.read_csv('Reddit_Posts_20k.csv')
df = pd.read_csv('Reddit_Cleaned.csv')

# Drop rows with missing values in relevant columns
df = df.dropna(subset=['title', 'selftext', 'sentiment'])

# Combine title and selftext for better context
df['text'] = df['title'] + ' ' + df['selftext']

# Function to clean text
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)  # Remove special characters
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Apply cleaning
df['text'] = df['text'].apply(clean_text)

# Vectorization (Convert text to numerical format)
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['text'])
y = df['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training (Naïve Bayes classifier)
model = MultinomialNB()
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

# Function to predict sentiment for new text
def predict_sentiment(text):
    text = clean_text(text)
    text_vectorized = vectorizer.transform([text])
    prediction = model.predict(text_vectorized)
    return prediction[0]

# Example prediction
example_text = "This is the best post I've seen!"
print("Predicted Sentiment:", predict_sentiment(example_text))



Accuracy: 0.8729664243682935
Classification Report:
               precision    recall  f1-score   support

    Negative       1.00      0.55      0.71       326
     Neutral       0.84      0.98      0.91      1662
    Positive       0.93      0.79      0.85       901

    accuracy                           0.87      2889
   macro avg       0.92      0.77      0.82      2889
weighted avg       0.89      0.87      0.87      2889

Predicted Sentiment: Positive
