In [None]:
#------ Real-time Data Collection with time&pd&Vader Libraries ------#

import praw
import pandas as pd
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import os
import time
import random

# Initialize Reddit API client
reddit = praw.Reddit(
    client_id="NT6xR6Qm2X_GT9uI-tnU0w",
    client_secret="ctNQXy_6K81LrZ8szunwDLnhHJQvOQ",
    user_agent="East-Assumption691"
)

# Initialize VADER sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Queries for fetching posts
queries = [
    "global warming", "greenhouse gases", "carbon emissions",
    "extreme weather", "carbon intensity", "biodiversity",
    "carbon footprint", "climate crisis", "carbon market",
    "cutting down forests", "powering buildings", "manufacturing goods",
    "fossil fuels"
]

# Sorting options to ensure variety in the fetched data
sort_options = ["top", "comments", "relevance"]

# Function to classify sentiment
def classify_sentiment(text):
    scores = analyzer.polarity_scores(str(text))  # Convert to string to handle NaN
    if scores['compound'] >= 0.05:
        return "Positive"
    elif scores['compound'] <= -0.05:
        return "Negative"
    else:
        return "Neutral"

# Function to fetch data iteratively from Reddit
def fetch_data(subreddit_name, min_rows=600, batch_size=100, cycle=0, max_attempts=10):
    posts = []
    seen_ids = set()  # Track unique post IDs to avoid duplicates
    subreddit = reddit.subreddit(subreddit_name)
    attempts = 0

    print(f"Starting fetch for at least {min_rows} rows of data...")

    while len(posts) < min_rows and attempts < max_attempts:
        sort_order = sort_options[cycle % len(sort_options)]
        shuffled_queries = random.sample(queries, len(queries))
        print(f"Attempt {attempts + 1}: Fetching data...")

        for query in shuffled_queries:
            print(f"Searching for query: {query} (sort: {sort_order})")
            for submission in subreddit.search(query, sort=sort_order, limit=batch_size):
                # Skip duplicates or invalid selftexts
                if (
                    submission.id in seen_ids
                    or not submission.selftext
                    or pd.isna(submission.selftext)
                    or submission.selftext.strip() == ""
                ):
                    continue
                seen_ids.add(submission.id)
                post = {
                    "id": submission.id,
                    "title": submission.title,
                    "selftext": submission.selftext,
                    "created_utc": submission.created_utc,
                    "score": submission.score,
                    "comments": submission.num_comments,
                    "url": submission.url,
                    "query": query,  # Track the query that retrieved this post
                    # Apply sentiment analysis
                    "selftext_sentiment": classify_sentiment(submission.selftext)
                }
                posts.append(post)
                if len(posts) % 50 == 0:  # Show progress every 50 posts
                    print(f"Progress: {len(posts)} rows collected...")

            if len(posts) >= min_rows:
                print(f"Collected {len(posts)} rows. Stopping further queries.")
                break

        attempts += 1
        cycle += 1  # Rotate the sort order for variety

    if len(posts) < min_rows:
        print(f"Warning: Only {len(posts)} rows fetched after {attempts} attempts.")

    return pd.DataFrame(posts).drop_duplicates(subset="id")  # Remove duplicates by post ID

# Function to replace CSV data and update sentiment analysis
def replace_csv(output_file, new_data):
    # Save new data directly
    new_data.to_csv(output_file, index=False)
    print(f"Replaced CSV file with {len(new_data)} rows.")

    # Print sentiment distribution
    sentiment_counts = new_data['selftext_sentiment'].value_counts()
    print("\nSentiment Analysis Distribution (Updated):")
    print(f"Positive: {sentiment_counts.get('Positive', 0)}")
    print(f"Neutral: {sentiment_counts.get('Neutral', 0)}")
    print(f"Negative: {sentiment_counts.get('Negative', 0)}")

# Main function to continuously fetch and replace data
def collect_data_continuously(subreddit_name, output_file="data.csv", batch_size=100, fetch_interval=60, min_rows=600):
    print(f"Starting continuous data collection and saving to {output_file}.")
    cycle = 0  # Counter to alternate sorting order and ensure variety

    while True:
        # Fetch fresh data for each cycle
        new_data = fetch_data(subreddit_name, min_rows, batch_size, cycle)

        # Replace the old data with new data
        if not new_data.empty:
            replace_csv(output_file, new_data)
        else:
            print("No valid data found.")

        # Increment the cycle for alternating sort
        cycle += 1

        # Wait before fetching new data
        print(f"Waiting {fetch_interval} seconds before fetching new data...\n")
        time.sleep(fetch_interval)

# Start the data collection process
subreddit_name = "climatechange"  # Target subreddit
output_csv = "data.csv"
batch_size = 100                  # Number of posts to fetch per query
fetch_interval = 60               # Time in seconds between each fetch
min_rows = 600                    # Minimum rows in the CSV file

collect_data_continuously(subreddit_name, output_csv, batch_size, fetch_interval, min_rows)


Starting continuous data collection and saving to data.csv.
Starting fetch for at least 600 rows of data...
Attempt 1: Fetching data...
Searching for query: extreme weather (sort: top)
Progress: 50 rows collected...
Searching for query: manufacturing goods (sort: top)
Searching for query: carbon intensity (sort: top)
Searching for query: carbon emissions (sort: top)
Progress: 100 rows collected...
Progress: 150 rows collected...
Searching for query: cutting down forests (sort: top)
Searching for query: carbon footprint (sort: top)
Progress: 200 rows collected...
Searching for query: powering buildings (sort: top)
Progress: 250 rows collected...
Searching for query: global warming (sort: top)
Progress: 300 rows collected...
Searching for query: climate crisis (sort: top)
Searching for query: greenhouse gases (sort: top)
Progress: 350 rows collected...
Progress: 400 rows collected...
Searching for query: carbon market (sort: top)
Searching for query: biodiversity (sort: top)
Progress: 45

In [31]:
data_check = pd.read_csv("data.csv")
duplicate_count = data_check.duplicated().sum()
print(f'Total duplicates: {duplicate_count}')
sentiment_counts = data_check['selftext_sentiment'].value_counts()
print("\nSentiment Counts:")
print(sentiment_counts)

Total duplicates: 0

Sentiment Counts:
selftext_sentiment
Positive    403
Negative    174
Neutral      28
Name: count, dtype: int64


In [36]:
selected_columns = data_check[['id','title', 'selftext', 'selftext_sentiment']]
random_sample = selected_columns.sample(n=500, random_state=42)
random_sample.to_csv('selected_columns.csv', index=False)
random_sample

Unnamed: 0,id,title,selftext,selftext_sentiment
412,joo4s3,Quick tips to reduce my carbon footprint?,Looking for small adjustments that you people ...,Neutral
289,sdi6ww,"""The Best Years Are Over""","05/29/1966, 1:00 p.m. - from [DER SPIEGEL 23/1...",Positive
76,18pmyfx,Good News in 2023,**The greatest year ever for clean energy**\n\...,Positive
78,1aesoan,China has a new climate & environment strategy...,Taken von Carbon Brief's China Briefing of 25 ...,Positive
182,1ekyy8p,What are some creative and promising ideas to ...,I learned about [marine cloud brightening](htt...,Positive
...,...,...,...,...
359,10hg68j,How can the earth store up feedback loops?,What I’ve heard claimed is that if we let the ...,Positive
171,165kchu,Why are we shutting down nuclear power plants ...,Why are we shutting down nuclear power plants ...,Negative
98,mdph5s,Tennessee Valley Authority head says more nucl...,[https://www.timesfreepress.com/news/business/...,Neutral
292,160vcke,Question about AMOC slowdown and lower tempera...,So this summer we here in northern Europe has ...,Negative
