# Sentiment Analysis of Reddit Comments

In [1]:
import openai
from openai.error import APIConnectionError, APIError, RateLimitError
from typing import List, Dict, Generator, Optional
import tiktoken
import pandas as pd
from dotenv import load_dotenv
import sys
import re
import random
import time
import praw
import os

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.display import display

In [4]:
model = "gpt-3.5-turbo"
load_dotenv(".env")
openai.api_key = os.environ["OPENAI_API_KEY"]

reddit = praw.Reddit(
    client_id=os.environ["REDDIT_CLIENT_ID"],
    client_secret=os.environ["REDDIT_CLIENT_SECRET"],
    user_agent=f"script:test:0.0.1 (by u/yourusername)",
)

In [5]:
# https://github.com/openai/openai-cookbook/blob/main/examples/How_to_count_tokens_with_tiktoken.ipynb

def num_tokens_from_messages(messages, model):
    """Returns the number of tokens used by a list of messages."""
    try:
        encoding = tiktoken.encoding_for_model(model)
    except KeyError:
        print("Warning: model not found. Using cl100k_base encoding.")
        encoding = tiktoken.get_encoding("cl100k_base")
    if model == "gpt-3.5-turbo":
        return num_tokens_from_messages(messages, model="gpt-3.5-turbo-0301")
    elif model == "gpt-4":
        return num_tokens_from_messages(messages, model="gpt-4-0314")
    elif model == "gpt-3.5-turbo-0301":
        tokens_per_message = 4  # every message follows <|start|>{role/name}\n{content}<|end|>\n
        tokens_per_name = -1  # if there's a name, the role is omitted
    elif model == "gpt-4-0314":
        tokens_per_message = 3
        tokens_per_name = 1
    else:
        raise NotImplementedError(
            f"""num_tokens_from_messages() is not implemented for model {model}. See https://github.com/openai/openai-python/blob/main/chatml.md for information on how messages are converted to tokens."""
        )
    num_tokens = 0
    for message in messages:
        num_tokens += tokens_per_message
        for key, value in message.items():
            num_tokens += len(encoding.encode(value))
            if key == "name":
                num_tokens += tokens_per_name
    num_tokens += 3  # every reply is primed with <|start|>assistant<|message|>
    return num_tokens

## Getting Reddit Comments:

In [6]:
DF_COLUMNS = ["subreddit", "submission_id", "score", "comment_body"]
# filename, subreddits = "cities.csv", [
#     "NYC",
#     "Seattle",
#     "LosAngeles",
#     "Chicago",
#     "Austin",
#     "Portland",
#     "SanFrancisco",
#     "Boston",
#     "Houston",
#     "Atlanta",
#     "Philadelphia",
#     "Denver",
#     "SeattleWa",
#     "Dallas",
#     "WashingtonDC",
#     "SanDiego",
#     "Pittsburgh",
#     "Phoenix",
#     "Minneapolis",
#     "Orlando",
#     "Nashville",
#     "StLouis",
#     "SaltLakeCity",
#     "Columbus",
#     "Raleigh",
# ]

# OTHER POTENTIAL SUBREDDITS TO TRY:
# filename, subreddits = "iphone_v_android.csv", ["iphone", "Android"]
# filename, subreddits = "startrek_v_starwars.csv", ["startrek", "StarWars"]
filename, subreddits = "epl_top_8.csv", ["reddevils", "LiverpoolFC", "chelseafc", "Gunners", "coys", "MCFC", "Everton", "NUFC"]

In [7]:
# Utility functions for fetching comments from submissions
def comment_generator(submission) -> Generator:
    # Do not bother expanding MoreComments (follow-links)
    for comment in submission.comments.list():
        if hasattr(comment, "body") and comment.body != "[deleted]" and comment.body != "[removed]":
            yield (comment)

In [9]:
def collect_comments(
    filename: str,
    target_comments_per_subreddit: int,
    max_comments_per_submission: int,
    max_comment_length: int,
    reddit: praw.Reddit,
) -> pd.DataFrame:
    """
    Collect comments from the top submissions in each subreddit.

    Cache results at cache_filename.

    Return a dataframe with columns: subreddit, submission_id, score, comment_body
    """
    try:
        df = pd.read_csv(filename, index_col="id")
        assert df.columns.tolist() == DF_COLUMNS
    except FileNotFoundError:
        df = pd.DataFrame(columns=DF_COLUMNS)

    # dict like {comment_id -> {column -> value}}
    records = df.to_dict(orient="index")

    for subreddit_index, subreddit_name in enumerate(subreddits):
        print(f"Processing Subreddit: {subreddit_name}")

        processed_comments_for_subreddit = len(df[df["subreddit"] == subreddit_name])

        if processed_comments_for_subreddit >= target_comments_per_subreddit:
            print(f"Enough comments fetched for {subreddit_name}, continuing to next subreddit.")
            continue

        # `top`` is a generator, grab submissions until we break (within this loop).
        for submission in reddit.subreddit(subreddit_name).top(time_filter="month"):
            if processed_comments_for_subreddit >= target_comments_per_subreddit:
                break

            # The number of comments that we already have for this subreddit
            processed_comments_for_submission = len(df[df["submission_id"] == submission.id])

            for comment in comment_generator(submission):
                if processed_comments_for_submission >= max_comments_per_submission or processed_comments_for_subreddit >= target_comments_per_subreddit:
                    break

                if comment.id in records:
                    print(f"Skipping comment {subreddit_name}-{submission.id}-{comment.id} because we already have it")
                    continue

                body = comment.body[:max_comment_length].strip()
                records[comment.id] = {"subreddit": subreddit_name, "submission_id": submission.id, "comment_body": body}

                processed_comments_for_subreddit += 1
                processed_comments_for_submission += 1

            # Once per post write to disk.
            print(f"CSV rewritten with {len(records)} rows.\n")
            df = pd.DataFrame.from_dict(records, orient="index", columns=DF_COLUMNS)
            df.to_csv(filename, index_label="id")

    print("Completed.")
    return df

## Sentiment Analysis on Comments:

In [None]:
MAX_ATTEMPTS = 3

def generate_prompt_messages(s: str) -> List[Dict]:
    return [
        {
            "role": "user",
            "content": """
The following is a comment from a user on Reddit. Score it from -1 to 1, where -1 is the most negative and 1 is the most positive:

The traffic is quite annoying.
""".strip(),
        },
        {"role": "assistant", "content": "-0.75"},
        {
            "role": "user",
            "content": """
The following is a comment from a user on Reddit. Score it from -1 to 1, where -1 is the most negative and 1 is the most positive:

The library is downtown.
""".strip(),
        },
        {"role": "assistant", "content": "0.0"},
        {
            "role": "user",
            "content": """
The following is a comment from a user on Reddit. Score it from -1 to 1, where -1 is the most negative and 1 is the most positive:

Even though it's humid, I really love the summertime. Everything is so green and the sun is out all the time.
""".strip(),
        },
        {"role": "assistant", "content": "0.8"},
        {
            "role": "user",
            "content": f"""
The following is a comment from a user on Reddit. Score it from -1 to 1, where -1 is the most negative and 1 is the most positive:

{s}
""".strip(),
        },
    ]

## Run Everything!

In [13]:
TARGET_COMMENTS_PER_SUBREDDIT = 50
MAX_COMMENTS_PER_SUBMISSION = 10
MAX_COMMENT_LENGTH = 2000

collect_comments(
    filename=filename,
    target_comments_per_subreddit=TARGET_COMMENTS_PER_SUBREDDIT,
    max_comments_per_submission=MAX_COMMENTS_PER_SUBMISSION,
    max_comment_length=MAX_COMMENT_LENGTH,
    reddit=reddit,
)

Processing Subreddit: reddevils
Enough comments fetched for reddevils, continuing to next subreddit.
Processing Subreddit: LiverpoolFC
Enough comments fetched for LiverpoolFC, continuing to next subreddit.
Processing Subreddit: chelseafc
Enough comments fetched for chelseafc, continuing to next subreddit.
Processing Subreddit: Gunners
Enough comments fetched for Gunners, continuing to next subreddit.
Processing Subreddit: coys
Enough comments fetched for coys, continuing to next subreddit.
Processing Subreddit: MCFC
Enough comments fetched for MCFC, continuing to next subreddit.
Processing Subreddit: Everton
Enough comments fetched for Everton, continuing to next subreddit.
Processing Subreddit: NUFC
Enough comments fetched for NUFC, continuing to next subreddit.
Completed.


Unnamed: 0_level_0,subreddit,submission_id,score,comment_body
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
jkr0z7j,reddevils,13lqbis,,Source: https://www.instagram.com/p/Csa1f8NM7-...
jkr1710,reddevils,13lqbis,,I always remember when he said no one would bo...
jkr0xqv,reddevils,13lqbis,,Poor guy with all the injuries - wishing him a...
jkr1qh5,reddevils,13lqbis,,"""I've said before that I found it hard to even..."
jkr1v2g,reddevils,13lqbis,,"Even if injured, name him on the bench for fin..."
...,...,...,...,...
jl7ubhu,NUFC,13p50iu,,Someone make this our subreddit banner please\...
jl7qp4f,NUFC,13p50iu,,Nick Pope absolutely makes this photo epic! LOL!
jl7q02h,NUFC,13p50iu,,Hands down best pic yet
jl7puyp,NUFC,13p50iu,,Murphy is saying it all for me
