# **Sentiment Analysis on Reddit**

In [None]:
# Subreddit to perform sentiment analysis on:
subreddit = 'pcgaming'
# Name of file that contains the topics to search the subreddit for.
topic_file = 'topic_list.csv'
# Number of posts to search for topics in.
num_posts = 40

## **0. Imports**

In [None]:
import praw
import pandas as pd
import numpy as np
import nltk
from fuzzywuzzy import process, fuzz
from difflib import SequenceMatcher
import statistics
import re
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

import bot1

In [None]:
def get_topic_fuzzy(post_text, topic_df):
    topic = ''
    for t in topic_df['Topic']:
        t = str(t)
        t_split = t.split()
        topic_len = len(t_split)
        post_list = post_text.split()
        ngram = nltk.ngrams(post_list, topic_len)
        for grams in ngram:
            r_list = [0] * len(grams)
            for i, word in enumerate(t_split):
                r_list[i] = fuzz.ratio(word, grams[i])
            if statistics.mean(r_list) >= 90:
                return ' '.join(t_split)
    return topic

def get_topic_first(post_text, topic_df):
    topic = ''
    for t in topic_df['Topic']:
        t = str(t)
        t_split = t.split()
        topic_len = len(t_split)
        post_list = post_text.split()
        ngram = nltk.ngrams(post_list, topic_len)
        for grams in ngram:
            print(grams)
            exit()
    return topic

def get_topic_exact(post_text, topic_df):
    topic = ''
    for t in topic_df['Topic']:
        t = str(t)
        t_split = t.split()
        topic_len = len(t_split)
        post_list = post_text.split()
        ngram = nltk.ngrams(post_list, topic_len)
        for grams in ngram:
            if list(grams) == t_split:
                return ' '.join(t_split)
    return topic

def get_url(postID):
    return 'reddit.com/r/' + subreddit + '/comments/' + postID

reddit = praw.Reddit(client_id=bot1.app_id, client_secret=bot1.app_secret, user_agent=bot1.app_ua)
sub = reddit.subreddit(subreddit)
topic_df = pd.read_csv(topic_file)
posts = pd.DataFrame(columns=['PostID', 'Title', 'Topic'])

for post in sub.hot(limit=num_posts):
    url = get_url(post.id)
    title = post.title
    #topic = get_topic_fuzzy(title, topic_df)
    #topic = get_topic_first(title, topic_df)
    topic = get_topic_exact(title, topic_df)
    if topic != '':
        posts.loc[len(posts.index)] = [post.id, title, topic]

posts

In [None]:
def get_comments(ID):
    pass
    post = reddit.submission(id=ID)
    all_comments = []
    post.comments.replace_more(limit=None)
    for comments in post.comments.list():
        all_comments.append(comments.body)
        #all_comments.append(nltk.tokenize.sent_tokenize(comments.body))
    return pd.DataFrame(all_comments, columns=['Comment'])
    
def preprocess_comment(df):
    df['Comment'] = df['Comment'].str.lower()
    df = df.astype(str).apply(lambda x: x.str.encode('ascii', 'ignore').str.decode('ascii'))
    df['Comment'] = df['Comment'].apply(lambda x: re.split('https:\/\/.*', str(x))[0])
    df['Tokenized_Comment'] = df['Comment'].apply(nltk.tokenize.word_tokenize)
    stopwords = set(nltk.corpus.stopwords.words('english'))
    df['Tokenized_Comment'] = df['Tokenized_Comment'].apply(lambda x: [item for item in x if item not in stopwords])
    stemmer = nltk.stem.PorterStemmer()
    df['Tokenized_Comment'] = df['Tokenized_Comment'].apply(lambda x: [stemmer.stem(y) for y in x])
    return df

def score_comments(ID):
    comments_df = get_comments(ID)
    #comments_df = preprocess_comment(comments_df)
    sia = SentimentIntensityAnalyzer()
    results = []
    for index, row in comments_df.iterrows():
        pol_score = sia.polarity_scores(row['Comment'])
        comments_df.loc[index,'pos'] = pol_score['pos']
        comments_df.loc[index,'neu'] = pol_score['neu']
        comments_df.loc[index,'neg'] = pol_score['neg']
        comments_df.loc[index,'compound'] = pol_score['compound']
    out_values = []
    out_values.append(comments_df['pos'].mean())
    out_values.append(comments_df['neu'].mean())
    out_values.append(comments_df['neg'].mean())
    out_values.append(comments_df['compound'].mean())
    out_values.append(len(comments_df.index))
    return out_values

for post in sub.hot(limit=num_posts):
    if posts['PostID'].str.contains(post.id).any():
        out_values = score_comments(post.id)
        index = posts.index[posts['PostID'] == post.id].tolist()
        posts.loc[index,'Avg_Pos'] = out_values[0]
        posts.loc[index,'Avg_Neu'] = out_values[1]
        posts.loc[index,'Avg_Neg'] = out_values[2]
        posts.loc[index,'Avg_Compound'] = out_values[3]
        posts.loc[index,'Num_Comments'] = out_values[4]
posts = posts.sort_values(by=['Avg_Compound'], ascending=False)
posts