# Reddit API Script to collect posts and perform sentiment analysis

In [267]:
import pandas as pd
import numpy as np

import time
from datetime import datetime

import praw # reddit API crawler

import nltk # Sentiment analysis module
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import sys

from deep_translator import GoogleTranslator

Only need to run this for first run, it downloads the VADER lexicon, punkt tokenizer and stop word library

In [268]:

def downloadNLTK():

    nltk.download('vader_lexicon')  # get lexicons data
    nltk.download('punkt')  # for tokenizer
    nltk.download('stopwords')

#downloadNLTK()

Credentials for Reddit API

In [269]:
username = 'balackdynamite' # Personal Reddit Account
id = 'RcSucsCZw-A0pEmsaaqaQA'
secret = 'oIF0qAUUXuh9QtbsgOJwwQM9j4vJLw'

As I am getting posts from non english speaking subreddits I can see potential issues in getting sentiment scores. I am going to use google translate to change the titles to be all in english.

In [270]:
def translateText(data,lang='en'):

    try:
        translation = GoogleTranslator(source='auto', target=lang).translate(data)

    except:
        translation = data

    if translation == None:
        return data

    else:
        return translation

I also would like to tokenize my titles and remove stop words to help imrpve the accuracy of sentiment scores

In [271]:
def listToString(s):
    str1 = " "
    return (str1.join(s))

In [272]:
def cleanText(text):

    stop_words = stopwords.words('english') # stop words from nltk module

    text = text.replace("'", "").replace("-", "").lower() # removing quotes and dashes

    tk = nltk.tokenize.RegexpTokenizer(r'\w+') #split up text into words
    tokens = tk.tokenize(text)

    words = [w for w in tokens if not w in stop_words]

    text = listToString(words) # convert back to string sentence

    return text

In [273]:
def dataframeSentiment(data):

    s = SentimentIntensityAnalyzer()

    res = data['Title'].apply(lambda x: pd.Series(s.polarity_scores(x)))

    data = pd.concat([data, res], axis=1)

    return data

Threshold values need to be selected to classify the titels into labels: Positive, Negative and Neutral

From research a value of 0.05 is typically used as optimal: https://www.researchgate.net/figure/Typical-threshold-values-used-in-VADER-20_fig5_350761338


In [274]:
def dataframeSentimentLabel(data,threshold=0.05):

    conditions = [
        (data['compound'] <= -threshold),
        (data['compound'] > -threshold) & (data['compound'] < threshold),
        (data['compound'] >= threshold),
        ]

    values = ["neg", "neu", "pos"]
    data['label'] = np.select(conditions, values)

    return data

In [275]:
def sentimentAnalysis(data):

    data = dataframeSentiment(data)

    data = dataframeSentimentLabel(data)

    return data

Initial Test of scraping post titles using the Reddit API

subreddit = 'ireland' # subreddit we want to scrape
postLimit = 10 # the amount of posts we want, None = All of them

subreddit = r.subreddit(subreddit)

posts = [*subreddit.top(limit=postLimit)] # top posts all time

titles = [posts.title for posts in posts] # list of title of posts

posts = pd.DataFrame({"title": titles,})

posts.head()

In [276]:
keywords = ['housing market', 'housing', 'property', 'real estate', 'construction', 'building', 'infrastructure', 'labor', 'wages']
subredditList = ['ireland', 'europe', 'germany', 'france']
#postLimit = int(1000/len(subredditList)) # 1000 is the max value allowed by Reddit API, so divide the limit by N number of subreddits to be searched
postLimit = 100 # 1000 is the max value allowed by Reddit API

startTime = time.time_ns()

cols = ["Subreddit",'Original Title',"Title"]
cols += keywords

germanKeywords = [translateText(word,'de') for word in keywords ]

frenchKeywords = [translateText(word,'fr') for word in keywords ]

keywordsMatch = keywords + germanKeywords + frenchKeywords

# keywordsMatch = [word for word in keywordsMatch if word is not None] # remove invalid translations - None types

data = pd.DataFrame(columns = cols)

for subreddit in subredditList:

    r = praw.Reddit(user_agent=username,
                client_id=id,
                client_secret=secret,
                check_for_async=False)

    s = r.subreddit(subreddit)

    for w in keywords:

        for submission in s.search(w, limit=postLimit):

            submission = submission.title

            ogTitle = submission

            #print(ogTitle)

            submission = translateText(submission) # translate titles to english

            #print(f"After Translate: {submission}")

            submission = cleanText(submission) # function to tokenize, remove stop words and make sentence

            #print(f"After Clean Text: {submission}")

            row = {'Subreddit': subreddit,
                   'Original Title': ogTitle,
                   'Title': submission}

            match = 0

            for keyword in keywordsMatch:

                if keyword in ogTitle.lower():
                    match = 1
                    row['Match'] = match
                else:
                    match = 0

                if keyword in germanKeywords:

                    position = germanKeywords.index(keyword)
                    word = keywords[position] # take the english word for it

                    if word in row:

                        if row[word] == 1: # already have a match for that word in this title
                            continue

                elif keyword in frenchKeywords:

                    position = frenchKeywords.index(keyword)
                    word = keywords[position]

                    if word in row:

                        if row[word] == 1:
                            continue

                else:
                    word = keyword

                row[word] = match

                if match ==1 :
                    #print(f'Title: {ogTitle.lower()} found {word}')
                    pass

            data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)

data = sentimentAnalysis(data) # function gets sentiment scores and labels them based on threshold value

endTime = time.time_ns()

runTimeSeconds = int(endTime - startTime)/1000000000

print(f'Running analysis on {len(data)} posts with a limit of {postLimit} took {runTimeSeconds} seconds')

Running analysis on 3325 posts with a limit of 100 took 883.0920258 seconds


Time analysis data from trial and error:

Running analysis on 6579 posts with a limit of 500 took 1851.6176345 seconds



Exporting to file and using the current datetime to timestamp it, this ensures each file is unique in value as the frequency of running this is greater than 1 minute naturally, if this was scaled up to run asynchronously and in millisecond time we'd need to optimize the code and file name structure.

In [277]:
now = datetime.now().replace(microsecond=0)

fileName = 'redditSentiment-' + str(now).replace(":","-") + '.csv'

data.to_csv(fileName,index=False)