# Reddit API Script to collect posts and perform sentiment analysis

In [301]:
import pandas as pd
import numpy as np

import time
from datetime import datetime

import praw # reddit API crawler

import nltk # Sentiment analysis module
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

import matplotlib.pyplot as plt
import sys

from deep_translator import GoogleTranslator

Only need to run this for first run, it downloads the VADER lexicon, punkt tokenizer and stop word library

In [302]:

def downloadNLTK():

    nltk.download('vader_lexicon')  # get lexicons data
    nltk.download('punkt')  # for tokenizer
    nltk.download('stopwords')

#downloadNLTK()

In [303]:
username = 'balackdynamite' # Personal Reddit Account
id = 'RcSucsCZw-A0pEmsaaqaQA'
secret = 'oIF0qAUUXuh9QtbsgOJwwQM9j4vJLw'

r = praw.Reddit(user_agent=username,
                client_id=id,
                client_secret=secret,
                check_for_async=False)

As I am getting posts from non english speaking subreddits I can see potential issues in getting sentiment scores. I am going to use google translate to change the titles to be all in english.

In [304]:
def translateText(data):

    try:
        translation = GoogleTranslator(source='auto', target='en').translate(data)

    except:
        translation = data

    return translation

I also would like to tokenize my titles and remove stop words to help imrpve the accuracy of sentiment scores

In [305]:
def listToString(s):
    str1 = " "
    return (str1.join(s))

In [306]:
def cleanText(text):

    stop_words = stopwords.words('english') # stop words from nltk module

    text = text.replace("'", "").replace("-", "").lower() # removing quotes and dashes

    tk = nltk.tokenize.RegexpTokenizer(r'\w+') #split up text into words
    tokens = tk.tokenize(text)

    words = [w for w in tokens if not w in stop_words]

    text = listToString(words) # convert back to string sentence

    return text

In [307]:
def dataframeSentiment(data):

    s = SentimentIntensityAnalyzer()

    res = data['Title'].apply(lambda x: pd.Series(s.polarity_scores(x)))

    data = pd.concat([data, res], axis=1)

    return data

Threshold values need to be selected to classify the titels into labels: Positive, Negative and Neutral

From research a value of 0.05 is typically used as optimal: https://www.researchgate.net/figure/Typical-threshold-values-used-in-VADER-20_fig5_350761338


In [308]:
def dataframeSentimentLabel(data,threshold=0.05):

    conditions = [
        (data['compound'] <= -threshold),
        (data['compound'] > -threshold) & (data['compound'] < threshold),
        (data['compound'] >= threshold),
        ]

    values = ["neg", "neu", "pos"]
    data['label'] = np.select(conditions, values)

    return data

In [309]:
def sentimentAnalysis(data):

    data = dataframeSentiment(data)

    data = dataframeSentimentLabel(data)

    return data

Initial Test of scraping post titles using the Reddit API

subreddit = 'ireland' # subreddit we want to scrape
postLimit = 10 # the amount of posts we want, None = All of them

subreddit = r.subreddit(subreddit)

posts = [*subreddit.top(limit=postLimit)] # top posts all time

titles = [posts.title for posts in posts] # list of title of posts

posts = pd.DataFrame({"title": titles,})

posts.head()

In [None]:
keywords = ['housing market', 'housing', 'property', 'real estate', 'construction', 'building', 'infrastructure', 'labor', 'wages']
subredditList = ['ireland', 'europe', 'germany', 'france']
postLimit = 10000

startTime = time.time_ns()

cols = ["Subreddit",'Original Title',"Title"]
cols += keywords

data = pd.DataFrame(columns = cols)

for subreddit in subredditList:

    s = r.subreddit(subreddit)

    for submission in r.subreddit(subreddit).top(limit=postLimit):

        submission = submission.title

        if submission == None:
            print('Possible Error, no Title')
            continue

        ogTitle = submission

        submission = translateText(submission) # translate titles to english

        submission = cleanText(submission) # function to tokenize, remove stop words and make sentence

        row = {'Subreddit': subreddit,
               'Original Title': ogTitle,
               'Title': submission}

        for keyword in keywords:

            if keyword in submission:
                match = 1
            else:
                match = 0

            row[keyword] = match

        data = pd.concat([data, pd.DataFrame([row])], ignore_index=True)

data = sentimentAnalysis(data) # function gets sentiment scores and labels them based on threshold value

endTime = time.time_ns()

runTimeSeconds = int(endTime - startTime)/1000000000

print(f'Running analysis on {postLimit * len(subredditList)} posts took {runTimeSeconds} seconds')

Exporting to file

In [None]:
now = datetime.now().replace(microsecond=0)

fileName = 'redditSentiment-' + str(now).replace(":","-") + '.csv'

data.to_csv(fileName,index=False)