# Reddit API Script to collect posts

In [8]:
import pandas as pd
import numpy as np

import datetime as dt
from pprint import pprint
from itertools import chain

import praw # reddit API crawler

import nltk # Sentiment analysis module
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import sys

from deep_translator import GoogleTranslator

Only need to run this for first run, it downloads the VADER lexicon, punkt tokenizer and stop word library

In [9]:

def downloadNLTK():

    nltk.download('vader_lexicon')  # get lexicons data
    nltk.download('punkt')  # for tokenizer
    nltk.download('stopwords')

#downloadNLTK()

In [10]:
username = 'balackdynamite' # Personal Reddit Account
id = 'RcSucsCZw-A0pEmsaaqaQA'
secret = 'oIF0qAUUXuh9QtbsgOJwwQM9j4vJLw'

r = praw.Reddit(user_agent=username,
                client_id=id,
                client_secret=secret,
                check_for_async=False)

Initial Test of scraping post titles using the Reddit API

In [11]:
subreddit = 'ireland' # subreddit we want to scrape
postLimit = 10 # the amount of posts we want, None = All of them

subreddit = r.subreddit(subreddit)

posts = [*subreddit.top(limit=postLimit)] # top posts all time

titles = [posts.title for posts in posts] # list of title of posts

posts = pd.DataFrame({"title": titles,})

posts.head()

Unnamed: 0,title
0,"The President’s dog, Síoda, has passed away. R..."
1,Let's have a cup of tea and let this all blow ...
2,Italy great bunch of lads.
3,Ireland stands with Ukraine
4,Ban Americans traveling until they sort their ...


In [None]:
keywords = ['housing market', 'property market', 'real estate', 'construction']
subredditList = ['ireland', 'europe', 'germany', 'france']
postLimit = 5

data = pd.DataFrame()

for subreddit in subredditList:
    
    s = r.subreddit(subreddit)

    search_results = s.search(' OR '.join(keywords), limit=postLimit)

    titles = [result.title for result in search_results]

    data['Subreddit']= subreddit
    data['Title']= titles

data.head(20)

As I am getting posts from non english speaking subreddits I can see potential issues in getting sentiment scores. I am going to use google translate to change the titles to be all in english.

In [None]:
for i, title in enumerate(data['Title']):

    translation = GoogleTranslator(source='auto', target='en').translate(title)

    data.loc[i, 'Title'] = translation

data.head(20)

In [None]:
s = SentimentIntensityAnalyzer()

res = data['Title'].apply(lambda x: pd.Series(s.polarity_scores(x)))

data = pd.concat([data, res], axis=1)

data.head()

Threshold values need to be selected to classify the titels into labels: Positive, Negative and Neutral

From research a value of 0.05 is typically used as optimal: https://www.researchgate.net/figure/Typical-threshold-values-used-in-VADER-20_fig5_350761338


In [None]:
THRESHOLD = 0.05

conditions = [
    (data['compound'] <= -THRESHOLD),
    (data['compound'] > -THRESHOLD) & (data['compound'] < THRESHOLD),
    (data['compound'] >= THRESHOLD),
    ]

values = ["neg", "neu", "pos"]
data['label'] = np.select(conditions, values)

data.head()