# Reddit API Script to collect posts

In [178]:
import pandas as pd
import numpy as np

import datetime as dt
from pprint import pprint
from itertools import chain

import praw # reddit API crawler

import nltk # Sentiment analysis module
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords

import matplotlib.pyplot as plt

import sys

from deep_translator import GoogleTranslator

Only need to run this for first run, it downloads the VADER lexicon, punkt tokenizer and stop word library

In [179]:

def downloadNLTK():

    nltk.download('vader_lexicon')  # get lexicons data
    nltk.download('punkt')  # for tokenizer
    nltk.download('stopwords')

#downloadNLTK()

In [180]:
username = 'balackdynamite' # Personal Reddit Account
id = 'RcSucsCZw-A0pEmsaaqaQA'
secret = 'oIF0qAUUXuh9QtbsgOJwwQM9j4vJLw'

r = praw.Reddit(user_agent=username,
                client_id=id,
                client_secret=secret,
                check_for_async=False)

Initial Test of scraping post titles using the Reddit API

In [181]:
subreddit = 'ireland' # subreddit we want to scrape
postLimit = 10 # the amount of posts we want, None = All of them

subreddit = r.subreddit(subreddit)

posts = [*subreddit.top(limit=postLimit)] # top posts all time

titles = [posts.title for posts in posts] # list of title of posts

posts = pd.DataFrame({"title": titles,})

posts.head()

Unnamed: 0,title
0,"The President’s dog, Síoda, has passed away. R..."
1,Let's have a cup of tea and let this all blow ...
2,Italy great bunch of lads.
3,Ireland stands with Ukraine
4,Ban Americans traveling until they sort their ...


In [182]:
keywords = ['housing market', 'property market', 'real estate', 'construction']
subredditList = ['ireland', 'europe', 'germany', 'france']
postLimit = 5

data = pd.DataFrame()

for subreddit in subredditList:
    
    s = r.subreddit(subreddit)

    search_results = s.search(' OR '.join(keywords), limit=postLimit)

    titles = [result.title for result in search_results]

    data['Subreddit']= subreddit
    data['Title']= titles

data.head(20)

Unnamed: 0,Subreddit,Title
0,france,Need help to find a french real estate agency
1,france,Une histoire une peu mignonne.
2,france,Mesmerizing sun-and-grape mosaic roundel from ...
3,france,"The Inquiring Photographer, il y a 100 ans, de..."
4,france,Questions about the real estate wealth tax on ...


As I am getting posts from non english speaking subreddits I can see potential issues in getting sentiment scores. I am going to use google translate to change the titles to be all in english.

In [183]:
for i, title in enumerate(data['Title']):

    translation = GoogleTranslator(source='auto', target='en').translate(title)

    data.loc[i, 'Title'] = translation

data.head(20)

Unnamed: 0,Subreddit,Title
0,france,Need help to find a french real estate agency
1,france,A bit of a cute story.
2,france,Mesmerizing sun-and-grape mosaic roundel from ...
3,france,The Inquiring Photographer 100 years ago asked...
4,france,Questions about the real estate wealth tax on ...


In [184]:
s = SentimentIntensityAnalyzer()

res = data['Title'].apply(lambda x: pd.Series(s.polarity_scores(x)))

data = pd.concat([data, res], axis=1)

data.head()

Unnamed: 0,Subreddit,Title,neg,neu,pos,compound
0,france,Need help to find a french real estate agency,0.0,0.722,0.278,0.4019
1,france,A bit of a cute story.,0.0,0.5,0.5,0.4588
2,france,Mesmerizing sun-and-grape mosaic roundel from ...,0.0,1.0,0.0,0.0
3,france,The Inquiring Photographer 100 years ago asked...,0.128,0.872,0.0,-0.4215
4,france,Questions about the real estate wealth tax on ...,0.0,0.758,0.242,0.4939
