In [None]:
# This notebook retrieves several data points from reddit and saves it to csv files

# Imports
import praw
import pandas as pd
import numpy as np
import datetime
import os

In [None]:
# Choose a subreddit to retrieve data from
subreddit = 'christianity'

# Choose the number of hot posts to retrieve from the subreddit
num_posts_to_retrieve = 30

In [None]:
# Login and create Reddit instance
myClientIDvar = '7H-wkx3wmj96hJazvWQb-A'
myClientSecret = 'kKTtoMJA0gYXEr9d0ohMp-niRM0yhw'
myRedditUserName = 'Legitimate_Thing5210'
reddit = praw.Reddit(client_id=myClientIDvar, client_secret=myClientSecret, user_agent=myRedditUserName)

In [None]:
"""
Returns a generator that generates the n hottest posts from a subreddit

Ex: Get the hottest 30 posts from r/christianity:
  generator = getHotPosts(30, 'christianity')
"""

def getHotPosts(numPosts, subreddit):
  return reddit.subreddit(subreddit).hot(limit=numPosts)

In [None]:
""""
Return a Dataframe of posts and information from a generator

Ex: Return a dataframe with info about the 30 hottest posts on r/christianity
  generator = getHotPosts(30, 'christianity')
  dfPosts = getPostsDf(generator, 10)
  dfPosts = dfPosts.append(getPostsDf(generator, 10))
  dfPosts = dfPosts.append(getPostsDf(generator, 10))
"""

def getHotPostsDf(postings_generator, n_postings):
  post_info_list = []

  for i in range(n_postings):
    try:
      submission = next(postings_generator)
    except:
      print("No more posts to generate")
      break

    post_info_list.append({
      'ID': submission.id,
      'Sub': submission.subreddit,
      'Title': submission.title,
      'URL': submission.permalink,
      'Time': datetime.datetime.fromtimestamp(submission.created_utc),
      'Author': str(submission.author),
      'Body': submission.selftext,
      'IsSelfPost': bool(submission.is_self)
    })

  return pd.DataFrame(post_info_list)

In [None]:
"""
Return a Dataframe with information about the authors of posts from a Dataframe of posts
"""
def getAuthorInfoDF(dfPosts):

  author_list = []

  for indexAuthor, id in enumerate(dfPosts['ID']):
    # Get Entry Information
    thisSubmission = reddit.submission(id=id)
    thisAuthor = thisSubmission.author

    # Get Author Information

    # Verifies that author exists and has not been suspended (as suspended accounts do not have most attributes)
    if (thisSubmission.author is None) or not hasattr(thisAuthor, 'created_utc'):
      continue

    TempDictAuthor = {
      'userName': thisAuthor.name,
      'numTrophies': len(thisAuthor.trophies()),
      'accountAge': thisAuthor.created_utc,
      'totalKarma': thisAuthor.total_karma,
      'linkKarma': thisAuthor.link_karma,
      'commentKarma': thisAuthor.comment_karma,
      'awarderKarma': thisAuthor.awarder_karma,
      'awardeeKarma': thisAuthor.awardee_karma,
      'isMod': thisAuthor.is_mod,
    }
    TempDictAuthor['numComments'] = sum(1 for _ in thisAuthor.comments.top(time_filter="month"))

    author_list.append(TempDictAuthor)

    print('Retrieved data for ' + str(indexAuthor + 1) + ' authors')

  dfAuthors = pd.DataFrame(author_list)
  dfAuthors['isMod'] = dfAuthors['isMod'].astype('bool')
  
  return dfAuthors

In [None]:
"""
Return a Dataframe with information about the posts that authors 
"""
def getPostsByAuthors(dfAuthors):
  usernames = dfAuthors["userName"].drop_duplicates()
  posts_lists = []

  for username in usernames:
    thisAuthor = reddit.redditor(name=username)
    for thisAuthorPost in thisAuthor.submissions.top(time_filter="month"):
      posts_lists.append({
        'ID': thisAuthorPost.id,
        'Author': str(thisAuthor),
        'Sub': thisAuthorPost.subreddit,
        'Title': thisAuthorPost.title,
        'Body': thisAuthorPost.selftext,
        'Upvotes': thisAuthorPost.ups,
        'Upvote Ratio': thisAuthorPost.upvote_ratio,
        'Awards': thisAuthorPost.total_awards_received,
        'URL': thisAuthorPost.permalink,
        'Time': datetime.datetime.fromtimestamp(thisAuthorPost.created_utc),
        'IsSelfPost': bool(thisAuthorPost.is_self)
      })
  return pd.DataFrame(posts_lists)

In [None]:
"""
Return a Dataframe with information about the comments from posts
Retrieves a maximum of 20 comments per post
"""
def getCommentsByPosts(dfPosts):
  comments_list = []

  max_comments_per_post = 20

  for postID in dfPosts["ID"]:

    thisSubmission = reddit.submission(id=postID)
    thisSubmission.comments.replace_more(limit=0)

    for num_comment, tempComment in enumerate(thisSubmission.comments.list()):
      if num_comment + 1 > max_comments_per_post:
        break

      if not hasattr(tempComment,'is_submitter'):
        continue
      
      comments_list.append({
        'ID': tempComment.id,
        'Author': tempComment.author,
        'Post ID': tempComment.submission,
        'Body': str(tempComment.body),
      })
  
  return pd.DataFrame(comments_list)

In [None]:
'''
Converts a Dataframe to CSV File. If './results/target_csv_file' doesn't exist, creates the file.
Otherwise appends the dataframe to the existing file
'''

def DFtoCSV(df, target_csv_file='output.csv'):
  if not os.path.exists('./results'):
    os.makedirs('./results')

  path = './results/' + target_csv_file
  if os.path.exists(path):
    df.to_csv(path, mode='a', index=False, header=False)
  else:
    df.to_csv(path, index=False)


In [None]:
'''
Reads data from a CSV file found in the 'results' folder and returns a Dataframe
'''
def CSVtoDF(csv_file):
  return pd.read_csv('./results/' + csv_file)

In [None]:
post_generator = getHotPosts(num_posts_to_retrieve, subreddit)

In [None]:
dfPosts = getHotPostsDf(post_generator, 30)
DFtoCSV(dfPosts, subreddit + '_hot_posts.csv')

In [None]:
dfPosts = CSVtoDF(subreddit + '_hot_posts.csv')
DFtoCSV(getAuthorInfoDF(dfPosts), subreddit + '_authors.csv')

In [None]:
dfAuthors = CSVtoDF(subreddit + '_authors.csv')
DFtoCSV(getPostsByAuthors(dfAuthors), subreddit + '_posts_by_authors.csv')

In [None]:
dfPostsByAuthors = CSVtoDF(subreddit + '_posts_by_authors.csv')
DFtoCSV(getCommentsByPosts(dfPostsByAuthors), subreddit + '_comments.csv')