In [1]:
import requests
import pandas as pd
import numpy as np
from praw.models import MoreComments

In [2]:
import praw

In [3]:
reddit = praw.Reddit()

In [4]:
reddit.read_only = True

# API call

In [5]:
ethereum = reddit.subreddit("ethereum")
bitcoin = reddit.subreddit('bitcoin')

subreddits = [bitcoin, ethereum]


In [11]:
#https://praw.readthedocs.io/en/stable/code_overview/models/submission.html

#https://www.reddit.com/r/redditdev/comments/rsz7za/getting_submissions_from_praw_extremely_slow/

'''The slowdown is mostly due to PRAW. In the loop in line 42 you are reading a bunch of attributes of each 
submission (id, title, ...). Most of them are read when you get the subreddit feeds, but not all of them. 
The missing ones seem to be **author.name** and **upvote_ratio**. If you are trying to call those attributes, a new 
network request has to be made, which takes around half a secong for each post. Check this part of the documentation 
for a better explanation.'''


        
'''replace_more, and just generally, digging deep into a Reddit comment tree, is a 
           slow operation and will consume your rate limit (thus the slowdown you see).'''
        
'''When you access the .upvote_ratio attribute it makes a request to fetch "all" the comments for that post. 

If so that would mean you need to make another request for each post - which would be another 100 http requests - 
perhaps this is where the slow down is happening? This would mean 300 http requests for new, hot, and rising 
combined per subreddit just for this upvote_ratio'''


def call(i):
    # pass an object from subreddits

    
        #API params
        limitval = 10
        sorting = {'top': i.top(limit = limitval), 'new': i.new(limit = limitval), 'hot': i.hot(limit = limitval)}
        APIsorter = sorting['top']
        

        post_titles = []
        post_upvoteses = []
        num_commentses = []
        # Annoying names for semantic formulism to resolve identity errors (eg num_comments.append(num_comments). There are no other LOTR jokes in this doc. 
        post_ids = []
        #slowdown
        post_upvote_ratios = []
        post_comments = []
        post_upvotes = []
        post_dates = []
        post_scores = []
        post_texts = []
        post_authors = []
        #slowdown
        comment_authors = []
        comment_bodies = []
        comment_submissions = []
        comment_upvotes = []
        

        
        for submission in APIsorter:
            post_title = submission.title
            post_upvotes = submission.score
            num_comments = submission.num_comments
            post_id = submission.id
            upvote_ratio = submission.upvote_ratio
            post_comment_list = list(submission.comments)
            post_date = submission.created_utc
            post_score = submission.score
            post_text = submission.selftext
            post_author = submission.author

            post_titles.append(post_title)
            post_upvoteses.append(post_upvotes)
            num_commentses.append(num_comments)
            post_ids.append(post_id)
            post_upvote_ratios.append(upvote_ratio)
            post_comments.append(post_comment_list)
            post_dates.append(post_date)
            post_scores.append(post_score)
            post_texts.append(post_text)
            post_authors.append(post_author)
            

    #https://praw.readthedocs.io/en/v7.3.0/tutorials/comments.html

            submission.comments.replace_more(limit = None)

            for comment in submission.comments.list():
                    comment_author = comment.author
                    comment_body = comment.body
                    comment_submission = comment.submission
                    comment_upvote = comment.score
                    comment_authors.append(comment_author)
                    comment_bodies.append(comment_body)
                    comment_submissions.append(comment_submission)
                    comment_upvotes.append(comment_upvote)


        #stick in DF, could be new method but put it in here

        df_posts = pd.DataFrame({
        'title': post_titles,
        'post_num_comments': num_commentses,
        'post_upvotes': post_upvoteses,
        'post_id': post_ids,
        'post_upvote_ratios':post_upvote_ratios,
        'post_datetime':post_dates,
        'post_score': post_scores,
        'post_text': post_texts,
        'post_upvote_ratio': post_upvote_ratios,
        'post_author':post_authors

        })

        
        print(df_posts)

        df_comments = pd.DataFrame({
        'comment_author': comment_authors,
        'comment_body': comment_bodies,
        'comment_submission': comment_submissions,
        'comment_upvotes': comment_upvotes


        })
    
        return df_posts, df_comments

In [12]:
# want to make variable variables here. Seems unpopular however

In [None]:
df_posts_ethereum, df_comments_ethereum = call(subreddits[1])

df_posts_bitcoin, df_comments_bitcoin = call(subreddits[0])
            



                                               title  post_num_comments  \
0  Vitalik Buterin: Cryptocurrency Should Focus L...                896   
1  Yesterday I received my very first payment for...               1318   
2                                              Nft 😑               2070   
3  Reddit announces partnership with the Ethereum...                893   
4  Bitcoin Miami Conference warns attendees it's ...                806   
5  The Ethereum blockchain now processes about as...                537   
6  I see everyone getting exited over burning ETH...                442   
7                                  Mark mic dropping                917   
8  Goldman Sachs calls Ethereum "The Amazon Of In...                504   
9  Why is this address sending thousands of 0 ETH...                460   

   post_upvotes post_id  post_upvote_ratios  post_datetime  post_score  \
0          7732  7mve4y                0.91   1.514566e+09        7733   
1          7497  nwcnzg   

In [None]:
df_posts_ethereum.head(10)

In [None]:
df_posts_bitcoin.head()

In [None]:
# get users
#https://stackoverflow.com/questions/32314937/how-do-i-use-praw-and-python-to-retrieve-reddit-post-data-from-a-certain-user
#https://www.reddit.com/r/redditdev/comments/gtmnaq/grabbing_all_postscomments_from_a_specific_user/

'''
Instead of using reddit.redditor("username") you can also use reddit.subreddit("r/u_username") 
so, if you want to stream your posts and comments then you can use r/u_superior__peach
If you include r/u_ before any username, you can use reddit.subreddit("r/u_superior__peach")
The main advantage of using the above mentioned method is that you can use all of the attributes 
and methods that are defined in reddit.subreddit() class.
'''

In [None]:
user_submissions = user.get_submitted()

user_submissions = []
for link in submissions:
    self_texts.append(link.selftext)

print self_texts

# Cleaning

In [None]:

def format_date(df):
    df['post_datetime'] = pd.to_datetime(df['post_datetime'], unit = 's')
    df['post_date'] = df['post_datetime'].dt.date
    df['post_time'] = df['post_datetime'].dt.time
    df.drop(['post_datetime'], axis = 1, inplace = True)
    
    return df

In [None]:
[format_date(i) for i in [df_posts_bitcoin, df_posts_ethereum]]

In [None]:
df_posts_ethereum.head()

In [None]:
df_comments_bitcoin.head()

In [None]:
df_comments_ethereum.head()

In [None]:
#df_joined = df_post.merge(df_comments, left_on='post_id', right_on='comment_submission', how='right')

In [None]:
#1. join posts with posts

In [None]:
# 2 join comments w/comments

In [None]:
#3. join comments w/posts

# EDA

In [None]:
# Questions

1. How many are text vs. content?
2. Authorship
3. Title keywords
4. Text keywords
5. Title / text relationship
6. Price / timeseries
7. 

# Export

In [None]:
df_post.to_csv('./data/source.csv', index = False)