In [1]:
import datetime

import praw
import pandas as pd

from keys import client_id, client_secret

### Collecting the posts for our topic

Initializing a Reddit Instance

In [2]:
reddit = praw.Reddit( client_id=client_id,
                      client_secret=client_secret,
                      user_agent='android:my_app:v1 (by /u/steven861998)')

Calling the API and building a dataframe from it

In [3]:
tesla = reddit.subreddit('Nio')

#Gathering the top 500 posts, with their title, url, body, upvotes, timestamp, and an index that serves as a key between the
#posts and the comments we collect later
posts = []
for index, post in enumerate(tesla.top(limit=500)):
    posts.append([post.title, "https://www.reddit.com" + post.permalink, post.selftext, post.score, post.created_utc, index])

#Converting into DataFrame
posts = pd.DataFrame(posts, columns=['Title', 'URL', 'Body', 'Upvotes', 'Time', 'Key'])
#Changing from utc time to standard timestamp
posts.Time = posts.Time.apply(lambda x: pd.to_datetime(datetime.datetime.fromtimestamp(x)))

#The first post is a sticky, so we can drop it
posts = posts.iloc[1:]

In [4]:
posts.head(3)

Unnamed: 0,Title,URL,Body,Upvotes,Time,Key
1,Elon Musk confirms secret Tesla ‘Carless Drive...,https://www.reddit.com/r/teslamotors/comments/...,,45611,2018-01-30 10:20:51,1
2,Tesla vs Bugatti,https://www.reddit.com/r/teslamotors/comments/...,,44274,2017-11-20 08:53:56,2
3,Autopilot saves my model 3 from an accident!,https://www.reddit.com/r/teslamotors/comments/...,,39401,2019-02-06 01:20:41,3


In [5]:
posts.shape

(499, 6)

### Collecting the comments for each of our posts

We want to get all the comments for the posts we collected

In [6]:
def collect_replies(key, url):
    ''' 
    params pandas series row: each row of the dataframe we built above in the form of a panda series
    Returns a pandas DataFrame, where each row represents an individual comment
    '''
    submission = reddit.submission(url=url)
    submission.comments.replace_more(limit=None)
    comment_queue = submission.comments[:] 

    table = {'Reply':[], 'Upvote':[], 'Time':[], 'Key':[]}

    while comment_queue:
        comment = comment_queue.pop(0)
        table['Reply'].append(comment.body)
        table['Time'].append(comment.created_utc)
        table['Upvote'].append(comment.score)
        table['Key'].append(key)
        comment_queue.extend(comment.replies)
    
    return pd.DataFrame.from_dict(table)

Now the function has been defined, we create our dataframe of comments. Using list comprehensions will speed things up slightly

In [7]:
#Let us first generate a list of tupules that contains the key and url for each row - the first value of the tupule is key,
#and the second value is url
keys = posts.Key.tolist()
urls = posts.URL.tolist()
tupules = list(zip(keys, urls))

#Now we generate our comments dataframe using list comprehensions!
comments = pd.concat([collect_replies(x[0], x[1]) for x in tupules])

In [8]:
#Again, converting the timestamp from utc to a standard format
comments.Time = comments.Time.apply(lambda x: pd.to_datetime(datetime.datetime.fromtimestamp(x)))

In [9]:
comments.head(3)

Unnamed: 0,Reply,Time,Upvote,Key
0,All we need now is humanless bodies,2018-01-30 11:19:01,956,1
1,For a moment I read careless driver. Would hav...,2018-01-30 10:35:31,3985,1
2,Elon's Twitter feed has been hilarious over th...,2018-01-30 11:21:32,1762,1


In [10]:
comments.shape

(228506, 4)

In [11]:
#comments.to_csv('Comments_.csv', index=False)

In [12]:
#posts.to_csv('Posts.csv', index=False)