# Pull Posts Associated with Tickers

This details how to create:

- combined_posts.csv

## 0. Import Packages

In [1]:
import pandas as pd
from pmaw import PushshiftAPI
import datetime as dt
import numpy as np

pd.set_option('display.max_colwidth', 50)

api = PushshiftAPI()

## 1. Get Posts for ALL Tickers

- Get any post that mentions the ticker name in the title

In [4]:
def get_posts(post_titles, start_epoch, end_epoch, path = './', limit = 100000, num_comments = '>100'):
    
    comment_counter = {}

    for title in post_titles:
        gen = api.search_submissions(title=title, # this is the keyword (ticker symbol) for which we're searching
                                 after=start_epoch, 
                                 before=end_epoch, # these are the unix-based timestamps to search between
                                 subreddit=['wallstreetbets'], # one or more subreddits to include in the search
                                 filter=['author','id', 'num_comments', 'score', 'subreddit', 'title', 'url'], # list of fields to return, created is returned by default
                                 num_comments=">100",
                                 limit = limit # limit on the number of records returned
                                ) 
        posts_df = pd.DataFrame(gen)

        comment_counter[title] = posts_df['num_comments'].sum()
        
        posts_df['search_term'] = title

        posts_df.to_csv('wsb_posts_{}.csv'.format(title), header=True, index=False, columns=list(posts_df.axes[1]))
        
    
    return comment_counter

In [5]:
# Iterate over titles
post_titles = ['AMZN', 'TSLA', 'META', 'FB', 'NOK', 'PLTR', 'AMC', 'BB', 'WISH', 'CLOV'] # looks for this in the title of the post

# DEFINE PARAMS
start_epoch = int(pd.to_datetime('2021-01-01').timestamp())
end_epoch = int(pd.to_datetime('2021-12-31').timestamp())
limit=100000
num_comments = '>100'

# CALL FUNCTION
get_posts(post_titles = post_titles, start_epoch = start_epoch, end_epoch = end_epoch)

INFO:pmaw.PushshiftAPIBase:99968 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 9 - Batches: 1 - Items Remaining: 0
INFO:pmaw.PushshiftAPIBase:99788 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 10 - Batches: 1 - Items Remaining: 0
INFO:pmaw.PushshiftAPIBase:99983 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 9 - Batches: 1 - Items Remaining: 0
INFO:pmaw.PushshiftAPIBase:99990 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 8 - Batches: 1 - Items Remaining: 0
INFO:pmaw.PushshiftAPIBase:99810 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 11 - Batches: 2 - Items Remaining: 0
INFO:pmaw.PushshiftAPIBase:99748 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 10 - Bat

{'AMZN': 12683,
 'TSLA': 80834,
 'META': 4311,
 'FB': 2326,
 'NOK': 69968,
 'PLTR': 77881,
 'AMC': 678012,
 'BB': 413880,
 'WISH': 114542,
 'CLOV': 252182}

## 2. Unions All Scraped Posts

In [16]:
extension = 'csv'
include = ['AMZN', 'TSLA', 'META', 'FB', 'NOK', 'PLTR', 'AMC', 'BB', 'WISH', 'CLOV']
all_filenames = ['wsb_posts_{}.csv'.format(i) for i in include]
all_filenames

['wsb_posts_AMZN.csv',
 'wsb_posts_TSLA.csv',
 'wsb_posts_META.csv',
 'wsb_posts_FB.csv',
 'wsb_posts_NOK.csv',
 'wsb_posts_PLTR.csv',
 'wsb_posts_AMC.csv',
 'wsb_posts_BB.csv',
 'wsb_posts_WISH.csv',
 'wsb_posts_CLOV.csv']

In [17]:
combined_posts = pd.concat([pd.read_csv('./posts/{}'.format(f)) for f in all_filenames])
len(combined_posts)

3178

In [18]:
def clean_posts(df, col_with_emoji):
    
    # CLEAN DATES
    df['date'] = pd.to_datetime(df['created_utc'], unit='s')
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    
    return df

In [19]:
# Call function
clean_posts(df = combined_posts, col_with_emoji = 'title')
combined_posts.head()

Unnamed: 0,author,created_utc,id,num_comments,score,subreddit,title,url,search_term,date,day,month
0,Substantial-Voice-48,1617295053,mhz8ff,376,1,wallstreetbets,DOWN over 81K. There’s no hope for me left. Ex...,https://i.redd.it/ivh8a1o0blq61.jpg,AMZN,2021-04-01 16:37:33,1,4
1,onemanstrong,1612195281,la6hca,115,0,wallstreetbets,STRATEGY: Supplement GME losses while you HOLD...,https://www.reddit.com/r/wallstreetbets/commen...,AMZN,2021-02-01 16:01:21,1,2
2,AuspiciousToad,1612122655,l9kbhp,324,670,wallstreetbets,Update: I said if I could turn $3k into $100k ...,https://i.redd.it/s2maxrrt2qe61.jpg,AMZN,2021-01-31 19:50:55,31,1
3,AuspiciousToad,1611169234,l1fugm,256,25,wallstreetbets,2021 options fuckaround account. Goal is $3k t...,https://www.reddit.com/gallery/l1fugm,AMZN,2021-01-20 19:00:34,20,1
4,louis_lafaille,1613061864,lhooz1,323,1,wallstreetbets,BB/AMZN Speculation: Blackberry IVY for the au...,https://www.reddit.com/r/wallstreetbets/commen...,AMZN,2021-02-11 16:44:24,11,2


## 3. Pull a Random Sample of Posts for Labelling

In [20]:
# Drop duplicates in the id column
chosenpost_unique = combined_posts.drop_duplicates(subset=['id'], keep='first')
print('Length without duplicates: ', len(chosenpost_unique))

Length without duplicates:  2771


In [21]:
# Each post has more than 100 comments, so randomly sample 100 posts (10000 comments at least)

sampled = combined_posts.sample(n=250,replace=False, random_state=555)
sampled.head()

Unnamed: 0,author,created_utc,id,num_comments,score,subreddit,title,url,search_term,date,day,month
517,Shinjito_DBL,1612278369,laxm2e,116,18,wallstreetbets,"To my European mates, any brokers apps where y...",https://www.reddit.com/r/wallstreetbets/commen...,AMC,2021-02-02 15:06:09,2,2
390,Focust92,1611796065,l6jwwt,656,322,wallstreetbets,Are AMC and BB the plays for the smaller guys,https://www.reddit.com/r/wallstreetbets/commen...,BB,2021-01-28 01:07:45,28,1
342,ObjectiveCauseCaw,1623253057,nvysl5,106,0,wallstreetbets,"sold my car, house for $WISH $CLOV $CLNE.... w...",https://i.redd.it/3vseqdt7f9471.png,WISH,2021-06-09 15:37:37,9,6
279,VisualMod,1623405619,nxcb5h,12191,2,wallstreetbets,"Daily Popular Tickers Thread for June 11, 2021...",https://www.reddit.com/r/wallstreetbets/commen...,AMC,2021-06-11 10:00:19,11,6
1,Apprehensive_Peace69,1623646537,nzf0mj,203,341,wallstreetbets,Will BB and FB make a deal?,https://www.reddit.com/r/wallstreetbets/commen...,FB,2021-06-14 04:55:37,14,6


In [22]:
sampled['num_comments'].sum()

150160

### 3.1 Use 'sampled' To Pull Comments Data

In [23]:
post_ids_to_pull = sampled['id'].to_list()
post_ids_to_pull[0:10]

['laxm2e',
 'l6jwwt',
 'nvysl5',
 'nxcb5h',
 'nzf0mj',
 'l68yas',
 'nv3ib3',
 'oxyrst',
 'q05y3e',
 'mg1q9o']

In [24]:
post_names = sampled['title'].to_list()
post_names[0:10]

['To my European mates, any brokers apps where you can still buy GME and AMC? (Revolut went down)',
 'Are AMC and BB the plays for the smaller guys',
 'sold my car, house for $WISH $CLOV $CLNE.... woops',
 'Daily Popular Tickers Thread for June 11, 2021 - AMC | BB | CLOV',
 'Will BB and FB make a deal?',
 'SELL AMC, SELL BB, SELL NOK, SELL EXPR. BUY BUY BUY GME.',
 'We Need Separate Daily Megathreads For GME / AMC / BB / CLOV',
 'PLTR YOLO my life savings',
 'Shots fired! Cathie calling out GM after TSLA deliveries!',
 'I’m selling all my $AMC stonk tomorrow to put into more $GME that would make 100% of my portfolio! After all the confirmation bias over the past month and a half I feel like I’m making the right decision but I could use some more to make me feel better FYI 😝<apes> <apes> <rocket> <rocket> ']

In [25]:
def get_comments(post_ids_to_pull, post_names, batch_start, batch_end, path = './comments/wsb_comments', emoji_dict = emoji_dict, limit = 100000, score = '>10'):
    
    # Get subset of posts to pull
    post_batch = post_ids_to_pull[batch_start:batch_end]
    post_names_batch = post_names[batch_start:batch_end]
    
    for i, post_id in enumerate(post_batch): # EDIT THIS pull in batches
        gen = api.search_comments(link_id = post_id, # this is the post id for which we want to pull comments from
                                 filter = ['author','body','id','permalink', 'score', 'subreddit', 'link_id'], # list of fields to return, created is returned by default
                                 score = score,
                                 limit = limit # limit on the number of records returned
                                )
        comments_df = pd.DataFrame(gen)

        if not 'created_utc' in comments_df.columns: #if no comments are found and the df is empty, move on to the next post
            continue

        # Clean up
        comments_df['search_term'] = post_id 
        comments_df['post_name'] = post_names_batch[i] # EDIT THIS included for easy tagging

        comments_df.to_csv('{}_{}.csv'.format(path, post_id), header=True, index=False, columns=list(comments_df.axes[1]))

In [26]:
# Define index locations of batch start and end
# Note that a batch of 50 posts takes about 1 hour to pull

batch_start = 100
batch_end = 101

# csvs will be created in your specified directory
get_comments(post_ids_to_pull = post_ids_to_pull, post_names = post_names, batch_start = 100, batch_end = 101)

INFO:pmaw.PushshiftAPIBase:99941 result(s) not found in Pushshift
INFO:pmaw.PushshiftAPIBase:Total:: Success Rate: 100.00% - Requests: 4 - Batches: 1 - Items Remaining: 0


### 3.2 Union All Comments

In [49]:
# df = pd.read_csv('./comments/wsb_comments_initialbatch.csv')
# df.drop(df.columns[0],axis = 1, inplace = True)
# df.to_csv('./comments/wsb_comments_initialbatch.csv', index = False)

In [56]:
from os import listdir
from os.path import isfile, join

# get filenames of everything successfully scraped
mypath = './comments/'
all_comments = [f for f in listdir(mypath) if isfile(join(mypath, f))]
all_comments.remove('.DS_Store') # remove this line if the file is not in your path. seems to be a mac thing
all_comments[0:10]

['wsb_comments_ny69ns.csv', 'wsb_comments_initialbatch.csv']

In [57]:
combined_comments = pd.concat([pd.read_csv('./comments/{}'.format(f)) for f in all_comments])
len(combined_comments)

7478

In [58]:
combined_comments.tail()

Unnamed: 0,author,body,created_utc,id,link_id,permalink,score,subreddit,search_term,post_name
7414,Abject_Resolution,Thank you for making me feel a bit better today.,1620675652,gxn8vxv,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,104,wallstreetbets,n9dymd,I love PLTR
7415,ThereIsNoSp00nz,It does not love you back🤣,1620675497,gxn8in8,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,75,wallstreetbets,n9dymd,I love PLTR
7416,Krrtis,will do,1620675497,gxn8imw,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,11,wallstreetbets,n9dymd,I love PLTR
7417,CoronaEraXpertTrader,"if you keep averaging down, it will eventually...",1620675485,gxn8hlk,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,69,wallstreetbets,n9dymd,I love PLTR
7418,CoronaEraXpertTrader,buy more,1620675455,gxn8f3k,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,15,wallstreetbets,n9dymd,I love PLTR


In [89]:
#combined_comments.to_csv('comments_initial_batch_Rachel.csv', index = False)

In [59]:
combined_comments.dropna(inplace = True)
len(combined_comments)

7478

In [61]:
# Call function (do this all together)
clean_posts(df = combined_comments, col_with_emoji = 'body')
combined_comments.tail()

Unnamed: 0,author,body,created_utc,id,link_id,permalink,score,subreddit,search_term,post_name,date,day,month
7414,Abject_Resolution,Thank you for making me feel a bit better today.,1620675652,gxn8vxv,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,104,wallstreetbets,n9dymd,I love PLTR,2021-05-10 19:40:52,10,5
7415,ThereIsNoSp00nz,It does not love you back🤣,1620675497,gxn8in8,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,75,wallstreetbets,n9dymd,I love PLTR,2021-05-10 19:38:17,10,5
7416,Krrtis,will do,1620675497,gxn8imw,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,11,wallstreetbets,n9dymd,I love PLTR,2021-05-10 19:38:17,10,5
7417,CoronaEraXpertTrader,"if you keep averaging down, it will eventually...",1620675485,gxn8hlk,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,69,wallstreetbets,n9dymd,I love PLTR,2021-05-10 19:38:05,10,5
7418,CoronaEraXpertTrader,buy more,1620675455,gxn8f3k,t3_n9dymd,/r/wallstreetbets/comments/n9dymd/i_love_pltr/...,15,wallstreetbets,n9dymd,I love PLTR,2021-05-10 19:37:35,10,5


### 3.3 Split Comments For Tagging

Split randomly sampled comments to 5 parts for each group member to tag. 

In [62]:
# Add a rough total word count
combined_comments['word_count'] = combined_comments['body'].str.split().apply(len)

In [63]:
# Filter by word count > 5
filtered_comments = combined_comments[combined_comments['word_count'] > 5]
len(filtered_comments)

5623

In [73]:
# Split comments into 5 files
batch_size = 1100

for i in range(5):
    start = i*batch_size
    stop = (i+1)*batch_size
    filtered_comments[start:stop].to_csv('./tagging/comments_batch_{}.csv'.format(i))

In [30]:
grouped = combined_csv.groupby(['search_term','month','day'], as_index=False)['num_comments'].sum()
grouped

Unnamed: 0,search_term,month,day,num_comments
0,AMC,1,1,116
1,AMC,1,18,627
2,AMC,1,20,108
3,AMC,1,25,104
4,AMC,1,26,768
...,...,...,...,...
899,WISH,11,25,417
900,WISH,11,27,280
901,WISH,11,29,519
902,WISH,12,2,218


In [33]:
std_dev = grouped.groupby(['search_term', 'month'], as_index = False).agg({"num_comments": [np.mean, np.std]})
std_dev.to_csv('std_dev.csv')

In [35]:
post_count = combined_csv.groupby(['search_term'], as_index=False).count()
post_count

Unnamed: 0,search_term,author,created_utc,id,num_comments,score,subreddit,title,url,date,day,month
0,AMC,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097,1097
1,AMZN,32,32,32,32,32,32,32,32,32,32,32
2,BB,642,642,642,642,642,642,642,642,642,642,642
3,BBBY,22,22,22,22,22,22,22,22,22,22,22
4,CLOV,357,357,357,357,357,357,357,357,357,357,357
5,FB,10,10,10,10,10,10,10,10,10,10,10
6,META,17,17,17,17,17,17,17,17,17,17,17
7,NOK,190,190,190,190,190,190,190,190,190,190,190
8,PLTR,252,252,252,252,252,252,252,252,252,252,252
9,TSLA,212,212,212,212,212,212,212,212,212,212,212


In [113]:
comment_sample={'author': 'benjjjix', 'body': 'To infinity and beyond boys🚀🚀🚀🙌🙌💎💎', 'created_utc': 1611866620, 'id': 'gl566x9', 'link_id': 't3_l785l4', 'permalink': '/r/wallstreetbets/comments/l785l4/this_is_war_gme_bb_nok/gl566x9/', 'score': 8, 'subreddit': 'wallstreetbets', 'created': 1611837820.0}

In [114]:
comment_sample

{'author': 'benjjjix',
 'body': 'To infinity and beyond boys🚀🚀🚀🙌🙌💎💎',
 'created_utc': 1611866620,
 'id': 'gl566x9',
 'link_id': 't3_l785l4',
 'permalink': '/r/wallstreetbets/comments/l785l4/this_is_war_gme_bb_nok/gl566x9/',
 'score': 8,
 'subreddit': 'wallstreetbets',
 'created': 1611837820.0}

In [115]:
post_sample = combined_csv.iloc[10].to_dict()
post_sample

{'author': 'FluxTradesStocks',
 'created_utc': 1626725207,
 'id': 'onm2cl',
 'num_comments': 232,
 'score': 1,
 'subreddit': 'wallstreetbets',
 'title': 'Why I bought $AMZN Calls for the Space Flight Tomorrow...',
 'url': 'https://www.reddit.com/r/wallstreetbets/comments/onm2cl/why_i_bought_amzn_calls_for_the_space_flight/',
 'date': '2021-07-19 20:06:47',
 'search_term': 'AMZN'}