In [1]:
import json
import os
from tqdm import tqdm
import pandas as pd
import re
import numpy as np
import datetime
import dateutil.parser as dp
from dateutil.relativedelta import relativedelta

In [2]:
from nltk.corpus import stopwords
from nltk import FreqDist
import collections
import string

def clean_post(post_text):
    translator = str.maketrans('', '', string.punctuation) # for removing punctuation
    post_text = post_text.translate(translator)
    token_text = [t for t in post_text.replace('\n',' ').split()]
    
    sr = stopwords.words('english')
    sr.extend(['gay', 'bear', 'girlfriend', 'fuckers', 'bull'])
    cleanTokens = [i for i in token_text if i not in sr]
    
    return cleanTokens

nasdaq = pd.read_csv('tickers/nasdaq.csv')
nyse = pd.read_csv('tickers/nyse.csv')

def check_tickers(tokens):
    ticker_counter = collections.Counter()
    for token in tokens:
        if any(nasdaq['Symbol'].isin([token])): 
            ticker_counter[token] += 1
            print(token)
        if any(nyse['Symbol'].isin([token])): 
            ticker_counter[token] += 1
            print(token)
    return ticker_counter

In [3]:
def ticker_finder(row):
    if row['regexed_combined'] == 0:
        return 0
    if type(row['selftext']) == float:
        return 0 
    if row['regexed_combined'][0][0] != 'NONE':
        return 1
    lookup_string = row['title'] + ' ' + row['selftext']
    tickers = check_tickers(clean_post(lookup_string))
    the_goods = tickers.most_common(3)
    return the_goods if the_goods else -1

In [4]:
def ticker_extender(pos_list):
    prev_ticker = "NONE"
    for group in pos_list:
        if not group[0]:
            group[0] = prev_ticker
        else:
            prev_ticker = group[0]
    return pos_list

In [5]:
def regex_pos(post_text):
    if pd.isna(post_text): return 0
    x = []
    p1 = re.compile('((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+\$?(?:(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}\s+(\d\d?\/\d\d?(?:\/\d{2,4})?))')
    p2 = re.compile('((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+(\d\d?\/\d\d?(?:\/\d{2,4})?)\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p3 = re.compile('(\d\d?\/\d\d?(?:\/\d{2,4})?)\s+((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p4 = re.compile('((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+(\d{1,2}\s?[A-Z]{1,3})\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p5 = re.compile('(\d{1,2}\s?[A-Z]{1,3})\s+((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p6 = re.compile('([A-Z]{1,3}\s?\d{1,2})\s+((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p7 = re.compile('((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+([A-Z]{1,3}\s?\d{1,2})\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    
    # original ticker: (\$?[A-Z]{1,4})
    # alt ticker: ((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)
    
    x1 = re.findall(p1, post_text)
    x2 = re.findall(p2, post_text)
    x3 = re.findall(p3, post_text)
    x4 = re.findall(p4, post_text) # diving into alpha-numeric dates like 20 NOV
    x5 = re.findall(p5, post_text) #alpha 20 nov, date first
    x6 = re.findall(p6, post_text) #alpha nov 20, date first
    x7 = re.findall(p7, post_text) #alpha nov 20, ticker first
    
    #dictating ticker-strike-c/p-date as standard. x1 stays, the rest must be edited 
    if x1:
        x1_e = []
        for match in x1:
            ordered = [match[0].upper(), match[1], match[2], match[3]]
            x1_e.append(ordered)
        x.extend(x1_e)
    if x2: #ticker-date-strike-c/p
        x2_e = []
        for match in x2:
            ordered = [match[0].upper(),match[2],match[3],match[1]]
            x2_e.append(ordered)
        x.extend(x2_e)
    if x3: #date-ticker-strike-c/p
        x3_e = []
        for match in x3:
            ordered = [match[1].upper(),match[2],match[3],match[0]]
            x3_e.append(ordered)
        x.extend(x3_e)
    if x4: # ticker-date(alphanumeric)-stike-c/p
        x4_e = []
        for match in x4:
            ordered = [match[0].upper(),match[2],match[3],match[1]]
            x4_e.append(ordered)
        x.extend(x4_e)
    if x5: #date(alpha)-ticker-strike-cp
        x5_e = []
        for match in x5:
            ordered = [match[1].upper(),match[2],match[3],match[0]]
            x5_e.append(ordered)
        x.extend(x5_e)
    if x6: #same as above, diff alpha
        x6_e = []
        for match in x6:
            ordered = [match[1].upper(),match[2],match[3],match[0]]
            x6_e.append(ordered)
        x.extend(x6_e)
    if x7:
        x7_e = []
        for match in x7:
            ordered = [match[0].upper(),match[2],match[3],match[1]]
            x7_e.append(ordered)
        x.extend(x7_e)

    if x:
        x = ticker_extender(x) # see above
        return x 
    else:
        return 0

In [6]:
# adds -1 to unproccessed dates, these are worth dropping
def date_proccessor(pos_list):
    if pos_list == 0: return pos_list
    new_list = pos_list
    for position in new_list:
        date = position[-1]
        try:
            p_date = dp.parse(date)
        except:
            try:
                date_e = date.split('/')
                if len(date_e) == 2:
                    date_e = '/'.join(i for i in date_e[::-1])
                elif len(date_e) == 3:
                    tmp = date_e[0]
                    date_e[0] = date_e[1]
                    date_e[1] = tmp
                    date_e = '/'.join(i for i in date_e[::-1])
                p_date = dp.parse(date_e)                
            except:
                print("date parse error, probably worth dropping")
                print(date)
                p_date = -1
        position[-1] = p_date
    return new_list
# hanndled the -1 entries for dates. if no positions left, drops the entire entry
def date_proccessor_corrector(pos_list):
    if pos_list == 0: return pos_list
    new_list = []
    for position in pos_list:
        if position[-1] != -1:
            new_list.append(position)
    if new_list:
        return new_list
    else:
        return 0

In [7]:
# get the created year, compare to expiry
# make adjustments as needed
# return
#initial comparison made to current because all posts with no year originally have current year added to them
#   as part of pre-processing
def expiry_year_corrector(row):
    post_date = int(row['created_utc'])
    post_date = datetime.datetime.fromtimestamp(post_date)
    combined = row['regexed_combined']
    if combined == 0: return combined
    current_year = datetime.datetime.now().year
    for position in combined:
        expiry = position[3]
        expiry_year = expiry.year
        expiry_month = expiry.month
        post_year = post_date.year
        post_month = post_date.month
        
        if expiry_year > current_year: 
            return combined # no changes needed
        
        if expiry_month >= post_month: # expiry month is greater than month, is POST year
            expiry = datetime.datetime(post_year,expiry.month,expiry.day)
        elif expiry_month < post_month: # expiry month is less than post month, is POST year + 1
            expiry = datetime.datetime(post_year+1,expiry.month,expiry.day)
        position[3] = expiry
    print(combined)
    return combined

In [8]:
def add_missing_tickers(row):
    # rows with no regex
    if row['regexed_combined'] == 0: return row['regexed_combined']
    # if there are no missing tickers
    if row['regexed_combined'][0][0] != "NONE": return row['regexed_combined']
    # if there are missing tickers
    for position in row['regexed_combined']:
        if position[0] == "NONE" and not (type(row['ticker_locator']) == int):
            position[0] = row['ticker_locator'][0][0] # adds in ticker of most common ticker mentioned - best guess
    return row['regexed_combined']

In [9]:
## maybe create a number of posts function for a new col
## need a expiry function to get expiry out of position
## stock at open and close (where close applicable)

In [10]:
good_post_csv = pd.read_csv('all_good_posts.csv')
#all_post_csv = pd.read_csv('all_posts.csv')

In [11]:
good_post_csv['all text'] = good_post_csv['title'] + good_post_csv['selftext']

In [12]:
good_post_csv['regexed_combined'] = good_post_csv['all text'].apply(regex_pos)
good_post_csv['regexed_body'] = good_post_csv['selftext'].apply(regex_pos)
good_post_csv['regexed_title'] = good_post_csv['title'].apply(regex_pos)

In [13]:
#good_post_csv.to_csv('output1.csv')

In [14]:
good_post_csv['regexed_combined'] = good_post_csv['regexed_combined'].apply(date_proccessor)

date parse error, probably worth dropping
SPY 25
date parse error, probably worth dropping
SPY 15
date parse error, probably worth dropping
Q3
date parse error, probably worth dropping
20 SPY
date parse error, probably worth dropping
35 SPY
date parse error, probably worth dropping
85P
date parse error, probably worth dropping
50P
date parse error, probably worth dropping
LEM20
date parse error, probably worth dropping
GFK20
date parse error, probably worth dropping
HEK20
date parse error, probably worth dropping
DLK20
date parse error, probably worth dropping
P 1
date parse error, probably worth dropping
1 OTM
date parse error, probably worth dropping
10 USD
date parse error, probably worth dropping
5
SLV
date parse error, probably worth dropping
10
UPS
date parse error, probably worth dropping
20 IAU
date parse error, probably worth dropping
28 SLV
date parse error, probably worth dropping
0 DTE
date parse error, probably worth dropping
0 DTE
date parse error, probably worth dropping

In [15]:
good_post_csv['regexed_combined'] = good_post_csv['regexed_combined'].apply(date_proccessor_corrector)

In [16]:
good_post_csv['regexed_combined'] = good_post_csv.apply(expiry_year_corrector, axis=1)

[['$T', '37.5', 'p', datetime.datetime(2019, 10, 4, 0, 0)]]
[['NONE', '200', 'P', datetime.datetime(2020, 5, 15, 0, 0)]]
[['HLT', '40', 'P', datetime.datetime(2020, 5, 1, 0, 0)]]
[['HDB', '35', 'p', datetime.datetime(2020, 4, 17, 0, 0)], ['EWP', '19', 'p', datetime.datetime(2020, 4, 17, 0, 0)], ['BLK', '295', 'p', datetime.datetime(2020, 4, 17, 0, 0)], ['EEM', '19.73', 'p', datetime.datetime(2020, 9, 18, 0, 0)]]
[['VTI', '70', 'p', datetime.datetime(2020, 4, 17, 0, 0)]]
[['SPY', '200', 'p', datetime.datetime(2020, 4, 13, 0, 0)], ['SPY', '205', 'p', datetime.datetime(2020, 4, 15, 0, 0)], ['SPY', '200', 'p', datetime.datetime(2020, 5, 1, 0, 0)]]
[['V', '167.5', 'c', datetime.datetime(2020, 3, 27, 0, 0)]]
[['NONE', '9', 'c', datetime.datetime(2020, 5, 15, 0, 0)], ['NONE', '8.5', 'c', datetime.datetime(2020, 4, 17, 0, 0)]]
[['$SPCE', '13.5', 'p', datetime.datetime(2020, 3, 27, 0, 0)]]
[['CCL', '13', 'p', datetime.datetime(2020, 3, 27, 0, 0)], ['SEAS', '9', 'p', datetime.datetime(2020, 3, 2

[['NONE', '1800', 'c', datetime.datetime(2020, 7, 31, 0, 0)], ['NONE', '2000', 'c', datetime.datetime(2020, 8, 21, 0, 0)], ['TSLA', '2000', 'c', datetime.datetime(2020, 7, 31, 0, 0)], ['TSLA', '1800', 'c', datetime.datetime(2020, 7, 31, 0, 0)], ['TSLA', '2000', 'c', datetime.datetime(2020, 7, 31, 0, 0)], ['TSLA', '2500', 'c', datetime.datetime(2020, 8, 21, 0, 0)]]
[['TSLA', '2000', 'c', datetime.datetime(2020, 8, 21, 0, 0)]]
[['CO', '155', 'c', datetime.datetime(2020, 8, 14, 0, 0)], ['$PEP', '155', 'c', datetime.datetime(2020, 8, 14, 0, 0)]]
[['$NFLX', '670', 'C', datetime.datetime(2021, 1, 20, 0, 0)]]
[['BAY', '60', 'c', datetime.datetime(2020, 7, 17, 0, 0)], ['AMZN', '3400', 'c', datetime.datetime(2020, 8, 31, 0, 0)], ['SPY', '321', 'c', datetime.datetime(2020, 7, 17, 0, 0)], ['AAPL', '390', 'c', datetime.datetime(2020, 7, 17, 0, 0)]]
[['AMD', '65', 'C', datetime.datetime(2020, 7, 31, 0, 0)]]
[['NONE', '67.5', 'p', datetime.datetime(2020, 7, 17, 0, 0)]]
[['TSLA', '2000', 'C', datetim

[['TSLA', '1880', 'c', datetime.datetime(2020, 2, 27, 0, 0)]]
[['HP', '23', 'c', datetime.datetime(2020, 3, 20, 0, 0)]]
[['NONE', '95', 'c', datetime.datetime(2020, 3, 20, 0, 0)]]
[['NONE', '17', 'P', datetime.datetime(2020, 4, 17, 0, 0)], ['NONE', '17', 'P', datetime.datetime(2020, 3, 20, 0, 0)]]
[['NONE', '23', 'c', datetime.datetime(2020, 6, 19, 0, 0)]]
[['TQQQ', '110', 'C', datetime.datetime(2020, 2, 28, 0, 0)]]
[['NONE', '180', 'c', datetime.datetime(2020, 2, 28, 0, 0)]]
[['NONE', '35', 'c', datetime.datetime(2020, 3, 6, 0, 0)]]
[['NONE', '135', 'C', datetime.datetime(2019, 11, 8, 0, 0)]]
[['NONE', '5', 'c', datetime.datetime(2020, 2, 28, 0, 0)]]
[['NONE', '37', 'C', datetime.datetime(2020, 3, 20, 0, 0)]]
[['NONE', '82.50', 'c', datetime.datetime(2020, 10, 16, 0, 0)]]
[['SPY', '326', 'c', datetime.datetime(2020, 2, 26, 0, 0)], ['SPY', '327', 'c', datetime.datetime(2020, 2, 28, 0, 0)]]
[['NONE', '60', 'c', datetime.datetime(2020, 3, 20, 0, 0)]]
[['NONE', '61', 'c', datetime.datetim

[['HABT', '13', 'p', datetime.datetime(2020, 3, 20, 0, 0)]]
[['EWW', '33', 'p', datetime.datetime(2020, 4, 17, 0, 0)]]
[['DIS', '75', 'p', datetime.datetime(2020, 4, 17, 0, 0)]]
[['NONE', '25', 'p', datetime.datetime(2020, 5, 15, 0, 0)]]
[['SPY', '230', 'P', datetime.datetime(2020, 3, 20, 0, 0)]]
[['NONE', '100', 'P', datetime.datetime(2019, 11, 22, 0, 0)]]
[['TSG', '15', 'p', datetime.datetime(2020, 5, 15, 0, 0)]]
[['$DIS', '80', 'p', datetime.datetime(2020, 4, 9, 0, 0)]]
[['NONE', '80', 'c', datetime.datetime(2020, 4, 17, 0, 0)]]
[['$EWJ', '44', 'p', datetime.datetime(2020, 4, 17, 0, 0)]]
[['NONE', '10', 'p', datetime.datetime(2020, 11, 18, 0, 0)]]
[['$DEAC', '10', 'P', datetime.datetime(2020, 3, 20, 0, 0)], ['$DEAC', '7.5', 'P', datetime.datetime(2020, 3, 20, 0, 0)], ['$DEAC', '10', 'P', datetime.datetime(2020, 4, 17, 0, 0)], ['$DEAC', '7.5', 'P', datetime.datetime(2020, 4, 17, 0, 0)]]
[['SPY', '284', 'c', datetime.datetime(2020, 3, 16, 0, 0)]]
[['$GEO', '9', 'p', datetime.datetime(

[['SPY', '200', 'p', datetime.datetime(2020, 5, 1, 0, 0)]]
[['MY', '200', 'P', datetime.datetime(2020, 4, 17, 0, 0)]]
[['NONE', '18', 'p', datetime.datetime(2020, 3, 14, 0, 0)]]
[['NONE', '220', 'p', datetime.datetime(2020, 5, 15, 0, 0)], ['NONE', '210', 'P', datetime.datetime(2020, 5, 15, 0, 0)]]
[['SPY', '170', 'p', datetime.datetime(2020, 5, 15, 0, 0)]]
[['INDA', '20', 'p', datetime.datetime(2020, 4, 17, 0, 0)], ['DIS', '90', 'p', datetime.datetime(2020, 4, 17, 0, 0)]]
[['INDA', '20', 'p', datetime.datetime(2020, 4, 17, 0, 0)], ['DIS', '90', 'p', datetime.datetime(2020, 4, 17, 0, 0)]]
[['MSFT', '152.50', 'C', datetime.datetime(2019, 11, 22, 0, 0)]]
[['NONE', '15', 'p', datetime.datetime(2019, 11, 22, 0, 0)]]
[['TGT', '100', 'P', datetime.datetime(2019, 12, 20, 0, 0)], ['WBA', '56', 'C', datetime.datetime(2019, 11, 15, 0, 0)], ['QQQ', '198', 'P', datetime.datetime(2019, 12, 20, 0, 0)], ['PCG', '6', 'P', datetime.datetime(2019, 12, 20, 0, 0)]]
[['DIS', '95', 'p', datetime.datetime(202

[['SPY', '270', 'c', datetime.datetime(2020, 3, 31, 0, 0)], ['SPY', '185', 'p', datetime.datetime(2020, 4, 13, 0, 0)], ['SPY', '200', 'p', datetime.datetime(2020, 4, 20, 0, 0)], ['SPY', '400', 'c', datetime.datetime(2020, 5, 15, 0, 0)], ['SPY', '1000', 'c', datetime.datetime(2020, 12, 31, 0, 0)], ['SPY', '420', 'c', datetime.datetime(2020, 6, 9, 0, 0)], ['AAL', '1', 'p', datetime.datetime(2020, 4, 3, 0, 0)], ['MSFT', '75', 'c', datetime.datetime(2020, 4, 24, 0, 0)], ['SPY', '274', 'c', datetime.datetime(2020, 4, 3, 0, 0)], ['SPY', '180', 'p', datetime.datetime(2020, 5, 1, 0, 0)], ['SPY', '220', 'p', datetime.datetime(2020, 5, 1, 0, 0)], ['NFLX', '420', 'c', datetime.datetime(2020, 4, 20, 0, 0)], ['AMZN', '1640', 'p', datetime.datetime(2020, 4, 3, 0, 0)], ['SPY', '280', 'c', datetime.datetime(2021, 1, 15, 0, 0)], ['SPY', '264', 'p', datetime.datetime(2020, 3, 15, 0, 0)], ['SPY', '200', 'p', datetime.datetime(2020, 4, 24, 0, 0)], ['SPY', '260', 'c', datetime.datetime(2020, 3, 31, 0, 0)],

[['SPY', '275', 'P', datetime.datetime(2020, 5, 1, 0, 0)], ['SPY', '260', 'P', datetime.datetime(2020, 5, 1, 0, 0)]]
[['V', '190', 'C', datetime.datetime(2020, 6, 19, 0, 0)]]
[['PG', '127', 'c', datetime.datetime(2020, 4, 24, 0, 0)], ['EPD', '13.5', 'c', datetime.datetime(2020, 4, 24, 0, 0)]]
[['OTM', '10', 'c', datetime.datetime(2020, 10, 16, 0, 0)]]
[['SPY', '220', 'p', datetime.datetime(2020, 6, 19, 0, 0)]]
[['SPY', '69', 'P', datetime.datetime(2020, 4, 24, 0, 0)]]
[['$CSOD', '32.50', 'c', datetime.datetime(2020, 5, 15, 0, 0)]]
[['NONE', '300', 'c', datetime.datetime(2020, 5, 15, 0, 0)]]
[['NONE', '13', 'C', datetime.datetime(2020, 5, 15, 0, 0)]]
[['DHT', '9', 'c', datetime.datetime(2020, 10, 16, 0, 0)], ['TNK', '22.5', 'c', datetime.datetime(2020, 6, 19, 0, 0)], ['TNK', '22.5', 'c', datetime.datetime(2020, 10, 16, 0, 0)]]
[['SPY', '200', 'p', datetime.datetime(2020, 5, 1, 0, 0)]]
[['NONE', '55', 'c', datetime.datetime(2020, 7, 17, 0, 0)]]
[['SPY', '270', 'p', datetime.datetime(2020

[['NONE', '20', 'C', datetime.datetime(2020, 5, 15, 0, 0)]]
[['NONE', '4', 'c', datetime.datetime(2020, 6, 19, 0, 0)]]
[['NONE', '77.5', 'C', datetime.datetime(2020, 5, 15, 0, 0)]]
[['TSLA', '610', 'p', datetime.datetime(2020, 5, 8, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5, 15, 0, 0)], ['APPL', '260', 'P', datetime.datetime(2020, 6, 19, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5, 15, 0, 0)], ['APPL', '260', 'P', datetime.datetime(2020, 6, 19, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5, 15, 0, 0)], ['APPL', '260', 'P', datetime.datetime(2020, 6, 19, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5, 15, 0, 0)], ['APPL', '260', 'P', datetime.datetime(2020, 6, 19, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5, 15, 0, 0)], ['APPL', '260', 'P', datetime.datetime(2020, 6, 19, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5, 15, 0, 0)], ['APPL', '260', 'P', datetime.datetime(2020, 6, 19, 0, 0)]]
[['AMD', '52', 'C', datetime.datetime(2020, 5

[['NONE', '33', 'c', datetime.datetime(2020, 6, 19, 0, 0)]]
[['SPY', '225', 'P', datetime.datetime(2020, 12, 18, 0, 0)]]
[['SWBI', '18', 'c', datetime.datetime(2020, 6, 19, 0, 0)]]
[['XRX', '25', 'C', datetime.datetime(2020, 10, 16, 0, 0)]]
[['XRX', '25', 'C', datetime.datetime(2020, 10, 16, 0, 0)]]
[['$PLAY', '12.5', 'P', datetime.datetime(2020, 7, 17, 0, 0)]]
[['CHK', '10', 'P', datetime.datetime(2020, 7, 17, 0, 0)]]
[['APHA', '6', 'C', datetime.datetime(2020, 7, 10, 0, 0)]]
[['MGM', '21', 'C', datetime.datetime(2020, 7, 2, 0, 0)]]
[['NONE', '3', 'c', datetime.datetime(2020, 7, 17, 0, 0)]]
[['ADBE', '410', 'c', datetime.datetime(2020, 6, 26, 0, 0)], ['ADBE', '410', 'c', datetime.datetime(2020, 6, 26, 0, 0)]]
[['NONE', '300', 'C', datetime.datetime(2020, 6, 12, 0, 0)]]
[['NONE', '250', 'c', datetime.datetime(2020, 6, 19, 0, 0)]]
[['PCG', '14', 'c', datetime.datetime(2020, 6, 26, 0, 0)], ['PCG', '16', 'c', datetime.datetime(2020, 9, 18, 0, 0)]]
[['$UNP', '200', 'c', datetime.datetime(2

[['XOM', '45', 'C', datetime.datetime(2020, 9, 18, 0, 0)], ['XOM', '50', 'C', datetime.datetime(2020, 9, 18, 0, 0)], ['CVX', '100', 'C', datetime.datetime(2020, 9, 19, 0, 0)]]
[['NONE', '225', 'c', datetime.datetime(2020, 7, 17, 0, 0)]]
[['SPY', '330', 'C', datetime.datetime(2020, 7, 17, 0, 0)]]
[['NONE', '40', 'p', datetime.datetime(2020, 8, 19, 0, 0)]]
[['SPY', '330', 'c', datetime.datetime(2020, 7, 10, 0, 0)]]
[['NONE', '27', 'p', datetime.datetime(2020, 10, 18, 0, 0)]]
[['NEM', '65', 'c', datetime.datetime(2020, 7, 17, 0, 0)]]
[['$PTON', '60', 'c', datetime.datetime(2020, 7, 17, 0, 0)]]
[['SPY', '320', 'c', datetime.datetime(2020, 7, 10, 0, 0)]]
[['NONE', '1400', 'C', datetime.datetime(2020, 7, 10, 0, 0)]]
[['NONE', '1650', 'c', datetime.datetime(2020, 7, 31, 0, 0)]]
[['CHGG', '90', 'c', datetime.datetime(2020, 8, 21, 0, 0)], ['SPY', '340', 'c', datetime.datetime(2020, 8, 7, 0, 0)]]
[['NONE', '2000', 'c', datetime.datetime(2020, 7, 24, 0, 0)]]
[['NONE', '70', 'C', datetime.datetime

[['NONE', '85', 'C', datetime.datetime(2020, 9, 18, 0, 0)]]
[['NONE', '140', 'C', datetime.datetime(2020, 8, 21, 0, 0)]]
[['NONE', '57', 'c', datetime.datetime(2020, 7, 31, 0, 0)]]
[['BOTZ', '27', 'c', datetime.datetime(2020, 8, 21, 0, 0)], ['BOTZ', '27', 'c', datetime.datetime(2020, 8, 21, 0, 0)]]
[['FB', '215', 'p', datetime.datetime(2020, 8, 7, 0, 0)], ['FB', '285', 'p', datetime.datetime(2020, 11, 20, 0, 0)], ['FB', '215', 'p', datetime.datetime(2020, 8, 7, 0, 0)], ['FB', '210', 'p', datetime.datetime(2020, 8, 17, 0, 0)], ['FB', '205', 'p', datetime.datetime(2020, 8, 17, 0, 0)]]
[['MY', '66', 'p', datetime.datetime(2020, 7, 31, 0, 0)]]
[['NONE', '70', 'C', datetime.datetime(2020, 7, 31, 0, 0)]]
[['NONE', '5.5', 'P', datetime.datetime(2020, 8, 28, 0, 0)]]
[['NONE', '76', 'c', datetime.datetime(2020, 7, 31, 0, 0)]]
[['AAPL', '400', 'C', datetime.datetime(2020, 8, 21, 0, 0)]]
[['FB', '210.00', 'P', datetime.datetime(2020, 7, 31, 0, 0)]]
[['NONE', '420', 'c', datetime.datetime(2020, 8,

[['LU', '400', 'c', datetime.datetime(2020, 9, 18, 0, 0)]]
[['NONE', '22', 'c', datetime.datetime(2020, 11, 20, 0, 0)], ['NONE', '22', 'c', datetime.datetime(2020, 9, 25, 0, 0)], ['NONE', '25', 'c', datetime.datetime(2020, 11, 20, 0, 0)]]
[['NONE', '100', 'C', datetime.datetime(2020, 9, 11, 0, 0)], ['NONE', '100', 'C', datetime.datetime(2020, 9, 11, 0, 0)], ['NONE', '80', 'C', datetime.datetime(2020, 9, 11, 0, 0)]]
[['MY', '135', 'p', datetime.datetime(2020, 9, 18, 0, 0)], ['MY', '105', 'p', datetime.datetime(2021, 1, 15, 0, 0)]]
[['PTON', '100', 'C', datetime.datetime(2020, 9, 11, 0, 0)]]
[['NONE', '3800', 'c', datetime.datetime(2020, 9, 18, 0, 0)], ['NONE', '3950', 'c', datetime.datetime(2020, 9, 18, 0, 0)], ['NONE', '3950', 'c', datetime.datetime(2020, 9, 11, 0, 0)], ['NONE', '3800', 'c', datetime.datetime(2020, 9, 18, 0, 0)]]
[['TSLA', '420', 'P', datetime.datetime(2020, 9, 4, 0, 0)]]
[['NONE', '3800', 'c', datetime.datetime(2020, 9, 18, 0, 0)], ['NONE', '3950', 'c', datetime.datet

[['TSLA', '450', 'c', datetime.datetime(2020, 11, 21, 0, 0)]]
[['$NIO', '27', 'p', datetime.datetime(2020, 10, 16, 0, 0)]]
[['COST', '377.5', 'c', datetime.datetime(2020, 10, 23, 0, 0)], ['COST', '390', 'c', datetime.datetime(2020, 10, 3, 0, 0)], ['COST', '420', 'c', datetime.datetime(2020, 11, 27, 0, 0)]]
[['COST', '377.5', 'c', datetime.datetime(2020, 10, 23, 0, 0)], ['COST', '390', 'c', datetime.datetime(2020, 10, 3, 0, 0)], ['COST', '420', 'c', datetime.datetime(2020, 11, 20, 0, 0)]]
[['NONE', '17', 'c', datetime.datetime(2021, 1, 22, 0, 0)]]
[['SLV', '23', 'c', datetime.datetime(2020, 11, 6, 0, 0)]]
[['FSLY', '135', 'P', datetime.datetime(2020, 10, 16, 0, 0)]]
[['$F', '8', 'c', datetime.datetime(2020, 10, 16, 0, 0)]]
[['NONE', '15', 'c', datetime.datetime(2020, 11, 20, 0, 0)], ['NONE', '12', 'c', datetime.datetime(2021, 1, 15, 0, 0)]]
[['FSLY', '115', 'P', datetime.datetime(2020, 10, 23, 0, 0)]]
[['NONE', '155', 'C', datetime.datetime(2020, 11, 20, 0, 0)]]
[['NONE', '500', 'c', da

[['KODK', '5', 'p', datetime.datetime(2020, 11, 20, 0, 0)]]
[['NKLA', '80', 'p', datetime.datetime(2020, 10, 23, 0, 0)]]
[['TSLA', '420', 'p', datetime.datetime(2020, 10, 30, 0, 0)], ['TSLA', '420', 'p', datetime.datetime(2020, 10, 30, 0, 0)]]
[['NONE', '30', 'c', datetime.datetime(2020, 11, 6, 0, 0)], ['NONE', '35', 'c', datetime.datetime(2020, 11, 6, 0, 0)], ['NONE', '30', 'c', datetime.datetime(2020, 10, 30, 0, 0)], ['NONE', '35', 'c', datetime.datetime(2020, 11, 6, 0, 0)], ['NONE', '40', 'c', datetime.datetime(2020, 11, 6, 0, 0)]]
[['UVXY', '20', 'c', datetime.datetime(2020, 11, 6, 0, 0)], ['VXX', '24', 'c', datetime.datetime(2020, 10, 30, 0, 0)], ['VXX', '26', 'c', datetime.datetime(2020, 11, 6, 0, 0)]]
[['$AMD', '85', 'c', datetime.datetime(2020, 10, 30, 0, 0)]]
[['NONE', '52.5', 'c', datetime.datetime(2020, 10, 23, 0, 0)]]
[['NONE', '99', 'c', datetime.datetime(2020, 11, 20, 0, 0)]]
[['NONE', '285', 'c', datetime.datetime(2020, 10, 30, 0, 0)]]
[['NONE', '420', 'p', datetime.date

In [17]:
## work on finding the missing tickers

In [18]:
good_post_csv['ticker_locator'] = good_post_csv.apply(ticker_finder, axis=1)

SPCE
TEVA
TEVA
TEVA
TEVA
SQ
AMC
SQ
SQ
SQ
SQ
WING
DD
BP
CEO
INSG
ZNGA
ZNGA
ZNGA
ZNGA
AMD
TSLA
TSLA
SPCE
ATH
NOW
SPCE
KR
KR
NVDA
PLUG
EPAM
ZG
ARE
ENPH
ENPH
NVDA
LB
L
CEO
A
L
L
NOK
ALLY
ALLY
SPWR
NEW
SPCE
AMD
AMD
SPCE
ALLY
FIT
NOMD
RH
RH
HPE
HPE
HPQ
HPE
A
BA
BA
BA
BA
VIRT
COST
TD
SPCE
SPCE
SPCE
TGT
ARMK
DD
ARMK
CTXS
BAC
DD
A
NYC
NEXT
BEST
PLAY
C
GRPN
AL
ULTA
ON
AL
SPWR
ICUI
ICUI
DD
TSN
NYT
UPS
C
DD
DD
A
DD
DD
DD
WING
CEO
HTHT
SNAP
DD
WING
HTHT
SNAP
DD
DD
PLAY
AT
SCI
SCI
IMGN
IMGN
IMGN
IMGN
IMGN
IMGN
EDIT
EDIT
NYT
NYT
C
FIVN
MSFT
AT
ARE
ARE
OUT
LIVE
III
CLR
GME
GME
GME
OR
BE
A
OR
A
BE
OUT
NEXT
AT
ARE
ARE
OUT
NAT
NAT
NAT
X
HTHT
HTHT
HTHT
HTHT
DLTR
TGT
WMT
ZEN
DD
ZEN
BAM
A
ZEN
ZEN
ZEN
ATH
ZEN
DIS
CLDR
DD
CLDR
CLDR
CLDR
CLDR
CLDR
WMT
WMT
WMT
ABT
ABT
BJ
UNFI
UNFI
UNFI
UNFI
APRN
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
UNFI
C
UNFI
UNFI
SPTN
COST
DD
AMD
AMD
AMD
INTC
AMD
INTC
AMD
RTX
AMD
AMD
INTC
INTC
INTC
AMD
AMD
AMD
INTC
AMD
AMD
AMD
INTC
INTC
AMD
INTC
AMD
INTC
DIS
AMD
AMD
AMD
INTC

VEEV
A
CRM
CRM
TMO
DD
FLIR
FLIR
DD
FLIR
FLIR
SO
FLIR
DD
PM
DD
EOD
AMD
AMD
J
DOW
PANW
PANW
PANW
NFLX
NFLX
C
SQ
USA
USA
SNAP
ET
BP
CVNA
PGC
ET
DD
ET
ET
NAT
DHT
NAT
ET
UFO
ET
DD
DD
A
DD
ATVI
ATVI
ATVI
ATVI
EA
SQ
SQ
USA
SQ
TSN
TSN
W
DD
ATVI
SQ
TSN
W
DD
DD
PLAY
AT
SBUX
NOW
ARE
ON
SBUX
SBUX
SBUX
CMG
SBUX
SBUX
NOW
SBUX
ORLY
ORLY
A
PCG
CEO
PCG
DD
DD
PCG
HUGE
PCG
A
PCG
CEO
A
SNAP
PCG
A
PCG
PCG
PCG
DD
SNAP
SNAP
SNAP
SNAP
SNAP
JNJ
V
DD
DD
CEO
HD
GME
GME
AM
TTOO
TTOO
TTOO
TTOO
ICPT
PBC
ICPT
ICPT
CCJ
CCJ
CCJ
WTI
WTI
DKNG
DKNG
CZR
CZR
MMM
AAPL
A
DD
FLIR
DD
FLIR
FLIR
FLIR
FLIR
FLIR
PS
WORK
MSFT
WORK
TAK
DD
TAK
DD
DD
TAK
TWO
TAK
TAK
TAK
DD
GOOD
OLD
USA
TAK
TAK
TAK
HALO
GMAB
SNY
SGEN
QURE
MMP
MMP
ATH
A
MMP
MMP
ATVI
ATH
ATH
CCL
CCL
JAN
TSLA
ATVI
ATH
DD
ATVI
ATVI
ATVI
BUD
NAT
NAT
NAT
A
WTI
NAT
NAT
OR
KHC
ET
PBR
SSL
ET
PBR
TSLA
MRO
XOM
MRO
ET
SSL
PBR
PE
PE
MA
V
MA
MA
V
V
V
V
A
B
A
MOS
AMD
AMD
AMD
AMD
AMD
DD
C
LULU
LULU
MIK
CM
DD
DD
BC
BC
PII
PII
PII
DD
PII
ATH
BIG
FAST
BABA
NTAP
NTAP
SRNE
TDA
TDA
SNAP
DD


PLAY
PCG
PCG
PT
HD
SPOT
HD
SPOT
HD
BIG
HD
HD
HD
TV
SPOT
SPOT
SPOT
SPOT
SPOT
SPOT
SPOT
HD
SPOT
WRTC
SGMO
PFE
JD
BIG
DD
JD
JD
BABA
JD
JD
JD
JD
MGM
MGM
NVDA
SQ
A
DD
DD
DD
HD
SPOT
DD
NVDA
SQ
FB
NVDA
NVDA
NVDA
NVDA
NVDA
NVDA
INTC
MU
AMD
NVDA
NVDA
NVDA
SQ
SQ
SPOT
DD
SQ
SPOT
SQ
SQ
SPOT
SPOT
NVDA
SQ
NVDA
SQ
A
DD
DD
DD
HD
SPOT
DD
NVDA
SQ
FB
NVDA
NVDA
NVDA
NVDA
NVDA
NVDA
INTC
MU
AMD
NVDA
NVDA
NVDA
SQ
SQ
SPOT
DD
SQ
SPOT
SQ
SQ
SPOT
SPOT
NVDA
SQ
FB
FB
FB
FB
FB
FB
FB
FOLD
ATH
CEO
JD
CEO
CEO
FOLD
ATH
FOUR
SQ
FOUR
FOUR
FOUR
PCG
DD
PCG
BAC
BCS
C
CS
DB
GS
HSBC
JPM
MS
UBS
WFC
BAC
C
GS
HSBC
JPM
MS
V
WFC
CCI
DD
DD
DD
FOUR
SQ
FOUR
FOUR
FOUR
SQ
HLT
UPS
FOUR
EOD
FOUR
FOUR
FOUR
SQ
PYPL
SQ
SQ
ATH
ATH
DD
GE
GE
CEO
CEO
GE
DD
GE
GE
GE
GE
GE
GE
GE
GE
GE
AMD
AMD
SHOP
AMD
A
ROKU
ROKU
FB
FB
TSLA
AAPL
CRM
TSLA
ROKU
AAPL
ROKU
ULTA
DISH
CI
DIS
DIS
ALGN
ALGN
GME
GME
CI
CI
AMD
AMD
ULTA
ULTA
BYND
DIS
DIS
PINS
PINS
TSLA
AMZN
AMZN
QQQ
GOOS
ABBV
TSLA
ATVI
ATVI
SESN
DD
FIT
FIT
AMZN
AMZN
A
UNH
UNH
AAPL
AMZN
AMZN
AAPL
GME
GME
TSL

PRPL
PRPL
AMD
AMD
AMD
AMD
AMD
AMD
AMD
AMD
CVLT
PT
CVLT
CVLT
CVLT
DBX
HD
LOW
TGT
BBY
LOW
TGT
BBY
WMT
WMT
WMT
WMT
WMT
ARE
RRR
DD
MGM
CEO
III
RKT
PRPL
BOX
DD
DD
DBX
BOX
BOX
BOX
BOX
DBX
MRK
CEO
DD
TJX
NET
NET
FSLY
NIO
TSLA
FSLY
NET
FSLY
NET
JD
BABA
DD
STWD
A
SHIP
PE
PE
TSLA
PE
A
MLHR
DD
ORN
A
GLDD
PRPL
RCII
NVDA
NIO
NKLA
CHWY
AC
CARR
DD
NVDA
NVDA
BNTC
BNTC
TSLA
PCG
BBY
BBY
BBY
GRWG
GRWG
TSLA
TSLA
OI
TSLA
OI
OI
PCG
PCG
GRWG
MU
MU
MU
MU
TSLA
JD
JD
ATH
JD
BABA
ADI
ADI
BABA
CHGG
A
CHGG
AMKR
DD
AMKR
PE
CEO
OI
AMKR
NIO
RH
NIO
VRNT
VRNT
VRNT
PRPL
LOW
VRNT
NOW
ON
AT
T
GT
AA
AMD
AMD
PS
DD
PRPL
NVDA
NVDA
RLGY
RLGY
RLGY
AMD
AMD
AAPL
AAPL
DD
A
GE
AVLR
BBY
BBY
PE
DD
PE
PE
PE
ATVI
RKT
OTRK
AMD
PLUG
AM
ATVI
OTRK
AMD
PLUG
TSLA
OTRK
TSLA
CEO
OTRK
DD
TSLA
NIO
WLL
WLL
WLL
WLL
CNK
AMC
CNK
AMC
PT
CNK
CNK
CNK
CNK
CNK
AMC
R
RH
TSLA
TA
TSLA
EOD
NIO
NIO
RH
ATVI
RH
AAPL
ON
GRWG
CEO
CFO
KODK
AAPL
OSTK
DD
PRPL
OSTK
OSTK
TA
DKS
DKS
DKS
DKS
PE
PE
DKS
RKT
RKT
RKT
DD
BEST
PLAY
NEXT
NVDA
AAPL
TSLA
NVDA
NVDA
AAPL
X
AAPL
AA

SPCE
SPCE
SPCE
SPCE
SPCE
EV
TSLA
NIO
ENPH
SEDG
FSLR
CSIQ
JKS
SOL
RUN
ENPH
JKS
RUN
BLDP
BE
FCEL
PLUG
PFE
PFE
KCAC
KCAC
EV
RKT
RKT
SHIP
SHIP
CCL
CRSR
CRSR
PLTR
GME
DISCA
DISCA
DISCA
GME
DD
TLC
TLC
CEO
PLTR
NIO
PLTR


In [19]:
good_post_csv[(good_post_csv['ticker_locator'] != 1) & (good_post_csv['ticker_locator'] != 0) & (good_post_csv['ticker_locator'] != -1)]

Unnamed: 0.1,Unnamed: 0,author,created_utc,title,link_flair_text,selftext,id,full_link,all text,regexed_combined,regexed_body,regexed_title,ticker_locator
248,248,laneciar,1585039591,Teva Pharmaceutical($TEVA) Is The Next $SPCE,DD,Im gonna keep it nice and short for you fuckin...,fo5d2y,https://www.reddit.com/r/wallstreetbets/commen...,Teva Pharmaceutical($TEVA) Is The Next $SPCEIm...,"[[NONE, 9, c, 2020-05-15 00:00:00], [NONE, 8.5...","[[NONE, 9, c, 05/15], [NONE, 8.5, c, 04/17]]",0,"[(TEVA, 4), (SPCE, 1)]"
387,387,HITLERMAHJONG,1585049576,Square (SQ) 3/24/2020 AMC - Need to be Patient,DD,Stock: **Square (SQ) - 48.30$**\n\nDirection: ...,fo83ho,https://www.reddit.com/r/wallstreetbets/commen...,Square (SQ) 3/24/2020 AMC - Need to be Patient...,"[[NONE, 40, P, 2020-04-03 00:00:00]]","[[NONE, 40, P, 4/03]]",0,"[(SQ, 5), (AMC, 1)]"
390,390,princevillian,1585049943,WING is overvalued trash. Not even your wife’s...,DD,Wingstop is trash for mouth breathing hillbill...,fo87ae,https://www.reddit.com/r/wallstreetbets/commen...,WING is overvalued trash. Not even your wife’s...,"[[NONE, 55, p, 2020-05-15 00:00:00]]","[[NONE, 55, p, 5/15]]",0,"[(WING, 1)]"
441,441,StillIntroduction7,1585053695,Did a little DD on New York,DD,I live in brooklyn and people really don’t giv...,fo9cks,https://www.reddit.com/r/wallstreetbets/commen...,Did a little DD on New YorkI live in brooklyn ...,"[[NONE, 195, p, 2020-05-01 00:00:00]]","[[NONE, 195, p, 5/1]]",0,"[(DD, 1)]"
456,456,firsure,1585054434,Puts to Buy During This BULLshit Bounce?,Discussion,No way in hell the market has bottomed out.\n\...,fo9kou,https://www.reddit.com/r/wallstreetbets/commen...,Puts to Buy During This BULLshit Bounce?No way...,"[[NONE, 270, p, 2020-06-19 00:00:00]]","[[NONE, 270, p, 6/19]]",0,"[(BP, 1)]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117534,117534,sjtomcat,1604935772,PLTR Gang how we feeling,Discussion,Nice little day of consolidation. Nothing craz...,jr5mbh,https://www.reddit.com/r/wallstreetbets/commen...,PLTR Gang how we feelingNice little day of con...,"[[NONE, 16, c, 2020-11-13 00:00:00]]","[[NONE, 16, c, 11/13]]",0,"[(PLTR, 1)]"
117537,117537,DisputablyGreen,1604935956,"If you liked Burry on GME, you're gonna love h...",DD,"Alright, idiots, here’s DISCA.\n\nBurry owns ...",jr5ohd,https://www.reddit.com/r/wallstreetbets/commen...,"If you liked Burry on GME, you're gonna love h...","[[NONE, 22.5, c, 2020-12-18 00:00:00]]","[[NONE, 22.5, c, 12/18]]",0,"[(DISCA, 3), (GME, 2), (TLC, 2)]"
117542,117542,sjtomcat,1604936049,PLTR Gang how we feeling,Discussion,How we all feeling after today? Seems like som...,jr5png,https://www.reddit.com/r/wallstreetbets/commen...,PLTR Gang how we feelingHow we all feeling aft...,"[[NONE, 16, c, 2020-11-13 00:00:00]]","[[NONE, 16, c, 11/13]]",0,"[(PLTR, 1)]"
117570,117570,Babyyodaenergy,1604939196,Nio Calls DEC,Discussion,How are we feeling about December NIO calls \...,jr6slg,https://www.reddit.com/r/wallstreetbets/commen...,Nio Calls DECHow are we feeling about December...,"[[NONE, 42, C, 2020-12-11 00:00:00], [NONE, 50...","[[NONE, 42, C, 12/11], [NONE, 50, C, 1/15/21]]",0,"[(NIO, 1)]"


In [20]:
good_post_csv.iloc[117537]['all text']

"If you liked Burry on GME, you're gonna love him on DISCA Alright, idiots, here’s DISCA.\n\nBurry owns 500,000 shares and a mess of calls on DISCA. You followed him on GME with less DD then I about to give you from memory. Buy it or don’t, I’m gonna take my winnings and go homesteading with that guy with a mustache.\n\nDiscovery Communications is a telecom heavyweight. They run the most profitable network portfolio in the paid cable game (Discovery Channel, TLC, HGTV, Animal Planet, Food Network, HGTV, and Travel Channel). Their networks are the #1 (TLC) and #2 (HGTV) for female viewers in domestic markets. They also have the largest sports network in all of Europe and are prepared to deal with that during the pandemic.\n\nOn their earnings call last week they said they were going to announce a new AVOD service/partnership in early December. Their current one is already doing well, their next will be even better. The rumor is coming from the CEO, so I'm fucking buying it. When the new

In [21]:
good_post_csv.iloc[387]['all text']

'Square (SQ) 3/24/2020 AMC - Need to be PatientStock: **Square (SQ) - 48.30$**\n\nDirection: **Neutral Bullish**\n\n**Price Target:** **50$**\n\n**Type: Short Put**\n\n**Fundamental:** Nothing in Square’s formula has changed since the start of the Coronavirus outbreak. **There is a note that due to the closure of many SMBs, Square’s transaction count will be significantly affected.** Square’s price fell from about 85$ to about 49$ in the span of the first Coronavirus slide, bottoming out at 32$, which was quickly brought up to about the 40$ average. Since then, it has grown alongside the rest of the market. Square will face very little growth for the foreseeable future, but can be a great candidate for covered calls.\n\n**Technical**: Priced below 20, 50, 200 day moving average. MACD slight shows bullish convergence. RSI shows around 40, coming up from oversold. Options sentiments are bullish in volume and Bearish open interest. Implied movement is a whopping 12.1% or about 5.85$ Short

In [22]:
good_post_csv.iloc[117534]['regexed_combined']

[['NONE', '16', 'c', datetime.datetime(2020, 11, 13, 0, 0)]]

In [23]:
good_post_csv.apply(add_missing_tickers, axis = 1) # should be an in place apply because i directly edit the lists

0         0
1         0
2         0
3         0
4         0
         ..
117648    0
117649    0
117650    0
117651    0
117652    0
Length: 117653, dtype: object

In [24]:
len(good_post_csv)

117653

In [25]:
good_post_csv[good_post_csv['regexed_combined'] != 0]

Unnamed: 0.1,Unnamed: 0,author,created_utc,title,link_flair_text,selftext,id,full_link,all text,regexed_combined,regexed_body,regexed_title,ticker_locator
6,6,angryrantingdude,1569872663,Should I cut my loss or turn my puts into a st...,Options,So I bought $T $37.5p 10/4 expiration. But ob...,dblal0,https://www.reddit.com/r/wallstreetbets/commen...,Should I cut my loss or turn my puts into a st...,"[[$T, 37.5, p, 2019-10-04 00:00:00]]","[[$T, 37.5, p, 10/4]]",0,1
136,136,clmohn,1585011346,COVID-19 was just something printing me tendie...,Discussion,[removed],fnzzcm,https://www.reddit.com/r/wallstreetbets/commen...,COVID-19 was just something printing me tendie...,"[[NONE, 200, P, 2020-05-15 00:00:00]]",0,"[[NONE, 200, P, 5/15]]",-1
159,159,lost_civilizations,1585016402,Hotels and Airline bailouts,Fundamentals,Wake up people - The only reason these compani...,fo0vym,https://www.reddit.com/r/wallstreetbets/commen...,Hotels and Airline bailoutsWake up people - Th...,"[[HLT, 40, P, 2020-05-01 00:00:00]]","[[HLT, 40, P, 5/1]]",0,1
161,161,CodfishCannon,1585017606,AMZN Call Spread Bug,YOLO,Found a fun little Robinhood glitch: more valu...,fo13kb,https://www.reddit.com/r/wallstreetbets/commen...,AMZN Call Spread BugFound a fun little Robinho...,"[[HDB, 35, p, 2020-04-17 00:00:00], [EWP, 19, ...","[[HDB, 35, p, 4/17], [EWP, 19, p, 4/17], [BLK,...",0,1
210,210,jlk1994-,1585034103,Was I misdiagnosed?,Discussion,"I got diagnosed with Asperger, like an actual ...",fo44sm,https://www.reddit.com/r/wallstreetbets/commen...,Was I misdiagnosed?I got diagnosed with Asperg...,"[[VTI, 70, p, 2020-04-17 00:00:00]]","[[VTI, 70, p, 4/17]]",0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
117542,117542,sjtomcat,1604936049,PLTR Gang how we feeling,Discussion,How we all feeling after today? Seems like som...,jr5png,https://www.reddit.com/r/wallstreetbets/commen...,PLTR Gang how we feelingHow we all feeling aft...,"[[PLTR, 16, c, 2020-11-13 00:00:00]]","[[NONE, 16, c, 11/13]]",0,"[(PLTR, 1)]"
117549,117549,yeeterboi1234567,1604937591,Who else went all in on RKT today?,YOLO,Went all in on RKT 11/13 25c lol. Who else?,jr68pb,https://www.reddit.com/r/wallstreetbets/commen...,Who else went all in on RKT today?Went all in ...,"[[RKT, 25, c, 2020-11-13 00:00:00]]","[[RKT, 25, c, 11/13]]",0,1
117570,117570,Babyyodaenergy,1604939196,Nio Calls DEC,Discussion,How are we feeling about December NIO calls \...,jr6slg,https://www.reddit.com/r/wallstreetbets/commen...,Nio Calls DECHow are we feeling about December...,"[[NIO, 42, C, 2020-12-11 00:00:00], [NIO, 50, ...","[[NONE, 42, C, 12/11], [NONE, 50, C, 1/15/21]]",0,"[(NIO, 1)]"
117573,117573,gravitiz3,1604939411,PLTR gang,Discussion,Y’all holding long term? Where do you see it h...,jr6v8s,https://www.reddit.com/r/wallstreetbets/commen...,PLTR gangY’all holding long term? Where do you...,"[[PLTR, 10, c, 2021-05-21 00:00:00]]","[[NONE, 10, c, 5/21/20]]",0,"[(PLTR, 1)]"


In [26]:
good_post_csv.iloc[248]['regexed_combined']

[['TEVA', '9', 'c', datetime.datetime(2020, 5, 15, 0, 0)],
 ['TEVA', '8.5', 'c', datetime.datetime(2020, 4, 17, 0, 0)]]

In [27]:
good_post_csv['num posts'] = good_post_csv['regexed_combined'].apply(lambda x: len(x) if type(x) is list else x)

In [28]:
sum(good_post_csv['num posts'])

11253

In [29]:
good_post_csv.to_csv('output.csv')

In [30]:
good_post_csv.to_json('out_json.json')