In [1]:
import json
import os
from tqdm import tqdm
import pandas as pd
import re
import numpy as np
import datetime
import dateutil.parser as dp
from dateutil.relativedelta import relativedelta

In [2]:
def ticker_extender(pos_list):
    prev_ticker = "NONE"
    for group in pos_list:
        if not group[0]:
            group[0] = prev_ticker
        else:
            prev_ticker = group[0]
    return pos_list

In [3]:
def regex_pos(post_text):
    if pd.isna(post_text): return 0
    x = []
    p1 = re.compile(r'((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+\$?(?:(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}\s+(\d\d?\/\d\d?(?:\/\d{2,4})?))')
    p2 = re.compile(r'((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+(\d\d?\/\d\d?(?:\/\d{2,4})?)\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p3 = re.compile(r'(\d\d?\/\d\d?(?:\/\d{2,4})?)\s+((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p4 = re.compile(r'((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+(\d{1,2}\s?[A-Z]{1,3})\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p5 = re.compile(r'(\d{1,2}\s?[A-Z]{1,3})\s+((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p6 = re.compile(r'([A-Z]{1,3}\s?\d{1,2})\s+((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    p7 = re.compile(r'((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)?\s+([A-Z]{1,3}\s?\d{1,2})\s+\$?(\d+(?:\.\d\d?)?)\s*([CcPp])\w{0,4}')
    
    # original ticker: (\$?[A-Z]{1,4})
    # alt ticker: ((?:\$?[A-Z]{1,4})(?:[a-z]{1,3})?)
    
    x1 = re.findall(p1, post_text)
    x2 = re.findall(p2, post_text)
    x3 = re.findall(p3, post_text)
    x4 = re.findall(p4, post_text) # diving into alpha-numeric dates like 20 NOV
    x5 = re.findall(p5, post_text) #alpha 20 nov, date first
    x6 = re.findall(p6, post_text) #alpha nov 20, date first
    x7 = re.findall(p7, post_text) #alpha nov 20, ticker first
    
    #dictating ticker-strike-c/p-date as standard. x1 stays, the rest must be edited 
    if x1:
        x1_e = []
        for match in x1:
            ordered = [match[0].upper(), match[1], match[2], match[3]]
            x1_e.append(ordered)
        x.extend(x1_e)
    if x2: #ticker-date-strike-c/p
        x2_e = []
        for match in x2:
            ordered = [match[0].upper(),match[2],match[3],match[1]]
            x2_e.append(ordered)
        x.extend(x2_e)
    if x3: #date-ticker-strike-c/p
        x3_e = []
        for match in x3:
            ordered = [match[1].upper(),match[2],match[3],match[0]]
            x3_e.append(ordered)
        x.extend(x3_e)
    if x4: # ticker-date(alphanumeric)-stike-c/p
        x4_e = []
        for match in x4:
            ordered = [match[0].upper(),match[2],match[3],match[1]]
            x4_e.append(ordered)
        x.extend(x4_e)
    if x5: #date(alpha)-ticker-strike-cp
        x5_e = []
        for match in x5:
            ordered = [match[1].upper(),match[2],match[3],match[0]]
            x5_e.append(ordered)
        x.extend(x5_e)
    if x6: #same as above, diff alpha
        x6_e = []
        for match in x6:
            ordered = [match[1].upper(),match[2],match[3],match[0]]
            x6_e.append(ordered)
        x.extend(x6_e)
    if x7:
        x7_e = []
        for match in x7:
            ordered = [match[0].upper(),match[2],match[3],match[1]]
            x7_e.append(ordered)
        x.extend(x7_e)

    if x:
        x = ticker_extender(x) # see above
        return x 
    else:
        return 0

In [4]:
# adds -1 to unproccessed dates, these are worth dropping
def date_proccessor(pos_list):
    if pos_list == 0: return pos_list
    new_list = pos_list
    for position in new_list:
        date = position[-1]
        try:
            p_date = dp.parse(date)
        except:
            try:
                date_e = date.split('/')
                if len(date_e) == 2:
                    date_e = '/'.join(i for i in date_e[::-1])
                elif len(date_e) == 3:
                    tmp = date_e[0]
                    date_e[0] = date_e[1]
                    date_e[1] = tmp
                    date_e = '/'.join(i for i in date_e[::-1])
                p_date = dp.parse(date_e)                
            except:
                print("date parse error, probably worth dropping")
                print(date)
                p_date = -1
        position[-1] = p_date
    return new_list
# hanndled the -1 entries for dates. if no positions left, drops the entire entry
def date_proccessor_corrector(pos_list):
    if pos_list == 0: return pos_list
    new_list = []
    for position in pos_list:
        if position[-1] != -1:
            new_list.append(position)
    if new_list:
        return new_list
    else:
        return 0

In [5]:
# get the created year, compare to expiry
# make adjustments as needed
# return
#initial comparison made to current because all posts with no year originally have current year added to them
#   as part of pre-processing
def expiry_year_corrector(row):
    post_date = int(row['created_utc'])
    post_date = datetime.datetime.fromtimestamp(post_date)
    combined = row['regexed_combined']
    if combined == 0: return combined
    current_year = datetime.datetime.now().year
    for position in combined:
        expiry = position[3]
        expiry_year = expiry.year
        expiry_month = expiry.month
        post_year = post_date.year
        post_month = post_date.month
        
        if expiry_year > current_year: 
            expiry = expiry.timestamp()
            continue # continue to other positions in post
            #return combined # no changes needed
        
        if expiry_month >= post_month: # expiry month is greater than month, is POST year
            expiry = datetime.datetime(post_year,expiry.month,expiry.day).timestamp()
        elif expiry_month < post_month: # expiry month is less than post month, is POST year + 1
            expiry = datetime.datetime(post_year+1,expiry.month,expiry.day).timestamp()
        position[3] = expiry
    print(combined)
    return combined

In [6]:
def ticker_finder(row):
    if row['regexed_combined'] == 0:
        return 0
    if type(row['selftext']) == float:
        return 0 
    if row['regexed_combined'][0][0] != 'NONE':
        return 1
    lookup_string = row['title'] + ' ' + row['selftext']
    tickers = check_tickers(clean_post(lookup_string))
    the_goods = tickers.most_common(3)
    return the_goods if the_goods else -1

In [7]:
from nltk.corpus import stopwords
from nltk import FreqDist
import collections
import string

def clean_post(post_text):
    translator = str.maketrans('', '', string.punctuation) # for removing punctuation
    post_text = post_text.translate(translator)
    token_text = [t for t in post_text.replace('\n',' ').split()]
    
    sr = stopwords.words('english')
    sr.extend(['gay', 'bear', 'girlfriend', 'bull'])
    cleanTokens = [i for i in token_text if i not in sr]
    
    return cleanTokens

nasdaq = pd.read_csv('tickers/nasdaq.csv')
nyse = pd.read_csv('tickers/nyse.csv')

def check_tickers(tokens):
    ticker_counter = collections.Counter()
    for token in tokens:
        if any(nasdaq['Symbol'].isin([token])): 
            ticker_counter[token] += 1
            print(token)
        if any(nyse['Symbol'].isin([token])): 
            ticker_counter[token] += 1
            print(token)
    return ticker_counter

In [8]:
def add_missing_tickers(row):
    # rows with no regex
    if row['regexed_combined'] == 0: return row['regexed_combined']
    # if there are no missing tickers
    if row['regexed_combined'][0][0] != "NONE": return row['regexed_combined']
    # if there are missing tickers
    for position in row['regexed_combined']:
        if position[0] == "NONE" and not (type(row['ticker_locator']) == int):
            position[0] = row['ticker_locator'][0][0] # adds in ticker of most common ticker mentioned - best guess
    return row['regexed_combined']

In [9]:
#good_post_csv = pd.read_csv('all_good_posts.csv')
good_post_csv = pd.read_json('master_json_good.json')

In [10]:
good_post_csv['all text'] = good_post_csv['title'] + good_post_csv['selftext']

In [11]:
good_post_csv['regexed_combined'] = good_post_csv['all text'].apply(regex_pos)
good_post_csv['regexed_body'] = good_post_csv['selftext'].apply(regex_pos)
good_post_csv['regexed_title'] = good_post_csv['title'].apply(regex_pos)

In [12]:
#good_post_csv.to_csv('output1.csv')

In [13]:
good_post_csv['regexed_combined'] = good_post_csv['regexed_combined'].apply(date_proccessor)

date parse error, probably worth dropping
15 IAU
date parse error, probably worth dropping
10 GLD
date parse error, probably worth dropping
20 SLV
date parse error, probably worth dropping
2 BA
date parse error, probably worth dropping
ELI 18
date parse error, probably worth dropping
ELI 18
date parse error, probably worth dropping
ELI 18
date parse error, probably worth dropping
ELI 18
date parse error, probably worth dropping
ELI 18
date parse error, probably worth dropping
21HD
date parse error, probably worth dropping
21 TGT
date parse error, probably worth dropping
SLV 16
date parse error, probably worth dropping
SPY2
date parse error, probably worth dropping
1 SPY
date parse error, probably worth dropping
60
AAL
date parse error, probably worth dropping
15
BA
date parse error, probably worth dropping
P 60
date parse error, probably worth dropping
75C
date parse error, probably worth dropping
PSA 21
date parse error, probably worth dropping
PSA 21
date parse error, probably worth 

In [14]:
good_post_csv['regexed_combined'] = good_post_csv['regexed_combined'].apply(date_proccessor_corrector)

In [15]:
good_post_csv['regexed_combined'] = good_post_csv.apply(expiry_year_corrector, axis=1)

[['NONE', '4', 'P', 1587096000.0], ['NONE', '4', 'C', 1587096000.0]]
[['NONE', '110', 'c', 1602820800.0]]
[['NONE', '25', 'c', 1600401600.0]]
[['NONE', '36', 'c', 1610686800.0]]
[['NONE', '100', 'p', 1597982400.0]]
[['NONE', '47.5', 'p', 1584676800.0]]
[['PFE', '36.5', 'C', 1598587200.0]]
[['NONE', '130', 'P', 1597982400.0]]
[['ADMA', '5', 'c', 1605848400.0], ['TAK', '22.5', 'c', 1610686800.0], ['BAX', '110', 'c', 1605848400.0], ['GRFS', '30', 'c', 1605848400.0], ['TAK', '22.5', 'c', 1605848400.0], ['BAX', '110', 'c', 1610686800.0], ['GRFS', '30', 'c', 1605848400.0]]
[['NIO', '7', 'p', 1596168000.0]]
[['TSLA', '2000', 'c', 1595563200.0]]
[['NONE', '80', 'c', 1605848400.0]]
[['NONE', '32', 'c', 1605848400.0]]
[['NONE', '13', 'C', 1607058000.0], ['NONE', '14', 'C', 1606453200.0], ['NONE', '13', 'C', 1607058000.0], ['NONE', '14', 'C', 1607058000.0], ['NONE', '15', 'C', 1606453200.0]]
[['DIS', '80', 'p', 1585281600.0]]
[['DIS', '80', 'p', 1585281600.0]]
[['SPXS', '30', 'c', 1584676800.0]]


In [16]:
## work on finding the missing tickers

In [17]:
good_post_csv['ticker_locator'] = good_post_csv.apply(ticker_finder, axis=1)

APRN
APRN
DD
APRN
APRN
A
HAE
RXT
RKT
RXT
RKT
DIS
NCLH
NCLH
NCLH
NCLH
DD
KDP
LOGI
KDP
GME
C
C
PPC
CEO
MGNI
MGNI
C
C
C
PLTR
AMZN
SOLO
PTON
PTON
PLTR
TSLA
NIO
EV
PLTR
BB
DD
PLTR
A
DLR
TSLA
ICLN
ICLN
ICLN
ICLN
GRMN
RXT
ENPH
GOOG
MSFT
SNOW
DDOG
NET
GOOG
MSFT
RXT
CEO
CFO
DDOG
NIO
NIO
AAL
DAL
UAL
LUV
SP
DOW
TSLA
NIO
WWE
PENN
PENN
WMT
WMT
WMT
WMT
WMT
PE
ON
DK
DK
KODK
EXP
JD
BABA
EV
PLUG
PDT
WYNN
DD
A
WYNN
WYNN
CLDR
CLDR
ON
SHIP
NOW
CLDR
CLR
GME
GME
GME
GME
EOD
AMD
DD
LRN
CF
PRPL
POST
PRPL
SQ
DD
DD
AAPL
UNFI
UNFI
UNFI
UNFI
PTON
CRWD
SWBI
SWBI
SWBI
ONE
BABA
BABA
BABA
BABA
NIO
EV
EV
EV
NIO
ICE
EV
BOOM
FB
FB
AA
ELY
ELY
A
A
AMD
MU
C
PLTR
PLTR
PLTR
PLTR
PLTR
GL
SPCE
DD
UPS
UPS
UPS
UPS
NVDA
CHWY
CHWY
DD
DD
TAK
TWO
TAK
TAK
HPQ
HP
HP
NIO
NIO
W
D
D
W
W
D
TSLA
NFLX
WORK
UI
TLT
DD
TOL
TDOC
TDOC
WMT
TGT
TDOC
TDOC
AAPL
HPQ
TOL
TLT
PLTR
GME
CCJ
NOW
JAN
C
ZUO
MSFT
SPCE
A
SPCE
TSLA
TDOC
AMD
DD
AMD
DD
AMD
AMD
AMD
AMD
AMD
ARE
X
AMD
AMD
BE
BIG
NAVI
ZEN
AMD
AMD
AMD
C
BCRX
DD
BCRX
HAE
BCRX
NKLA
PLTR
PLTR
MRK
CEO
DD

In [18]:
good_post_csv[(good_post_csv['ticker_locator'] != 1) & (good_post_csv['ticker_locator'] != 0) & (good_post_csv['ticker_locator'] != -1)]

Unnamed: 0,author,created_utc,title,link_flair_text,selftext,id,full_link,all text,regexed_combined,regexed_body,regexed_title,ticker_locator
10,alcakd,1585689509,WTF is up with APRN - opportunity or am I reta...,Discussion,"So, APRN was on the verge of bankruptcy and tr...",fsqolv,https://www.reddit.com/r/wallstreetbets/commen...,WTF is up with APRN - opportunity or am I reta...,"[[NONE, 4, P, 1587096000.0], [NONE, 4, C, 1587...","[[NONE, 4, P, 4/17], [NONE, 4, C, 4/17]]",0,"[(APRN, 4), (DD, 1), (A, 1)]"
50,superkhanbeats,1598396489,Haemonetics (HAE) machine spotted being used f...,DD,,igrwmm,https://www.reddit.com/r/wallstreetbets/commen...,Haemonetics (HAE) machine spotted being used f...,"[[NONE, 110, c, 1602820800.0]]",0,"[[NONE, 110, c, 10/16]]","[(HAE, 1)]"
64,h0ld4wg,1598399338,Is RXT the next RKT!? 🌚🌝 🚀,DD,Data center company that is basically an exten...,igsk0t,https://www.reddit.com/r/wallstreetbets/commen...,Is RXT the next RKT!? 🌚🌝 🚀Data center company ...,"[[NONE, 25, c, 1600401600.0]]","[[NONE, 25, c, 9/18]]",0,"[(RXT, 2), (RKT, 2)]"
157,suicide_walter,1595715653,Anyone Buying Disney Puts?,Discussion,"The way I see it, Disney is going to have to c...",hxzqjv,https://www.reddit.com/r/wallstreetbets/commen...,"Anyone Buying Disney Puts?The way I see it, Di...","[[NONE, 100, p, 1597982400.0]]","[[NONE, 100, p, 08/21]]",0,"[(DIS, 1)]"
221,MrHud2,1581867398,NCLH puts?,Options,"Greetings fellow financial retards,\nHow do so...",f4ws7i,https://www.reddit.com/r/wallstreetbets/commen...,"NCLH puts?Greetings fellow financial retards,\...","[[NONE, 47.5, p, 1584676800.0]]","[[NONE, 47.5, p, 3/20]]",0,"[(NCLH, 4)]"
...,...,...,...,...,...,...,...,...,...,...,...,...
145875,miolini,1609241062,SWI options blocked,Discussion,Hello! Does anybody knows what happening with ...,kmhgf2,https://www.reddit.com/r/wallstreetbets/commen...,SWI options blockedHello! Does anybody knows w...,"[[NONE, 12.5, P, 1610427600.0]]","[[NONE, 12.5, P, 01/12]]",0,"[(SWI, 2)]"
145935,runitupmikey,1608680160,Why it's not too late (still early) to invest ...,DD,MGNI holder since $10 here. Also holding 800 c...,kim7ej,https://www.reddit.com/r/wallstreetbets/commen...,Why it's not too late (still early) to invest ...,"[[NONE, 25, C, 2021-01-15 00:00:00]]","[[NONE, 25, C, 1/15/21]]",0,"[(MGNI, 6), (TTD, 4), (TV, 3)]"
146027,KimJong-UnsBodyguard,1604678381,$ICLN,DD,Biden has made his views on clean energy clear...,jpdc0t,https://www.reddit.com/r/wallstreetbets/commen...,$ICLNBiden has made his views on clean energy ...,"[[NONE, 20, c, 1605848400.0]]","[[NONE, 20, c, 11/20]]",0,"[(ICLN, 3)]"
146032,sjtomcat,1604678868,Argument for DIS puts,Discussion,I hate being a 🌈 🐻 BUT with Disney to report e...,jpdhga,https://www.reddit.com/r/wallstreetbets/commen...,Argument for DIS putsI hate being a 🌈 🐻 BUT wi...,"[[NONE, 120, p, 1605243600.0]]","[[NONE, 120, p, 11/13]]",0,"[(DIS, 1)]"


In [19]:
good_post_csv.iloc[117537]['all text']

'3 Russian doctors fall from hospital windows during pandemic'

In [20]:
good_post_csv.iloc[387]['all text']

'Hydrogen Fuel Cell SPAC[removed]'

In [21]:
good_post_csv.iloc[117534]['regexed_combined']

0

In [22]:
good_post_csv.apply(add_missing_tickers, axis = 1) # should be an in place apply because i directly edit the lists

0         0
1         0
2         0
3         0
4         0
         ..
146235    0
146236    0
146237    0
146238    0
146239    0
Length: 146240, dtype: object

In [23]:
len(good_post_csv)

146240

In [24]:
good_post_csv[good_post_csv['regexed_combined'] != 0]

Unnamed: 0,author,created_utc,title,link_flair_text,selftext,id,full_link,all text,regexed_combined,regexed_body,regexed_title,ticker_locator
10,alcakd,1585689509,WTF is up with APRN - opportunity or am I reta...,Discussion,"So, APRN was on the verge of bankruptcy and tr...",fsqolv,https://www.reddit.com/r/wallstreetbets/commen...,WTF is up with APRN - opportunity or am I reta...,"[[APRN, 4, P, 1587096000.0], [APRN, 4, C, 1587...","[[NONE, 4, P, 4/17], [NONE, 4, C, 4/17]]",0,"[(APRN, 4), (DD, 1), (A, 1)]"
50,superkhanbeats,1598396489,Haemonetics (HAE) machine spotted being used f...,DD,,igrwmm,https://www.reddit.com/r/wallstreetbets/commen...,Haemonetics (HAE) machine spotted being used f...,"[[HAE, 110, c, 1602820800.0]]",0,"[[NONE, 110, c, 10/16]]","[(HAE, 1)]"
64,h0ld4wg,1598399338,Is RXT the next RKT!? 🌚🌝 🚀,DD,Data center company that is basically an exten...,igsk0t,https://www.reddit.com/r/wallstreetbets/commen...,Is RXT the next RKT!? 🌚🌝 🚀Data center company ...,"[[RXT, 25, c, 1600401600.0]]","[[NONE, 25, c, 9/18]]",0,"[(RXT, 2), (RKT, 2)]"
72,No_Choice6240,1598401936,First energy calls,Options,What is up with first energy? Stock hasn’t mov...,igt3oh,https://www.reddit.com/r/wallstreetbets/commen...,First energy callsWhat is up with first energy...,"[[NONE, 36, c, 1610686800.0]]","[[NONE, 36, c, 1/15/20]]",0,-1
157,suicide_walter,1595715653,Anyone Buying Disney Puts?,Discussion,"The way I see it, Disney is going to have to c...",hxzqjv,https://www.reddit.com/r/wallstreetbets/commen...,"Anyone Buying Disney Puts?The way I see it, Di...","[[DIS, 100, p, 1597982400.0]]","[[NONE, 100, p, 08/21]]",0,"[(DIS, 1)]"
...,...,...,...,...,...,...,...,...,...,...,...,...
146069,johnyt3,1604688725,Volatility is Coming Back: How Election News i...,DD,**Section** **I. The Election**\n\na. A Repu...,jpgd94,https://www.reddit.com/r/wallstreetbets/commen...,Volatility is Coming Back: How Election News i...,"[[VXX, 24, C, 1605848400.0]]","[[VXX, 24, C, 11/20]]",0,1
146166,kill_wonka,1602418485,AMD will not buy Xilinx,DD,"After sneaky WSJ reported ""talks"" of AMD buyin...",j99a27,https://www.reddit.com/r/wallstreetbets/commen...,AMD will not buy XilinxAfter sneaky WSJ report...,"[[AMD, 80, c, 1602820800.0], [XLNX, 120, p, 16...","[[AMD, 80, c, 10/16], [XLNX, 120, p, 10/16], [...",0,1
146190,JuicePick,1602425501,FanDuel has the HIV today. Calls on DKNG.,Discussion,“Our stat provider is experiencing a major ser...,j9beyv,https://www.reddit.com/r/wallstreetbets/commen...,FanDuel has the HIV today. Calls on DKNG.“Our ...,"[[DKNG, 50, c, 1602820800.0]]","[[DKNG, 50, c, 10/16]]",0,1
146213,UnusualW_Mod,1602431695,Stimulus and Payment Processors - Catch the news,Discussion,I think there is a big way to make consistent ...,j9dbt0,https://www.reddit.com/r/wallstreetbets/commen...,Stimulus and Payment Processors - Catch the ne...,"[[PYPL, 202, c, 1604030400.0], [SQ, 220, c, 16...","[[PYPL, 202, c, 10/30], [SQ, 220, c, 10/23]]",0,1


In [25]:
good_post_csv.iloc[248]['regexed_combined']

0

In [26]:
good_post_csv['num posts'] = good_post_csv['regexed_combined'].apply(lambda x: len(x) if type(x) is list else x)

In [27]:
sum(good_post_csv['num posts']) #duplicates dealt with in phase 4

13937

In [28]:
good_post_csv.to_json('final_output.json')
good_post_csv.to_csv('final_output.csv')