In [2]:
import json
import os
from tqdm import tqdm
import pandas as pd
import re
def regex_pos(post_text):
    x = []
    x1 = re.findall('[A-Z]{1,4}\s\d{1,3}\s?\w{1,4}\s\d{1,2}/\d{1,4}', post_text)
    x2 = re.findall('\$?[A-Z]{1,4}\s\d{1,2}\/\d{1,4}\s\$?\d{1,3}\s\w{1,4}', post_text)
    
    if x1:
        x.extend(x1)
    if x2:
        x.extend(x2)
    return x 

In [3]:
## lets make the attempt with nltk and see what we get
from nltk.corpus import stopwords
from nltk import FreqDist
import collections
import string

def clean_post(post_text):
    translator = str.maketrans('', '', string.punctuation) # for removing punctuation
    post_text = post_text.translate(translator)
    token_text = [t for t in post_text.replace('\n',' ').split()]
    
    sr = stopwords.words('english')
    cleanTokens = [i for i in token_text if i not in sr]
    
    return cleanTokens

nasdaq = pd.read_csv('tickers/nasdaq.csv')
nyse = pd.read_csv('tickers/nyse.csv')
def check_tickers(freq_dist):
    for token in freq_dist:
        if type(token) == tuple: token = token[0]
        if any(nasdaq['Symbol'].isin([token])): return token
        if any(nyse['Symbol'].isin([token])): return token
    return None

In [6]:
directory = 'post_data'
numposts = 0
textposts = 0
positions = 0
regexed = []
for file in os.scandir(directory):
    with open(file.path, 'r') as f:
        posts = json.load(f)
        for post in posts:
            numposts += 1
            if 'selftext' in post:
                textposts += 1
                tmp = regex_pos(post['selftext'])
                if tmp:
                    positions += len(tmp)
                    regexed.append(tmp)
print('Total posts in JSON: {}'.format(numposts))
print('Posts with text: {}'.format(textposts))
print('Total positions found: {}'.format(positions))
print('Success rate (out of posts with with text): {}%'.format(round(positions/textposts * 100,2)))

Total posts in JSON: 109912
Posts with text: 107919
Total positions found: 0
Success rate (out of posts with with text): 0.0%


In [4]:
post_csv = pd.read_csv('all_good_posts.csv')

In [7]:
numposts = 0
textposts = 0
positions = 0
title_positions = 0
regexed = []
regexed_title = []
tickers_found_body = []
tickers_found_title = []
for index,row in tqdm(post_csv.iterrows()):
    numposts += 1
    nulltext = pd.isna(row['selftext'])
    if not nulltext:
        textposts += 1
        r_text = regex_pos(row['selftext'])
        if r_text:
            positions += len(r_text)
            regexed.append((r_text,index))
    r_title = regex_pos(row['title'])
    if r_title:
        title_positions += len(r_title)
        regexed_title.append((r_title,index))
    if not r_text and not r_title: # if nothing was found in either title or text
        if not nulltext:
            freq = FreqDist(clean_post(row['selftext']))
            attempt = check_tickers(freq.most_common(10)) #checks only top ten freqs
            if attempt: 
                tickers_found_body.append([attempt,row['selftext']])
                #continue
        attempt = check_tickers(clean_post(row['title']))
        if attempt:
            tickers_found_title.append([attempt,row['title']])
    
print('Total posts in CSV: {}'.format(numposts))
print('Posts with text: {}'.format(textposts))
print('Total positions found: {}'.format(positions))
print('Success rate (body positions / all posts): {}%'.format(round(positions/numposts * 100,2)))
print('Success rate (title positions / all posts): {}%'.format(round(title_positions/numposts * 100,2)))
print('Tickers found in post texts regexed failed: {}'.format(len(tickers_found_body)))
print('Tickers found in post titles regexed failed: {}'.format(len(tickers_found_title)))

115080it [19:27, 98.56it/s] 

Total posts in CSV: 115080
Posts with text: 0
Total positions found: 2603





ZeroDivisionError: division by zero

In [46]:
print(len(regexed))
regexed

1499


[(['BJ 50c 11/20'], 119),
 (['NKLA 30p 12/18', 'NKLA 20p 12/18'], 194),
 (['ALB 90c 10/16'], 207),
 (['ASML 390c 10/9'], 250),
 (['$SPY 3/20 $300 poot'], 639),
 (['SPY 4/6 $100 puts'], 790),
 (['RUSS 3/20 $10 call'], 841),
 (['SPY 230 3/27', 'AMD 47 3/20'], 962),
 (['BA 270 3/6'], 1039),
 (['SPY 200p 4/17', 'SPY 200p 4/17'], 1109),
 (['SPY 200p 4/17'], 1189),
 (['SPY 170p 4/22', 'SPY 200p 5/1'], 1226),
 (['SPY 235p 5/1', 'SPXU 35c 4/17'], 1329),
 (['NRZ 5c 4/17',
   'NRZ 6c 4/17',
   'LYFT 21p 5/1',
   'UBER 20p 5/1',
   'SPY 195p 5/1',
   'SHO 5p 6/19'],
  1379),
 (['SPY 220 4/17'], 1448),
 (['SPX 2500P 4/17'], 1491),
 (['ZM 160c 5/1'], 1492),
 (['SPY 200p 4/17'], 1497),
 (['BABA 202c 5/8'], 1634),
 (['SPY 250p 6/18'], 1659),
 (['SBUX 70P 5/08',
   'SPY 229 P 5/15',
   'DIS 90P 5/15',
   'SPY 145 P 5/29',
   'NFLX 240P 6/05',
   'BYND 65P 6/19',
   'DRI 60P 7/17',
   'IWM 95P 8/21',
   'BYND 50P 8/21',
   'IWM 95P 9/18'],
  1690),
 (['DHT 6c 6/19', 'FRO 12c 6/19', 'CCL 14p 5/8'], 1721

In [47]:
print(len(regexed_title))
regexed_title

320


[(['$CLVS 12/20 $20 call'], 500),
 (['TSLA 530c 9/4'], 1347),
 (['SPY 280 6/21'], 2109),
 (['SPY 300p 9/18'], 2406),
 (['TSLA 9/11 $600 Call'], 2886),
 (['KODK 9/4 $6 Puts'], 3039),
 (['NFLX 500c 9/18'], 3146),
 (['NFLX 500c 9/18'], 3153),
 (['NFLX 600c 9/4'], 3216),
 (['TSLA 490c 9/11'], 3311),
 (['AMD 90C 10/16'], 3543),
 (['AMD 90C 10/16'], 3545),
 (['MSFT 3/20 $200 call'], 3907),
 (['SPY 310c 3/13'], 4455),
 (['SPY 310 3/13'], 4659),
 (['SPY 310 3/13'], 4667),
 (['SPY 310 3/13'], 4681),
 (['SPY 310 3/13'], 4687),
 (['SPY 310 3/13'], 4693),
 (['MDT 115c 4/17'], 5148),
 (['APHA 7/10 6 C'], 5421),
 (['SPY 324 6/10'], 5761),
 (['AMZN 3550c 7/17'], 5965),
 (['AMZN 3500c 9/18', 'SNAP 30c 1/15'], 6140),
 (['TSLA 1500c 10/7'], 6258),
 (['AAPL 390c 7/17'], 6283),
 (['WMT 126c 7/17'], 6349),
 (['SPY 360c 9/18'], 6788),
 (['PLAY 20c 9/18'], 7033),
 (['SPCE 69 by 4/20'], 7473),
 (['TSLA 800 call 2/11'], 7568),
 (['SPY 300c 4/17'], 8578),
 (['ADBE 325 P 6/19', 'P 6/19 30 mins'], 9734),
 (['IWM 

In [55]:
pd.DataFrame(regexed)

Unnamed: 0,0,1
0,[BJ 50c 11/20],119
1,"[NKLA 30p 12/18, NKLA 20p 12/18]",194
2,[ALB 90c 10/16],207
3,[ASML 390c 10/9],250
4,[$SPY 3/20 $300 poot],639
...,...,...
1494,[TSLA 310p 9/11],109663
1495,[SHW 550p 10/16],109664
1496,[TSLA 9/11 320 call],109737
1497,[FIT 5p 1/22],109822


In [36]:
len(tickers_found)

9403

In [49]:
type(freq.most_common(1)[0]) == tuple

True