In [87]:
import json
import os
from tqdm import tqdm
import pandas as pd
import re
def regex_pos(post_text):
    x = []
    x1 = re.findall('\$?[A-Z]{1,4}\s\$?\d{1,3}\.?\d{1,2}?\s?\w{1,5}\s\d{1,2}\/\d{1,4}', post_text)
    x2 = re.findall('\$?[A-Z]{1,4}\s\d{1,2}\/\d{1,4}\s\$?\d{1,3}\.?\d{1,2}?\s?\w{1,5}', post_text)
    
    if x1:
        x.extend(x1)
    if x2:
        x.extend(x2)
    return x 

In [88]:
## lets make the attempt with nltk and see what we get
from nltk.corpus import stopwords
from nltk import FreqDist
import collections
import string

def clean_post(post_text):
    translator = str.maketrans('', '', string.punctuation) # for removing punctuation
    post_text = post_text.translate(translator)
    token_text = [t for t in post_text.replace('\n',' ').split()]
    
    sr = stopwords.words('english')
    cleanTokens = [i for i in token_text if i not in sr]
    
    return cleanTokens

nasdaq = pd.read_csv('tickers/nasdaq.csv')
nyse = pd.read_csv('tickers/nyse.csv')
def check_tickers(freq_dist):
    for token in freq_dist:
        if type(token) == tuple: token = token[0]
        if any(nasdaq['Symbol'].isin([token])): return token
        if any(nyse['Symbol'].isin([token])): return token
    return None

In [89]:
good_post_csv = pd.read_csv('all_good_posts.csv')
all_post_csv = pd.read_csv('all_posts.csv')

In [90]:
def post_processor(post_csv):    
    numposts = 0
    textposts = 0
    positions = 0
    regexed = []
    regexed_title = []
    tickers_found_body = []
    tickers_found_title = []
    for index,row in tqdm(post_csv.iterrows()):
        numposts += 1
        nulltext = pd.isna(row['selftext'])
        r_text = []
        if not nulltext:
            textposts += 1
            r_text = regex_pos(row['selftext'])
            if r_text:
                positions += len(r_text)
                #print(r_text)
                regexed.append((r_text,index))
        r_title = regex_pos(row['title'])
        if r_title:
            positions += len(r_title)
            regexed_title.append((r_title,index))
        if not (r_text or nulltext) and not r_title: # if nothing was found in either title or text
            if not nulltext:
                freq = FreqDist(clean_post(row['selftext']))
                attempt = check_tickers(freq.most_common(10)) #checks only top ten freqs
                if attempt: 
                    tickers_found_body.append([attempt,row['selftext']])
                    #continue
            attempt = check_tickers(clean_post(row['title']))
            if attempt:
                tickers_found_title.append([attempt,row['title']])

    print('Total posts in CSV: {}'.format(numposts))
    print('Posts with text: {}'.format(textposts))
    print('Total positions found: {}'.format(positions))
    print('Success rate (body positions / all posts): {}%'.format(round(positions/numposts * 100,2)))
    print('Success rate (title positions / all posts): {}%'.format(round(title_positions/numposts * 100,2)))
    print('Tickers found in post texts regexed failed: {}'.format(len(tickers_found_body)))
    print('Tickers found in post titles regexed failed: {}'.format(len(tickers_found_title)))
    print('----------------------------------------------------------')
    #print(regexed)

In [22]:
post_processor(all_post_csv)

254252it [16:03, 263.97it/s]

Total posts in CSV: 254252
Posts with text: 142015
Total positions found: 11001
Success rate (body positions / all posts): 4.33%
Success rate (title positions / all posts): 0.96%
Tickers found in post texts regexed failed: 13013
Tickers found in post titles regexed failed: 36467





In [91]:
post_processor(good_post_csv)

27it [00:00, 132.42it/s]

['$T $37.5p 10/4']


189it [00:01, 165.25it/s]

['HLT 40P 5/1']
['HDB $35p 4/17', 'EWP $19p 4/17', 'BLK $295p 4/17', 'EEM $19.73p 9/18']


241it [00:01, 160.01it/s]

['VTI $70 puts 4/17']
['SPY 200p 4/13']
['DAX 7100 4/17', 'NDX 4500 17/4']
['V 167.5c 3/27']


280it [00:01, 175.44it/s]

['$SPCE 3/27 13.5p']
['CCL 3/27 $13p']
['SPY 200 P 5/15', 'INDA 20p 4/17', 'CCL 10p 4/17']
['$FLIR 35c 4/17', '$FLIR 40c 7/17', '$FLIR 50c 7/17', '$FLIR 35c 4/17', '$FLIR 40c 7/17', '$FLIR 50c 7/17']


318it [00:01, 182.30it/s]

['VXX 3/27 35P', 'VXX 4/3 35P']
['WYNN 04/17 40p']
['SPXS $25c 4/17']


422it [00:02, 197.83it/s]

['SLV $15C 4/17']
['SPY 160p 4/17', 'SPY 180p 7/17', 'YUMC 45p 7/17', 'LVS 35p 9/18', 'BAC 18p 9/18', 'SPCE 25p 10/16']
['$SPY 200 6/19']


494it [00:02, 203.51it/s]

['HSY $105p 4/17']
['GLD 150C 5/15']
['PLAY 10p 4/17']
['EIDO 10p 7/17']
['SPY 200p 4/17', 'SPY 200p 3/25']
['SPYP 5/15 180p']
['$SPY 180p 5/1']
['$LOW 4/17 $85C']


544it [00:02, 223.63it/s]

['$SPCE 3/27 13.5p', '$SPCE 3/27 13.5p']
['SPY 240p 3/27']
['PLNT $45p 4/17']


591it [00:03, 211.94it/s]

['SPY $200p 4/17']


660it [00:03, 208.05it/s]

['SPY $195 4/17']
['$SLV 15c 4/3']
['AEX 450c 4/17']
['SPY $225 4/17', 'SPY $215 05/01', 'SPY $215 05/01']


744it [00:04, 180.02it/s]

['SPY 25p 3/27', 'AAPL 500c 4/17', 'TWTR 18p 3/27', 'SPY 3/27 500c']


804it [00:04, 176.82it/s]

['TEVA 11P 4/17']


1350it [00:07, 161.14it/s]

['SPY 324c 2/10', 'SPCE 20c 1/15', 'NIO 10c 1/15', 'CRSP 55c 1/15', 'BA 310p 1/31', 'SPCE 25c 1/15']


1405it [00:07, 176.94it/s]


KeyboardInterrupt: 

In [None]:
print(len(regexed))
regexed

In [None]:
print(len(regexed_title))
regexed_title

In [55]:
pd.DataFrame(regexed)

Unnamed: 0,0,1
0,[BJ 50c 11/20],119
1,"[NKLA 30p 12/18, NKLA 20p 12/18]",194
2,[ALB 90c 10/16],207
3,[ASML 390c 10/9],250
4,[$SPY 3/20 $300 poot],639
...,...,...
1494,[TSLA 310p 9/11],109663
1495,[SHW 550p 10/16],109664
1496,[TSLA 9/11 320 call],109737
1497,[FIT 5p 1/22],109822


In [36]:
len(tickers_found)

9403

In [49]:
type(freq.most_common(1)[0]) == tuple

True