In [1]:
import pandas as pd
from pathlib import Path
import os

directory = Path(os.getcwd())   
directory = directory.parent.parent.joinpath("data") 
df = pd.read_pickle(directory.joinpath("stocktwits_processed_without_multiple_full.pkl"))
df

Unnamed: 0,id,text,time,sentiment,symbols
0,411896118,I’m comfortable at $127.50. I’ll check back in...,1638291431,Bullish,$BABA
1,411895020,how low does this POS go tomorrow is the milli...,1638291222,,$BABA
2,411894156,@Nobrainer6868 started my first 5% position t...,1638291057,,$BABA
3,411894124,You guys want me to sell down here? Look at th...,1638291050,,$BABA
4,411893846,at this price point shorts are better off flip...,1638291000,,$BABA
...,...,...,...,...,...
386926,420618061,USBancorp BidaskScore is #Reiterated to Stron...,1640479505,,$USB
386927,420617596,Short sale volume (not short interest) for on...,1640478214,,$USB
386928,420610698,SweepCast observed: with Unusual Options Acti...,1640466364,,$USB
386929,420437859,How does this affect your portfolio? 's in Upt...,1640326224,,$USB


In [11]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_top_n_words(df, threshold, target):
    # df to corpus
    corpus = []
    symbols = set(df.symbols)
    for i in symbols:
        corpus.append(". ".join([i.strip() for i in list(df[df['symbols']==i].text)]))

    # words to vector
    vectorizer = TfidfVectorizer(stop_words='english')
    vecs = vectorizer.fit_transform(corpus) # fit corpus
    feature_names = vectorizer.get_feature_names()
    dense = vecs.todense()
    lst1 = dense.tolist()
    # a dataframe with tf-idf scores
    df_tfidf = pd.DataFrame(lst1, columns=feature_names)
    data = df_tfidf.transpose()
    data.columns = list(symbols)

    df = data.sort_values(by=[target], ascending=False)
    df = df[df[target] > threshold]
    return df[target]

In [12]:
words = []
for i in set(df.symbols):
    result = tfidf_top_n_words(df, 0.1, i)
    print(list(result.index))
    words += (list(result.index))

['oracle', 'cerner', 'orcl', '100', 'options', 'cloud', 'new']
['coca', 'cola', 'alerted', 'sweepcast', 'options', 'unusual', 'expiring', 'activity', 'worth', 'bodyarmor', 'allen', 'ko', 'new', '2022', 'company']
['300', 'nvidia', 'nvda', 'today', 'market', 'calls', 'buy', 'just', 'day', 'stock', 'going', 'like', 'tomorrow', 'week', 'time', '310']
['filing', '424b2', 'prospectus', '424', 'filed', 'form', 'chase', 'sec', 'rule', 'fwp', 'stockorbit', 'simulated', 'jpmorgan', 'dollar', 'morgan', 'calls', 'open']
['citigroup', 'simulated', 'stockorbit', 'calls', 'fwp', 'citi', 'dollar', 'open', 'prospectuses', 'options', '62', '433', 'filed', 'form', 'liquidtheta', 'sec', '163', 'tp2', 'tp3', '2022', 'inbox', 'tp1', 'buy', 'filing', 'optional']
['bitcoin', 'bears', 'btc', 'buy', 'crypto', 'just', '50k', 'like', 'going', 'time', 'lol', 'dubai', 'don', '60k', 'dip']
['bancorp', 'redemption', 'announces']
['jd', 'baba', 'china', 'hk', 'com', 'alerted', 'sweepcast', 'today']
['amazon', 'aws', 

In [15]:
import collections

final_words = []
occurrences = collections.Counter(words)
occurrences

Counter({'oracle': 1,
         'cerner': 1,
         'orcl': 1,
         '100': 1,
         'options': 15,
         'cloud': 1,
         'new': 5,
         'coca': 1,
         'cola': 1,
         'alerted': 15,
         'sweepcast': 12,
         'unusual': 11,
         'expiring': 5,
         'activity': 4,
         'worth': 3,
         'bodyarmor': 1,
         'allen': 1,
         'ko': 1,
         '2022': 5,
         'company': 2,
         '300': 1,
         'nvidia': 1,
         'nvda': 1,
         'today': 11,
         'market': 11,
         'calls': 18,
         'buy': 13,
         'just': 11,
         'day': 6,
         'stock': 11,
         'going': 10,
         'like': 11,
         'tomorrow': 6,
         'week': 3,
         'time': 6,
         '310': 1,
         'filing': 3,
         '424b2': 1,
         'prospectus': 1,
         '424': 1,
         'filed': 6,
         'form': 6,
         'chase': 1,
         'sec': 5,
         'rule': 1,
         'fwp': 5,
         'stockorbi

In [16]:
remove_threshold = 0.125
for i in occurrences:
    if occurrences[i] > len(set(df.symbols)) * remove_threshold:
        print('REMOVED: ',i)
    else:
        final_words.append(i)


REMOVED:  options
REMOVED:  new
REMOVED:  alerted
REMOVED:  sweepcast
REMOVED:  unusual
REMOVED:  expiring
REMOVED:  2022
REMOVED:  today
REMOVED:  market
REMOVED:  calls
REMOVED:  buy
REMOVED:  just
REMOVED:  day
REMOVED:  stock
REMOVED:  going
REMOVED:  like
REMOVED:  tomorrow
REMOVED:  time
REMOVED:  filed
REMOVED:  form
REMOVED:  sec
REMOVED:  fwp
REMOVED:  stockorbit
REMOVED:  simulated
REMOVED:  dollar
REMOVED:  open
REMOVED:  bears


In [7]:
final_words

['oracle',
 'cerner',
 'orcl',
 'cloud',
 'coca',
 'cola',
 'bodyarmor',
 'allen',
 'ko',
 'coke',
 'herbert',
 'nvidia',
 'nvda',
 '310',
 'arm',
 '320',
 '290',
 '305',
 '330',
 '280',
 'nasdaq',
 '424b2',
 'prospectus',
 '424',
 'chase',
 'rule',
 'jpmorgan',
 '162',
 'jp',
 'citigroup',
 'citi',
 'prospectuses',
 '433',
 'liquidtheta',
 '163',
 'tp3',
 'tp2',
 'inbox',
 'tp1',
 'optional',
 'writing',
 'actionable',
 'bitcoin',
 'btc',
 '50k',
 'dubai',
 '60k',
 '100k',
 'salvador',
 'coins',
 '42k',
 'bancorp',
 'redemption',
 'depositary',
 'jd',
 'baba',
 'hk',
 'tencent',
 'chinese',
 'kong',
 'hsi',
 'amazon',
 'aws',
 'amzn',
 '3500',
 '3400',
 '3450',
 '3600',
 'split',
 'outage',
 '3550',
 '3300',
 'walton',
 'walmart',
 'waltons',
 'wmt',
 'robson',
 '140',
 'alice',
 'disney',
 'walt',
 'dis',
 '155',
 '157',
 'parks',
 'intel',
 'mobileye',
 'intc',
 'paypal',
 'pypl',
 '190',
 '195',
 '192',
 '187',
 '180',
 '188',
 'microsoft',
 'msft',
 '337',
 '340',
 '335',
 '327',


In [8]:
with open('tfidf_words/tfidf_words.txt', 'w') as filehandle:
    for listitem in final_words:
        filehandle.write('%s\n' % listitem)