In [51]:
import requests
import time
import datetime
import pandas as pd
import sys
import string


def scraper(ticker, latest_xhr_id='0', max_volume=50000, start_date='2020-01-01', end_date='2020-12-31'):

    
    inv_table = {}
    
    # Push Errors if scrape volume less than 0:
    if max_volume <= 0:
        sys.exit("Error: max_volume must be more than 0")

    start = time.time()
    master_content = []  # List to store all data extracted
    scroll_list = [latest_xhr_id]  # List to store all XHR id to be part of the url parameters
    tracker_list = []  # List containing integers for tracking progress
    tracker = 0
    fail_count = 0

    for x in range(5001):
        if x > 0:
            addition = x * 100
            tracker_list.append(addition)

    # Running for loop for collecting data from stocktwits. Each loop collects 20 comments.
    for _ in range(max_volume):
        try:
            headers = {
                'authority': 'api.stocktwits.com',
                'accept': 'application/json',
                'authorization': 'OAuth 6439333424451d1c85e731fb126006f7780192d2',
                'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                               'Chrome/87.0.4280.88 Safari/537.36'),
                'origin': 'https://stocktwits.com',
                'sec-fetch-site': 'same-site',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': 'https://stocktwits.com/',
                'accept-language': 'en-US,en;q=0.9',
            }

            params = (
                ('symbols',ticker),
                ('filter', 'all'),
                ('limit', '30'),
                ('max', scroll_list[-1]),
            )

            response = requests.get(f'https://api.stocktwits.com/api/2/streams/symbols.json',
                                    headers=headers, params=params)
            content = response.json()
            messages = content['messages']
            # Creating dictionary for items scraped
            for item in messages:
                content_dict = {}
                content_dict['Doc_id'] = item['id']
                content_dict['Message'] = item['body']
                content_dict['Date'] = item['created_at'].split('T')[0]
                content_dict['Time'] = item['created_at'].split('T')[1]
                content_dict['Symbols'] = []
                
                
                try:
                    content_dict['Sentiment'] = item['entities']['sentiment']['basic']
                except TypeError:
                    content_dict['Sentiment'] = "N/A"
                    
                for i in range(len(item['symbols'])):
                    label = item['symbols'][i]['symbol']
                    content_dict['Symbols'].append(label)
                    
                    if label in inv_table:
                        inv_table[label].append(content_dict['Doc_id'])
                    else:
                        inv_table[label] = [content_dict['Doc_id']]

                master_content.append(content_dict.copy())
                    

            next_20_id = str(messages[-1]['id'])
            scroll_list.append(next_20_id)

            # Progress Tracker
            tracker += 1

            # Variables for tracker
            last_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
            first_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')

            diff = last_date - first_date

            three_quarter_done = first_date + diff/4
            half_done = first_date + diff/2
            one_quarter_done = first_date + diff*3/4

            # Trackers
            for number in tracker_list:
                if tracker == number:
                    print(f"Extracted {number}...")
                    print(f"run time = {time.time() - start}")  # Check run time

            if (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{one_quarter_done}'):
                print("25% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{half_done}'):
                print("50% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{three_quarter_done}'):
                print("75% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{first_date}'):
                print("100% done")
                print(f'Number of tweets unable to scrape: {fail_count * 20}')
                break

        except:
            fail_count += 1

    print(f"Number of tweets scraped: {len(master_content)}")
    print(f"Last Tweet: {master_content[-1]}")

    df = pd.DataFrame(master_content)
    
    for i in inv_table:
        inv_table[i].sort()
        
    return df,inv_table


if __name__ == "__main__":
    tweets_df,inv_table = scraper ("AAPL,BABA",max_volume = 5)
    

Number of tweets scraped: 150
Last Tweet: {'Doc_id': 286986311, 'Message': '$AAPL \nOfflate...there is lot of fake news being propagated by media....\nSlowly the world is realizing that the products laumced by apple are crap...\nLook at the sales ofniphone mini....disaster product....apple tv...no one is buying it....', 'Date': '2021-02-10', 'Time': '03:17:53Z', 'Symbols': ['AAPL'], 'Sentiment': 'Bearish'}


In [52]:
inv_table

{'AAPL': [286986311,
  286986485,
  286986616,
  286988148,
  286988971,
  286989930,
  286990349,
  286991053,
  286991650,
  286991985,
  286992095,
  286992100,
  286992479,
  286992811,
  286993007,
  286993378,
  286994350,
  286994410,
  286994544,
  286994942,
  286995039,
  286995429,
  286995518,
  286995605,
  286995734,
  286997022,
  286997623,
  286998468,
  286998688,
  286999361,
  286999799,
  287000392,
  287000766,
  287001590,
  287001643,
  287002063,
  287002343,
  287002548,
  287002605,
  287002629,
  287004109,
  287004196,
  287004260,
  287004361,
  287004393,
  287004398,
  287004408,
  287004415,
  287004470,
  287004481,
  287004840,
  287004884,
  287005418,
  287006012,
  287006201,
  287006737,
  287006783,
  287006858,
  287007651,
  287007807,
  287007936,
  287008057,
  287008084,
  287008089,
  287008292,
  287008826,
  287009139,
  287009815,
  287010189,
  287011165,
  287011222,
  287011229,
  287011281,
  287011443,
  287011600,
  287012178,
  28

# Tokenize, Filter and Lemmatize

In [199]:
tweets = tweets_df.copy()

In [200]:
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

stop_words = set(stopwords.words("english"))
tickers = list(inv_table.keys())

lem = WordNetLemmatizer()

In [201]:
def token_filter_lemmatize(msg):
    words = word_tokenize(msg)
    
    
    processed = []
    
    
    
    for w in words:
        if (w not in stop_words) and (w not in tickers) and (w not in string.punctuation):
            processed.append(lem.lemmatize(w))
            
    return processed

tweets['Words'] = tweets['Message'].apply(token_filter_lemmatize)

#tweets.to_csv(r'Database.csv', index=False)
tweets

Unnamed: 0,Date,Doc_id,Message,Sentiment,Symbols,Time,Words
0,2021-02-10,287028871,"$AAPL ASTI, future of Solar!",,[AAPL],06:12:44Z,"[ASTI, future, Solar]"
1,2021-02-10,287028433,Amazing week from the gains of over 2490% and ...,Bullish,"[AAPL, BA, BAC, C, AAL]",06:10:07Z,"[Amazing, week, gain, 2490, 15, return, A, big..."
2,2021-02-10,287028295,$AAPL .... $UATG !!!!,Bullish,"[AAPL, UATG]",06:09:07Z,[...]
3,2021-02-10,287028151,$AAPL is the dividend payout tomorrow? How is ...,,[AAPL],06:08:05Z,"[dividend, payout, tomorrow, How, done, Curren..."
4,2021-02-10,287028098,$aapl Mount Sinai study finds Apple Watch can ...,,[AAPL],06:07:41Z,"[aapl, Mount, Sinai, study, find, Apple, Watch..."
5,2021-02-10,287027909,$AAPL 140 by Friday,Bullish,[AAPL],06:06:23Z,"[140, Friday]"
6,2021-02-10,287027250,$spy $amd $aapl $baba $tsla\n\n&#39;&#39;For a...,,"[AAPL, AMD, SPY, TSLA, BABA]",06:02:09Z,"[spy, amd, aapl, baba, tsla, 39, 39, For, limi..."
7,2021-02-10,287026945,$SNDL $4.20 tomorrow! $TSLA $AAPL $AMC $GME ht...,,"[AAPL, GME, TSLA, AMC, SNDL]",06:00:13Z,"[4.20, tomorrow, http, //www.reddit.com/r/wall..."
8,2021-02-10,287026848,$TSLA ur done . Everyone is getting in . I&#3...,,"[AAPL, SPY, TSLA]",05:59:36Z,"[ur, done, Everyone, getting, I, 39, going, Gr..."
9,2021-02-10,287026794,$AAPL I was watching Andrei Jikh on YouTube wh...,,[AAPL],05:59:18Z,"[I, watching, Andrei, Jikh, YouTube, mentioned..."


# Inverted_Index and Search

In [223]:
company_ticker = {"baba":"alibaba",
                  "amzn":"amazon",
                  "appl":"aaple",
                  "tsla":"tesla",
                  "msft":"microsoft",
                  "fb":"facebook",
                  "googl":"google",
                  "nio":"nio",
                  "twtr":"twitter",
                  "nflx":"netflix"
                 }

In [224]:
from collections import defaultdict

inverted_index = defaultdict(set)
# bigram_inverted_index = defaultdict(set)

docIndex = tweets.columns.get_loc("Doc_id")
wordsIndex = tweets.columns.get_loc("Words")

for index, doc in tweets.iterrows():
    docID = doc[docIndex]
    words = doc[wordsIndex]
    
    i = 0
#     first = True
    
    for w in words:
        
        if w in company_ticker:
            w = company_ticker[w]
        
        inverted_index[w].add((docID,i))
        
#         if not first:
#             bigram = prev+' '+cur
#             bigram_inverted_index[bigram].add((docID,i-1))
            
#         first = False
#         prev = cur

        i+=1

In [225]:
inverted_index

defaultdict(set,
            {'ASTI': {(287028871, 0)},
             'future': {(286989930, 2), (286998468, 0), (287028871, 1)},
             'Solar': {(287028871, 2)},
             'Amazing': {(286998688, 0),
              (287012346, 0),
              (287020275, 0),
              (287028433, 0)},
             'week': {(286997551, 6),
              (286998688, 1),
              (286999361, 9),
              (287000530, 8),
              (287001643, 3),
              (287011165, 29),
              (287012346, 1),
              (287020275, 1),
              (287028098, 10),
              (287028433, 1)},
             'gain': {(286995429, 5),
              (286998688, 2),
              (287007650, 14),
              (287012346, 2),
              (287019444, 6),
              (287020275, 2),
              (287027250, 18),
              (287028433, 2)},
             '2490': {(286998688, 3),
              (287012346, 3),
              (287020275, 3),
              (287028433, 3)},
        

In [233]:
available_words = pd.Series(list(inverted_index.keys()))

def search(query, df = tweets):
    matched_documents = set()
    
    words = word_tokenize(query)
    
    prev = None
    
    for word in words:
        word_lower = word.lower()
        if word_lower not in stop_words:
            cur = lem.lemmatize(word_lower)
            
            reco_cur = JDreco(cur)
            print(reco_cur)
            
            match1 = inverted_index.get(reco_cur)

            if match1:
                # The operator |= is a short hand for set union
                matched_documents |= match1
                
    match_index = [item[0] for item in matched_documents]
    return df[df['Doc_id'].isin(match_index)]


def jaccard(entry, gram_number):
    spellings = available_words[available_words.str.startswith(entry[0])]
    distances = ((jaccard_distance(set(ngrams(entry,gram_number)),
                                       set(ngrams(word,gram_number))), word)
                 for word in spellings)

    closest = min(distances)
    return closest[1]


def JDreco(entry):
    return jaccard(entry, 2)

In [234]:
def search(query, df = tweets):
    matched_documents = set()
    
    words = word_tokenize(query)
    
    prev = None
    
    for word in words:
        word_lower = word.lower()
        if word_lower not in stop_words:
            cur = lem.lemmatize(word_lower)
            
            if cur in company_ticker:
                cur = company_ticker[cur]
            
            reco_cur = JDreco(cur)
            print("Recommended: "+reco_cur)
            
            match1 = inverted_index.get(reco_cur)

            if match1:
                # The operator |= is a short hand for set union
                matched_documents |= match1
                
    match_index = [item[0] for item in matched_documents]
    return df[df['Doc_id'].isin(match_index)]

In [236]:
search("tsla")

Recommended: tesla
Recommended: morn


Unnamed: 0,Date,Doc_id,Message,Sentiment,Symbols,Time,Words
6,2021-02-10,287027250,$spy $amd $aapl $baba $tsla\n\n&#39;&#39;For a...,,"[AAPL, AMD, SPY, TSLA, BABA]",06:02:09Z,"[spy, amd, aapl, baba, tsla, 39, 39, For, limi..."
30,2021-02-10,287018563,How Options Traders Printed Millions in Profit...,,"[AAPL, SPY, TSLA]",05:16:02Z,"[How, Options, Traders, Printed, Millions, Pro..."
57,2021-02-10,287009815,$AAPL CrApple better GApple in the morn or im ...,,[AAPL],04:38:05Z,"[CrApple, better, GApple, morn, im, flushing, ..."
90,2021-02-10,287004109,How To Manage Your Capital As a Profitable Opt...,,"[AAPL, SPY, TSLA]",04:16:02Z,"[How, To, Manage, Your, Capital, As, Profitabl..."
106,2021-02-10,287000392,$AAPL \nSplitting of shares is thr biggest mis...,,[AAPL],04:02:42Z,"[Splitting, share, thr, biggest, mistake, ...,..."


# Separating Train Data for Model

In [208]:
train = tweets[['Sentiment', 'Words']]
train
train.to_csv(r'Train.csv', index=False)