In [1]:
import requests
import time
import datetime
import pandas as pd
import sys


def scraper(ticker, latest_xhr_id='0', max_volume=50000, start_date='2020-01-01', end_date='2020-12-31'):

    
    inv_table = {}
    
    # Push Errors if scrape volume less than 0:
    if max_volume <= 0:
        sys.exit("Error: max_volume must be more than 0")

    start = time.time()
    master_content = []  # List to store all data extracted
    scroll_list = [latest_xhr_id]  # List to store all XHR id to be part of the url parameters
    tracker_list = []  # List containing integers for tracking progress
    tracker = 0
    fail_count = 0

    for x in range(5001):
        if x > 0:
            addition = x * 100
            tracker_list.append(addition)

    # Running for loop for collecting data from stocktwits. Each loop collects 20 comments.
    for _ in range(max_volume):
        try:
            headers = {
                'authority': 'api.stocktwits.com',
                'accept': 'application/json',
                'authorization': 'OAuth 6439333424451d1c85e731fb126006f7780192d2',
                'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                               'Chrome/87.0.4280.88 Safari/537.36'),
                'origin': 'https://stocktwits.com',
                'sec-fetch-site': 'same-site',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': 'https://stocktwits.com/',
                'accept-language': 'en-US,en;q=0.9',
            }

            params = (
                ('symbols',ticker),
                ('filter', 'all'),
                ('limit', '30'),
                ('max', scroll_list[-1]),
            )

            response = requests.get(f'https://api.stocktwits.com/api/2/streams/symbols.json',
                                    headers=headers, params=params)
            content = response.json()
            messages = content['messages']
            # Creating dictionary for items scraped
            for item in messages:
                content_dict = {}
                content_dict['Doc_id'] = item['id']
                content_dict['Message'] = item['body']
                content_dict['Date'] = item['created_at'].split('T')[0]
                content_dict['Time'] = item['created_at'].split('T')[1]
                content_dict['Symbols'] = []
                
                
                try:
                    content_dict['Sentiment'] = item['entities']['sentiment']['basic']
                except TypeError:
                    content_dict['Sentiment'] = "N/A"
                    
                for i in range(len(item['symbols'])):
                    label = item['symbols'][i]['symbol']
                    content_dict['Symbols'].append(label)
                    
                    if label in inv_table:
                        inv_table[label].append(content_dict['Doc_id'])
                    else:
                        inv_table[label] = [content_dict['Doc_id']]

                master_content.append(content_dict.copy())
                    

            next_20_id = str(messages[-1]['id'])
            scroll_list.append(next_20_id)

            # Progress Tracker
            tracker += 1

            # Variables for tracker
            last_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
            first_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')

            diff = last_date - first_date

            three_quarter_done = first_date + diff/4
            half_done = first_date + diff/2
            one_quarter_done = first_date + diff*3/4

            # Trackers
            for number in tracker_list:
                if tracker == number:
                    print(f"Extracted {number}...")
                    print(f"run time = {time.time() - start}")  # Check run time

            if (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{one_quarter_done}'):
                print("25% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{half_done}'):
                print("50% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{three_quarter_done}'):
                print("75% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{first_date}'):
                print("100% done")
                print(f'Number of tweets unable to scrape: {fail_count * 20}')
                break

        except:
            fail_count += 1

    print(f"Number of tweets scraped: {len(master_content)}")
    print(f"Last Tweet: {master_content[-1]}")

    df = pd.DataFrame(master_content)
    
    for i in inv_table:
        inv_table[i].sort()
        
    return df,inv_table


if __name__ == "__main__":
    tweets_df,inv_table = scraper ("AAPL,BABA",max_volume = 5)
    

Number of tweets scraped: 150
Last Tweet: {'Doc_id': 286953362, 'Message': '$BABA 267 right now based on HK price', 'Date': '2021-02-10', 'Time': '01:49:20Z', 'Symbols': ['BABA'], 'Sentiment': 'Bullish'}


In [2]:
inv_table

{'BABA': [286953362,
  286954194,
  286954501,
  286956277,
  286957534,
  286960462,
  286963674,
  286964080,
  286964814,
  286965030,
  286965830,
  286967763,
  286969732,
  286969836,
  286970127,
  286972285,
  286972450,
  286974856,
  286975446,
  286976296,
  286979643,
  286981619,
  286985084,
  286985779,
  286985896,
  286986384,
  286988148,
  286988521,
  286989199,
  286989619,
  286991115,
  286993328,
  286995441,
  286995583,
  286997551,
  286997747,
  286997831,
  286998322,
  287000530,
  287000693,
  287002002,
  287003153,
  287003298],
 'AAPL': [286953369,
  286954163,
  286955865,
  286956203,
  286957408,
  286957737,
  286958192,
  286958307,
  286958468,
  286958516,
  286960481,
  286960675,
  286960907,
  286962300,
  286962458,
  286962923,
  286963429,
  286963756,
  286963979,
  286965111,
  286965333,
  286967336,
  286967386,
  286967566,
  286967645,
  286968097,
  286968653,
  286969275,
  286969293,
  286969406,
  286969742,
  286969981,
  286972

In [3]:
tweets = tweets_df.copy()

In [7]:
import nltk
# nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer

stop_words = set(stopwords.words("english"))
tickers = list(inv_table.keys())
add_stop = ['@', '$']

lem = WordNetLemmatizer()

In [8]:
def token_filter_lemmatize(msg):
    words = word_tokenize(msg)
    
    
    processed = []
    
    
    
    for w in words:
        if (w not in stop_words) and (w not in tickers) and (w not in add_stop):
            processed.append(lem.lemmatize(w))
            
    return processed

tweets['Words'] = tweets['Message'].apply(token_filter_lemmatize)

#tweets.to_csv(r'Database.csv', index=False)
tweets

Unnamed: 0,Date,Doc_id,Message,Sentiment,Symbols,Time,Words
0,2021-02-10,287003298,$BABA very sexy pump in HK,Bullish,[BABA],04:12:57Z,"[sexy, pump, HK]"
1,2021-02-10,287003153,$BABA 262.60=US 271.31,,[BABA],04:12:24Z,"[262.60=US, 271.31]"
2,2021-02-10,287002629,$AAPL 2DAMOON!🚀🚀🚀🚀🚀,,[AAPL],04:10:29Z,"[2DAMOON, !, 🚀🚀🚀🚀🚀]"
3,2021-02-10,287002605,$AAPL my 🍏 call gang hang in there good news i...,Bullish,[AAPL],04:10:25Z,"[🍏, call, gang, hang, good, news, coming]"
4,2021-02-10,287002548,$GME $T $AAPL $VZ $TSNP,Bullish,"[AAPL, GME, T, VZ, TSNP]",04:10:12Z,[]
5,2021-02-10,287002343,$AAPL \nI see many penny stock pumpers are usi...,,[AAPL],04:09:32Z,"[I, see, many, penny, stock, pumpers, using, a..."
6,2021-02-10,287002063,$HCMC $MSFT $SSNLF $AAPL \nAnother penny stoc...,,"[AAPL, MSFT, SSNLF, HCMC]",04:08:29Z,"[Another, penny, stock, pumper, apple, board, ..."
7,2021-02-10,287002002,$BABA green today. Futures green rn. Hk up 2.5...,Bullish,[BABA],04:08:17Z,"[green, today, ., Futures, green, rn, ., Hk, 2..."
8,2021-02-10,287001643,$HCMC up higher higher every week . $MSFT $A...,Bullish,"[AAPL, MSFT, SSNLF, HCMC]",04:07:00Z,"[higher, higher, every, week, .]"
9,2021-02-10,287001590,$AAPL if this dog sheeeit doesnt gap up in the...,,[AAPL],04:06:48Z,"[dog, sheeeit, doesnt, gap, im, gon, na, start..."


In [9]:
from collections import defaultdict

inverted_index = defaultdict(set)

docIndex = tweets.columns.get_loc("Doc_id")
wordsIndex = tweets.columns.get_loc("Words")

for index, doc in tweets.iterrows():
    docID = doc[docIndex]
    words = doc[wordsIndex]
    
    for w in words:
        inverted_index[w].add(docID)
        

In [10]:
def search(query, df = tweets):
    matched_documents = set()
    
    for word in word_tokenize(query):
        word_lower = word.lower()
        if word_lower not in stop_words:
            word = lem.lemmatize(word_lower)

            matches = inverted_index.get(word)
            if matches:
                # The operator |= is a short hand for set union
                matched_documents |= matches
                
    return df[df['Doc_id'].isin(matched_documents)]

In [11]:
search("apple")

Unnamed: 0,Date,Doc_id,Message,Sentiment,Symbols,Time,Words
5,2021-02-10,287002343,$AAPL \nI see many penny stock pumpers are usi...,,[AAPL],04:09:32Z,"[I, see, many, penny, stock, pumpers, using, a..."
6,2021-02-10,287002063,$HCMC $MSFT $SSNLF $AAPL \nAnother penny stoc...,,"[AAPL, MSFT, SSNLF, HCMC]",04:08:29Z,"[Another, penny, stock, pumper, apple, board, ..."
9,2021-02-10,287001590,$AAPL if this dog sheeeit doesnt gap up in the...,,[AAPL],04:06:48Z,"[dog, sheeeit, doesnt, gap, im, gon, na, start..."
10,2021-02-10,287000766,$AAPL $USMJ \nAnother penny stock pumper in a...,,"[AAPL, USMJ]",04:03:57Z,"[Another, penny, stock, pumper, apple, board, ..."
17,2021-02-10,286998468,$AAPL futures deep green. maybe they&#39;ll le...,,[AAPL],03:55:51Z,"[future, deep, green, ., maybe, &, #, 39, ;, l..."
56,2021-02-10,286986311,$AAPL \nOfflate...there is lot of fake news be...,Bearish,[AAPL],03:17:53Z,"[Offlate, ..., lot, fake, news, propagated, me..."
72,2021-02-10,286982376,$AAPL the apple car thing is actually quite in...,,[AAPL],03:06:34Z,"[apple, car, thing, actually, quite, ingenious..."
86,2021-02-10,286976169,$AAPL \nWho buys apple headset for 600$?\nWho ...,,[AAPL],02:49:04Z,"[Who, buy, apple, headset, 600, ?, Who, buy, u..."
91,2021-02-10,286975084,$AAPL \nApple doesnt implement crypto wallet.....,Bearish,"[AAPL, AMD, SPY]",02:46:09Z,"[Apple, doesnt, implement, crypto, wallet, ......"


In [None]:
tweets