In [32]:
import requests
import time
import datetime
import pandas as pd
import sys
import string
import pyodbc

In [40]:
def scraper(ticker, latest_xhr_id='268864272', max_volume=50000, start_date='2020-01-01', end_date='2020-12-31'):

    
    inv_table = {}
    
    # Push Errors if scrape volume less than 0:
    if max_volume <= 0:
        sys.exit("Error: max_volume must be more than 0")

    start = time.time()
    master_content = []  # List to store all data extracted
    scroll_list = [latest_xhr_id]  # List to store all XHR id to be part of the url parameters
    tracker_list = []  # List containing integers for tracking progress
    tracker = 0
    fail_count = 0

    for x in range(5001):
        if x > 0:
            addition = x * 100
            tracker_list.append(addition)

    # Running for loop for collecting data from stocktwits. Each loop collects 20 comments.
    for _ in range(max_volume):
        try:
            headers = {
                'authority': 'api.stocktwits.com',
                'accept': 'application/json',
                'authorization': 'OAuth 6439333424451d1c85e731fb126006f7780192d2',
                'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                               'Chrome/87.0.4280.88 Safari/537.36'),
                'origin': 'https://stocktwits.com',
                'sec-fetch-site': 'same-site',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': 'https://stocktwits.com/',
                'accept-language': 'en-US,en;q=0.9',
            }

            params = (
                ('symbols',ticker),
                ('filter', 'all'),
                ('limit', '30'),
                ('max', scroll_list[-1]),
                ('since', 224610272)
            )

            response = requests.get(f'https://api.stocktwits.com/api/2/streams/symbols.json',
                                    headers=headers, params=params)
            content = response.json()
            messages = content['messages']
            # Creating dictionary for items scraped
            for item in messages:
                content_dict = {}
                content_dict['Doc_id'] = item['id']
                content_dict['Message'] = item['body']
                content_dict['Date'] = item['created_at'].split('T')[0]
                content_dict['Time'] = item['created_at'].split('T')[1]
                content_dict['Symbols'] = []
                
                
                try:
                    content_dict['Sentiment'] = item['entities']['sentiment']['basic']
                except TypeError:
                    content_dict['Sentiment'] = "N/A"
                    
                for i in range(len(item['symbols'])):
                    label = item['symbols'][i]['symbol']
                    content_dict['Symbols'].append(label)
                    
                    if label in inv_table:
                        inv_table[label].append(content_dict['Doc_id'])
                    else:
                        inv_table[label] = [content_dict['Doc_id']]

                master_content.append(content_dict.copy())
                    

            next_20_id = str(messages[-1]['id'])
            scroll_list.append(next_20_id)

            # Progress Tracker
            tracker += 1

            # Variables for tracker
            last_date = datetime.datetime.strptime(end_date, '%Y-%m-%d')
            first_date = datetime.datetime.strptime(start_date, '%Y-%m-%d')

            diff = last_date - first_date

            three_quarter_done = first_date + diff/4
            half_done = first_date + diff/2
            one_quarter_done = first_date + diff*3/4

            # Trackers
            for number in tracker_list:
                if tracker == number:
                    print(f"Extracted {number}...")
                    print(f"run time = {time.time() - start}")  # Check run time

            if (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{one_quarter_done}'):
                print("25% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{half_done}'):
                print("50% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{three_quarter_done}'):
                print("75% done")

            elif (master_content[-1]['Time'].split(":")[0] == "00" and
                    master_content[-1]['Date'] == f'{first_date}'):
                print("100% done")
                print(f'Number of tweets unable to scrape: {fail_count * 20}')
                break

        except:
            fail_count += 1

    print(f"Number of tweets scraped: {len(master_content)}")
    print(f"Last Tweet: {master_content[-1]}")

    df = pd.DataFrame(master_content)
    
    for i in inv_table:
        inv_table[i].sort()
        
    return df,inv_table


if __name__ == "__main__":
    tweets_df,inv_table = scraper ("AAPL,BABA",max_volume = )
    

Number of tweets scraped: 60
Last Tweet: {'Doc_id': 268849354, 'Message': '$BABA the worst stock I have ever invested my money in.', 'Date': '2020-12-31', 'Time': '22:40:51Z', 'Symbols': ['BABA'], 'Sentiment': 'N/A'}


In [42]:
#to update the database 
tweets_df.to_csv('tweets.csv')

# Tokenize, Filter and Lemmatize

In [43]:
tweets_df = pd.read_csv('tweets.csv',nrows = 500)
tweets = tweets_df.copy()
tweets

Unnamed: 0.1,Unnamed: 0,Doc_id,Message,Date,Time,Symbols,Sentiment
0,0,268864074,$BABA buying more!!!!! Baba to $350-400 next y...,2020-12-31,23:59:48Z,['BABA'],Bullish
1,1,268863929,$AAPL $SPY $QQQ Will $AAPL hit 3 trillion mark...,2020-12-31,23:58:49Z,"['AAPL', 'SPY', 'QQQ']",Bullish
2,2,268863920,$AAPL so you would think it was a perfect day ...,2020-12-31,23:58:40Z,['AAPL'],Bullish
3,3,268863653,$AAPL buy price Apple for me $64 can’t wait to...,2020-12-31,23:56:52Z,['AAPL'],Bullish
4,4,268863594,$AAPL can i get a hi-5,2020-12-31,23:56:31Z,['AAPL'],
5,5,268863425,$SPY WS cheered Ma &amp; treated him like a K...,2020-12-31,23:55:29Z,"['BIDU', 'SPY', 'WFC', 'JD', 'BABA']",
6,6,268862960,$AAPL \nHappy New Year!! \nBy Melbenross. \nAp...,2020-12-31,23:52:32Z,['AAPL'],Bullish
7,7,268862897,"$AAPL hard to see now,eventually the bleeding ...",2020-12-31,23:52:10Z,['AAPL'],
8,8,268862612,$AMZN I think this symmetric triangle and all ...,2020-12-31,23:50:08Z,"['AAPL', 'AMZN', 'SPY', 'TSLA', 'QQQ']",Bullish
9,9,268862114,$AAPL Anyone stressed about this stock needs t...,2020-12-31,23:46:55Z,['AAPL'],Bullish


In [44]:
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

stop_words = set(stopwords.words("english"))

lem = WordNetLemmatizer()

In [45]:
company_ticker = {"baba":"alibaba",
                  "amzn":"amazon",
                  "aapl":"apple",
                  "tsla":"tesla",
                  "msft":"microsoft",
                  "fb":"facebook",
                  "googl":"google",
                  "nio":"nio",
                  "twtr":"twitter",
                  "nflx":"netflix"
                 }

In [46]:
def token_filter_lemmatize(msg):
    words = word_tokenize(msg)
    
    
    processed = []
    
    
    
    for w in words:
        if (w not in stop_words) and (w not in string.punctuation):
            if (w in company_ticker):
                w = company_ticker[w]
            processed.append(lem.lemmatize(w.lower()))
            
    return processed

tweets['Words'] = tweets['Message'].apply(token_filter_lemmatize)

#tweets.to_csv(r'Database.csv', index=False)
tweets

Unnamed: 0.1,Unnamed: 0,Doc_id,Message,Date,Time,Symbols,Sentiment,Words
0,0,268864074,$BABA buying more!!!!! Baba to $350-400 next y...,2020-12-31,23:59:48Z,['BABA'],Bullish,"[baba, buying, baba, 350-400, next, year, mark..."
1,1,268863929,$AAPL $SPY $QQQ Will $AAPL hit 3 trillion mark...,2020-12-31,23:58:49Z,"['AAPL', 'SPY', 'QQQ']",Bullish,"[aapl, spy, qqq, will, aapl, hit, 3, trillion,..."
2,2,268863920,$AAPL so you would think it was a perfect day ...,2020-12-31,23:58:40Z,['AAPL'],Bullish,"[aapl, would, think, perfect, day, buy, call, ..."
3,3,268863653,$AAPL buy price Apple for me $64 can’t wait to...,2020-12-31,23:56:52Z,['AAPL'],Bullish,"[aapl, buy, price, apple, 64, ’, wait, see, ne..."
4,4,268863594,$AAPL can i get a hi-5,2020-12-31,23:56:31Z,['AAPL'],,"[aapl, get, hi-5]"
5,5,268863425,$SPY WS cheered Ma &amp; treated him like a K...,2020-12-31,23:55:29Z,"['BIDU', 'SPY', 'WFC', 'JD', 'BABA']",,"[spy, w, cheered, ma, amp, treated, like, king..."
6,6,268862960,$AAPL \nHappy New Year!! \nBy Melbenross. \nAp...,2020-12-31,23:52:32Z,['AAPL'],Bullish,"[aapl, happy, new, year, by, melbenross, apple]"
7,7,268862897,"$AAPL hard to see now,eventually the bleeding ...",2020-12-31,23:52:10Z,['AAPL'],,"[aapl, hard, see, eventually, bleeding, stop, ..."
8,8,268862612,$AMZN I think this symmetric triangle and all ...,2020-12-31,23:50:08Z,"['AAPL', 'AMZN', 'SPY', 'TSLA', 'QQQ']",Bullish,"[amzn, i, think, symmetric, triangle, bullshit..."
9,9,268862114,$AAPL Anyone stressed about this stock needs t...,2020-12-31,23:46:55Z,['AAPL'],Bullish,"[aapl, anyone, stressed, stock, need, stop, bu..."


# Inverted_Index and Search

In [73]:
from collections import defaultdict

inverted_index = defaultdict(set)
# bigram_inverted_index = defaultdict(set)

docIndex = tweets.columns.get_loc("Doc_id")
wordsIndex = tweets.columns.get_loc("Words")

for index, doc in tweets.iterrows():
    docID = doc[docIndex]
    words = doc[wordsIndex]
    
    i = 0
#     first = True
    
    for w in words:
        
        if w in company_ticker:
            w = company_ticker[w]
        
        inverted_index[w].add((docID,i))
        
#         if not first:
#             bigram = prev+' '+cur
#             bigram_inverted_index[bigram].add((docID,i-1))
            
#         first = False
#         prev = cur

        i+=1
    
inverted_index

defaultdict(set,
            {'alibaba': {(268849354, 0),
              (268849386, 3),
              (268850263, 0),
              (268850387, 0),
              (268851997, 1),
              (268851997, 5),
              (268852763, 0),
              (268853121, 0),
              (268853121, 11),
              (268854971, 2),
              (268855653, 0),
              (268855653, 4),
              (268856248, 1),
              (268857628, 32),
              (268857980, 0),
              (268858615, 0),
              (268860914, 0),
              (268861195, 2),
              (268861204, 1),
              (268861408, 0),
              (268862069, 0),
              (268862069, 18),
              (268863425, 13),
              (268863425, 69),
              (268864074, 0),
              (268864074, 2)},
             'buying': {(268862114, 6), (268864074, 1)},
             '350-400': {(268864074, 3)},
             'next': {(268856248, 13),
              (268858597, 16),
              (26

In [48]:
available_words = pd.Series(list(inverted_index.keys()))

def search(query, df = tweets):
    matched_documents = set()
    
    words = word_tokenize(query)
    
    prev = None
    
    for word in words:
        word_lower = word.lower()
        if word_lower not in stop_words:
            cur = lem.lemmatize(word_lower)
            
            reco_cur = JDreco(cur)
            print(reco_cur)
            
            match1 = inverted_index.get(reco_cur)

            if match1:
                # The operator |= is a short hand for set union
                matched_documents |= match1
                
    match_index = [item[0] for item in matched_documents]
    return df[df['Doc_id'].isin(match_index)]


def jaccard(entry, gram_number):
    spellings = available_words[available_words.str.startswith(entry[0])]
    distances = ((jaccard_distance(set(ngrams(entry,gram_number)),
                                       set(ngrams(word,gram_number))), word)
                 for word in spellings)

    closest = min(distances)
    return closest[1]


def JDreco(entry):
    return jaccard(entry, 2)

In [49]:
def search(query, df = tweets, exact = False):
    matched_documents = set()
    
    words = word_tokenize(query)
    
    prev = None
    
    for word in words:
        word_lower = word.lower()
        if word_lower not in stop_words:
            cur = lem.lemmatize(word_lower)
            
            if cur in company_ticker:
                cur = company_ticker[cur]
            
            reco_cur = JDreco(cur)
            
            matches = inverted_index.get(reco_cur)

            if matches:
                # The operator |= is a short hand for set union
                matched_documents |= matches
                
    
                
    match_index = [item[0] for item in matched_documents]
    return match_index #df[df['Doc_id'].isin(match_index)] 

In [57]:
match_index = search("aapl","baba")

# Separating Train Data for Model

In [12]:
result.to_csv(r'Result.csv', index=False)

NameError: name 'result' is not defined

In [None]:
train = tweets[['Sentiment', 'Words']]
train
train.to_csv(r'Train.csv', index=False)

In [None]:
a = (123,[1])

In [None]:
a[1].append(2)

In [None]:
tweets[tweets['Doc_id']==287005418]

In [None]:
'over' in stop_words

In [58]:
print(match_index)

[268856483, 268850065, 268863594, 268850384, 268862960, 268862960, 268855171, 268850047, 268853670, 268860519, 268854046, 268850047, 268852961, 268854585, 268853344, 268850658, 268863653, 268853928, 268851768, 268853133, 268863653, 268851310, 268858597, 268850761, 268857368, 268863929, 268863929, 268862114, 268860466, 268853096, 268850658, 268856542, 268863653, 268853133, 268858043, 268854050, 268856237, 268862897, 268863929, 268852339, 268850761, 268849416, 268855704, 268855376, 268862612, 268853344, 268851768, 268861567, 268858750, 268863920, 268852092]


# Querying the database

In [83]:
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=DESKTOP-97U22LI;DATABASE=CZ4034;UID=sa;PWD=password1')
# cursor = conn.cursor()

# def getInvIndex():
#     return pd.read_sql_query('SELECT * FROM CZ4034.dbo.InvIndex',conn)

def queryDB(table, dbQuery):
    return pd.read_sql_query('SELECT * FROM CZ4034.dbo.'+table+' where Doc_id IN '+str(tuple(dbQuery)),conn)


In [84]:
testing = ['286952949','286952978']
a = queryDB('words',testing)
a

Unnamed: 0,Date,Doc_id,Message,Sentiment,Time,tok_words,filtered_tok_words
0,2021-02-10,286952949,"$AMC More ppl hold this than $AAPL , That shou...",Bullish,2021-02-10 09:48:17,"['$', 'AMC', 'More', 'ppl', 'hold', 'this', 't...","['$', 'AMC', 'More', 'ppl', 'hold', '$', 'AAPL..."
1,2021-02-10,286952978,$BB $TSLA $Aapl $Msft Blackberry QNX is not on...,,2021-02-10 09:48:19,"['$', 'BB', '$', 'TSLA', '$', 'Aapl', '$', 'Ms...","['$', 'BB', '$', 'TSLA', '$', 'Aapl', '$', 'Ms..."


In [None]:
import requests
from flask import Flask
from flask_restful import Api, Resource
from flask import request
#import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
import pyodbc
import json
from collections import defaultdict


app = Flask(__name__)
api = Api(app)
stop_words = set(stopwords.words("english"))
lem = WordNetLemmatizer()
inverted_index = defaultdict(set)
available_words = pd.Series(list(inverted_index.keys()))

conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=DESKTOP-97U22LI;DATABASE=CZ4034;UID=sa;PWD=password1')
cursor = conn.cursor()

def getInvIndex():
    with open('InvIndex.json') as infile:
        return json.load(infile)

def queryDB(table, dbQuery):
    return pd.read_sql_query('SELECT * FROM CZ4034.dbo.'+table+' where Doc_id IN '+str(tuple(dbQuery)),conn)

class SearchResource(Resource):    
    def __init__(self):
        self.InvIndex = getInvIndex()
        self.words = pd.Series(list(inverted_index.keys()))
        
    def get(self):
        query = request.args.get('query')
        matched_index = self.search(query)
        
        return {"content": query}
    
    def post(self):
        return {"data": "posted!"}
    
    def search(query,self):
    
        matched_documents = set() 
        words = word_tokenize(query)
        prev = None
        
        for word in words:
            word_lower = word.lower()
            if word_lower not in stop_words:
                cur = lem.lemmatize(word_lower)
                
                reco_cur = self.JDreco(cur)
                
                match1 = self.InvIndex.get(reco_cur)
    
                if match1:
                    # The operator |= is a short hand for set union
                    matched_documents |= match1
                    
        match_index = [item[0] for item in matched_documents]
        return match_index #df[df['Doc_id'].isin(match_index)]
    
    
    def jaccard(self,entry, gram_number):
        spellings = self.words[self.words.str.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry,gram_number)),
                                           set(ngrams(word,gram_number))), word)
                     for word in spellings)
    
        closest = min(distances)
        return closest[1]
    
    
    def JDreco(entry,self):
        return self.jaccard(entry, 2)

api.add_resource(SearchResource, "/search" )




if __name__=="__main__":
    app.run(debug=True)


In [None]:
import requests

response = requests.get('http://127.0.0.1:5000/search?query=aapl')
print(response.json())