In [2]:
import requests
import time
import datetime
import pandas as pd
import sys
import string
import pyodbc

# 1. SCRAPING

In [4]:
def crawler(ticker, latest_xhr_id='268864272', max_volume=50000):   
    inv_table = {}
    
    # Push Errors if scrape volume less than 0:
    if max_volume <= 0:
        sys.exit("Error: max_volume must be more than 0")

    master_content = []  # List to store all data extracted
    scroll_list = [latest_xhr_id]  # List to store all XHR id to be part of the url parameters
    tracker = 0
    fail_count = 0

    # Running for loop for collecting data from stocktwits. Each loop collects 20 comments.
    for _ in range(max_volume):
        try:
            headers = {
                'authority': 'api.stocktwits.com',
                'accept': 'application/json',
                'authorization': 'OAuth 6439333424451d1c85e731fb126006f7780192d2',
                'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                               'Chrome/87.0.4280.88 Safari/537.36'),
                'origin': 'https://stocktwits.com',
                'sec-fetch-site': 'same-site',
                'sec-fetch-mode': 'cors',
                'sec-fetch-dest': 'empty',
                'referer': 'https://stocktwits.com/',
                'accept-language': 'en-US,en;q=0.9',
            }`

            params = (
                ('symbols',ticker),
                ('filter', 'all'),
                ('limit', '30'),
                ('max', scroll_list[-1]),
                ('since', 224610272)
            )

            response = requests.get(f'https://api.stocktwits.com/api/2/streams/symbols.json',
                                    headers=headers, params=params)
            content = response.json()
            messages = content['messages']
            # Creating dictionary for items scraped
            for item in messages:
                content_dict = {}
                content_dict['Doc_id'] = item['id']
                content_dict['Message'] = item['body']
                content_dict['Date'] = item['created_at'].split('T')[0]
                content_dict['Time'] = item['created_at'].split('T')[1]
                content_dict['Symbols'] = []
                content_dict['Username'] = item['user']['username']
                content_dict['Name'] = item['user']['name']
                
                
                try:
                    content_dict['Sentiment'] = item['entities']['sentiment']['basic']
                except TypeError:
                    content_dict['Sentiment'] = "N/A"
                    
                for i in range(len(item['symbols'])):
                    label = item['symbols'][i]['symbol']
                    content_dict['Symbols'].append(label)
                    
                    if label in inv_table:
                        inv_table[label].append(content_dict['Doc_id'])
                    else:
                        inv_table[label] = [content_dict['Doc_id']]

                master_content.append(content_dict.copy())
                    

            next_20_id = str(messages[-1]['id'])
            scroll_list.append(next_20_id)

            # Progress Tracker
            tracker += 1
        except:
            fail_count += 1

    print(f"Number of tweets scraped: {len(master_content)}")
    print(f"Last Tweet: {master_content[-1]}")

    df = pd.DataFrame(master_content)
    
    for i in inv_table:
        inv_table[i].sort()
        
    return df,inv_table


if __name__ == "__main__":
    tweets_df,inv_table = scraper ("AAPL,BABA",max_volume = 2)
    

Number of tweets scraped: 60
Last Tweet: {'Doc_id': 268848975, 'Message': '$AAPL NEW ARTICLE : The Stock Market Was Amazing in 2020 -- Let Us Count the Ways https://www.stck.pro/news/AAPL/10019216', 'Date': '2020-12-31', 'Time': '22:39:05Z', 'Symbols': ['AAPL'], 'Username': 'STCKPRO', 'Name': 'STCK.PRO', 'Sentiment': 'N/A'}


In [None]:
#to update the database 
tweets_df.to_csv('tweets.csv')

# 2. Dataset Preparation (Train, Test)

In [None]:
tweets_1 = pd.read_csv('Database_1mil.csv')
tweets_2 = pd.read_csv('Database_2mil.csv')

In [None]:
tweets = tweets_1.append(tweets_2)

In [None]:
tweets = tweets.drop(columns = "Unnamed: 0")

In [None]:
tweets_test = tweets.head(100000)
tweets_train = tweets.tail(len(tweets)-100000)

# Store 100k tweets for DB
# tweets_test.to_csv("tweets_100k.csv")

### a. Trainset Preparation

In [None]:
sentiments = ['Bearish','Bullish']
tweets_train = tweets_train[tweets_train['Sentiment'].isin(sentiments)]
tweets_train = tweets_train.reset_index().drop(columns="index")

tweets_train

In [None]:
bullish = 0
bearish = 0
neutral = 0
for x in tweets_train['Sentiment']:
    if x == 'Bearish':
        bearish += 1
    elif x == 'Bullish':
        bullish += 1
    else:
        neutral += 1
    
print('Bullish count: ', bullish)
print('Bearish count: ', bearish)
print('Neutral count: ', neutral)

In [None]:
import random

randints = []
for i in range(bullish + bearish):
    randints.append(random.randint(1, bullish))
    
tweets_train['randint'] = randints

rows_to_drop = []

for i, r in tweets_train.iterrows():
  if r['Sentiment'] == 'Bullish':
    if r['randint'] < (bullish - bearish):
      rows_to_drop.append(i)
    
tweets_train = tweets_train.drop(rows_to_drop)

In [None]:
bullish = 0
bearish = 0
neutral = 0
for x in tweets_train['Sentiment']:
    if x == 'Bearish':
        bearish += 1
    elif x == 'Bullish':
        bullish += 1
    else:
        neutral += 1
    
print('Bullish count: ', bullish)
print('Bearish count: ', bearish)
print('Neutral count: ', neutral)

In [None]:
import datetime

def get_date_time(date, time):
    time = time[0:len(time)-1]
    string = date + ' ' + time
    return datetime.datetime.strptime(string, '%Y-%m-%d %H:%M:%S')

In [None]:
dateIndex = tweets_test.columns.get_loc("Date")
timeIndex = tweets_test.columns.get_loc("Time")

tweets_test['datetime'] = tweets_test.apply(lambda row: get_date_time(row[dateIndex],row[timeIndex]), axis=1)
tweets_test

In [None]:
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

stop_words = set(stopwords.words("english"))

lem = WordNetLemmatizer()

In [None]:
company_ticker = {"baba":"alibaba",
                  "amzn":"amazon",
                  "aapl":"apple",
                  "tsla":"tesla",
                  "msft":"microsoft",
                  "fb":"facebook",
                  "googl":"google",
                  "nio":"nio",
                  "twtr":"twitter",
                  "nflx":"netflix"
                 }

In [None]:
def token_filter_lemmatize(msg):
    words = word_tokenize(msg)
    
    processed = []
    
    for w in words:
        if (w not in stop_words) and (w not in string.punctuation):
            if (w in company_ticker):
                w = company_ticker[w]
            processed.append(lem.lemmatize(w.lower()))
            
    return processed

tweets_train['Words'] = tweets_train['Message'].apply(token_filter_lemmatize)

#tweets.to_csv(r'Database.csv', index=False)
tweets_train

In [None]:
tweets_train_final = tweets_train[["Words", "Sentiment", "datetime"]]
tweets_train_final.to_csv("tweets_train_700k.csv")

### b. Testset Preparation

In [None]:
dateIndex = tweets_train.columns.get_loc("Date")
timeIndex = tweets_train.columns.get_loc("Time")

tweets_train['datetime'] = tweets_train.apply(lambda row: get_date_time(row[dateIndex],row[timeIndex]), axis=1)

In [None]:
tweets_test['Words'] = tweets_test['Message'].apply(token_filter_lemmatize)
tweets_test

In [None]:
tweets_test_final = tweets_test[["Words", "Sentiment", "datetime"]]
tweets_test_final.to_csv("tweets_test_100k.csv")

# 3. Tokenize, Filter and Lemmatize

In [None]:
tweets_df = pd.read_csv('tweets_100k.csv')

tweets = tweets_df.copy()
tweets

In [None]:
import datetime

def get_date_time(date, time):
    time = time[0:len(time)-1]
    string = date + ' ' + time
    return datetime.datetime.strptime(string, '%Y-%m-%d %H:%M:%S')

In [None]:
dateIndex = tweets.columns.get_loc("Date")
timeIndex = tweets.columns.get_loc("Time")

tweets['datetime'] = tweets.apply(lambda row: get_date_time(row[dateIndex],row[timeIndex]), axis=1)

In [None]:
earliest = min(tweets.datetime)
total = max(tweets.datetime) - earliest

def datetime_score(datetime):
    diff = datetime - earliest
    return diff/total

In [None]:
tweets['datetime_score'] = tweets['datetime'].apply(datetime_score)

In [None]:
import re
def clean_string(text):
    pattern = re.compile('[\W_]+')
    return pattern.sub(' ', text)


tweets['Message'] = tweets['Message'].apply(clean_string)

In [None]:
tweets

In [None]:
import nltk
#nltk.download()
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams

stop_words = set(stopwords.words("english"))

lem = WordNetLemmatizer()

In [6]:
company_ticker = {"baba":"alibaba",
                  "amzn":"amazon",
                  "aapl":"apple",
                  "tsla":"tesla",
                  "msft":"microsoft",
                  "fb":"facebook",
                  "googl":"google",
                  "nio":"nio",
                  "twtr":"twitter",
                  "nflx":"netflix"
                 }

In [None]:
def token_filter_lemmatize(msg):
    words = word_tokenize(msg)
    
    processed = []
    
    for w in words:
        if (w not in stop_words) and (w not in string.punctuation):
            if (w in company_ticker):
                w = company_ticker[w]
            processed.append(lem.lemmatize(w.lower()))
            
    return processed

tweets['Words'] = tweets['Message'].apply(token_filter_lemmatize)

#tweets.to_csv(r'Database.csv', index=False)
tweets

In [None]:
tweets = tweets[["Doc_id", "datetime_score", "Words"]]
tweets.to_csv(r'tweets_processed.csv', index=False)

In [None]:
tweets

# 4. Inverted_Index and Search (START HERE - REUBEN)

In [3]:
tweets = pd.read_csv('tweets_processed.csv')

In [4]:
import ast

def read_list(string):
    return ast.literal_eval(string)

tweets['Words'] = tweets['Words'].apply(read_list)
tweets

Unnamed: 0,Doc_id,datetime_score,Words
0,268864270,1.000000,"[tsla, well, fargo, 39, top, prediction, 2021,..."
1,268864163,0.999956,"[nio, 60, jan, 9th]"
2,268864120,0.999937,"[tsla, opps, cut, head, wierd]"
3,268864106,0.999932,"[amzn, sentiment, highly, favorable, right, ne..."
4,268864074,0.999916,"[baba, buying, baba, 350, 400, next, year, mar..."
5,268864073,0.999915,"[tsla, broke, bear, ran, well, today, year, ye..."
6,268864071,0.999915,"[smartoptions, unusual, activity, alert, delay..."
7,268864047,0.999907,"[tsla, happy, new, year, stay, positive, stay,..."
8,268863937,0.999852,"[tsla, 750, soon]"
9,268863929,0.999850,"[aapl, spy, qqq, will, aapl, hit, 3, trillion,..."


In [7]:
# Create Inverted Index

# FORMAT (Preparing for JSON):

# Inv_Index -> Words -> Documents -> Freq + Tf.idf

inverted_index = {}

docIndex = tweets.columns.get_loc("Doc_id")
wordsIndex = tweets.columns.get_loc("Words")

for index, doc in tweets.iterrows():
    docID = doc[docIndex]
    words = doc[wordsIndex]
    
    for w in words:

        if w in company_ticker:
            w = company_ticker[w]
        
        if w not in inverted_index:
            inverted_index[w] = {}
        
        if docID not in inverted_index[w]:
            inverted_index[w][docID] = {'freq': 1, 'score':0}
        else:
            inverted_index[w][docID]['freq'] += 1
            

In [8]:
tweets_docid = tweets.set_index("Doc_id")[['Words','datetime_score']]
tweets_docid['len2'] = 0.0
tweets_docid

Unnamed: 0_level_0,Words,datetime_score,len2
Doc_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
268864270,"[tsla, well, fargo, 39, top, prediction, 2021,...",1.000000,0.0
268864163,"[nio, 60, jan, 9th]",0.999956,0.0
268864120,"[tsla, opps, cut, head, wierd]",0.999937,0.0
268864106,"[amzn, sentiment, highly, favorable, right, ne...",0.999932,0.0
268864074,"[baba, buying, baba, 350, 400, next, year, mar...",0.999916,0.0
268864073,"[tsla, broke, bear, ran, well, today, year, ye...",0.999915,0.0
268864071,"[smartoptions, unusual, activity, alert, delay...",0.999915,0.0
268864047,"[tsla, happy, new, year, stay, positive, stay,...",0.999907,0.0
268863937,"[tsla, 750, soon]",0.999852,0.0
268863929,"[aapl, spy, qqq, will, aapl, hit, 3, trillion,...",0.999850,0.0


In [11]:
len(inverted_index)

38650

In [None]:
# CALCULATE TF.IDF
## TF = (count of word in doc)/(total words in doc)
## IDF = 1

import math

def tf(word_dict, docID):
    count = word_dict[docID]['freq']
    return (1 + math.log(count,10))
    
def get_datetime_score(docID):
    return tweets_docid['datetime_score'][docID]

num = 0

for word in inverted_index:
    # Using lnc.ltc
    ## n means idf_value = 1
    
    idf_value = 1
    
    for docID in inverted_index[word]:
        tf_value = tf(inverted_index[word], docID)
        del inverted_index[word][docID]['freq']
        
        tweets_docid['len2'][docID] += (tf_value)**2
        
        tf_idf = tf_value * idf_value
        
        inverted_index[word][docID]['tf.idf'] = tf_idf
    
    num += 1
    
    if (num % 20 == 0):
        print("Completed: "+str(num))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [None]:
def calc_norm(length):
    return 1/((length)**(1/2))

tweets_docid['norm_coeff'] = tweets_docid['len2'].apply(calc_norm)

In [None]:
tweets_docid

In [None]:
tweets_scores = {}

dtIndex = tweets_docid.columns.get_loc("datetime_score")
normIndex = tweets_docid.columns.get_loc("norm_coeff")

for index, doc in tweets_docid.iterrows():
    dtScore = doc[dtIndex]
    norm_coeff = doc[normIndex]
    
    tweets_scores[index] = {}
    tweets_scores[index]['datetime_score'] = dtScore
    tweets_scores[index]['norm_coeff'] = norm_coeff
    
    
def store_tweets_scores():
    with open('tweets_scores.json','w') as outfile:
        json.dump(tweets_scores,outfile)
    
store_tweets_scores()

In [None]:
# Creating tiered index:

champion_inverted_index = {}

num = 0

for word in inverted_index:
    
    champion_len = 500
    
    for word in inverted_index:
        
        champion_inverted_index[word] = {}
        
        if (len(inverted_index[word]) > champion_len):
            sort_word = sorted(inverted_index[word].items(), key=lambda item: item[1]['tf.idf'], reverse = True)
            
            for i in sort_word[:champion_len]:
                champion_inverted_index[word][i[0]] = i[1]['tf.idf']
            
        else:
            for i in inverted_index[word].items():
                champion_inverted_index[word][i[0]] = i[1]['tf.idf']
                
        champion_inverted_index[word]['count'] = len(inverted_index[word])
    
    num += 1
    
    if (num % 20 == 0):
        print("Completed: "+str(num))

In [None]:
import json

def store_inv_index():
    with open('InvIndex.json', 'w') as outfile:
        json.dump(inverted_index,outfile)
        
def store_champion_list():
    with open('ChampionList.json','w') as outfile:
        json.dump(champion_inverted_index,outfile)

# store_inv_index()
store_champion_list()

In [None]:
def getInvIndex():
    with open('tiered_InvIndex.json') as infile:
        data = json.load(infile)
    return data

inv_index = getInvIndex()



# Querying the database

In [None]:
from flask import Flask
from flask_restful import Api, Resource
from flask import request
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.metrics.distance import jaccard_distance
from nltk.util import ngrams
import pyodbc
import json
from collections import defaultdict
import math


app = Flask(__name__)
api = Api(app)

# conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server};SERVER=DESKTOP-97U22LI;DATABASE=CZ4034;UID=sa;PWD=password1')
# cursor = conn.cursor()

stop_words = set(stopwords.words("english"))
lem = WordNetLemmatizer()

class SearchResource(Resource):    
    def __init__(self):
        
        self.ChampionList = defaultdict(set, self.getChampionList())
#         self.InvIndex = defaultdict(set, self.getInvIndex())
        
        self.words = pd.Series(list(self.ChampionList.keys()))
        self.company_ticker = {"baba":"alibaba",
                              "amzn":"amazon",
                              "aapl":"apple",
                              "tsla":"tesla",
                              "msft":"microsoft",
                              "fb":"facebook",
                              "googl":"google",
                              "nio":"nio",
                              "twtr":"twitter",
                              "nflx":"netflix"
                             }
        
        self.tweets_scores = self.getTweetsScores()
        
        # CHANGE PARAM WHEN MORE SCRAPED!
        self.num_docs = len(self.tweets_scores)
        
    ################# API FUNCTIONS #######################
        
    def get(self):
        query = request.args.get('query')
        matched_index = self.search(query)
        
        data = self.queryDB('tweets',matched_index).to_json() 
        
        return {'content': data}
    
    def post(self):
        return {"data": "posted!"}
    
    ################# QUERY SEARCH #######################
    
    def clean_query(self, query, exact):
        query_clean = []
        
        query_split = word_tokenize(query)
        
        for query_word in query_split:
            word_lower = query_word.lower()
            if word_lower not in stop_words:
                cur = lem.lemmatize(word_lower)
                
                if cur in self.company_ticker:
                    cur = self.company_ticker[cur]
                    
                if not exact:
                    cur = self.JDreco(cur)
                
                if (cur not in query_clean):
                    query_clean.append(cur)
                    
        return query_clean
    
    def search(self, query, exact = False):
        
        # Returns 200 (or less if unavailable) most relevant documents
        
        matched_index_list = []
        matched_documents = None
        
        # (1) Clean Query:
        query_clean = self.clean_query(query, exact)
        print("Recommended Query List:")
        print(query_clean)
        
        
        # (2) Find All Docs (match any one word in query):
        for word in query_clean:
            if word in self.ChampionList:
                matches = self.ChampionList[word]
                
                match_index = list(matches.keys())
                match_index.remove('count')
            
            else:
                print("No Matches for Word: "+word)
                continue
    
#             matched_index_list.append(match_index)
            
            if matched_documents is None:
                matched_documents = match_index
                
            else:
                #matched_documents = list(set.intersection(set(matched_documents),set(match_index)))
                matched_documents = list(set.union(set(matched_documents),set(match_index)))
                
                
        # (3) Filter Most Relevant:
        
        filter_amt = 200

        weights = self.tf_idf(query_clean)
            
        doc_score = {}
            
        for docID in matched_documents:
            doc_score[docID] = self.calc_match_score(weights, docID)
                
        matched_documents = sorted(doc_score, key=doc_score.get, reverse = True)
            
        if (filter_amt <= len(matched_documents)):
            matched_documents = matched_documents[:filter_amt]
        
        # (4) Return Results:
        return matched_documents
    
    ################# RECOMMENDATION #######################

    def jaccard(self,entry, gram_number):
        spellings = self.words[self.words.str.startswith(entry[0])]
        distances = ((jaccard_distance(set(ngrams(entry,gram_number)),
                                           set(ngrams(word,gram_number))), word)
                     for word in spellings)
        closest = min(distances)
        return closest[1]
    
    
    def JDreco(self,entry):
        return self.jaccard(entry, 2)
    
    ################# TF.IDF #######################
    
    def tf_idf(self,query):
        weights = {}
        
        len2 = 0
        
        for word in query:
            
            # Find IDF value
            if (word in self.ChampionList):
                idf_value = self.idf(self.ChampionList[word])
            else:
                idf_value = 0
                
            # Find TF Value
            tf_value = 1 + math.log(1,10) # = 1
            
            # Store TF.IDF
            tf_idf = tf_value * idf_value
            
            weights[word] = tf_idf
            len2 += tf_idf**2
            
        norm = 1/(len2**(1/2))
        
        for word in weights:
            weights[word] = weights[word] * norm
        
        return weights
            
    
    def idf(self,word_dict):
        value = self.num_docs/word_dict['count']
        return math.log(value, 10)
    
    ################# FILTER MOST RELEVANT #######################
    
    def calc_match_score(self, weights_query, docID):
        
        result = 0
        
#         print(self.tweets_scores)
        
        norm = float(self.tweets_scores[str(docID)]['norm_coeff'])
        dtScore = float(self.tweets_scores[str(docID)]['datetime_score'])
        recency_weight = 0.33
        
        for word in weights_query:

            #QUERY
            query_score = weights_query[word]
            
            #DOC
            try:
                doc_score = float(self.ChampionList[word][docID]) * norm
            except:
                doc_score = 0
            
            #PRODUCT
            result += query_score * doc_score
            
        result += dtScore * recency_weight
        
        return result
    
    ################# INVERTED INDEX #######################
    
    def getInvIndex(self):
        with open('InvIndex.json') as infile:
            data = json.load(infile)
        return data
    
    def getChampionList(self):
        with open('ChampionList.json') as infile:
            data = json.load(infile)
        return data
    
    def getTweetsScores(self):
        with open('tweets_scores.json') as infile:
            data = json.load(infile)
        return data
    
    ################# DATABASE CONNECTION & QUERY #######################

    def queryDB(self,table, dbQuery):
        return pd.read_sql_query('SELECT * FROM CZ4034.dbo.'+table+' where Doc_id IN '+str(tuple(dbQuery)),conn)
    
    def queryOffline(self, query, exact = False):
        match_list = self.search(query, exact = exact)
        
        for docID in match_list:
            print(tweets[tweets['Doc_id']==int(docID)]['Message'])

api.add_resource(SearchResource, "/search" )

In [None]:
x = SearchResource()

In [None]:
x.search("stonks best apple")

In [None]:
from collections import defaultdict

inverted_index = defaultdict(set)
# bigram_inverted_index = defaultdict(set)

docIndex = tweets.columns.get_loc("Doc_id")
wordsIndex = tweets.columns.get_loc("Words")

for index, doc in tweets.iterrows():
    docID = doc[docIndex]
    words = doc[wordsIndex]
    
    i = 0
#     first = True
    
    for w in words:
        
        if w in company_ticker:
            w = company_ticker[w]
        
        inverted_index[w].add((docID,i))
        
#         if not first:
#             bigram = prev+' '+cur
#             bigram_inverted_index[bigram].add((docID,i-1))
            
#         first = False
#         prev = cur

        i+=1
    
inverted_index

In [None]:
available_words = pd.Series(list(inverted_index.keys()))

def search(query, df = tweets):
    matched_documents = set()
    
    words = word_tokenize(query)
    
    prev = None
    
    for word in words:
        word_lower = word.lower()
        if word_lower not in stop_words:
            cur = lem.lemmatize(word_lower)
            
            reco_cur = JDreco(cur)
            print(reco_cur)
            
            match1 = inverted_index.get(reco_cur)

            if match1:
                # The operator |= is a short hand for set union
                matched_documents |= match1
                
    match_index = [item[0] for item in matched_documents]
    return df[df['Doc_id'].isin(match_index)]


def jaccard(entry, gram_number):
    spellings = available_words[available_words.str.startswith(entry[0])]
    distances = ((jaccard_distance(set(ngrams(entry,gram_number)),
                                       set(ngrams(word,gram_number))), word)
                 for word in spellings)

    closest = min(distances)
    return closest[1]


def JDreco(entry):
    return jaccard(entry, 2)

In [None]:
def search(query, df = tweets, exact = False):
    matched_documents = set()
    
    words = word_tokenize(query)
    
    prev = None
    
    for word in words:
        word_lower = word.lower()
        if word_lower not in stop_words:
            cur = lem.lemmatize(word_lower)
            
            if cur in company_ticker:
                cur = company_ticker[cur]
            
            reco_cur = JDreco(cur)
            
            matches = inverted_index.get(reco_cur)

            if matches:
                # The operator |= is a short hand for set union
                matched_documents |= matches
                
    
                
    match_index = [item[0] for item in matched_documents]
    return match_index #df[df['Doc_id'].isin(match_index)] 