In [1]:
import pandas as pd
import pysolr
import os

ModuleNotFoundError: No module named 'pysolr'

In [None]:
# establish connection with the core
core_path = "http://localhost:8983/solr/stock_project_core"
solr = pysolr.Solr(core_path, always_commit=True)

# Remove all indexes from the core

In [None]:
# remove all index
solr.delete(q='*:*')

# Index data from csv

In [None]:
# load csv data (labelling version)
csv_path = "IR_New_Testing_Data - Tesing_Data.csv"

csv_data = pd.read_csv(csv_path, dtype={'tweet_id': 'int64'}, float_precision='round_trip')
csv_data['tweet_id'] = csv_data['tweet_id'].astype(str)
csv_data.head(5)

In [None]:
# extract csv data to be indexed
extracted_data = []
for index, row in csv_data.iterrows():
    extracted_data.append({"id": row["tweet_id"],
                           "company": row["ticker_symbol"],
                           "post_date": row["post_date"],
                            "author": row["writer"], 
                            "raw_text": row["body"], 
                            "like_num": row["like_num"],
                            "comment_num": row["comment_num"], 
                            "retweet_num": row["retweet_num"], 
                            "tweet_activity": row["tweet_activity"],
                            "clean_text": row["clean_text_no_stem_user"],
                            "subjectivity": row["subjectivity"], 
                            "sentiment": row["sentiment"]})

In [None]:
# define company names
ticker_symbol_to_name_mapping = {"AAPL": "apple",
                                "AMZN": "amazon",
                                "GOOGL": "google",
                                "MSFT": "Microsoft",
                                "TSLA": "Tesla"}

for i in range(len(extracted_data)):
    extracted_data[i]["company"] = ticker_symbol_to_name_mapping[extracted_data[i]["company"]]

In [None]:
# inspect data before indexing
extracted_data

In [None]:
# index data
print(solr.add(extracted_data))

# Query guide

Note that like_num, comment_num, retweet_num, tweet_activity, subjectivity, sentiment are not indexed and hence can't be queried. However, their values can be returned when relevant queries are made using other query fields such as id.

## Single field query, query field = text body

When querying the text body, query with clean_text, display raw_text

In [None]:
def display_results(search_results, query_field):
    # show spell correction only if query_field == "clean_text"
    if query_field == "clean_text":
        if len(search_results) == 0 and search_results.spellcheck['suggestions'][1]['numFound'] > 0:
            print("It seems the query you are searching if not present in the database. Here are some alternatives suggested queries: ")
            print(search_results.spellcheck['suggestions'][1]['suggestion'])
        else:
            for result in search_results:
                print("id: ", result["id"])
                print("company: ", result["company"])
                print("post_date: ", result["post_date"])
                print("author: ", result["author"])
                print("raw_text: ", result["raw_text"])
                print("like_num: ", result["like_num"])
                print("comment_num: ", result["comment_num"])
                print("retweet_num: ", result["retweet_num"])
                print("tweet_activity: ", result["tweet_activity"])
                print("subjectivity: ", result["subjectivity"])
                print("sentiment: ", result["sentiment"])
                print("====================================")
                print()
    else:
        if len(search_results) == 0:
            print("It seems the query you are searching if not present in the database")
        else:
            for result in search_results:
                print("id: ", result["id"])
                print("company: ", result["company"])
                print("post_date: ", result["post_date"])
                print("author: ", result["author"])
                print("raw_text: ", result["raw_text"])
                print("like_num: ", result["like_num"])
                print("comment_num: ", result["comment_num"])
                print("retweet_num: ", result["retweet_num"])
                print("tweet_activity: ", result["tweet_activity"])
                print("subjectivity: ", result["subjectivity"])
                print("sentiment: ", result["sentiment"])
                print("====================================")
                print() 

In [None]:
# query on 1 field
query_field = "clean_text" # define the field to search under
query = "economically" # define the search term
fields_to_return = "id, company, post_date, author, raw_text, like_num, comment_num, retweet_num, tweet_activity, subjectivity, sentiment" # define the fields to return results
top_n_results = 3 # define the number of top results to return

query_string = "%s: %s" % (query_field, query)

search_results = solr.search(query_string, 
                             **{'fl': fields_to_return, 
                                'rows': top_n_results})

display_results(search_results, query_field)

## Single field query, query field = post date

In [None]:
# query on 1 field
query_field = "post_date" # define the field to search under
query = '[2023-01-01T23\:59\:59Z TO 2023-03-01T23\:59\:59Z]' # define the search term
fields_to_return = "id, company, post_date, author, raw_text, like_num, comment_num, retweet_num, tweet_activity, subjectivity, sentiment" # define the fields to return results
top_n_results = 3 # define the number of top results to return

query_string = "%s: %s" % (query_field, query)

search_results = solr.search(query_string, 
                             **{'fl': fields_to_return, 
                                'rows': top_n_results})

display_results(search_results, query_field)

# Single field query, query field = text body, query with wrong spelling

Note that spell correction only applied to clean_text

In [None]:
# query on 1 field
query_field = "clean_text" # define the field to search under
query = 'increasy' # define the search term
fields_to_return = "id, company, post_date, author, raw_text, like_num, comment_num, retweet_num, tweet_activity, subjectivity, sentiment" # define the fields to return results
top_n_results = 3 # define the number of top results to return

query_string = "%s: %s" % (query_field, query)

search_results = solr.search(query_string, 
                             **{'fl': fields_to_return, 
                                'rows': top_n_results})

display_results(search_results, query_field)

# Multi-fields query

In [None]:
# query on multiple fields
fields_to_return = "id, company, post_date, author, raw_text, like_num, comment_num, retweet_num, tweet_activity, subjectivity, sentiment" # define the fields to return results
top_n_results = 3 # define the number of top results to return

query_string = "company: apple AND clean_text: increase" # AND OR

search_results = solr.search(query_string, 
                             **{'fl': fields_to_return, 
                                'rows': top_n_results})

display_results(search_results, query_field)