Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [12]:
%pip install compress_pickle[lz4] pandas numpy matplotlib yahooquery

Note: you may need to restart the kernel to use updated packages.


In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from compress_pickle import dump, load
import os
from yahooquery import Ticker
import timeit
import time
import datetime

1) Download and extract https://www.kaggle.com/datasets/footballjoe789/us-stock-dataset

2) Copy Stocks/ to the same directory as this notebook. You can delete the other files as they are not used (yet, at least).

In [14]:
class stock_tickers():
    """A work-in-progress. Goal is to take stock ticker symbols and return a list of search terms for NLP web scraping. Officers, affiliated companies, company name, etc."""
    def __init__(self, file_path = 'data/Stocks/', file_ext = '.csv'):
        self.file_path = file_path
        self.load_symbols()
        self.ticker_list_to_dataframe()
        self.get_company_list()
        self.process_information()

    def load_symbols(self):
        """Load all the stock symbols."""
        self.stocks_symbols = pd.read_csv("data/Stock_List.csv")['Symbol'].tolist()
        return self.stocks_symbols

    def ticker_list_to_dataframe(self):
        self.data = pd.DataFrame(self.stocks_symbols, columns = ['Ticker'])

    def get_company(self, index):
        """Return the company name for a given ticker."""
        if type(index) == int:
            if 'longName' in self.yh_tickers[index]:
                return self.yh_tickers[index]['longName']
            if 'longBusinessSummary' in self.yh_tickers[index]:
                name =  self.yh_tickers[index]['longBusinessSummary'].replace(". ", " ").replace(",", "")
                # if name contains break_words, split on them and take the first word
                
                # words to break on but also keep in the name
                break_words = [' Inc ', ' Corp ', ' Ltd ', ' LLC ', ' LTD ', ' Corporation ', ' Company ', ' Group ', ' Holdings ', ' Systems ', ' Technologies ', ' Technology ', ' Services ', ' Solutions ', " Bancorp ", " Limited ", " develops ", " researches ", "S.A", " Inc.", " N.V", " Co ", " Corp. "]
                for word in break_words:
                    if word in name:
                        name = name.split(word)[0] + word
                        #remove trailing spaces
                        name = name.strip()
                
                # exclusive words to break on
                break_words2 = [" is ", " and sells ", " does ", " engages ", " operates ", " provides ", " de ", " focuses ", " supports ", " - ", " distributes ", " specializes ", " (", " originates ", " owns ", " acquires ", " invests ", " offers ", " through ", " together ", " a ", " an ", " supplies ", "explores ", " designs ", " formerly " , " manufactures "," holds ", " included ", " ranks "]
                for word in break_words2:
                    if word in name:
                        name = name.split(word)[0]
                
                # words to break and remove from the front of the name
                break_words_front = ["2022", "2023", "2024", "2025", "2026", "2027", "2028", "2029", "2030", " VA "]
                for word in break_words_front:
                    if word in name:
                        name = name.split(word)[-1]
                        name = name.strip()
                
                if name[-1] == ".":
                    name = name[:-1]

                return name
            if 'shortName' in self.yh_tickers[index]:
                return self.yh_tickers[index]['shortName']
        return ""

    def get_company_officers(self, index):
        if type(index) == int:
            if 'companyOfficers' in self.yh_tickers[index]:
                if len(self.yh_tickers[index]["companyOfficers"]) > 0:
                    officers = self.yh_tickers[index]["companyOfficers"]
                    officers = sorted(officers, key = lambda x: x['unexercisedValue'], reverse = True)
                    officer = officers[0]['name'].replace("Mr. ", "").replace("Ms. ", "").replace("Dr. ", "").replace("Mrs. ", "").replace(".", "").split(" ")
                    officer = [x for x in officer if len(x) > 1]
                    officer = " ".join(officer)
                    return officer
        return ""

    def get_industry(self, index):
        """Return the industry for a given ticker."""
        if type(index) == int:
            if 'industry' in self.yh_tickers[index]:
                return self.yh_tickers[index]['industry']
        return ""

    def get_company_list(self):
        """Return a list of company information for later processing. Caches results to file."""
        if not os.path.exists('company_list.pkl'):
            self.yh_tickers_init = [Ticker(ticker) for ticker in self.stocks_symbols]
            self.yh_tickers = [{} for _ in self.stocks_symbols]
            length = len(self.yh_tickers_init)
            errors = 0
            recovered_errors = 0
            start = timeit.default_timer()
            for t in range(len(self.yh_tickers_init)):
                try:
                    sym = self.stocks_symbols[t]
                    self.yh_tickers[t] = self.yh_tickers_init[t].asset_profile[sym]
                    if type(self.yh_tickers[t]) != dict:
                        self.yh_tickers[t] = {}
                        errors+=1
                except Exception as e:
                    self.yh_tickers[t] = {}
                    errors+=1
                    print(e)
                if 'longBusinessSummary' in self.yh_tickers[t] or 'companyOfficers' in self.yh_tickers[t] or 'industry' in self.yh_tickers[t]:
                    more_info = self.yh_tickers_init[t].quote_type[sym]
                    if type(more_info) != dict:
                        more_info = {}
                    self.yh_tickers[t] = {**self.yh_tickers[t], **more_info}
                else:
                    try:
                        if len(self.yh_tickers[t]) == 0:
                            self.yh_tickers[t] = self.yh_tickers_init[t].quote_type[sym]
                        else:
                            more_info = self.yh_tickers_init[t].quote_type[sym]
                            if type(more_info) != dict:
                                more_info = {}
                            self.yh_tickers[t] = {**more_info, **self.yh_tickers[t]}
                        recovered_errors += 1
                        
                    except Exception as e:
                        print(e)

                #time.sleep(0.2)
                time_now = timeit.default_timer()

                time_left = round((time_now - start)/(t/length + 1e-10) - (time_now - start), 0)
                time_left = str(datetime.timedelta(seconds=time_left))

                print(f"{round(100*t/len(self.yh_tickers),1)}% complete.", "Estimated time left:", time_left, "seconds. Symbol:", self.stocks_symbols[t], type(self.yh_tickers[t]), "--- errors:",errors, f"of current: {t}, total: {len(self.yh_tickers)}.", "Recovered_errors:", recovered_errors,"          ",end = '\r')
            with open('company_list.pkl', 'wb') as f:
                dump(self.yh_tickers, f, compress='lz4')
        else:
            with open('company_list.pkl', 'rb') as f:
                self.yh_tickers = load(f, decompress='lz4')
        self.missed = []
        for name in self.yh_tickers:
            if type(name) == str:
                self.missed.append(name)
    
    def process_information(self):
        self.data['Company'] = [self.get_company(t) for t in range(len(self.yh_tickers))]
        self.data["Executive"] = [self.get_company_officers(t) for t in range(len(self.yh_tickers))] 
        self.data['Industry'] = [self.get_industry(t) for t in range(len(self.yh_tickers))]

tickers = stock_tickers()
tickers.data

Unnamed: 0,Ticker,Company,Executive,Industry
0,A,"Agilent Technologies, Inc.",Michael McMullen,Diagnostics & Research
1,AA,Alcoa Corporation,Roy Harvey,Aluminum
2,AAC,Ares Acquisition Corporation,David Kaplan,Shell Companies
3,AACG,ATA Creativity Global,Xiaofeng Ma,Education & Training Services
4,AACI,Armada Acquisition Corp. I,Stephen Herbert,Shell Companies
...,...,...,...,...
7677,ZVSAW,,,
7678,ZWS,Zurn Elkay Water Solutions Corporation,Todd Adams,Pollution & Treatment Controls
7679,ZYME,Zymeworks Inc.,Ali Tehrani PhD,Biotechnology
7680,ZYNE,"Zynerba Pharmaceuticals, Inc.",Armando Anido MBA,Drug Manufacturers—Specialty & Generic


In [15]:
count = 0
for ticker in range(len(tickers.data)):
    if tickers.data['Ticker'][ticker] != "TSLA":
        count += 1
    else:
        print(tickers.data['Company'][ticker])
        print(tickers.data['Ticker'][ticker])
        print(ticker)
        break

Tesla, Inc.
TSLA
6981


In [16]:
a = tickers.data.iloc[6:7].Ticker.values[0]
ttt = Ticker(a)

In [17]:
ttt.quote_type[a]
{**ttt.asset_profile[a], **ttt.quote_type[a]}
#self.yh_tickers[t] = {**self.yh_tickers[t], **more_info}

{'address1': '6862 Elm Street',
 'address2': 'Suite 320',
 'city': 'McLean',
 'state': 'VA',
 'zip': '22101-1720',
 'country': 'United States',
 'phone': '703 373 0200',
 'fax': '703 373 0680',
 'website': 'https://www.arlingtonasset.com',
 'industry': 'REIT—Mortgage',
 'sector': 'Real Estate',
 'longBusinessSummary': 'Arlington Asset Investment Corp. (NYSE: AAIC) currently invests primarily in mortgage related and residential real estate and has elected to be taxed as a REIT. The Company is headquartered in the Washington, D.C. metropolitan area. For more information, please visit www.arlingtonasset.com.',
 'fullTimeEmployees': 11,
 'companyOfficers': [{'maxAge': 1,
   'name': 'Mr. J. Rock Tonkel Jr., CPA',
   'age': 59,
   'title': 'Pres, CEO & Director',
   'yearBorn': 1963,
   'fiscalYear': 2021,
   'totalPay': 1618545,
   'exercisedValue': 0,
   'unexercisedValue': 0},
  {'maxAge': 1,
   'name': 'Mr. Richard E. Konzmann',
   'age': 54,
   'title': 'Exec. VP, Treasurer & CFO',
   '

In [18]:
officers = tickers.yh_tickers[6436]
officers = tickers.yh_tickers[7]
officers

{'exchange': 'NYQ',
 'quoteType': 'EQUITY',
 'symbol': 'AAIN',
 'underlyingSymbol': 'AAIN',
 'shortName': 'Arlington Asset Investment Corp',
 'longName': None,
 'firstTradeDateEpochUtc': '2021-07-19 08:30:00',
 'timeZoneFullName': 'America/New_York',
 'timeZoneShortName': 'EST',
 'uuid': '6a83b0b9-f174-3be1-940d-58834cd7fd8d',
 'gmtOffSetMilliseconds': -18000000,
 'maxAge': 1}

In [19]:
# For every stock ticker symbol in stock_symbols, lookup the top executive and the company name.


Datasets:
We need a historical dataset of stock prices.
Secondly, we need a dataset of web trends for the same time period. 

On the other hand, once the model is trained, it needs to be able to aquire live data that looks the same as the training data.

Possible sources:
   - twitter
   - google trends

Possible models:
    - Train a model to process text into an encoding
    - Train a model to do stock price prediction
    - Combine the two models to predict stock price/volatility (not sure which is best) from text with greater accuracy

In [20]:
# Collect text web data

In [21]:
# Collect stock data

In [22]:
# 