Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from glob import glob
from pickle import dump, load
import os
from yahooquery import Ticker
import timeit

1) Download and extract https://www.kaggle.com/datasets/footballjoe789/us-stock-dataset

2) Copy Stocks/ to the same directory as this notebook. You can delete the other files as they are not used (yet, at least).

In [2]:
class stock_tickers():
    """A work-in-progress. Goal is to take stock ticker symbols and return a list of search terms for NLP web scraping. Officers, affiliated companies, company name, etc."""
    def __init__(self, file_path = 'Stocks/', file_ext = '.csv'):
        self.delete_small_files(file_path, file_ext, min_size = 1000)
        self.load_symbols(file_path, file_ext)
        self.ticker_list_to_dataframe()
        self.get_company_list()
        self.process_information()

    def load_symbols(self, file_path = 'Stocks/', file_ext = '.csv'):
        """Find all the stock symbols in the directory and return a list of them."""
        self.stocks_symbols = [name.split('.')[0].split("""\\""")[-1] for name in glob (file_path+'*'+file_ext)]
        return self.stocks_symbols

    def delete_small_files(self, file_path = 'Stocks/', file_ext = '.csv', min_size = 1000):
        """Delete any files smaller than min_size bytes."""
        self.load_symbols()
        file_sizes = [os.path.getsize(file_path+name+file_ext) for name in self.stocks_symbols]
        if min(file_sizes) < min_size:
            for i in range(len(file_sizes)):
                if file_sizes[i] < min_size:
                    os.remove(file_path+self.stocks_symbols[i]+file_ext)

    def ticker_list_to_dataframe(self):
        self.data = pd.DataFrame(self.stocks_symbols, columns = ['Ticker'])

    def get_company(self, index):
        """Return the company name for a given ticker."""
        if type(index) == int:
            if 'longBusinessSummary' in self.yh_tickers[index]:
                name =  self.yh_tickers[index]['longBusinessSummary'].replace(". ", " ").replace(",", "")
                # if name contains break_words, split on them and take the first word
                
                # words to break on but also keep in the name
                break_words = [' Inc ', ' Corp ', ' Ltd ', ' LLC ', ' LTD ', ' Corporation ', ' Company ', ' Group ', ' Holdings ', ' Systems ', ' Technologies ', ' Technology ', ' Services ', ' Solutions ', " Bancorp ", " Limited ", " develops ", " researches ", "S.A", " Inc.", " N.V", " Co ", " Corp. "]
                for word in break_words:
                    if word in name:
                        name = name.split(word)[0] + word
                        #remove trailing spaces
                        name = name.strip()
                
                # exclusive words to break on
                break_words2 = [" is ", " and sells ", " does ", " engages ", " operates ", " provides ", " de ", " focuses ", " supports ", " - ", " distributes ", " specializes ", " (", " originates ", " owns ", " acquires ", " invests ", " offers ", " through ", " together ", " a ", " an ", " supplies ", "explores ", " designs ", " formerly " , " manufactures "," holds ", " included ", " ranks "]
                for word in break_words2:
                    if word in name:
                        name = name.split(word)[0]
                
                # words to break and remove from the front of the name
                break_words_front = ["2022", "2023", "2024", "2025", "2026", "2027", "2028", "2029", "2030", " VA "]
                for word in break_words_front:
                    if word in name:
                        name = name.split(word)[-1]
                        name = name.strip()
                
                if name[-1] == ".":
                    name = name[:-1]

                return name
        return ""

    def get_company_officers(self, index):
        if type(index) == int:
            if 'companyOfficers' in self.yh_tickers[index]:
                if len(self.yh_tickers[index]["companyOfficers"]) > 0:
                    officers = self.yh_tickers[index]["companyOfficers"]
                    officers = sorted(officers, key = lambda x: x['unexercisedValue'], reverse = True)
                    officer = officers[0]['name'].replace("Mr. ", "").replace("Ms. ", "").replace("Dr. ", "").replace("Mrs. ", "").replace(".", "").split(" ")
                    officer = [x for x in officer if len(x) > 1]
                    officer = " ".join(officer)
                    return officer
        return ""

    def get_industry(self, index):
        """Return the industry for a given ticker."""
        if type(index) == int:
            if 'industry' in self.yh_tickers[index]:
                return self.yh_tickers[index]['industry']
        return ""

    def get_company_list(self):
        """Return a list of company information for later processing. Caches results to file."""
        if not os.path.exists('company_list.pkl'):
            self.yh_tickers = [Ticker(ticker) for ticker in self.stocks_symbols]
            start = timeit.default_timer()
            for t in range(len(self.yh_tickers)):
                sym = self.stocks_symbols[t]
                time_now = timeit.default_timer()
                print(f"{round(100*t/len(self.yh_tickers),1)}% complete.", "Estimated time left:", round((time_now-start)*(len(self.yh_tickers)-t)/(t+1),0), "seconds. Symbol:", self.stocks_symbols[t], end = '\r')
                self.yh_tickers[t] = self.yh_tickers[t].asset_profile[sym]
            with open('company_list.pkl', 'wb') as f:
                dump(self.yh_tickers, f)
        else:
            with open('company_list.pkl', 'rb') as f:
                self.yh_tickers = load(f)
    
    def process_information(self):
        self.data['Company'] = [self.get_company(t) for t in range(len(self.yh_tickers))]
        self.data["Executive"] = [self.get_company_officers(t) for t in range(len(self.yh_tickers))] 
        self.data['Industry'] = [self.get_industry(t) for t in range(len(self.yh_tickers))]

tickers = stock_tickers()
tickers.data

Unnamed: 0,Ticker,Company,Executive,Industry
0,A,Agilent Technologies,Michael McMullen,Diagnostics & Research
1,AA,Alcoa Corporation,Roy Harvey,Aluminum
2,AAC,Ares Acquisition Corporation,David Kaplan,Shell Companies
3,AACG,ATA Creativity Global,Xiaofeng Ma,Education & Training Services
4,AACI,Armada Acquisition Corp,Stephen Herbert,Shell Companies
...,...,...,...,...
7100,ZVSA,ZyVersa Therapeutics Inc,Stephen Glover,Biotechnology
7101,ZWS,Zurn Elkay Water Solutions,Todd Adams,Pollution & Treatment Controls
7102,ZYME,Zymeworks Inc,Ali Tehrani PhD,Biotechnology
7103,ZYNE,Zynerba Pharmaceuticals Inc,Armando Anido MBA,Drug Manufacturers—Specialty & Generic


In [3]:
count = 0
for ticker in range(len(tickers.data)):
    if tickers.data['Ticker'][ticker] != "TSLA":
        count += 1
    else:
        print(tickers.data['Company'][ticker])
        print(tickers.data['Ticker'][ticker])
        print(ticker)
        break

Tesla Inc
TSLA
6436


In [4]:
for name in tickers.data['Company']:
    print(name)

Agilent Technologies
Alcoa Corporation
Ares Acquisition Corporation
ATA Creativity Global
Armada Acquisition Corp
Aadi Bioscience Inc
Arlington Asset Investment Corp

American Airlines Group

Altisource Asset Management Corporation
Atlantic American Corporation
The Aaron's Company
Applied Optoelectronics Inc
AAON Inc
Advance Auto Parts Inc
Apple Inc
American Assets Trust Inc
Autoscope Technologies
Almaden Minerals Ltd
Atlas Air Worldwide Holdings
AllianceBernstein Holding L.P
ABB Ltd
AbbVie Inc
AmerisourceBergen Corporation
Ameris Bancorp
AbCellera Biologics Inc
Abcam plc
Abeona Therapeutics Inc
Ambev S.A
Asbury Automotive Group
ABG Acquisition Corp
ARCA biopharma Inc
ABM Industries Incorporated
Abiomed Inc
Airbnb Inc
Acumen Pharmaceuticals Inc
Arbor Realty Trust Inc
Absci Corporation
Absolute Software Corporation
Abbott Laboratories
Arbutus Biopharma Corporation
ABVC BioPharma Inc
Associated Capital Group
Arcosa Inc
Atlantic Coastal Acquisition Corp
Acri Capital Acquisition Corporatio

In [11]:
officers = tickers.yh_tickers[6436]
officers = tickers.yh_tickers[7]
officers

'No fundamentals data found for any of the summaryTypes=assetProfile'

In [6]:
# For every stock ticker symbol in stock_symbols, lookup the top executive and the company name.


Datasets:
We need a historical dataset of stock prices.
Secondly, we need a dataset of web trends for the same time period. 

On the other hand, once the model is trained, it needs to be able to aquire live data that looks the same as the training data.

Possible sources:
   - twitter
   - google trends

Possible models:
    - Train a model to process text into an encoding
    - Train a model to do stock price prediction
    - Combine the two models to predict stock price/volatility (not sure which is best) from text with greater accuracy

In [7]:
# Collect text web data

In [8]:
# Collect stock data

In [9]:
# 