        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [None]:
# Ensures that the following packages are installed (will not check for versioning as long as everything is installed, else it will install the last Tensorflow version supported by native windows 11/10)
from importlib.util import find_spec
packages = "compress_pickle[lz4] pandas numpy yahooquery cryptography seaborn kaggle pyarrow transformers fasttext tensorflow==2.10.1 keras-tuner scikit-learn tensorflow_datasets tensorflow-text lime"

not_installed = [package_name.split('[')[0].split('=')[0].replace('scikit-learn','sklearn').replace('-','_') for package_name in packages.split(" ")] # remove versioning and other stuff
not_installed = [package_name for package_name in not_installed if find_spec(package_name) is None] # check if package is installed

if len(not_installed) > 0:
    %pip install -U $packages

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('io.parquet.engine', 'pyarrow')
import numpy as np
import seaborn
from glob import glob
from compress_pickle import dump, load
from matplotlib import pyplot as plt
import os
from yahooquery import Ticker
import timeit
import time
import gc, os
import datetime
from getpass import getpass
from shutil import rmtree
from utils import fernet_key_encryption, aquire_stock_search_terms as aquire_terms, get_macroeconomic_data as macro_data, download_datasets, save_file, load_file, interpolate_months_to_days, intersect_df, parse_emotion_dataframes, get_emotion_df, create_triplets
from models import siamese_model, triplet_loss
import transformers
from tqdm.notebook import tqdm
import gc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, preprocessing, callbacks, optimizers, losses, metrics
import keras_tuner as kt

    

# enable memory growth for GPU and mixed precision
physical_devices = tf.config.list_physical_devices('GPU')
try:
    import subprocess
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    vram = tf.config.experimental.get_memory_growth(physical_devices[0])
    print('Memory growth:', vram)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    print('Invalid device or cannot modify virtual devices once initialized.')
    print(physical_devices)


In [None]:
def available_mem():
    """Return the available GPU memory in GB."""
    MB_memory = int("".join([x for x in subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]).decode() if x.isdigit()]))
    GB_memory = MB_memory / 1000
    return GB_memory

print('Available GPU memory:', available_mem(), 'GB')

In [None]:
# After this first run, you will only need to enter the password to load the api keys.
# If you need to change the keys or password, delete the relevent .secret keys file and run this section again.
# salt.secret is a non-sensitive file that is used to both generate the encryption key as well as decryption. If this key is lost, the encrypted files are lost and you will need to re-enter the api keys.

# Ask for input of password to save API keys
password = getpass("Enter password to save/load API keys: ");

kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
#td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')

#data.nasdaq.com api key through quandle
data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');

del password;
gc.collect();


In [None]:
# if this (or any other) section fails initially, you may need to run it again. Each of these steps caches results to disk to speed up future runs and save RAM usage (they only will run once).
username, password = kaggle_api_key.split(' ');
os.environ['KAGGLE_USERNAME'] = username;
os.environ['KAGGLE_KEY'] = password;
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd();

# download the various kaggle datasets
download_datasets(
        'https://www.kaggle.com/datasets/sarthmirashi07/us-macroeconomic-data', 
        kaggle_api_key, 
        files_to_move={'US_macroeconomics.csv': 'macro/US_macroeconomics.csv'},
        delete=True,
        dest_name='Macro')

download_datasets(
        'https://www.kaggle.com/datasets/footballjoe789/us-stock-dataset', 
        kaggle_api_key, 
        files_to_move={'us-stock-dataset/Data/Stocks': 'Stocks', 'us-stock-dataset/Stock_List.csv': 'Stock_List.csv'}, 
        delete=True,
        dest_name='Stocks')

download_datasets(
        'https://www.kaggle.com/datasets/mathurinache/goemotions',
        kaggle_api_key,
        files_to_move={'goemotions.csv': 'Emotions/goemotions.csv'},
        delete=True,
        dest_name='Emotions')

download_datasets(
        'https://www.kaggle.com/datasets/parulpandey/emotion-dataset',
        kaggle_api_key,
        files_to_move={'training.csv': 'Emotions/training.csv', 'validation.csv': 'Emotions/validation.csv', 'test.csv': 'Emotions/test.csv'},
        delete=True,
        dest_name='Emotions')

download_datasets(
        'https://www.kaggle.com/datasets/kosweet/cleaned-emotion-extraction-dataset-from-twitter',
        kaggle_api_key,
        files_to_move={'dataset(clean).csv': 'Emotions/dataset(clean).csv'},
        delete=True,
        dest_name='Emotions')

download_datasets(
        'https://www.kaggle.com/datasets/miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests',
        kaggle_api_key,
        files_to_move={'raw_partner_headlines.csv': 'Text/raw_partner_headlines.csv', 'raw_analyst_ratings.csv': 'Text/raw_analyst_ratings.csv', 'analyst_ratings_processed.csv': 'Text/analyst_ratings_processed.csv'},
        delete=True,
        dest_name='Text')



# clear the username and key from the environment variables
os.environ['KAGGLE_USERNAME'] = "" 
os.environ['KAGGLE_KEY'] = ""

In [None]:
# Gather the company info for all the ticker symbols and return a dataframe with relevant search terms for each company.
# If the stocks dataset is updated on kaggle, compank_list.pkl needs to be deleted and this run again if the symbols have changed. 
# TODO: It would be more efficient to manually pull the new stock data ourselves and keep the old ticker symbols.
search_terms = aquire_terms('data/Stocks/')
search_terms.data

In [None]:
df1 = load_file('data/Macro/US_macroeconomics.parquet')
df2 = load_file('data\Stocks\AAPL.parquet')

df1, df2 = intersect_df(df1, df2, interpolate_to_days=True, extend_trend_to_today=False) # extend_trend_to_today should only be used when the macro data is recent.
df1


In [None]:
emotion_df = get_emotion_df()
x = np.array([x for x in emotion_df['text'].values])
y = emotion_df[[x for x in emotion_df.columns if x != 'text']].values.astype('float32')
x_shape = x[0].shape
label_shape = y[0].shape
x.shape, y.shape
del emotion_df
gc.collect()

In [None]:
bs = 32
#bs = int(bs * available_mem() // 8) # very conservative estimate of batch size based on available GPU memory

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
triplet_train = create_triplets(x_train, y_train, batch_size=bs, shuffle=True, seed=42)
triplet_test = create_triplets(x_test, y_test, batch_size=int(bs * 0.2), shuffle=True, seed=42)
del x, y

    Given that the text emotion dataset is highly imbalanced, we will construct a siamese model to create an embedding that is seperable between the ~30 classes of emotions. This will give the classes equal probbaility of being accessed as well as serve as N^3 dataset augmentation. We can also easily compare the accuracy of the siamese network by adding a different head to the model and training it on a simpler ubalanced method.

In [None]:

#model_siamese, model_encoder, model_inference = siamese_model(x_shape, label_shape)


In [None]:
x_shape, label_shape

In [None]:
tuner = kt.Hyperband(siamese_model,
                        objective=kt.Objective("triplet_loss", direction="min"),
                        hyperband_iterations=5,
                        seed=42,
                        factor=3,
                        directory='tuning',
                        project_name='siamese_emotion')

In [None]:
tuner.search(triplet_train,
                epochs=10,
                validation_data=triplet_test,
                callbacks=[tf.keras.callbacks.EarlyStopping(patience=1)])

In [None]:
best_p = tuner.get_best_hyperparameters()[0]

In [None]:
best_p.values

In [None]:
model = tuner.hypermodel.build(best_p)

In [None]:
history = model.fit(triplet_train, epochs=10, validation_data=triplet_test, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)])

In [None]:
# history_siamese = model_siamese.fit(triplet_train.generator(), steps_per_epoch=triplet_train.num_batches, epochs=100, validation_data=triplet_test.generator(), validation_steps=triplet_test.num_batches, callbacks=[callbacks.EarlyStopping(patience=2, restore_best_weights=True)])

# # freeze the layers
# for layer in model_siamese.layers:
#     layer.trainable = False
# fig = plt.figure(figsize=(10, 5))
# plt.plot(history_siamese.history['loss'], label='train')
# plt.plot(history_siamese.history['val_loss'], label='test')
# plt.legend()
# plt.show()

In [None]:
# history = model_inference.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_test, y_test), callbacks=[callbacks.EarlyStopping(patience=2, restore_best_weights=True)])

In [None]:
fig = plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()