        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('io.parquet.engine', 'pyarrow')
import numpy as np
import seaborn
from glob import glob
from compress_pickle import dump, load
from matplotlib import pyplot as plt
import os
from yahooquery import Ticker
import timeit
import time
import gc, os
from time import sleep
import datetime
from getpass import getpass
from shutil import rmtree
from utils import fernet_key_encryption, aquire_stock_search_terms as aquire_terms, get_macroeconomic_data as macro_data, download_dataset, save_file, load_file, interpolate_months_to_days, intersect_df, parse_emotion_dataframes, get_emotion_df, create_triplets,  get_datasets, convert_project_files_to_parquet
from models import siamese_model_dense, triplet_loss
import transformers
from tqdm.notebook import tqdm
import gc
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers, models, preprocessing, callbacks, optimizers, losses, metrics
import keras_tuner as kt
from shutil import rmtree

# enable memory growth for GPU and mixed precision
physical_devices = tf.config.list_physical_devices('GPU')
try:
    import subprocess
    tf.config.experimental.set_memory_growth(physical_devices[0], True)
    vram = tf.config.experimental.get_memory_growth(physical_devices[0])
    print('Memory growth:', vram)
except:
    # Invalid device or cannot modify virtual devices once initialized.
    print('Invalid device or cannot modify virtual devices once initialized.')
    print(physical_devices)

seed=42

np.random.seed(seed)
tf.random.set_seed(seed)

# use this for reproducible random sampling
rng = np.random.default_rng(seed=seed) 

# if you don't need the API downloads, you can set this to False
use_api = True

In [None]:
def available_mem():
    """Return the available GPU memory in GB."""
    MB_memory = int("".join([x for x in subprocess.check_output(["nvidia-smi", "--query-gpu=memory.free", "--format=csv"]).decode() if x.isdigit()]))
    GB_memory = MB_memory / 1000
    return GB_memory

print('Available GPU memory:', available_mem(), 'GB')

In [None]:
# After this first run, you will only need to enter the password to load the api keys.
# If you need to change the keys or password, delete the relevent .secret keys file and run this section again.
# salt.secret is a non-sensitive file that is used to both generate the encryption key as well as decryption. If this key is lost, the encrypted files are lost and you will need to re-enter the api keys.

if use_api:
    # Ask for input of password to save API keys
    password = getpass("Enter password to save/load API keys: ");

    kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
    #td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')

    #data.nasdaq.com api key through quandle
    data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');

    del password;
    gc.collect();

    get_datasets(kaggle_api_key, data_nasdaq_key)

In [None]:
# Gather the company info for all the ticker symbols and return a dataframe with relevant search terms for each company.
# If the stocks dataset is updated on kaggle, compank_list.pkl needs to be deleted and this run again if the symbols have changed. 
# TODO: It would be more efficient to manually pull the new stock data ourselves and keep the old ticker symbols.
search_terms = aquire_terms('data/Stock/')
search_terms.data

In [None]:
df1 = load_file('data/Macro/US_macroeconomics.parquet')
df2 = load_file('data\Stock\AAPL.parquet')

df1, df2 = intersect_df(df1, df2, interpolate_to_days=True, extend_trend_to_today=False) # extend_trend_to_today should only be used when the macro data is recent.
df1


In [None]:
emotion_df = get_emotion_df()
x = np.array([x for x in emotion_df['text'].values])
y = emotion_df[[x for x in emotion_df.columns if x != 'text']].values.astype('float32')
x_shape = x[0].shape
label_shape = y[0].shape
x.shape, y.shape
del emotion_df
gc.collect();

In [None]:
bs = 32
#bs = int(bs * available_mem() // 8) # very conservative estimate of batch size based on available GPU memory

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=seed)
triplet_train = create_triplets(x_train, y_train, batch_size=bs, shuffle=True, seed=seed)
triplet_test = create_triplets(x_test, y_test, batch_size=int(bs * 0.2), shuffle=True, seed=seed)
del x, y

    Given that the text emotion dataset is highly imbalanced, we will construct a siamese model to create an embedding that is seperable between the ~30 classes of emotions. This will give the classes equal probbaility of being accessed as well as serve as N^3 dataset augmentation. We can also easily compare the accuracy of the siamese network by adding a different head to the model and training it on a simpler ubalanced method.

In [None]:

#model_siamese, model_encoder, model_inference = siamese_model(x_shape, label_shape)


In [None]:
x_shape, label_shape

In [None]:
# Define the custom callback function
try:
    class CustomCallback(keras.callbacks.Callback):
        def on_epoch_end(self, epoch, logs=None):
            # Log the validation loss at the end of each epoch
            val_triplet_loss = logs['val_triplet_loss']
            val_loss = logs['val_triplet_loss']

    best_p = None
    tuner = kt.Hyperband(siamese_model_dense,
                            objective=kt.Objective("val_triplet_loss", direction="min"),
                            hyperband_iterations=5,
                            seed=seed,
                            max_epochs=5,
                            executions_per_trial=1,
                            factor=3,
                            directory='tuning',
                            max_consecutive_failed_trials=10,
                            project_name='siamese_emotion')


    tuner.search(triplet_train.generator(),
                epochs=5,
                validation_data=triplet_test.generator(),
                initial_epoch=1,
                steps_per_epoch=triplet_train.num_batches//2,
                validation_steps=triplet_test.num_batches,
                callbacks=[
                    tf.keras.callbacks.EarlyStopping(patience=0, monitor='val_triplet_loss'), 
                    CustomCallback()]
                )

    best_p = tuner.get_best_hyperparameters()[0]

# Make sure the tuner cleanly exits if it is canceled by keyboard interrupt. The printed values should remain intact.
# If fewer than 4 trials have been run, assume the tuner settings/model needs to be altered and delete the tuning directory.
except KeyboardInterrupt as e:
    if len(glob('tuning/*')) < 4:
        rmtree('tuning/', ignore_errors=True)
        rmtree('__pycache__/', ignore_errors=True)
    print('Tuning canceled by user.')

except Exception as e:
    if len(glob('tuning/*')) < 4:
        rmtree('tuning/', ignore_errors=True)
        rmtree('__pycache__/', ignore_errors=True)
    print('Tuning failed.')
    raise e

In [None]:
best_p = tuner.get_best_hyperparameters()[0]

In [None]:
best_p.values

In [None]:
model = tuner.hypermodel.build(best_p)

In [None]:
history = model.fit(triplet_train, epochs=10, validation_data=triplet_test, callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)])

In [None]:
# history_siamese = model_siamese.fit(triplet_train.generator(), steps_per_epoch=triplet_train.num_batches, epochs=100, validation_data=triplet_test.generator(), validation_steps=triplet_test.num_batches, callbacks=[callbacks.EarlyStopping(patience=2, restore_best_weights=True)])

# # freeze the layers
# for layer in model_siamese.layers:
#     layer.trainable = False
# fig = plt.figure(figsize=(10, 5))
# plt.plot(history_siamese.history['loss'], label='train')
# plt.plot(history_siamese.history['val_loss'], label='test')
# plt.legend()
# plt.show()

In [None]:
# history = model_inference.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_test, y_test), callbacks=[callbacks.EarlyStopping(patience=2, restore_best_weights=True)])

In [None]:
fig = plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()
plt.show()

In [None]:
import fasttext
import urllib.request
import zipfile
import shutil
import os

def download_file(url, filename, move_to=None):
    """Downloads a file from a url and saves it to the specified filename.
    
    Args:
        url (str): The url to download the file from.
        filename (str): The filename to save the file to.
    """
    if not os.path.exists(move_to + filename) and not os.path.exists(move_to + filename.replace(filename.split('.')[-1], 'parquet')):
        # create the directory if it doesn't exist
        
        with urllib.request.urlopen(url) as response, open(filename, 'wb') as out_file:
            data = response.read() # a `bytes` object
            out_file.write(data)

    if move_to is not None:
        if not os.path.exists(os.path.dirname(move_to)):
            os.makedirs(os.path.dirname(move_to), exist_ok=True)
        if not os.path.exists(move_to + filename) and not os.path.exists(move_to + filename.replace(filename.split('.')[-1], 'parquet')):
            shutil.move(filename, move_to)


In [None]:

def get_emotion_df():
    """Parses the emotion dataframes and returns a dataframe with the emotion data that has been tokenized using of of the fasttext larger english cbow models.
    
    This saves a little bit of the processing time at the expense of storage space.
    
    Returns:
        pd.DataFrame"""
    if os.path.exists('data/Emotions/emotion_df.parquet'):
        emotion_df = load_file('data/Emotions/emotion_df.parquet')
    else:
        emotion_df = parse_emotion_dataframes([0, 1, 2, 3, 4], ensure_only_one_label=True)
        #drop duplicates
        emotion_df = emotion_df.drop_duplicates(subset=['text'])
        if not os.path.exists('data/Emotions/crawl-300d-2M-subword.bin'):
            #fasttext.util.download_model('en', if_exists='ignore')
            
            # download file https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip
            download_file('https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M-subword.zip', 'crawl-300d-2M-subword.zip')
            # unzip the file

            with zipfile.ZipFile('crawl-300d-2M-subword.zip', 'r') as zip_ref:
                zip_ref.extract('crawl-300d-2M-subword.bin')
            # rename the file

            os.rename('crawl-300d-2M-subword.bin', 'data/Emotions/crawl-300d-2M-subword.bin')
            os.remove('crawl-300d-2M-subword.zip')
            fast_text_model = fasttext.load_model('data/Emotions/crawl-300d-2M-subword.bin')
        else:
            fast_text_model = fasttext.load_model('data/Emotions/crawl-300d-2M-subword.bin')
        
        list_of_emotions = emotion_df.columns[1:]
        
        
        # preprocess the text data using the fasttext model
        emotion_df['text'] = emotion_df['text'].apply(lambda x: fast_text_model.get_sentence_vector(x))
        # save the preprocessed data to a file as a parquet file
        #save_file(emotion_df, 'data/Emotions/emotion_df.parquet')
    emotion_df.dropna(inplace=True)
    return emotion_df

emotion_df = get_emotion_df()
emotion_df.text[0].shape

In [None]:
emotion_df = parse_emotion_dataframes([0, 1, 2, 3, 4], ensure_only_one_label=True)

In [None]:
# get counts of each hot encoded emotion
emotion_df_counts = emotion_df[emotion_df.columns[1:]].sum().sort_values(ascending=False)


In [None]:
emotion_df_counts

In [None]:
import fasttext
from tqdm.notebook import tqdm
from nltk.corpus import wordnet as wn

words = []
for word in emotion_df.text:
    words.extend(word.split())
words = list(set(words))

def get_synonyms(word):
    synonyms = set()
    for syn in wn.synsets(word):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)
# find the form of a word using wordnet
def get_wordnet_pos(word):

    pos = wn.synsets(word)
    if pos.__len__() > 0:
       pos = pos[0].pos()
    else:
        return ''

    return pos


In [None]:

words_df = pd.DataFrame(words, columns=['word'])



import emoji
import string

# Check whether a word is an emoji. If it is, then return a one word description of the emoji
def convert_emojis(s):
    return emoji.demojize(s)

words_df['word'] = words_df['word'].apply(lambda x: convert_emojis(x))
def replace_punctuation(s):
    """Replace puntuation with spaces"""
    list_of_punctuation = string.punctuation

    for p in list_of_punctuation:
        s = s.replace(p, ' ')
    return s.lower()

words_df['word'] = words_df['word'].apply(lambda x: replace_punctuation(x))

#if there are rows with spaces between words, then split them into separate rows
words_df_split = words_df[words_df['word'].str.contains(' ')].copy()
words_df = words_df[~words_df['word'].str.contains(' ')]
words_df_split['word'] = words_df_split['word'].apply(lambda x: x.split(' '))
words_df_split = words_df_split.explode('word')
words_df = pd.concat([words_df, words_df_split], axis=0)

# remove duplicates
words_df = words_df.drop_duplicates(subset=['word'])
# remove words that are just numbers
words_df = words_df[~words_df['word'].str.isnumeric()]
words_df['pos'] = words_df['word'].apply(lambda x: get_wordnet_pos(x))
words_df = words_df[words_df['pos']!='']
words_df = words_df[words_df['word'].apply(lambda x: x.__len__()>4)]
len(words_df)

In [None]:
# print all values of d
words_df.pos.value_counts()

In [None]:
def get_synonyms(word, pos):
    synonyms = set()
    for syn in wn.synsets(word, pos=pos):
        for lemma in syn.lemmas():
            synonyms.add(lemma.name())
    return list(synonyms)

words_df['synonyms'] = words_df.apply(lambda x: get_synonyms(word=x['word'], pos=x['pos']), axis=1)
# if synonyms is empty, then remove the row
words_df = words_df[words_df['synonyms'].apply(lambda x: x.__len__()>0)]

# if len synonyms is 1 and the synonym is the same as the word, then remove the row
words_df = words_df[~((words_df['synonyms'].apply(lambda x: x.__len__()==1)) & (words_df['synonyms'].apply(lambda x: x[0])==words_df['word']))]

words_df = words_df.reset_index(drop=True)

In [None]:
#pyspellchecker 
from spellchecker import SpellChecker

def get_misspellings(file_path):
    
    misspellings = load_file(file_path)
    misspellings = np.squeeze(misspellings.values).tolist()
    if ":" in misspellings[0]:
        misspellings_dict = {key: value_split_list.replace(',', '').replace('\n', '').split(' ') for key, value_split_list in [x.split(':') for x in misspellings]}
        for key, value in misspellings_dict.items():
            misspellings_dict[key] = [x for x in value if x not in [key, '', None] and "*" not in x]
    elif "$" in "".join(misspellings[0:100]):
        misspellings_list = [[] for x in range(0, misspellings.__len__())]
        #misspellings = [x.replace('\n', '') for x in misspellings]
        spellings_list = []
        for spelling_id in range(len(misspellings)):
            word = misspellings[spelling_id]
            if word != None:
                
                if "$" in word: 
                    spellings_list.append(word.replace("$", ""))
                else:
                    misspellings_list[spelling_id].append(word)

        misspellings_dict = {}
        last_key = ""
        
        for i in range(len(spellings_list)):
            if "$" in misspellings[i]:
                last_key = misspellings[i].replace("$", "")
                misspellings_dict[last_key] = []
            else:
                if last_key != "":
                    misspellings_dict[last_key].append([misspellings[i]])
        for key, value in misspellings_dict.items():
            misspellings_dict[key] = [x[0] for x in value if x not in [key, '', None] and "*" not in x]
    return misspellings_dict


urls = ['https://www.dcs.bbk.ac.uk/~ROGER/missp.dat', 'https://www.dcs.bbk.ac.uk/~ROGER/aspell.dat', 'https://www.dcs.bbk.ac.uk/~ROGER/wikipedia.dat', 'https://norvig.com/ngrams/spell-errors.txt']
for file in urls:
        download_file(file, file.split('/')[-1], move_to='data/Misspellings/')
convert_project_files_to_parquet()
urls = [x.replace(x.split('.')[-1], 'parquet') for x in urls]

In [None]:
list_dicts = [get_misspellings("data/Misspellings/" + x.split('/')[-1]) for x in urls]

In [None]:

def load_spelling_dictionaries(list_of_files_to_load):
    spell = SpellChecker()
    for file in list_of_files_to_load:
        download_file(file, file.split('/')[-1], move_to='data/Misspellings/')
    convert_project_files_to_parquet()
    list_dicts = [get_misspellings("data/Misspellings/" + x.split('/')[-1]) for x in list_of_files_to_load]
    # set of all keys in n dictionaries
    union_keys = list(set(list_dicts[0]).union(*list_dicts[1:]))
    misspellings_dict = {}
    for key in union_keys:
        misspellings_dict[key] = []
        for misspellings_dict_n in list_dicts:
            if key in misspellings_dict_n:
                misspellings_dict[key].extend(misspellings_dict_n[key])

    for key in misspellings_dict:
        misspellings_dict[key] = list(set(misspellings_dict[key]))
    
    index_to_remove = []
    for key, value in misspellings_dict.items():
        for word in value:
            if len(spell.unknown([word])) == 0 or word == '':
                index_to_remove.append((key, word))
    
    for key, word in index_to_remove:
        misspellings_dict[key].remove(word)

    # remove empty lists
    for key in list(misspellings_dict):
        if not misspellings_dict[key]:
            del misspellings_dict[key]

    return misspellings_dict



misspellings_dict = load_spelling_dictionaries(urls)

def get_spell_variants(word, misspellings_dict):
    if word in misspellings_dict:
        return misspellings_dict[word]
    else:
        return []

# collect all misspellings for each word (str) and synonym (list) in words_df
words_df['misspellings'] = words_df['word'].apply(lambda x: get_spell_variants(x, misspellings_dict))
words_df['synonym_misspellings'] = words_df['synonyms'].apply(lambda x: [get_spell_variants(y, misspellings_dict) for y in x])
# concat list of lists into one list
words_df['synonym_misspellings'] = words_df['synonym_misspellings'].apply(lambda x: [item for sublist in x for item in sublist])
# combine misspellings and synonym_misspellings and remove duplicates
words_df['misspellings'] = words_df.apply(lambda x: list(set(x['misspellings'] + x['synonym_misspellings'])), axis=1)
words_df.drop(columns=['synonym_misspellings'], inplace=True)
words_df