        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

python packages needed: compress_pickle[lz4] pandas numpy yahooquery seaborn

If for some reason you wish to download the data yourself, you can do so by following these steps:

1) Download and extract https://www.kaggle.com/datasets/footballjoe789/us-stock-dataset

2) From the stock dataset, copy "Stock_List.csv" and "Stocks/*" to "data/". You can delete the other files as they are not used (at least yet).

In [None]:
try:
    import pandas as pd
    # set pandas to use pyarrow')
    pd.set_option('io.parquet.engine', 'pyarrow')
    import numpy as np
    import seaborn
    from glob import glob
    from compress_pickle import dump, load
    import os
    from yahooquery import Ticker
    import timeit
    import time
    import gc, os
    import datetime
    from getpass import getpass
    from shutil import rmtree
    from utils import fernet_key_encryption, aquire_stock_search_terms as aquire_terms, get_macroeconomic_data as macro_data, download_datasets, load_file, interpolate_months_to_days, intersect_df, parse_emotion_dataframes
except ModuleNotFoundError as e:
    print(e)
    print('Please install the missing module(s)')
    print("pip install compress_pickle[lz4] pandas numpy yahooquery cryptography seaborn kaggle pyarrow")
    


In [None]:
# After this first run, you will only need to enter the password to load the api keys.
# If you need to change the keys or password, delete the relevent .secret keys file and run this section again.
# salt.secret is a non-sensitive file that is used to both generate the encryption key as well as decryption. If this key is lost, the encrypted files are lost and you will need to re-enter the api keys.

# Ask for input of password to save API keys
password = getpass("Enter password to save/load API keys: ");

kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
#td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')

#data.nasdaq.com api key
data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');

del password;
gc.collect();


In [None]:
username, password = kaggle_api_key.split(' ');
os.environ['KAGGLE_USERNAME'] = username;
os.environ['KAGGLE_KEY'] = password;
os.environ['KAGGLE_CONFIG_DIR'] = os.getcwd();

# download the various kaggle datasets
download_datasets(
        'https://www.kaggle.com/datasets/sarthmirashi07/us-macroeconomic-data', 
        kaggle_api_key, 
        files_to_move={'US_macroeconomics.csv': 'macro/US_macroeconomics.csv'},
        delete=True,
        dest_name='Macro')

download_datasets(
        'https://www.kaggle.com/datasets/footballjoe789/us-stock-dataset', 
        kaggle_api_key, 
        files_to_move={'us-stock-dataset/Stock_List.csv': 'Stock_List.csv', 'us-stock-dataset/Data/Stocks': 'Stocks'}, 
        delete=True,
        dest_name='Stocks')

download_datasets(
        'https://www.kaggle.com/datasets/mathurinache/goemotions',
        kaggle_api_key,
        files_to_move={'goemotions.csv': 'Emotions/goemotions.csv'},
        delete=True,
        dest_name='Emotions')

download_datasets(
        'https://www.kaggle.com/datasets/parulpandey/emotion-dataset',
        kaggle_api_key,
        files_to_move={'training.csv': 'Emotions/training.csv', 'validation.csv': 'Emotions/validation.csv', 'test.csv': 'Emotions/test.csv'},
        delete=True,
        dest_name='Emotions')

download_datasets(
        'https://www.kaggle.com/datasets/kosweet/cleaned-emotion-extraction-dataset-from-twitter',
        kaggle_api_key,
        files_to_move={'dataset(clean).csv': 'Emotions/dataset(clean).csv'},
        delete=True,
        dest_name='Emotions')

download_datasets(
        'https://www.kaggle.com/datasets/miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests',
        kaggle_api_key,
        files_to_move={'raw_partner_headlines.csv': 'Text/raw_partner_headlines.csv', 'raw_analyst_ratings.csv': 'Text/raw_analyst_ratings.csv', 'analyst_ratings_processed.csv': 'Text/analyst_ratings_processed.csv'},
        delete=True,
        dest_name='Text')



# clear the username and key from the environment variables
os.environ['KAGGLE_USERNAME'] = "" 
os.environ['KAGGLE_KEY'] = ""

In [None]:
# Gather the company info for all the ticker symbols and return a dataframe with relevant search terms for each company.
# If the stocks dataset is updated on kaggle, compank_list.pkl needs to be deleted and this run again if the symbols have changed. It would be more efficient to manually pull the new data ourselves.
search_terms = aquire_terms('data/Stocks/')
search_terms.data

In [None]:
# gether historical macroeconomic data from different sources.
#macro_data = macro_data('data/Macro/')

df1 = load_file('data/Macro/US_macroeconomics.parquet')
df2 = load_file('data\Stocks\AAPL.parquet')

#for col in [x for x in df.columns if x not in ["CPI", "date", "Unemp_rate", "mortgage_rate"]]:
    #df[col] = df[col].div(df['CPI'])
    


df1, df2 = intersect_df(df1, df2, interpolate_to_days=True, extend_trend_to_today=False) # extend_trend_to_today should only be used when the macro data is recent.
df1



In [None]:
# Still needs to be checked for accuracy/cleannes after merging the datasets.
emotion_df = parse_emotion_dataframes([0, 1, 2, 3, 4])
emotion_df

In [None]:
emotion_df.value_counts([x for x in emotion_df.columns if x != 'text'])