        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('io.parquet.engine', 'pyarrow')
import numpy as np
import os
import gc, os
from getpass import getpass
from utils import *
from models import *
import gc
import shutil
import zipfile
import torch
import arrow
import matplotlib.pyplot as plt


# currently hardcoded to use GPU in order to identify when cuda is installed incorrectly. The models will not be practical to train on CPU
cuda = torch.device("cuda") 
cpu = torch.device("cpu")
# if you don't need the API downloads, you can set this to False
use_api = False

38
seed=42
np.random.seed(seed)
torch.manual_seed(seed)
rng = np.random.default_rng(seed=seed) 
MODEL = 'cardiffnlp/twitter-xlm-roberta-base-sentiment' #cardiffnlp/twitter-xlm-roberta-base-sentiment  cardiffnlp/twitter-roberta-base-emotion
print('Available GPU memory:', available_mem(), 'GB')

In [None]:
# 1: Enter password to save/load API keys. This will be used to create/subsequently unlock the encrypted API keys.
# 2: if it is the first time running this, this will ask you to enter your username + a single space + your Kaggle API key. (don't use quotes)
# 3: any other api keys in future will follow the same format

# # may take awhile to load the first time, depending on your internet speeds. 
# After this first run, you will only need to enter the password to load the api keys 
# NOTE: Don't delete or move salt.secret as all tokens will become undecryptable. 
# If you made a mistake or need to retry, you may 1) delete salt.secret to reset everything. 2) delete the specific .secret key to re-enter only that info.
if use_api:
    # Ask for input of password to save API keys
    password = getpass("Enter password to save/load API keys: ");
    if len(password) > 0:
        kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
        #td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')
        #data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');
        del password;
        gc.collect();
        get_datasets(kaggle_api_key);
    elif len(glob('data/*')) < 4:
        cont = input(f"Password is empty. Press n to cancel or any other key to continue: ")
        if cont == 'n':
            assert False, "Exiting program. Please enter a password to continue."

In [None]:
# gathers a dataframe of useful terms and info related to every stock in data\Stock_List.parquet. This may take up to 40 minutes to run the first time (the web scraping is slow due to requests being throttled). Stores results in company_list.pkl
search_terms = aquire_stock_search_terms('data/Stock/')

In [None]:
update_stock_data(ticker='KR', save = True)
update_stock_data(ticker='AAPL', save = True)
update_stock_data(ticker='TSLA', save = True)

In [None]:
if False:
    search = search_terms.data[search_terms.data['ticker'] == 'KR'].values.tolist()[0][0:4]
    search = [x.lower() for x in search if x != None and x != '']
    search[0] = '#' + search[0]
    df_if_error = scrape_tweets(since='2018-01-01', until='2023-04-19', max_tweets=20, update_twitter_data=True, co_list=search)

    search = search_terms.data[search_terms.data['ticker'] == 'AAPL'].values.tolist()[0][0:4]
    search = [x.lower() for x in search if x != None and x != '']
    search[0] = '#' + search[0]
    df_if_error = scrape_tweets(since='2018-01-01', until='2023-04-19', max_tweets=20, update_twitter_data=True, co_list=search)


    # search = search_terms.data[search_terms.data['ticker'] == 'TSLA'].values.tolist()[0][0:4]
    # search = [x.lower() for x in search if x != None and x != '']
    # search[0] = '#' + search[0]
    # df_if_error = scrape_tweets(since='2018-01-01', until='2023-04-19', max_tweets=20, update_twitter_data=True, co_list=search)
    

# how to recover the data if the crawler crashes (example for apple inc.)
#tweets_df = pd.DataFrame(df_if_error, columns=['date', 'text', 'username', 'searchterm'])
#tweets_df = tweets_df.drop_duplicates(inplace=False, subset=['date', 'text', 'username', 'searchterm']).reset_index(drop=True, inplace=False).dropna(inplace=False)
#save_file(tweets_df, r'data\Twitter\twitter_data___#aapl---apple inc.---timothy cook---consumer electronics.parquet')

In [None]:
emotion_classifier = emotion_classifier_load(MODEL)
text_df = classify_twitter_text(save_path='data/Text/text_emotion_29.parquet', model=emotion_classifier, load_path='data/Text/text_emotion_29.parquet')
text_df = text_df.drop(columns=['stock','text'], inplace=False).dropna(inplace=False)
# len date row chars > 7
text_df = text_df[text_df['date'].str.len() > 7]
text_df


In [None]:
index = glob('data/Twitter/twitter_emotion___#*.parquet')
index = {x.split("#")[1].split("---")[0]: x for x in index}

kr_text =   load_file(index['kr'].replace('emotion', 'data'))
aapl_text = load_file(index['aapl'].replace('emotion', 'data'))

kr_df   = classify_twitter_text(save_path=index['kr'], model=emotion_classifier, load_path=index['kr'].replace('emotion', 'data'))
aapl_df = classify_twitter_text(save_path=index['aapl'], model=emotion_classifier, load_path=index['aapl'].replace('emotion', 'data'))

In [None]:
# kr stock
stock_kr = load_file('data/Stock/KR.parquet')
#kr_df, stock_kr = intersect_df(kr_df, stock_kr)
kr_df['date'] = kr_df['date'].apply(lambda x: arrow.get(str(x)[:10]).format('YYYY-MM-DD') if len(str(x)) > 7 else None).dropna()
#kr_df, stock_kr = intersect_df(kr_df, stock_kr)
kr_stats = daily_stats(kr_df)

stock_kr = load_file('data/Stock/KR.parquet')
stock_kr, text_df_kr = intersect_df(stock_kr, text_df)
text_stats_kr = daily_stats(text_df_kr)
text_df_kr = daily_stats(text_df_kr)



# aapl stock
stock_aapl = load_file('data/Stock/AAPL.parquet')
#aapl_df, stock_aapl = intersect_df(aapl_df, stock_aapl)
aapl_df['date'] = aapl_df['date'].apply(lambda x: arrow.get(str(x)[:10]).format('YYYY-MM-DD') if len(str(x)) > 7 else None).dropna()
#aapl_df, stock_aapl = intersect_df(aapl_df, stock_aapl)
aapl_stats = daily_stats(aapl_df)

stock_aapl = load_file('data/Stock/AAPL.parquet')
stock_aapl, text_df_appl = intersect_df(stock_aapl, text_df)
text_stats_appl = daily_stats(text_df_appl)
text_df_appl = daily_stats(text_df_appl)

twitter to emotion dataframe

feature list: 

              emotions        [std, mean, count],                                 --> no need to normalize
              
              stock           [open, high, low, close, volume],                   --> normalize percent change

              portfolio       [current money, stocks owned, stocks owned value],  --> keep track of portfolio

              date            [day of week, holiday, day of month, month]         --> engineer

              economic data   [NA]                                                --> normalize 

output: 

  volatility:

  best action

  predicted price


In [None]:
def merge_data_timeline(company_emotion_stats, stock, generic_news_stats, fillzero=True, impute=True):
    """
    Merges the data from the company emotion stats and the generic news stats into a single dataframe
    
    Parameters
    ----------
    company_emotion_stats : pandas.DataFrame
        The dataframe containing the company emotion stats
    generic_news_stats : pandas.DataFrame
        The dataframe containing the generic news stats
    fillzero : bool, optional
        Whether to fill the NaN values with 0, by default True
    impute : bool, optional
        Whether to impute the NaN values (missing days), by default True
    
    Returns
    -------
    pandas.DataFrame
        The merged dataframe by date, with zeros for all values where there is no intersection 
    """
    company_emotion_stats.date = company_emotion_stats.date.apply(lambda x: arrow.get(str(x)[:10]).format('YYYY-MM-DD') if len(str(x)) > 7 else None)
    generic_news_stats.date = generic_news_stats.date.apply(lambda x: arrow.get(str(x)[:10]).format('YYYY-MM-DD') if len(str(x)) > 7 else None)
    df = pd.merge(company_emotion_stats, generic_news_stats, how='outer', on='date')
    df = df.sort_values(by='date', inplace=False)
    df = df.dropna(subset=['date'])
    # add one day to stock data so we can have the next day's as the target
    #stock['date'] = stock['date'].apply(lambda x: arrow.get(x).shift(days=1).format('YYYY-MM-DD') if arrow.get(x).weekday() < 5 else arrow.get(x).shift(days=2).format('YYYY-MM-DD') if arrow.get(x).weekday() == 5 else arrow.get(x).shift(days=3).format('YYYY-MM-DD'))

    df, stock = intersect_df(df, stock)
    df = pd.merge(df, stock, how='outer', on='date')
    df = df.sort_values(by='date', inplace=False).reset_index(drop=True, inplace=False)
    df['day_of_week'] = df['date'].apply(lambda x: arrow.get(x).weekday())
    df['day_of_month'] = df['date'].apply(lambda x: arrow.get(x).day)
    df['month'] = df['date'].apply(lambda x: arrow.get(x).month)
    df['mon_or_fri'] = df['day_of_week'].apply(lambda x: 1 if x == 0 or x == 4 else 0)
    # percent change
    df['open'] = df['open'].pct_change()
    df['high'] = df['high'].pct_change()
    df['low'] = df['low'].pct_change()
    df['volume'] = df['volume'].pct_change()
    df['close'] = df['close'].pct_change()
    if fillzero:
        df = df.fillna(0)
    if impute:
        df = df.interpolate(method='linear')
    
    return  df

merged_df = merge_data_timeline(kr_stats, stock_kr, text_stats_kr)
merged_df



In [None]:
y_cols = ['open', 'high', 'low', 'close', 'volume']
x_cols = merged_df.columns #  if x not in y_cols and x != 'date'... not removed because it is current day's data and we want to predict next day's data
X = merged_df[x_cols]
Y = merged_df[y_cols]
#delete the first row to allign the data
Y = Y.drop(Y.index[0]) # y is now the next day's data... e.g. we want to predict tomorrow's stock price
X = X.drop(X.index[-1]) 

# complete timeseries
Y = Y.values
X = X.values

# split into train and test sets by slicing
train_size = int(len(X) * 0.90)
test_size = len(X) - train_size

X_train, X_test = X[0:train_size,:], X[train_size:len(X),:]
Y_train, Y_test = Y[0:train_size,:], Y[train_size:len(Y),:]

dates_train = X_train[:,0]
dates_test = X_test[:,0]
X_train = X_train[:,1:].astype('float32')
X_test = X_test[:,1:].astype('float32')

# convert to tensor
X_train = torch.from_numpy(X_train).float()
Y_train = torch.from_numpy(Y_train).float()

Y_train.shape, X_train.shape, Y_test.shape, X_test.shape

In [None]:
from transformers import SwinConfig, SwinModel
configuration = SwinConfig()


In [None]:
class portfolio():
    def __init__(self, funds):
        self.funds = funds
        self.stocks_owned = {}

    def de_norm_loss(self):
        """undoes the operations performed to normalize the data"""
        pass

    def loss_action(self, y_action, y_reality):
        """calculates how well the portfolio is doing"""
        pass

    def loss_innacuracy(self, y_pred, y_reality):
        pass