        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('io.parquet.engine', 'pyarrow')
import numpy as np
import os
import gc, os
from getpass import getpass
from utils import *
from models import *
import gc
import shutil
import zipfile
import torch
import arrow
import matplotlib.pyplot as plt


# currently hardcoded to use GPU in order to identify when cuda is installed incorrectly. The models will not be practical to train on CPU
cuda = torch.device("cuda") 
cpu = torch.device("cpu")
# if you don't need the API downloads, you can set this to False
use_api = False

38
seed=42
np.random.seed(seed)
torch.manual_seed(seed)
rng = np.random.default_rng(seed=seed) 
MODEL = 'cardiffnlp/twitter-xlm-roberta-base-sentiment' #cardiffnlp/twitter-xlm-roberta-base-sentiment  cardiffnlp/twitter-roberta-base-emotion
print('Available GPU memory:', available_mem(), 'GB')

In [None]:
# 1: Enter password to save/load API keys. This will be used to create/subsequently unlock the encrypted API keys.
# 2: if it is the first time running this, this will ask you to enter your username + a single space + your Kaggle API key. (don't use quotes)
# 3: any other api keys in future will follow the same format

# # may take awhile to load the first time, depending on your internet speeds. 
# After this first run, you will only need to enter the password to load the api keys 
# NOTE: Don't delete or move salt.secret as all tokens will become undecryptable. 
# If you made a mistake or need to retry, you may 1) delete salt.secret to reset everything. 2) delete the specific .secret key to re-enter only that info.
if use_api:
    # Ask for input of password to save API keys
    password = getpass("Enter password to save/load API keys: ");
    if len(password) > 0:
        kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
        #td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')
        #data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');
        del password;
        gc.collect();
        get_datasets(kaggle_api_key);
    elif len(glob('data/*')) < 4:
        cont = input(f"Password is empty. Press n to cancel or any other key to continue: ")
        if cont == 'n':
            assert False, "Exiting program. Please enter a password to continue."

In [None]:
# gathers a dataframe of useful terms and info related to every stock in data\Stock_List.parquet. This may take up to 40 minutes to run the first time (the web scraping is slow due to requests being throttled). Stores results in company_list.pkl
search_terms = aquire_stock_search_terms('data/Stock/')

In [None]:
emotion_classifier = emotion_classifier_load(MODEL)

# classifier and classify the dataset of analyst ratings and partner headlines
if not os.path.exists('data/Text/text_emotion_29.parquet'):
    text_df = pd.concat([
                    load_file('data/Text/analyst_ratings_processed.parquet').drop(columns=['unnamed: 0']).rename(columns={'title': 'text'}), 
                    load_file('data/Text/raw_partner_headlines.parquet').drop(columns=['unnamed: 0', 'url', 'publisher']).rename(columns={'headline': 'text'})
                    ], axis=0).drop_duplicates().reset_index(drop=True)
    text=text_df.text.tolist()
    text = classify_text(emotion_classifier, text, MODELNAME=MODEL, bs=100, device=cuda)
    text_dict = {'emotion': text}
    text_df = pd.concat([text_df, pd.DataFrame(text_dict)], axis=1)
    save_file(text_df, 'data/Text/text_emotion_29.parquet')
    text_df
else:
    text_df = load_file('data/Text/text_emotion_29.parquet')
    text_df


In [None]:
#update_stock_data(ticker=None, save = False)

In [None]:
# kr stock
stock = load_file('data/Stock/KR.parquet')
stock
text_df, stock = intersect_df(text_df, stock)
text_df['date'] = text_df['date'].apply(lambda x: arrow.get(str(x)[:10]).format('YYYY-MM-DD') if len(str(x)) > 9 else None).dropna()
text_df, stock = intersect_df(text_df, stock)
text_df = text_df.dropna(subset=['date'])

In [None]:
# group by date
def daily_stats(df):
    """Takes in a dataframe and returns the daily std, mean, outliers, and count of the dataframe
    
    Parameters
    ----------
    df : pd.DataFrame
        The dataframe to be grouped by date
        
        Returns
        -------
        pd.DataFrame
            The dataframe with the daily stats
    """
    df = df.drop(['text', 'stock'], axis=1)#.reset_index(drop=False)
    df = pd.concat([df, pd.DataFrame(df.emotion.tolist(), index=df.index)], axis=1)
    df = df.drop(columns=['emotion'])

    # group by date
    df = df.groupby('date').agg([lambda x: x.std() if x.count() > 1 else 0, lambda x: x.mean() if x.count() > 1 else 0, lambda x: x.count() if x.count() > 1 else 0])
    # flatten the columns
    df.columns = ['_'.join([str(x).replace('<lambda_0>', 'std').replace('<lambda_1>', 'mean').replace('<lambda_2>', 'count').replace('<lambda_3>', 'occured') for x in col]) for col in df.columns.values]
    df.reset_index(inplace=True)
    return df

text_stats = daily_stats(text_df)


In [None]:
stock

In [None]:
search = search_terms.data[search_terms.data['ticker'] == 'KR'].values.tolist()[0][0:4]
search = [x.lower() for x in search if x != None and x != '']
search[0] = '#' + search[0]
search

In [None]:
df_if_error = scrape_tweets(since='2018-11-01', until='2023-04-19', max_tweets=20, update_twitter_data=True, co_list=search)

In [None]:
load_file(r'data\Twitter\twitter_data___#kr---the kroger co.---william rodney mcmullen---grocery stores.parquet')