        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('io.parquet.engine', 'pyarrow')
import numpy as np
import os
import gc, os
from getpass import getpass
from utils import *
from models import *
import gc
import shutil
import zipfile
import torch
import arrow
import matplotlib.pyplot as plt

# currently hardcoded to use GPU in order to identify when cuda is installed incorrectly. The models will not be practical to train on CPU
cuda = torch.device("cuda") 
cpu = torch.device("cpu")
# if you don't need the API downloads, you can set this to False
use_api = True

38
seed=42
np.random.seed(seed)
torch.manual_seed(seed)
rng = np.random.default_rng(seed=seed) 
print('Available GPU memory:', available_mem(), 'GB')

In [None]:
# 1: Enter password to save/load API keys. This will be used to create/subsequently unlock the encrypted API keys.
# 2: if it is the first time running this, this will ask you to enter your username + a single space + your Kaggle API key. (don't use quotes)
# 3: any other api keys in future will follow the same format

# # may take awhile to load the first time, depending on your internet speeds. 
# After this first run, you will only need to enter the password to load the api keys 
# NOTE: Don't delete or move salt.secret as all tokens will become undecryptable. 
# If you made a mistake or need to retry, you may 1) delete salt.secret to reset everything. 2) delete the specific .secret key to re-enter only that info.
if use_api:
    # Ask for input of password to save API keys
    password = getpass("Enter password to save/load API keys: ");
    kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
    #td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')
    #data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');
    del password;
    gc.collect();
    get_datasets(kaggle_api_key);

In [None]:
# gathers a dataframe of useful terms and info related to every stock in data\Stock_List.parquet. This may take up to 40 minutes to run the first time (the web scraping is slow due to requests being throttled). Stores results in company_list.pkl
search_terms = aquire_stock_search_terms('data/Stock/')

In [None]:
# only needed for training the sentiment analysis model
# ~2m 42s first run, ~25s after (if augment=True, 24 hours, 2m 45s after)

if not os.path.exists('models/emotion_classifier.pt'):
    classes, train_triplets, test_triplets, x_train, y_train, x_test, y_test = prep_triplet_data(MODEL=f"cardiffnlp/twitter-xlm-roberta-base-sentiment", augment=True, aug_n = 400000)
    ds_train, ds_test = prep_tensor_ds( x_train, y_train, x_test, y_test)
    classes_len = len(classes)

In [None]:
"""
Ignore the error if a model weight of size [29, 768] is loaded without weights. This is intentional since we will train the model head as a final step not using contrastive loss.

At this point, this model should train to a validation loss of 0. Since we are using contrastive loss (with a margin) the model will stop changing once classes are considered different enough (a perameter that can be set in torch.nn.TripletMarginWithDistanceLoss margine).

This step is designed to force the model to encode every class in a manner that is equally easily seperable in the final crossentropy loss (with a set margin of similarity, which can be increased if the classifier struggles).
"""

if not os.path.exists('models/siamese_model.pt'):
    
    siamese_network_model = siamese_network(classes_len).to(cuda)
    siamese_model, history = pre_train_using_siamese(train_triplets, test_triplets, siamese_network_model, epochs=10, classes=classes)
    if not os.path.exists('models'):
        os.makedirs('models')
    torch.save(siamese_model.state_dict(), f'models/siamese_model.pt')

    # plot the train and test loss over time on the same plot
    fig = plt.figure(figsize=(10, 5))
    plt.plot(history['train'], label='pretraining train loss')
    plt.plot(history['test'], label='pretraining test loss')
    plt.legend()
    plt.show()

    # save plot of train and test loss over time along with history
    if not os.path.exists('results'):
        os.makedirs('results')
    fig.savefig('results/pretrain_emotion_history.jpg')
    save_file(history, 'results/pretrain_emotion_history.parquet')
    plt.close(fig)
else:
    siamese_network_model = siamese_network(classes_len).to(cuda)
    siamese_network_model.load_state_dict(torch.load(f'models/siamese_model.pt'))


In [None]:
"""In this step, we finally train the last two weight layers to convert the siamese network into a classifier. Later, we can consider unfreezing a few of the earlier layers to improve performance with a lower learning rate."""

if not os.path.exists('models/emotion_classifier.pt'):
    model = classify_single_input(siamese_network_model)
    model = model.to(cuda)
    model, history = train_emotion_classifier(model, ds_train, ds_test, epochs=2)

    if not os.path.exists('models'):
        os.makedirs('models')
    torch.save(model.state_dict(), f'models/emotion_classifier.pt')

    # plot the train and test loss over time on the same plot
    fig = plt.figure(figsize=(10, 5))
    plt.plot(history['train'], label='train loss')
    plt.plot(history['test'], label='test loss')
    plt.legend()
    plt.show()

    # save plot of train and test loss over time along with history
    if not os.path.exists('results'):
        os.makedirs('results')
    fig.savefig('results/emotion_history.jpg')
    save_file(history, 'results/emotion_history.parquet')
    plt.close(fig)
else:
    model = classify_single_input(siamese_network_model)
    model = model.to(cuda)
    model.load_state_dict(torch.load(f'models/emotion_classifier.pt'))

In [None]:

############################# Experiments with web scraping. Unfinished, so ignore if it does not work #############################
from newspaper import Article
import newspaper
import scrapy
import stweet as st
import requests
from stweet.search_runner import SearchRunContext
import tor_python_easy
import io
#arrow time
# import urllib.request as urllib2
import arrow
import json
import os
# import yaml
# INSTALL DOCKER COMPOSE

# if not os.path.exists('docker-compose.yml'):
#     url = 'https://raw.githubusercontent.com/markowanga/tor-python-easy/main/docker-compose.yml'
#     filename = 'docker-compose.yml'
#     urllib2.urlretrieve(url, filename);
#     with open('docker-compose.yml', 'r') as file:
#         config = yaml.safe_load(file);
#     config['services']['torproxy']['environment'][0]=f'PASSWORD={os.urandom(32).hex()}';
#     #with open('docker-compose.yml', 'w') as file:
#        # yaml.dump(config, file);

# with open('docker-compose.yml', 'r') as file:
#         config = yaml.safe_load(file);
#         tor_pass = config['services']['torproxy']['environment'][0].split('=')[1]; 

# from tor_python_easy.tor_control_port_client import TorControlPortClient
# from tor_python_easy.tor_socks_get_ip_client import TorSocksGetIpClient
# proxy_config = {'http': 'socks5://localhost:9050','https': 'socks5://localhost:9050',}
# ip_client = TorSocksGetIpClient(proxy_config)
# tor_control_port_client = TorControlPortClient(control_address='localhost', control_port=9051, control_password=tor_pass)

def url_fetch(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text, article.title, article.publish_date, article.authors, article.summary, article.keywords

def crawl_url(url):
    page = newspaper.build(url, memoize_articles=False)
    articles = page.articles
    return articles


In [None]:
since = arrow.get('2021-01-01').datetime
until = arrow.get('2021-02-01').datetime
exact_words = None
all_words = None
any_word = '#apple'

# web_client = st.DefaultTwitterWebClientProvider.get_web_client_preconfigured_for_tor_proxy(
#     socks_proxy_url='socks5://localhost:9050',
#     control_host='localhost',
#     control_password=tor_pass,
#     control_port=9051,
# )


collect1 = st.CollectorRawOutput()
collect2 = st.CollectorRawOutput()
context = SearchRunContext()

#since = arrow.get(since).datetime
#until = arrow.get(until).datetime

tweets_task = st.SearchTweetsTask(since=since, until=until, any_word=any_word, exact_words=exact_words,all_words=all_words, tweets_limit=50, replies_filter=True)
runner = st.TweetSearchRunner(search_tweets_task=tweets_task,tweet_raw_data_outputs=[collect1],user_raw_data_outputs=[collect2], web_client=None, search_run_context=context)

runner.run()


In [None]:
runner.search_run_context

In [None]:
l = collect1.get_raw_list()

l2 = collect2.get_raw_list()

len(l), len(l2)

In [None]:

d = [value for key,value in json.loads(l[0].to_json_line())['raw_value'].items() if key in ['full_text','created_at','id']]
d[0] = arrow.get(d[0], 'ddd MMM DD HH:mm:ss Z YYYY').datetime.date()

In [None]:
d

In [None]:
len(list(set([[value for key,value in json.loads(x.to_json_line())['raw_value'].items() if key in ['full_text','created_at','id']][1] for x in l])))