        Prototyping notebook for predicting stock volaitility, prices, etc using extra data from web trends, news, etc. 

In [None]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
pd.set_option('io.parquet.engine', 'pyarrow')
import numpy as np
import os
import gc, os
from getpass import getpass
from utils import *
from models import *
import gc
import torch
import arrow
import matplotlib.pyplot as plt

# currently hardcoded to use GPU in order to identify when cuda is installed incorrectly. The models will not be practical to train on CPU
cuda = torch.device("cuda") 
cpu = torch.device("cpu")
# if you don't need the API downloads, you can set this to False
use_api = True


seed=42
np.random.seed(seed)
torch.manual_seed(seed)
rng = np.random.default_rng(seed=seed) 
print('Available GPU memory:', available_mem(), 'GB')

In [None]:
# 1: Enter password to save/load API keys. This will be used to create/subsequently unlock the encrypted API keys.
# 2: if it is the first time running this, this will ask you to enter your username + a single space + your Kaggle API key. (don't use quotes)
# 3: any other api keys in future will follow the same format

# # may take awhile to load the first time, depending on your internet speeds. 
# After this first run, you will only need to enter the password to load the api keys 
# NOTE: Don't delete or move salt.secret as all tokens will become undecryptable. 
# If you made a mistake or need to retry, you may 1) delete salt.secret to reset everything. 2) delete the specific .secret key to re-enter only that info.
if use_api:
    # Ask for input of password to save API keys
    password = getpass("Enter password to save/load API keys: ");
    kaggle_api_key = fernet_key_encryption(password, 'Kaggle');
    #td_ameritrade_api_key = fernet_key_encryption(password, 'TD_Ameritrade')
    #data_nasdaq_key = fernet_key_encryption(password, 'Nasdaq');
    del password;
    gc.collect();
    get_datasets(kaggle_api_key);

In [None]:
# gathers a dataframe of useful terms and info related to every stock in data\Stock_List.parquet. This may take up to 40 minutes to run the first time (the web scraping is slow due to requests being throttled). Stores results in company_list.pkl
search_terms = aquire_stock_search_terms('data/Stock/')

In [None]:
# only needed for training the sentiment analysis model
# ~2m 42s first run, ~25s after
classes, train_triplets, test_triplets, x_train, y_train, x_test, y_test = prep_triplet_data(MODEL=f"cardiffnlp/twitter-xlm-roberta-base-sentiment")
ds_train, ds_test = prep_tensor_ds( x_train, y_train, x_test, y_test)
classes_len = len(classes)

siamese_network_model = siamese_network(classes_len).to(cuda)

In [None]:
# This will train slower than using a tranditional torch dataset, since the triplet dataloader does not preprocess the data in parallel.
siamese_model, history = pre_train_using_siamese(train_triplets, test_triplets, siamese_network_model, epochs=2, classes=classes)

In [None]:
model = classify_single_input(siamese_network_model)
model = model.to(cuda)
model, history = train_emotion_classifier(model, ds_train, ds_test, epochs=2)

In [None]:
# save the model to disk
directory = 'models'
if not os.path.exists(directory):
    os.makedirs(directory)
torch.save(model.state_dict(), f'{directory}/emotion_classifier.pt')

In [None]:
#!tor --controlport 9051
# run this asynchronusly on a seperate thread
tor_pass = os.urandom(32).hex()
os.system(f'tor --controlport 9051 --hash-password {tor_pass} &')#--controlpassword {tor_pass} 

In [None]:
tor_pass = '16:8B50BA4D04CE492760868EC56A14B3E6D56D53F45A8E3435B8B59944E3';

In [None]:
# create a file to store the password
#find user dir
import os
torrc = f"{os.path.expanduser('~')}/AppData/Roaming/tor/torrc"
if not os.path.exists(torrc):
    with open(torrc, 'w') as f:
        f.write(tor_pass)

In [None]:
torrc

In [None]:
os.system(f'tor --controlport 9051 --tor.password "568456845678" &')#--controlpassword {tor_pass}

In [None]:

############################# Experiments with web scraping. Unfinished, so ignore if it does not work #############################
from newspaper import Article
import newspaper
import scrapy
import stweet as st
import requests
from stweet.search_runner import SearchRunContext
import tor_python_easy
import io
#arrow time
# import urllib.request as urllib2
import arrow
import json
import os
# import yaml
# INSTALL DOCKER COMPOSE

# if not os.path.exists('docker-compose.yml'):
#     url = 'https://raw.githubusercontent.com/markowanga/tor-python-easy/main/docker-compose.yml'
#     filename = 'docker-compose.yml'
#     urllib2.urlretrieve(url, filename);
#     with open('docker-compose.yml', 'r') as file:
#         config = yaml.safe_load(file);
#     config['services']['torproxy']['environment'][0]=f'PASSWORD={os.urandom(32).hex()}';
#     #with open('docker-compose.yml', 'w') as file:
#        # yaml.dump(config, file);

# with open('docker-compose.yml', 'r') as file:
#         config = yaml.safe_load(file);
#         tor_pass = config['services']['torproxy']['environment'][0].split('=')[1]; 

from tor_python_easy.tor_control_port_client import TorControlPortClient
from tor_python_easy.tor_socks_get_ip_client import TorSocksGetIpClient
proxy_config = {'http': 'socks5://localhost:9050','https': 'socks5://localhost:9050',}
ip_client = TorSocksGetIpClient(proxy_config)
tor_control_port_client = TorControlPortClient(control_address='localhost', control_port=9051, control_password=tor_pass)

def url_fetch(url):
    article = Article(url)
    article.download()
    article.parse()
    return article.text, article.title, article.publish_date, article.authors, article.summary, article.keywords

def crawl_url(url):
    page = newspaper.build(url, memoize_articles=False)
    articles = page.articles
    return articles


In [None]:
since = arrow.get('2021-01-01').datetime
until = arrow.get('2021-02-01').datetime
exact_words = None
all_words = None
any_word = '#apple'

web_client = st.DefaultTwitterWebClientProvider.get_web_client_preconfigured_for_tor_proxy(
    socks_proxy_url='socks5://localhost:9050',
    control_host='localhost',
    control_password=tor_pass,
    control_port=9051,
)


collect1 = st.CollectorRawOutput()
collect2 = st.CollectorRawOutput()
context = SearchRunContext()

#since = arrow.get(since).datetime
#until = arrow.get(until).datetime

tweets_task = st.SearchTweetsTask(since=since, until=until, any_word=any_word, exact_words=exact_words,all_words=all_words, tweets_limit=50, replies_filter=True)
runner = st.TweetSearchRunner(search_tweets_task=tweets_task,tweet_raw_data_outputs=[collect1],user_raw_data_outputs=[collect2], web_client=web_client, search_run_context=context)

runner.run()


In [None]:
runner.search_run_context

In [None]:
l = collect1.get_raw_list()

l2 = collect2.get_raw_list()

len(l), len(l2)

In [None]:

d = [value for key,value in json.loads(l[0].to_json_line())['raw_value'].items() if key in ['full_text','created_at','id']]
d[0] = arrow.get(d[0], 'ddd MMM DD HH:mm:ss Z YYYY').datetime.date()

In [None]:
d

In [None]:
len(list(set([[value for key,value in json.loads(x.to_json_line())['raw_value'].items() if key in ['full_text','created_at','id']][1] for x in l])))