In [1]:
import json
import csv
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from tqdm import tqdm_notebook
import torch.optim as optim
import torchtext
from torchtext.data import Field, BucketIterator, Iterator, TabularDataset
import math
import pickle

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
torch.backends.cudnn.benchmark = True

## Load data

In [4]:
with open('../../data/processed/splits/train/stock_data.json', 'r') as inp:
    train_stock = json.load(inp)
with open('../../data/processed/splits/valid/stock_data.json', 'r') as inp:
    valid_stock = json.load(inp)
with open('../../data/processed/splits/test/stock_data.json', 'r') as inp:
    test_stock = json.load(inp)

In [5]:
ID = Field(
    sequential=False
)
TRANSCRIPT = Field(
    sequential=True,
    fix_length=11000,
    lower=True
)
LABEL = Field(
    sequential=False,
    dtype=torch.float64
)

In [6]:
class StockDataset(data.Dataset):
    def __init__(self, examples):
        examples = np.array(examples)
        self.labels = examples[:,-1]
        self.market_cap = examples[:,-2]
        self.examples = np.array(examples[:,:-2].tolist())

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.examples)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = torch.tensor(self.examples[index])
        auxiliary = self.market_cap[index]
        y = self.labels[index]

        return X, auxiliary, y

In [7]:
def build_datasets():
    train, valid, test = TabularDataset.splits(
        path='../../data/processed/splits',
        format='csv',
        skip_header=True,
        train='train/transcripts.csv',
        validation='valid/transcripts.csv',
        test='test/transcripts.csv',
        fields=[('id', ID), ('transcript', TRANSCRIPT), ('post_high', LABEL)]
    )
    glove = torchtext.vocab.GloVe(name='6B', dim=50)
    TRANSCRIPT.build_vocab(train, valid, test, vectors=glove)
    return train, valid, test

In [8]:
class LookingGlassDataset(data.Dataset):
    def __init__(self, stock_dataset, transcript_dataset):
        self.stock_data = stock_dataset
        self.transcript_data = transcript_dataset

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.stock_data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        stocks, stocks_aux, label = self.stock_data[index]
        transcript_example = self.transcript_data[index]
        assert math.isclose(float(transcript_example.post_high), label)

        transcript = torch.tensor(transcript_example.transcript)
        return stocks, stocks_aux, transcript, label

In [9]:
with open('vocab.pkl', 'rb') as inp:
    VOCAB = pickle.load(inp)

In [10]:
def preprocess_transcripts(train, valid, test):
    for dataset in [train, valid, test]:
        for example in dataset:
            if len(example.transcript) > 11000:
                example.transcript = example.transcript[:11000]
            else:
                remainder = 11000 - len(example.transcript)
                example.transcript += ['<pad>']*remainder
            example.transcript = list(map(lambda x: VOCAB.stoi[x], example.transcript))
    return train, valid, test

train_transcript, valid_transcript, test_transcript = preprocess_transcripts(*build_datasets())

In [11]:
stock_datasets = {
    'train': StockDataset(train_stock),
    'valid': StockDataset(valid_stock),
    'test': StockDataset(test_stock)
}

train_dataset = LookingGlassDataset(stock_datasets['train'], train_transcript)
valid_dataset = LookingGlassDataset(stock_datasets['valid'], valid_transcript)
test_dataset = LookingGlassDataset(stock_datasets['test'], test_transcript)

## Define Models

In [66]:
class BaselineStockPredictor(nn.Module):
    """
    Model that will read in plain stock ticker values over time and decide whether to buy, sell, or hold at the current price.
    """
    def __init__(self, num_series_features=1, num_auxiliary_features=1, hidden_size=128, output_size=1):
        """
        Attributes:
            num_series_features: the size of the feature set for an individual
                                 stock price example (e.g. if we include high,
                                 low, average, num_series_features will equal 3
            num_auxiliary_features: the number of auxiliary (not dependent on time)
                                    features we are adding (e.g. if we include the 1yr
                                    high and the market capitalization, num_auxiliary_features
                                    would equal 2
            output_size: the size of the outputted vector. For evaluation, we would use a
                         size of 1 (stock price) or 3 (buy, sell, hold classification).
                         For use in the looking glass model, we want an encoding so we might
                         use a size of 128 to feed into the model.
        """
        super().__init__()
        self.recurrent = nn.LSTM(
            input_size=num_series_features,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
            dropout=0.5
        )
        # concatenate LSTM output with auxiliary features
        # output predicted price
        self.linear1 = nn.Linear(hidden_size+num_auxiliary_features, output_size)
        self.init_weights()

    def init_weights(self):
        """
        Initializes the weights of the model
        """
        for layer in [self.linear1]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0.0)

    def forward(self, X_series, X_auxiliary):
        """
        Moves the model through each layer
        Parameters:
            X_series: an [N, num_series_examples, num_series_features] size vector
                      where N is the batch size, num_series_examples is how many stock prices
                      we are providing per example (e.g. weekly for the last 3 months), and
                      num_series_features is the same as described in __init__
            X_auxiliary: an [N, num_auxiliary_features] vector
        """
        recurrent_output,_ = self.recurrent(X_series)
        recurrent_output = torch.mean(recurrent_output, 1)
        # We might need this
        # recurrent_output = torch.squeeze(1) 
        aux_combined = torch.cat([recurrent_output, X_auxiliary], dim=1)
        output = self.linear1(aux_combined)

        return output

In [101]:
class LookingGlassPredictor(nn.Module):
    """
    Model that will use the Baseline predictor as well as earnings call information to decide whether to buy, sell, or hold at the current price
    """
    def __init__(self, num_series_features=1, hidden_size=64, num_auxiliary_features=1, max_call_len=11000, baseline_weights=None):
        """
        Initializes the model.
        Attributes:
            (see baseline.py for num_series_features and num_auxiliary_features)
            max_call_len: maximum number of tokens allowed in an earnings call transcript.
                          We will need to pad each earnings call to be this length (or truncate
                          if the call is too long)
            num_auxiliary_call_features: # non-transcript related features (e.g. if we
                                         include sentiment, ambiguity score, and
                                         confidence score, the num_auxiliary_call_features
                                         would equal 3
        """
        super().__init__()
        self.baseline = BaselineStockPredictor(
            num_series_features=num_series_features,
            num_auxiliary_features=num_auxiliary_features,
            hidden_size=hidden_size,
            output_size=hidden_size
        ).cuda()
        
        if baseline_weights is not None:
            for (name, value),param in zip(baseline_weights.items(), self.baseline.parameters()):
                if 'linear2' not in name:
                    if param.data.shape == value.shape:
                        param.data = value
#                         param.requires_grad = False
                    else:
                        print(name, param.data.shape, value.shape)
        
        self.embedding = nn.Embedding.from_pretrained(VOCAB.vectors, freeze=True)
        self.recurrent = nn.LSTM(
            input_size=50,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
            dropout=0.5
        )
        self.rec_linear = nn.Linear(hidden_size, hidden_size)
        self.combined_linear = nn.Linear(2*hidden_size, hidden_size)
        self.final_linear = nn.Linear(hidden_size, 1)
    
        self.init_weights()

    def init_weights(self):
        """
        Initialize the model weights
        """
        self.baseline.init_weights()
        for layer in [self.rec_linear, self.combined_linear, self.final_linear]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0.0)

    def forward(self, X_series, X_auxiliary, X_transcript):
        """
        Moves the model through each layer
        Parameters:
            (see baseline.py for X_series and X_auxiliary)
            X_transcript: an [N, max_series_features, embedding_size] vector
            X_transcript_auxiliary: an [N, num_auxiliary_features] vector
        """
        baseline_output = self.baseline.forward(X_series, X_auxiliary)
        baseline_activated = nn.functional.relu(baseline_output)

        transcript_embeddings = self.embedding(X_transcript)
        recurrent_output,_ = self.recurrent(transcript_embeddings)
        recurrent_output = torch.mean(recurrent_output, 1)
        
#         aux_combined = torch.cat([recurrent_output, X_transcript_auxiliary], dim=1)
        output = self.rec_linear(recurrent_output)
        output_activated = nn.functional.relu(output)

        stock_transcript_joint_layer = torch.cat([baseline_activated, output_activated], dim=1)
        z1 = self.combined_linear(stock_transcript_joint_layer)
        a1 = nn.functional.relu(z1)
        
        final_output = self.final_linear(a1)
        
        return final_output

## Load pre-trained model

In [82]:
baseline_model = torch.load('../../data/models/stock_6k.pt')

## Train LR Model

In [112]:
def get_lg_iterator(dataset, batch_size, train=True, shuffle=True):
    iterator = data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=5)
    return iterator
    
def train_model(train, valid, baseline, num_epochs=200, learning_rate=0.003, existing_model=None):
    batch_size = 64
    train_iterator = get_lg_iterator(train, batch_size)
    valid_iterator = get_lg_iterator(valid, batch_size)
    
    model = None

    if existing_model is None:
        model = LookingGlassPredictor(num_series_features=2, hidden_size=64, baseline_weights=baseline.state_dict())
    else:
        model = existing_model
    model = model.float()
    model = model.cuda()
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), learning_rate)
    
    losses = []
    valid_scores = []
    
    min_mse = float('inf')
    delay = 0
    MAX_INC = 80
    
    for epoch in tqdm_notebook(range(num_epochs)):
        model.train()
        iter_losses = []
        print('Starting epoch', epoch)
        for batch_stock_series, batch_stock_aux, batch_transcripts, batch_labels in train_iterator:
            batch_stock_aux = torch.reshape(batch_stock_aux, (-1,1))
            optimizer.zero_grad()
            outputs = model(batch_stock_series.float().cuda(), batch_stock_aux.float().cuda(), batch_transcripts.long().cuda())
            batch_labels = torch.reshape(batch_labels, (-1,1))
            loss = criterion(outputs.cuda(), batch_labels.float().cuda())
            loss.backward()
            optimizer.step()
            iter_losses.append(loss.item())
        iter_losses = np.array(iter_losses)
        losses.append(np.mean(iter_losses))
        
        valid_mse = []
        model.eval()

        for valid_batch_stock_series, valid_batch_stock_aux, valid_batch_transcripts, valid_batch_labels in valid_iterator:
            valid_batch_stock_aux = torch.reshape(valid_batch_stock_aux, (-1,1))
            outputs = model(valid_batch_stock_series.float().cuda(), valid_batch_stock_aux.float().cuda(), valid_batch_transcripts.long().cuda())
            valid_batch_labels = torch.reshape(valid_batch_labels, (-1,1))
            loss = criterion(outputs.cuda(), valid_batch_labels.float().cuda())
            valid_mse.append(loss.item())
        valid_mse = np.mean(valid_mse)
        print(f'Completed epoch {epoch}. Valid MSE: {valid_mse}')


        if valid_mse < min_mse:
            min_mse = valid_mse
            delay = 0
            torch.save(model, 'lg_model.ckpt')
        else:
            delay += 1
        if delay > MAX_INC:
            print('Stopping early')
            break
    
    return model, losses

In [113]:
lg_model2, losses1 = train_model(train_dataset, valid_dataset, baseline_model, num_epochs=200, existing_model = lg_model1)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=200.0), HTML(value='')))

Starting epoch 0
Completed epoch 0. Valid MSE: 3868.6046142578125
Starting epoch 1
Completed epoch 1. Valid MSE: 2517.458251953125
Starting epoch 2
Completed epoch 2. Valid MSE: 369.23743438720703
Starting epoch 3
Completed epoch 3. Valid MSE: 340.4572525024414
Starting epoch 4
Completed epoch 4. Valid MSE: 264.6954460144043
Starting epoch 5
Completed epoch 5. Valid MSE: 418.76341247558594
Starting epoch 6
Completed epoch 6. Valid MSE: 295.20481872558594
Starting epoch 7
Completed epoch 7. Valid MSE: 299.5159606933594
Starting epoch 8
Completed epoch 8. Valid MSE: 378.79632568359375
Starting epoch 9
Completed epoch 9. Valid MSE: 308.86622619628906
Starting epoch 10
Completed epoch 10. Valid MSE: 182.84267234802246
Starting epoch 11
Completed epoch 11. Valid MSE: 302.1366271972656
Starting epoch 12
Completed epoch 12. Valid MSE: 337.833984375
Starting epoch 13
Completed epoch 13. Valid MSE: 256.4351501464844
Starting epoch 14
Completed epoch 14. Valid MSE: 287.48252868652344
Starting ep

Completed epoch 122. Valid MSE: 854.3472595214844
Starting epoch 123
Completed epoch 123. Valid MSE: 1357.2125854492188
Starting epoch 124
Completed epoch 124. Valid MSE: 577.3701019287109
Starting epoch 125
Completed epoch 125. Valid MSE: 297.90354919433594
Starting epoch 126
Completed epoch 126. Valid MSE: 356.0046081542969
Starting epoch 127
Completed epoch 127. Valid MSE: 431.24488830566406
Starting epoch 128
Completed epoch 128. Valid MSE: 545.0994110107422
Starting epoch 129
Completed epoch 129. Valid MSE: 250.21267700195312
Starting epoch 130
Completed epoch 130. Valid MSE: 209.8308334350586
Starting epoch 131
Completed epoch 131. Valid MSE: 205.9602813720703
Starting epoch 132
Completed epoch 132. Valid MSE: 179.7796220779419
Starting epoch 133
Completed epoch 133. Valid MSE: 236.35184478759766
Starting epoch 134
Completed epoch 134. Valid MSE: 285.6539993286133
Starting epoch 135
Completed epoch 135. Valid MSE: 252.4657745361328
Stopping early


In [71]:
baseline_model.state_dict().keys()

odict_keys(['recurrent.weight_ih_l0', 'recurrent.weight_hh_l0', 'recurrent.bias_ih_l0', 'recurrent.bias_hh_l0', 'linear1.weight', 'linear1.bias', 'linear2.weight', 'linear2.bias'])

In [125]:
def eval_model(model, test):
    batch_size = 128
    test_iterator = get_lg_iterator(test, batch_size, shuffle=False)
    model.eval()
        
    criterion = nn.MSELoss()
    
    losses = []
    for batch_stock_series, batch_stock_aux, batch_transcripts, batch_labels in test_iterator:
        batch_stock_aux = torch.reshape(batch_stock_aux, (-1,1))
        outputs = model(batch_stock_series.float().cuda(), batch_stock_aux.float().cuda(), batch_transcripts.long().cuda())
        batch_labels = torch.reshape(batch_labels, (-1,1))
        loss = criterion(outputs.cuda(), batch_labels.float().cuda())
        losses.append(loss.item())
    return np.mean(losses)

def eval_model_baseline(model, test):
    batch_size = 128
    test_iterator = get_lg_iterator(test, batch_size, shuffle=False)
        
    criterion = nn.MSELoss()
    
    model.eval()
    losses = []
    for batch_stock_series, batch_stock_aux, batch_transcripts, batch_labels in test_iterator:
        batch_stock_aux = torch.reshape(batch_stock_aux, (-1,1))
        outputs = model(batch_stock_series.float().cuda(), batch_stock_aux.float().cuda())
        batch_labels = torch.reshape(batch_labels, (-1,1))
        loss = criterion(outputs, batch_labels.float().cuda())
        losses.append(loss.item())
    return np.mean(losses)

In [126]:
eval_model(lg_model2, test_dataset)

621.9088134765625

In [119]:
len(losses), len(losses1)

(88, 136)

In [120]:
losses + losses1

[57791.01806640625,
 41765.090576171875,
 49242.14697265625,
 39555.488037109375,
 38901.3603515625,
 38614.108154296875,
 46313.680013020836,
 39562.42635091146,
 38635.38728841146,
 45981.48543294271,
 38271.709635416664,
 38408.105631510414,
 38231.034423828125,
 37923.4111328125,
 38867.24694824219,
 38267.087076822914,
 37828.23779296875,
 37453.446044921875,
 46381.73478190104,
 45597.123779296875,
 37574.11551920573,
 37298.36083984375,
 37042.51021321615,
 37398.85404459635,
 37239.03377278646,
 37695.6650390625,
 38167.640625,
 36792.414123535156,
 44308.742431640625,
 36920.99650065104,
 45672.737467447914,
 37451.074055989586,
 37104.071614583336,
 43765.85359700521,
 36727.4472249349,
 47291.23392740885,
 36761.545654296875,
 44285.685384114586,
 36750.63415527344,
 39795.615559895836,
 37035.04890950521,
 37462.063802083336,
 38940.532958984375,
 36781.2431640625,
 36941.57413736979,
 36478.21048990885,
 36880.685221354164,
 36534.810709635414,
 36852.89786783854,
 39132.1