In [1]:
import json
import csv
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from tqdm import tqdm_notebook
import torch.optim as optim
import torchtext
from torchtext.data import Field, BucketIterator, Iterator, TabularDataset
import math
import pickle

In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
# dtype = torch.FloatTensor
dtype = torch.cuda.FloatTensor # Uncomment this to run on GPU

In [4]:
torch.backends.cudnn.benchmark = True

## Load Data

In [5]:
with open('../../data/processed/splits/train/stock_data.json', 'r') as inp:
    train_stock = json.load(inp)
with open('../../data/processed/splits/valid/stock_data.json', 'r') as inp:
    valid_stock = json.load(inp)
with open('../../data/processed/splits/test/stock_data.json', 'r') as inp:
    test_stock = json.load(inp)

In [6]:
ID = Field(
    sequential=False
)
TRANSCRIPT = Field(
    sequential=True,
    fix_length=11000,
    lower=True
)
LABEL = Field(
    sequential=False,
    dtype=torch.float64
)

In [7]:
class StockDataset(data.Dataset):
    def __init__(self, examples):
        examples = np.array(examples)
        self.labels = examples[:,-1]
        self.market_cap = examples[:,-2]
        self.examples = np.array(examples[:,:-2].tolist())

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.examples)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = torch.tensor(self.examples[index])
        auxiliary = self.market_cap[index]
        y = self.labels[index]

        return X, auxiliary, y

In [8]:
def build_datasets():
    train, valid, test = TabularDataset.splits(
        path='../../data/processed/splits',
        format='csv',
        skip_header=True,
        train='train/transcripts.csv',
        validation='valid/transcripts.csv',
        test='test/transcripts.csv',
        fields=[('id', ID), ('transcript', TRANSCRIPT), ('post_high', LABEL)]
    )
    glove = torchtext.vocab.GloVe(name='6B', dim=50)
    TRANSCRIPT.build_vocab(train, valid, test, vectors=glove)
    return train, valid, test

In [16]:
class LookingGlassDataset(data.Dataset):
    def __init__(self, stock_dataset, transcript_dataset):
        self.stock_data = stock_dataset
        self.transcript_data = transcript_dataset

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.stock_data)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        stocks, stocks_aux, label = self.stock_data[index]
        transcript_example = self.transcript_data[index]
        assert math.isclose(float(transcript_example.post_high), label)

        transcript = torch.tensor(transcript_example.transcript)
        return stocks, stocks_aux, transcript, label

In [10]:
with open('vocab.pkl', 'rb') as inp:
    VOCAB = pickle.load(inp)

In [11]:
def preprocess_transcripts(train, valid, test):
    for dataset in [train, valid, test]:
        for example in dataset:
            if len(example.transcript) > 11000:
                example.transcript = example.transcript[:11000]
            else:
                remainder = 11000 - len(example.transcript)
                example.transcript += ['<pad>']*remainder
            example.transcript = list(map(lambda x: VOCAB.stoi[x], example.transcript))
    return train, valid, test

train_transcript, valid_transcript, test_transcript = preprocess_transcripts(*build_datasets())

In [17]:
stock_datasets = {
    'train': StockDataset(train_stock),
    'valid': StockDataset(valid_stock),
    'test': StockDataset(test_stock)
}

train_dataset = LookingGlassDataset(stock_datasets['train'], train_transcript)
valid_dataset = LookingGlassDataset(stock_datasets['valid'], valid_transcript)
test_dataset = LookingGlassDataset(stock_datasets['test'], test_transcript)

In [15]:
# with open('vocab.pkl', 'wb') as out:
#     pickle.dump(TRANSCRIPT.vocab, out)

## Train Model

In [13]:
class BaselineStockPredictor(nn.Module):
    """
    Model that will read in plain stock ticker values over time and decide whether to buy, sell, or hold at the current price.
    """
    def __init__(self, num_series_features=1, num_auxiliary_features=1, hidden_size=128, output_size=1):
        """
        Attributes:
            num_series_features: the size of the feature set for an individual
                                 stock price example (e.g. if we include high,
                                 low, average, num_series_features will equal 3
            num_auxiliary_features: the number of auxiliary (not dependent on time)
                                    features we are adding (e.g. if we include the 1yr
                                    high and the market capitalization, num_auxiliary_features
                                    would equal 2
            output_size: the size of the outputted vector. For evaluation, we would use a
                         size of 1 (stock price) or 3 (buy, sell, hold classification).
                         For use in the looking glass model, we want an encoding so we might
                         use a size of 128 to feed into the model.
        """
        super().__init__()
        self.recurrent = nn.LSTM(
            input_size=num_series_features,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
            dropout=0.5
        )
        # concatenate LSTM output with auxiliary features
        # output predicted price
        self.linear = nn.Linear(hidden_size+num_auxiliary_features, output_size)
        self.init_weights()

    def init_weights(self):
        """
        Initializes the weights of the model
        """
        for layer in [self.linear]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0.0)

    def forward(self, X_series, X_auxiliary):
        """
        Moves the model through each layer
        Parameters:
            X_series: an [N, num_series_examples, num_series_features] size vector
                      where N is the batch size, num_series_examples is how many stock prices
                      we are providing per example (e.g. weekly for the last 3 months), and
                      num_series_features is the same as described in __init__
            X_auxiliary: an [N, num_auxiliary_features] vector
        """
        recurrent_output,_ = self.recurrent(X_series)
        recurrent_output = torch.mean(recurrent_output, 1)
        # We might need this
        # recurrent_output = torch.squeeze(1) 
        aux_combined = torch.cat([recurrent_output, X_auxiliary], dim=1)
        output = self.linear(aux_combined)

        return output

In [18]:
class LookingGlassPredictor(nn.Module):
    """
    Model that will use the Baseline predictor as well as earnings call information to decide whether to buy, sell, or hold at the current price
    """
    def __init__(self, num_series_features=1, hidden_size=64, num_auxiliary_features=1, max_call_len=11000):
        """
        Initializes the model.
        Attributes:
            (see baseline.py for num_series_features and num_auxiliary_features)
            max_call_len: maximum number of tokens allowed in an earnings call transcript.
                          We will need to pad each earnings call to be this length (or truncate
                          if the call is too long)
            num_auxiliary_call_features: # non-transcript related features (e.g. if we
                                         include sentiment, ambiguity score, and
                                         confidence score, the num_auxiliary_call_features
                                         would equal 3
        """
        super().__init__()
        self.baseline = BaselineStockPredictor(
            num_series_features=num_series_features,
            num_auxiliary_features=num_auxiliary_features,
            output_size=hidden_size
        ).cuda()
        self.embedding = nn.Embedding.from_pretrained(VOCAB.vectors, freeze=True)
        self.recurrent = nn.LSTM(
            input_size=50,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
            dropout=0.5
        )
        self.rec_linear = nn.Linear(hidden_size, hidden_size)
        self.combined_linear = nn.Linear(2*hidden_size, hidden_size)
        self.final_linear = nn.Linear(hidden_size, 1)
    
        self.init_weights()

    def init_weights(self):
        """
        Initialize the model weights
        """
        self.baseline.init_weights()
        for layer in [self.rec_linear, self.combined_linear, self.final_linear]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0.0)

    def forward(self, X_series, X_auxiliary, X_transcript):
        """
        Moves the model through each layer
        Parameters:
            (see baseline.py for X_series and X_auxiliary)
            X_transcript: an [N, max_series_features, embedding_size] vector
            X_transcript_auxiliary: an [N, num_auxiliary_features] vector
        """
        baseline_output = self.baseline.forward(X_series, X_auxiliary)
        baseline_activated = nn.functional.relu(baseline_output)

        transcript_embeddings = self.embedding(X_transcript)
        recurrent_output,_ = self.recurrent(transcript_embeddings)
        recurrent_output = torch.mean(recurrent_output, 1)
        
#         aux_combined = torch.cat([recurrent_output, X_transcript_auxiliary], dim=1)
        output = self.rec_linear(recurrent_output)
        output_activated = nn.functional.relu(output)

        stock_transcript_joint_layer = torch.cat([baseline_activated, output_activated], dim=1)
        z1 = self.combined_linear(stock_transcript_joint_layer)
        a1 = nn.functional.relu(z1)
        
        final_output = self.final_linear(a1)
        
        return final_output

In [19]:
def get_lg_iterator(dataset, batch_size, train=True, shuffle=True):
    iterator = data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=5)
    return iterator
    
def train_model(train, valid, num_epochs=200, learning_rate=0.003):
    batch_size = 64
    train_iterator = get_lg_iterator(train, batch_size)
    valid_iterator = get_lg_iterator(valid, batch_size)
    
    model = LookingGlassPredictor(num_series_features=2, hidden_size=64)
    model = model.float()
    model = model.cuda()
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), learning_rate)
    
    losses = []
    valid_scores = []
    
    min_mse = float('inf')
    delay = 0
    MAX_INC = 100
    
    for epoch in tqdm_notebook(range(num_epochs)):
        model.train()
        iter_losses = []
        print('Starting epoch', epoch)
        for batch_stock_series, batch_stock_aux, batch_transcripts, batch_labels in train_iterator:
            batch_stock_aux = torch.reshape(batch_stock_aux, (-1,1))
            optimizer.zero_grad()
            outputs = model(batch_stock_series.float().cuda(), batch_stock_aux.float().cuda(), batch_transcripts.long().cuda())
            batch_labels = torch.reshape(batch_labels, (-1,1))
            loss = criterion(outputs.cuda(), batch_labels.float().cuda())
            loss.backward()
            optimizer.step()
            iter_losses.append(loss.item())
        iter_losses = np.array(iter_losses)
        losses.append(np.mean(iter_losses))
        
        valid_mse = []
        model.eval()

        for valid_batch_stock_series, valid_batch_stock_aux, valid_batch_transcripts, valid_batch_labels in valid_iterator:
            valid_batch_stock_aux = torch.reshape(valid_batch_stock_aux, (-1,1))
            outputs = model(valid_batch_stock_series.float().cuda(), valid_batch_stock_aux.float().cuda(), valid_batch_transcripts.long().cuda())
            valid_batch_labels = torch.reshape(valid_batch_labels, (-1,1))
            loss = criterion(outputs.cuda(), valid_batch_labels.float().cuda())
            valid_mse.append(loss.item())
        valid_mse = np.mean(valid_mse)
        print(f'Completed epoch {epoch}. Valid MSE: {valid_mse}')


        if valid_mse < min_mse:
            min_mse = valid_mse
            delay = 0
            torch.save(model, 'lg_model.ckpt')
        else:
            delay += 1
        if delay > MAX_INC:
            print('Stopping early')
            break
    
    return model, losses

In [20]:
model = train_model(train_dataset, valid_dataset, num_epochs=1000)

  "num_layers={}".format(dropout, num_layers))
Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Starting epoch 0
Completed epoch 0. Valid MSE: 23785.08984375
Starting epoch 1


  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "
  "type " + obj.__name__ + ". It won't be checked "


Completed epoch 1. Valid MSE: 20688.765625
Starting epoch 2
Completed epoch 2. Valid MSE: 16154.099365234375
Starting epoch 3
Completed epoch 3. Valid MSE: 16766.5986328125
Starting epoch 4
Completed epoch 4. Valid MSE: 11740.748046875
Starting epoch 5
Completed epoch 5. Valid MSE: 7772.623291015625
Starting epoch 6
Completed epoch 6. Valid MSE: 4198.9544677734375
Starting epoch 7
Completed epoch 7. Valid MSE: 1839.4732971191406
Starting epoch 8
Completed epoch 8. Valid MSE: 2023.5560302734375
Starting epoch 9
Completed epoch 9. Valid MSE: 1521.7385864257812
Starting epoch 10
Completed epoch 10. Valid MSE: 4094.26904296875
Starting epoch 11
Completed epoch 11. Valid MSE: 1079.9304809570312
Starting epoch 12
Completed epoch 12. Valid MSE: 517.68017578125
Starting epoch 13
Completed epoch 13. Valid MSE: 330.5074157714844
Starting epoch 14
Completed epoch 14. Valid MSE: 660.12451171875
Starting epoch 15
Completed epoch 15. Valid MSE: 417.8621520996094
Starting epoch 16
Completed epoch 16.

Completed epoch 123. Valid MSE: 822.9580841064453
Starting epoch 124
Completed epoch 124. Valid MSE: 987.074951171875
Starting epoch 125
Completed epoch 125. Valid MSE: 863.0892333984375
Starting epoch 126
Completed epoch 126. Valid MSE: 355.7365531921387
Starting epoch 127
Completed epoch 127. Valid MSE: 539.4450836181641
Starting epoch 128
Completed epoch 128. Valid MSE: 525.7629547119141
Starting epoch 129
Completed epoch 129. Valid MSE: 638.437744140625
Starting epoch 130
Completed epoch 130. Valid MSE: 761.9725646972656
Starting epoch 131
Completed epoch 131. Valid MSE: 557.3621520996094
Starting epoch 132
Completed epoch 132. Valid MSE: 525.2748565673828
Starting epoch 133
Completed epoch 133. Valid MSE: 850.2799682617188
Starting epoch 134
Completed epoch 134. Valid MSE: 1857.0657958984375
Starting epoch 135
Completed epoch 135. Valid MSE: 791.0437316894531
Starting epoch 136
Completed epoch 136. Valid MSE: 517.6451416015625
Starting epoch 137
Completed epoch 137. Valid MSE: 146

In [22]:
def eval_model(model, test):
    batch_size = 64
    test_iterator = get_lg_iterator(test, batch_size)
    model.eval()
        
    criterion = nn.MSELoss()
    
    losses = []
    for batch_stock_series, batch_stock_aux, batch_transcripts, batch_labels in test_iterator:
        batch_stock_aux = torch.reshape(batch_stock_aux, (-1,1))
        outputs = model(batch_stock_series.float().cuda(), batch_stock_aux.float().cuda(), batch_transcripts.long().cuda())
        batch_labels = torch.reshape(batch_labels, (-1,1))
        loss = criterion(outputs.cuda(), batch_labels.float().cuda())
        losses.append(loss.item())
    return np.mean(losses)

In [23]:
eval_model(model[0], test_dataset)

362.0254936218262

In [25]:
with open('lg_losses_180_epochs.json', 'w') as out:
    json.dump(model[1], out)