In [2]:
import json
import csv
import numpy as np
import torch
import torch.nn as nn
from torch.utils import data
from tqdm import tqdm_notebook
import torch.optim as optim

In [3]:
with open('../../data/processed/masked_full_transcripts.json', 'r') as inp:
    transcripts = json.load(inp)

In [9]:
transcripts[0]

{'id': 'bb7a9130-ae57-4228-a57f-abd5b6ab6b3a',
 'company': 'CTAS',
 'quarter': 'Q4 2019',
 'after_market': True,
 'market_cap': 27.1,
 'transcript': ['_',
  'S',
  'P',
  '_',
  'S',
  'T',
  'A',
  'R',
  'T',
  '_',
  'o',
  'p',
  'e',
  'r',
  'a',
  't',
  'o',
  'r',
  '_',
  'S',
  'P',
  '_',
  'E',
  'N',
  'D',
  '_',
  'l',
  'a',
  'd',
  'i',
  'e',
  's',
  'a',
  'n',
  'd',
  'g',
  'e',
  'n',
  't',
  'l',
  'e',
  '_MASKED_',
  'e',
  'n',
  't',
  'h',
  'a',
  'n',
  'k',
  'y',
  'o',
  'u',
  'f',
  'o',
  'r',
  's',
  't',
  'a',
  'n',
  'd',
  'i',
  'n',
  'g',
  'b',
  'y',
  'a',
  'n',
  'd',
  'w',
  'e',
  'l',
  'c',
  'o',
  '_MASKED_',
  'e',
  't',
  'o',
  't',
  'h',
  'e',
  'c',
  'u',
  'b',
  'i',
  'c',
  'c',
  'o',
  'r',
  'p',
  'o',
  'r',
  'a',
  't',
  'i',
  'o',
  'n',
  'f',
  'o',
  'u',
  'r',
  't',
  'h',
  'q',
  'u',
  'a',
  'r',
  't',
  'e',
  'r',
  'a',
  'n',
  'd',
  'f',
  'u',
  'l',
  'l',
  'y',
  'e',
  'a',
  'r'

## Split Data

In [4]:
np.random.shuffle(transcripts)

In [5]:
train_split = len(transcripts)*8//10
valid_split = len(transcripts)*1//10
train = transcripts[:train_split]
valid = transcripts[train_split:train_split+valid_split]
test = transcripts[train_split+valid_split:]

train_csv = [('id', 'transcript', 'post_high')]
train_stock = []
for example in train:
    train_csv.append((example['id'], example['transcript'], example['post_high_low'][0]))
    historical = map(lambda x: x[1], sorted(example['historical_info'], key=lambda x: x[0]))
    train_stock.append(list(map(lambda x: list(map(float, x)), historical)))
    train_stock[-1] += [example['market_cap'], example['post_high_low'][0]]

valid_csv = [('id', 'transcript', 'post_high')]
valid_stock = []
for example in valid:
    valid_csv.append((example['id'], example['transcript'], example['post_high_low'][0]))
    historical = map(lambda x: x[1], sorted(example['historical_info'], key=lambda x: x[0]))
    valid_stock.append(list(map(lambda x: list(map(float, x)), historical)))
    valid_stock[-1] += [example['market_cap'], example['post_high_low'][0]]

test_csv = [('id', 'transcript', 'post_high')]
test_stock = []
for example in test:
    test_csv.append((example['id'], example['transcript'], example['post_high_low'][0]))
    historical = map(lambda x: x[1], sorted(example['historical_info'], key=lambda x: x[0]))
    test_stock.append(list(map(lambda x: list(map(float, x)), historical)))
    test_stock[-1] += [example['market_cap'], example['post_high_low'][0]]

In [8]:
for row in train_csv[1:]:
    print(row)
    break

('bb7a9130-ae57-4228-a57f-abd5b6ab6b3a', ['_', 'S', 'P', '_', 'S', 'T', 'A', 'R', 'T', '_', 'o', 'p', 'e', 'r', 'a', 't', 'o', 'r', '_', 'S', 'P', '_', 'E', 'N', 'D', '_', 'l', 'a', 'd', 'i', 'e', 's', 'a', 'n', 'd', 'g', 'e', 'n', 't', 'l', 'e', '_MASKED_', 'e', 'n', 't', 'h', 'a', 'n', 'k', 'y', 'o', 'u', 'f', 'o', 'r', 's', 't', 'a', 'n', 'd', 'i', 'n', 'g', 'b', 'y', 'a', 'n', 'd', 'w', 'e', 'l', 'c', 'o', '_MASKED_', 'e', 't', 'o', 't', 'h', 'e', 'c', 'u', 'b', 'i', 'c', 'c', 'o', 'r', 'p', 'o', 'r', 'a', 't', 'i', 'o', 'n', 'f', 'o', 'u', 'r', 't', 'h', 'q', 'u', 'a', 'r', 't', 'e', 'r', 'a', 'n', 'd', 'f', 'u', 'l', 'l', 'y', 'e', 'a', 'r', 'f', 'i', 's', 'c', 'a', 'l', '_', 'M', 'A', 'S', 'K', 'E', 'D', '_', 'e', 'a', 'r', 'n', 'i', 'n', 'g', 's', 'c', 'o', 'n', 'f', 'e', 'r', 'e', 'n', 'c', 'e', 'c', 'a', 'l', 'l', 'o', 'p', 'e', 'r', 'a', 't', 'o', 'r', 'i', 'n', 's', 't', 'r', 'u', 'c', 't', 'i', 'o', 'n', 's', 'i', 'w', 'o', 'u', 'l', 'd', 'n', 'o', 'w', 'l', 'i', 'k', 'e',

In [6]:
with open('../../data/processed/splits/train/transcripts.csv', 'w') as out:
    writer = csv.writer(out)
    header = None
    for row in train_csv:
        if header is None:
            writer.writerow(row)
            header = row
        else:
            writer.writerow(' '.join(row))
with open('../../data/processed/splits/valid/transcripts.csv', 'w') as out:
    writer = csv.writer(out)
    for row in valid_csv:
        writer.writerow(' '.join(row))
with open('../../data/processed/splits/test/transcripts.csv', 'w') as out:
    writer = csv.writer(out)
    for row in test_csv:
        writer.writerow(' '.join(row))

TypeError: sequence item 1: expected str instance, list found

In [58]:
with open('../../data/processed/splits/train/transcripts.csv', 'w') as out:
    writer = csv.writer(out)
    for row in train_csv:
        writer.writerow(' '.join(row))
with open('../../data/processed/splits/valid/transcripts.csv', 'w') as out:
    writer = csv.writer(out)
    for row in valid_csv:
        writer.writerow(' '.join(row))
with open('../../data/processed/splits/test/transcripts.csv', 'w') as out:
    writer = csv.writer(out)
    for row in test_csv:
        writer.writerow(' '.join(row))
        
with open('../../data/processed/splits/train/stock_data.json', 'w') as out:
    json.dump(train_stock, out)
with open('../../data/processed/splits/valid/stock_data.json', 'w') as out:
    json.dump(valid_stock, out)
with open('../../data/processed/splits/test/stock_data.json', 'w') as out:
    json.dump(test_stock, out)

In [57]:
with open('../../data/processed/splits/valid/stock_data.csv', 'w') as out:
    writer = csv.writer(out)
    for row in valid_stock:
        writer.writerow(row)

## Create Dataset

In [117]:
class StockDataset(data.Dataset):
    def __init__(self, examples):
        examples = np.array(examples)
        self.labels = examples[:,-1]
        self.market_cap = examples[:,-2]
        self.examples = np.array(examples[:,:-2].tolist())

    def __len__(self):
        'Denotes the total number of samples'
        return len(self.examples)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Load data and get label
        X = torch.tensor(self.examples[index], type=torch.
        auxiliary = self.market_cap[index]
        y = self.labels[index]

        return X, auxiliary, y

In [118]:
valid_dataset = StockDataset(valid_stock)

In [119]:
params = {'batch_size': 64,
          'shuffle': True,
          'num_workers': 6}
valid_generator = data.DataLoader(valid_dataset, **params)

In [120]:
for batch, aux, labels in valid_generator:
    print(aux)
    break

tensor([ 72.6400,  42.8700,  17.1200,  10.6800,  81.3200,  79.6700,  17.3100,
        233.1600, 152.1600,  14.1000,  29.2000,  48.6100,  70.9000,  66.2800,
         23.1900, 103.9800, 142.9200,  16.0400,  47.7300, 137.0800,  21.9100,
        873.0700,  14.3000,  66.2800, 131.7400,  38.0600, 192.2900,  47.7900,
          7.8900,  18.1800,  30.5300,  55.1600,  15.7300,  14.1700,  21.9400,
         14.8300,  39.3700,  22.0300,  11.8500,  80.4300, 166.7700,  19.7100,
         18.6700, 152.6200,  39.4800,  39.5100,  12.5600, 116.6800,  19.3500,
        192.1700,  15.0800,  13.0000,  28.1100,  13.7000,  28.4500,  51.1800,
         22.8900,  44.8900,  12.0000,  55.3100,  79.3800,  26.3300, 222.5600,
        227.5000], dtype=torch.float64)


## Train Model

In [203]:
class BaselineStockPredictor(nn.Module):
    """
    Model that will read in plain stock ticker values over time and decide whether to buy, sell, or hold at the current price.
    """
    def __init__(self, num_series_features=1, num_auxiliary_features=1, hidden_size=128, output_size=1):
        """
        Attributes:
            num_series_features: the size of the feature set for an individual
                                 stock price example (e.g. if we include high,
                                 low, average, num_series_features will equal 3
            num_auxiliary_features: the number of auxiliary (not dependent on time)
                                    features we are adding (e.g. if we include the 1yr
                                    high and the market capitalization, num_auxiliary_features
                                    would equal 2
            output_size: the size of the outputted vector. For evaluation, we would use a
                         size of 1 (stock price) or 3 (buy, sell, hold classification).
                         For use in the looking glass model, we want an encoding so we might
                         use a size of 128 to feed into the model.
        """
        super().__init__()
        self.recurrent = nn.LSTM(
            input_size=num_series_features,
            hidden_size=hidden_size,
            num_layers=1,
            bidirectional=False,
            batch_first=True,
            dropout=0.5
        )
        # concatenate LSTM output with auxiliary features
        # output predicted price
        self.linear = nn.Linear(hidden_size+num_auxiliary_features, output_size)
        self.init_weights()

    def init_weights(self):
        """
        Initializes the weights of the model
        """
        for layer in [self.linear]:
            nn.init.xavier_uniform_(layer.weight)
            nn.init.constant_(layer.bias, 0.0)

    def forward(self, X_series, X_auxiliary):
        """
        Moves the model through each layer
        Parameters:
            X_series: an [N, num_series_examples, num_series_features] size vector
                      where N is the batch size, num_series_examples is how many stock prices
                      we are providing per example (e.g. weekly for the last 3 months), and
                      num_series_features is the same as described in __init__
            X_auxiliary: an [N, num_auxiliary_features] vector
        """
        recurrent_output,_ = self.recurrent(X_series)
        recurrent_output = torch.mean(recurrent_output, 1)
        # We might need this
        # recurrent_output = torch.squeeze(1) 
        aux_combined = torch.cat([recurrent_output, X_auxiliary], dim=1)
        output = self.linear(aux_combined)

        return output

In [204]:
def get_stock_iterator(input_data, batch_size, train=True, shuffle=True):
    dataset = StockDataset(input_data)
    iterator = data.DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, num_workers=5)
    return iterator
    
def train_model(train, valid, num_epochs=200, learning_rate=0.003):
    batch_size = 100
    train_iterator = get_stock_iterator(train, batch_size)
    valid_iterator = get_stock_iterator(valid, batch_size)
    
    model = BaselineStockPredictor(num_series_features=2, hidden_size=64)
    model = model.float()
    
    criterion = nn.MSELoss()
    optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), learning_rate)
    
    losses = []
    valid_scores = []
    
    min_mse = float('inf')
    delay = 0
    MAX_INC = 20
    
    for epoch in tqdm_notebook(range(num_epochs)):
        model.train()
        iter_losses = []
        for batch_series, batch_aux, batch_labels in train_iterator:
            batch_aux = torch.reshape(batch_aux, (-1,1))
            optimizer.zero_grad()
            outputs = model(batch_series.float(), batch_aux.float())
            batch_labels = torch.reshape(batch_labels, (-1,1))
            loss = criterion(outputs, batch_labels.float())
            loss.backward()
            optimizer.step()
            iter_losses.append(loss.item())
        iter_losses = np.array(iter_losses)
        losses.append(np.mean(iter_losses))
        
        valid_mse = []
        model.eval()

        for valid_batch_series, valid_batch_aux, valid_batch_labels in valid_iterator:
            valid_batch_aux = torch.reshape(valid_batch_aux, (-1,1))
            outputs = model(valid_batch_series.float(), valid_batch_aux.float())
            valid_batch_labels = torch.reshape(valid_batch_labels, (-1,1))
            loss = criterion(outputs, valid_batch_labels.float())
            valid_mse.append(loss.item())
        valid_mse = np.mean(valid_mse)
        print(f'Completed epoch {epoch}. Valid MSE: {valid_mse}')


        if valid_mse < min_mse:
            min_mse = valid_mse
            delay = 0
            torch.save(model, 'model.ckpt')
        else:
            delay += 1
        if delay > MAX_INC:
            print('Stopping early')
            break
    
    return model, losses

In [215]:
model = train_model(train_stock, valid_stock, num_epochs=2000)

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`


HBox(children=(IntProgress(value=0, max=2000), HTML(value='')))

Completed epoch 0. Valid MSE: 93251.59375
Completed epoch 1. Valid MSE: 91044.4296875
Completed epoch 2. Valid MSE: 88757.875
Completed epoch 3. Valid MSE: 86543.4296875
Completed epoch 4. Valid MSE: 84642.0234375
Completed epoch 5. Valid MSE: 82720.0
Completed epoch 6. Valid MSE: 80888.84375
Completed epoch 7. Valid MSE: 79195.265625
Completed epoch 8. Valid MSE: 77501.7578125
Completed epoch 9. Valid MSE: 75878.9921875
Completed epoch 10. Valid MSE: 74251.828125
Completed epoch 11. Valid MSE: 72730.40625
Completed epoch 12. Valid MSE: 71197.859375
Completed epoch 13. Valid MSE: 69713.9609375
Completed epoch 14. Valid MSE: 68330.296875
Completed epoch 15. Valid MSE: 66855.8125
Completed epoch 16. Valid MSE: 65516.47265625
Completed epoch 17. Valid MSE: 64178.8125
Completed epoch 18. Valid MSE: 62962.44921875
Completed epoch 19. Valid MSE: 61659.47265625
Completed epoch 20. Valid MSE: 60389.5546875
Completed epoch 21. Valid MSE: 59284.79296875
Completed epoch 22. Valid MSE: 58170.09375

KeyboardInterrupt: 

In [207]:
model = torch.load('model.ckpt')

In [209]:
model.eval()

BaselineStockPredictor(
  (recurrent): LSTM(2, 64, batch_first=True, dropout=0.5)
  (linear): Linear(in_features=65, out_features=1, bias=True)
)

In [213]:
def eval_model(model, test):
    batch_size = 100
    test_iterator = get_stock_iterator(test, batch_size, shuffle=False)
        
    criterion = nn.MSELoss()
        
    losses = []
    for batch_series, batch_aux, batch_labels in test_iterator:
        batch_aux = torch.reshape(batch_aux, (-1,1))
        outputs = model(batch_series.float(), batch_aux.float())
        batch_labels = torch.reshape(batch_labels, (-1,1))
        loss = criterion(outputs, batch_labels.float())
        losses.append(loss.item())
    return np.mean(losses)

In [214]:
eval_model(model, test_stock)

11477.3125