In [1]:
import numpy as np
import math
import pandas as pd
import torch
import matplotlib.pyplot as plt

import pickle

from sklearn.utils import shuffle
from sklearn.manifold import TSNE
# import seaborn as sns
import gc
import os

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


In [3]:
from transformers import BertTokenizer, BertModel

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased").to(device)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# Loading Trained Model

In [4]:
class ClassifierModel(torch.nn.Module):
    def __init__(self, h1, h2, output_dim):
        super(ClassifierModel, self).__init__()

        self.linear1 = torch.nn.Linear(768, h1)
        self.activation1 = torch.nn.ReLU()

        self.linear2 = torch.nn.Linear(h1, h2)
        self.dropout2 = torch.nn.Dropout(0.5)
        self.activation2 = torch.nn.ReLU()

        self.linear3 = torch.nn.Linear(h2, output_dim)
        self.sigmoid = torch.nn.Sigmoid()

        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.zeros_(self.linear1.bias)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        torch.nn.init.zeros_(self.linear2.bias)
        torch.nn.init.xavier_uniform_(self.linear3.weight)
        torch.nn.init.zeros_(self.linear3.bias)
  

    def forward(self, embedding_batch):
        #embedding_batch: [batch_size, embedding_length]
        l1_out = self.linear1(embedding_batch)
        l1_act = self.activation1(l1_out)

        l2_out = self.linear2(l1_act)
        l2_drop = self.dropout2(l2_out)
        l2_act = self.activation2(l2_drop)

        out = self.sigmoid(self.linear3(l2_act))
        return out
  

    def reset(self):
        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.zeros_(self.linear1.bias)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        torch.nn.init.zeros_(self.linear2.bias)
        torch.nn.init.xavier_uniform_(self.linear3.weight)
        torch.nn.init.zeros_(self.linear3.bias)

In [5]:
net = torch.load('model_checkpoints/model.pt')
net.eval()

ClassifierModel(
  (linear1): Linear(in_features=768, out_features=250, bias=True)
  (activation1): ReLU()
  (linear2): Linear(in_features=250, out_features=250, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (activation2): ReLU()
  (linear3): Linear(in_features=250, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

# Reading New Stock Data and Computing Sentiments

In [2]:
df = pd.read_csv('Combined_percentage_same.csv')
df.head()

Unnamed: 0,symbol,message,datetime,user,message_id,Date,Time,label
0,AAPL,peak profit last 6 expired option alerts aapl ...,2020-07-19 09:49:35,1442893,229008387,2020-07-19,09:49:35,1
1,AAPL,aapl jul 17 382 50 calls option volume 144 44 ...,2020-07-19 09:47:26,1442893,229008357,2020-07-19,09:47:26,1
2,AAPL,tsla market true bubble territory profitable c...,2020-07-19 09:01:25,1115913,229007569,2020-07-19,09:01:25,1
3,AAPL,aapl analyzed 26 analysts buy consensus 86 ana...,2020-07-19 08:13:00,47688,229006733,2020-07-19,08:13:00,1
4,AAPL,aapl new article dogs dow august 4 adopt ignore,2020-07-19 07:54:05,1555408,229006403,2020-07-19,07:54:05,1


In [3]:
df.shape

(6450989, 8)

In [7]:
df.dropna(inplace=True)

In [8]:
def get_sentence_embeddings(sentences: list):
    # print(sentences)
    # Tokenize sentences
    # print("tokenizing")
    try:
        encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt').to('cuda')
    except Exception:
        print(sentences)
        print(type(sentences))
        return None
    # print("done")

    # print("running through bert")
    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)
    # print("done")
    return model_output[0][:,0]

In [9]:
def Get_Sentiments_and_Save(df, batch_size=4, filename='All_Embs.csv'):
    data_len = df.shape[0]

    sentences_df = np.array(df['message'])
    embeddings = []
    print(f'Getting BERT embeddings for {data_len} samples')

    print('Progress: ', end='')
    thresh = 1
    iters = math.ceil(data_len/batch_size)
    for i in range(iters):
        start = i*batch_size
        end = min(data_len, start + batch_size)
        sentences = list(sentences_df[start:end])
        embeddings.append(get_sentence_embeddings(sentences))
        if i*100.0/iters > thresh:
            print('|', end='')
            thresh += 1
    print('\nDone')

    embeddings = torch.cat(embeddings)
    with torch.no_grad():
        sentiments = net(embeddings)
    # embeddings_np = embeddings.cpu().detach().numpy()
    
    del embeddings
    # print(embeddings_np.shape)
    sentiments_np = sentiments.cpu().detach().numpy()

    ret_df = pd.DataFrame()
    ret_df['Date'] = np.array(df['Date']).reshape(-1,)
    ret_df['Symbol'] = np.array(df['symbol']).reshape(-1,)
    ret_df['Sentiment'] = sentiments_np.reshape(-1,)
    ret_df.to_csv(filename)

    return ret_df

In [10]:
data_len = df.shape[0]
base_file_name = 'NewStock/NewStock_Sentiment_Batch_'
batch_size = math.ceil(data_len / 100)
for i in range(50, 100):
    batch_start = i*batch_size
    batch_end = min(data_len, batch_start + batch_size)
    ret_df = Get_Sentiments_and_Save(df.loc[batch_start:batch_end,:],
                                    batch_size=2,
                                    filename=base_file_name+f'{i:03d}.csv')
    print(ret_df.shape)
    torch.cuda.empty_cache()

Getting BERT embeddings for 64490 samples
Progress: |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done
(64490, 3)
Getting BERT embeddings for 64493 samples
Progress: |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done
(64493, 3)
Getting BERT embeddings for 64493 samples
Progress: |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done
(64493, 3)
Getting BERT embeddings for 64493 samples
Progress: |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done
(64493, 3)
Getting BERT embeddings for 64493 samples
Progress: |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done
(64493, 3)
Getting BERT embeddings for 64493 samples
Progress: |||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
Done
(64

In [32]:
ret_df.shape

(10750, 3)