In [None]:
!pip install transformers
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import logging
import matplotlib.pyplot as plt
% matplotlib inline
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

### Load Dataset

In [None]:
import pandas as pd
import numpy as np
import torch
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score


annotations_aggregated = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/pfizer_processed.csv")
a_agg = annotations_aggregated['Volatility']
pfizer = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/pfizer_processed.csv").fillna(0)
tesla = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/tesla_processed.csv").fillna(0)
dataset = pd.concat((pfizer, tesla), axis=0)


X_total = dataset['Text']
y_total = dataset['Volatility']
y_total = np.array([i for i in y_total])
from collections import defaultdict

In [None]:
sum(y_total)

### Generate Sentence Embeddings

In [None]:
kf = KFold(random_state=24, shuffle=True, n_splits=5)
f1_array = []
precision_array = []
recall_array = []
acc_array = []
lr = None
fold = 1

model = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True,)

model.eval()

total_tokens = list(map(lambda t: tokenizer.tokenize("[CLS] " + t + " [SEP]"), X_total))
print(len(total_tokens))

total_tokens_ids = list(map(tokenizer.convert_tokens_to_ids, total_tokens))
total_segment_ids = list(map(lambda t, y: [y] * len(t), total_tokens, y_total))

total_token_tensor_arr = list(map(lambda t: torch.tensor([t]), total_tokens_ids))
total_segment_tensor_arr = list(map(lambda t: torch.tensor([t]), total_segment_ids))

print(f"train_tokens = {len(total_tokens)}")
print(f"train_tokens_ids = {len(total_tokens_ids)}")
print(f"train_segment_ids = {len(total_segment_ids)}")
print(f"train_token_tensor_arr = {len(total_token_tensor_arr)}")
print(f"train_segment_tensor_arr = {len(total_segment_tensor_arr)}")
with torch.no_grad():
  total_outputs_arr = list(map(lambda x, y: model(x, y), total_token_tensor_arr, total_segment_tensor_arr))
  total_hidden_states_arr = list(map(lambda t: t[2], total_outputs_arr))
  
total_token_vecs_arr = list(map(lambda t: t[-2][0], total_hidden_states_arr))
total_sentence_embeddings_arr = list(map(lambda t: torch.mean(t, dim=0), total_token_vecs_arr))
X_features = np.array([i.numpy() for i in total_sentence_embeddings_arr])

### Logistic Regression + BERT

In [None]:
def runLR(X_features, y_totl):
    kf = KFold(random_state=24, shuffle=True, n_splits=5)
    f1_array = []
    precision_array = []
    recall_array = []
    acc_array = []
    lr = None
    fold = 1
    y_total = np.array([i+1 for i in y_totl])
    for train_index, test_index in kf.split(X_features):
        print(f"\n_____fold = {fold}_____")
        X_train, X_test = X_features[train_index], X_features[test_index]
        y_train, y_test = y_total[train_index], y_total[test_index]
        print(f"train = x:{X_train.shape}, y:{y_train.shape}")

        
        lr = LogisticRegression(max_iter=1000)
        lr.fit(X_train, y_train)
        
        y_pred = lr.predict(X_test)



        f1 = f1_score(y_test, y_pred, average='binary')
        prec = precision_score(y_test, y_pred)
        f1_array.append(f1)
        precision_array.append(prec)
        rec = recall_score(y_test, y_pred)
        recall_array.append(rec)
        acc = accuracy_score(y_test, y_pred)
        acc_array.append(acc)
        
        print(f"F1 score\t = {f1}")
        print(f"precision score = {prec}")
        print(f"recall score = {rec}")
        print(f"accuracy score = {acc}")
        
        fold+=1

    print("\n__________________\n\naverage f1 score over all folds = " + str(np.mean([f1_array])))
    print("average precision over all folds\t = "+ str(np.mean([precision_array])))
    print("average recall over all folds\t\t = " + str(np.mean([recall_array])))
    print("average accuracy over all folds\t\t = " + str(np.mean([acc_array])))
  
runLR(X_features, y_total)

### Neural Network + BERT

In [None]:
from __future__ import print_function
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import numpy as np
import torch.utils.data as utils
import pandas as pd
from sklearn.model_selection import  train_test_split as train_test_split
from IPython.display import display, HTML
import tensorflow as tf


class SeqModel(nn.Module):

    def __init__(self, num_features, num_targets, batch_size):
        super().__init__()
        self.layer1_size = 32
        self.layer2_size = 16
        self.layer3_size = 2
        self.gradient_accumulation_steps = 1
        self.max_grad_norm = 1000

        self.num_features = num_features
        self.num_targets = num_targets
        self.batch_size = batch_size
        
        self.inputnorm = nn.BatchNorm1d(self.num_features)
        
        self.layer1 = nn.Linear(num_features, self.layer1_size)
        self.drop1 = nn.Dropout(0.5)
        self.prelu = nn.PReLU()
        
        self.hiddenlinear = nn.Linear(self.layer1_size, self.layer2_size)
        self.dropout2 = nn.Dropout(0.2)
        
        self.layer3 = nn.Linear(self.layer2_size, self.layer3_size)
        self.dropout3 = nn.Dropout(0.2)
        
 
        
        self.final = nn.Linear(self.layer3_size, num_targets)

    def forward(self, x):
        x = self.layer1(x)
        x = self.drop1(x)
        x = self.prelu(x)
        x = self.hiddenlinear(x)
        x = self.prelu(x)
        x = self.dropout2(x)
        x = self.layer3(x)
        x = self.dropout3(x)
        x = self.prelu(x)
        x = self.final(x)

        return x

def train_mdl(model, device, train_loader, optimizer, scheduler, epoch):
    model.train()
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        output = model(data)
        loss = nn.BCEWithLogitsLoss()(output, target)
        if model.gradient_accumulation_steps > 1:
            loss = loss / model.gradient_accumulation_steps
        loss.backward()

        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), model.max_grad_norm)
        if (batch_idx + 1) % model.gradient_accumulation_steps == 0:
            scheduler.step()
            optimizer.step()
            optimizer.zero_grad()
            
        if batch_idx % 100 == 0: #Print loss every 100 batch
            print('Train Epoch: {}\tLoss: {:.6f}'.format(
                epoch, loss.item()))
    Loss = test_mdl(model, device, train_loader)
    return Loss

def test_mdl(model, device, test_loader):
    model.eval()
    loss_arr = []
    acc_arr = []
    prec_arr = []
    recall_arr = []

    with torch.no_grad():
        for data, target in test_loader:
            p = []
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = nn.BCEWithLogitsLoss()(output, target)
            loss_arr.append(loss.item())
            p.append([output.sigmoid().cpu().numpy()])
            probas = np.array(p).reshape(-1)
            preds = [1 if i>0.5 else 0 for i in probas]
            acc_arr.append(accuracy_score(target.cpu().numpy(), preds))
            prec_arr.append(precision_score(target.cpu().numpy(), preds))
            recall_arr.append(recall_score(target.cpu().numpy(), preds))
    print(f"acc = {np.mean(acc_arr)}")
    print(f"prec = {np.mean(prec_arr)}")
    print(f"recall = {np.mean(recall_arr)}")
    return np.mean(loss_arr)






def main2(X, Y):
    train_data , valid_data, train_target, valid_target = train_test_split(X, Y, test_size=0.33, random_state=42)

    use_cuda= True
    learning_rate = 0.0001
    NumEpochs = 15
    batch_size = 16
    device = torch.device("cuda" if use_cuda else "cpu")

    tensor_x = torch.tensor(train_data,dtype=torch.float, device=device)
    tensor_y = torch.tensor(train_target,dtype=torch.float,  device=device)

    test_tensor_x = torch.tensor(valid_data,dtype=torch.float, device=device)
    test_tensor_y = torch.tensor(valid_target,dtype=torch.float)
    
    train_dataset = utils.TensorDataset(tensor_x, tensor_y)  # create your datset
    train_loader = utils.DataLoader(train_dataset, batch_size=batch_size, drop_last=True)  # create your dataloader
    
    test_dataset = utils.TensorDataset(test_tensor_x, test_tensor_y)  # create your datset
    test_loader = utils.DataLoader(test_dataset, batch_size=batch_size, drop_last=True)


    model = SeqModel(768, 1, batch_size)
    if torch.cuda.device_count() > 1:
        print("Let's use", torch.cuda.device_count(), "GPUs!")
        model = nn.DataParallel(model)

    model.to(device)
    optimizer = optim.Adamax(model.parameters(), lr=learning_rate)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, max_lr=1e-2, epochs=NumEpochs, steps_per_epoch=len(train_loader))


    train_Loss_list = []
    test_Loss_list = []
    epoch_list = []
    for epoch in range(NumEpochs):
        epoch_list.append(epoch)
        train_loss = train_mdl(model, device, train_loader, optimizer, scheduler, epoch)
        train_Loss_list.append(train_loss)
        print(f'\nTrain set Loss: {train_loss}')
        test_loss = test_mdl(model, device, test_loader)
        print(f'Test set Loss: {test_loss}\n')
        test_Loss_list.append(test_loss)

    # Plot train and test accuracy vs epoch
    plt.figure("Train and Test Loss vs Epoch")
    plt.plot(epoch_list, train_Loss_list, c='r', label="Train Loss")
    plt.plot(epoch_list, test_Loss_list, c='g', label="Test Loss")
    plt.ylabel("Loss")
    plt.xlabel("Number of Epochs")
    plt.legend(loc=0)
    plt.show()
    return model

y_target = y_total.reshape(-1, 1)

X_train_f, X_valid, y_train_f, y_valid = train_test_split(X_features, y_target,test_size=0.5, random_state=42)

mdl = main2(X_train_f, y_train_f)
testvals = torch.tensor(X_valid,dtype=torch.float,  device=torch.device("cuda"))
p = mdl(testvals).sigmoid().detach().cpu().numpy()
probas = np.array(p).reshape(-1)
preds = [1 if i>=0.5 else 0 for i in probas]
print(f"test accuracy = {accuracy_score(y_valid, preds)}")
print(f"test f1 = {f1_score(y_valid, preds, average='binary')}")
print(f"test precision = {precision_score(y_valid, preds)}")
print(f"test recall = {recall_score(y_valid, preds)}")