# Classifier

## Imports

In [1]:
import numpy as np
import pandas as pd
import math

import torch
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
import pickle
import os
import gc
import random

In [2]:
random.seed(42)

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

cuda:0


## Dataset

In [4]:
class StockDataset(torch.utils.data.IterableDataset):
    def __init__(self, X, y):
        """
        X: [data_len, 768]: Sentence Embeddings
        y: [data_len,]: labels: -1, 0, 1
        """
        y_not_neutral = (y != 0)
        
        self.X = X[y_not_neutral, :]
        self.y = y[y_not_neutral]
        self.y[self.y == -1] = 0
        
        self.data_len = self.X.shape[0]
    
    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        
        if worker_info is None:
            #if only one worker
            for i in range(self.data_len):
                yield self.X[i,:], self.y[i]
    
        else:
            #multiple workers
            per_worker = int(math.ceil(self.data_len / float(worker_info.num_workers)))
            worker_id = worker_info.id
            iter_start = worker_id * per_worker
            iter_end = min(iter_start + per_worker, self.data_len)

            for i in range(iter_start, iter_end):
                yield self.X[i, :], self.y[i]

In [5]:
def load_data_from_disk(path, file_name, batches):
    X = []
    y = []
    for batch in batches:
        file_path = path + file_name + f'{batch:03d}'
        df = pd.read_csv(file_path, index_col=0)
        tmp_x = np.array(df.loc[:, '0':'767'])
        tmp_y = np.array(df['Label'])
        X.append(tmp_x)
        y.append(tmp_y)
    X = np.concatenate(X)
    y = np.concatenate(y)
    
    return X, y

In [6]:
path = '/home/rmeshkin/ML-Project/'
file_name = 'Stock_Embs_Batch_'
X, y = load_data_from_disk(path, file_name, [5,6,7])
print(X.shape, y.shape)

(35580, 768) (35580,)


## Classifier Model

In [7]:
class ClassifierModel(torch.nn.Module):
    def __init__(self, h1, h2, output_dim):
        super(ClassifierModel, self).__init__()

        self.linear1 = torch.nn.Linear(768, h1)
        self.activation1 = torch.nn.ReLU()

        self.linear2 = torch.nn.Linear(h1, h2)
        self.dropout2 = torch.nn.Dropout(0.5)
        self.activation2 = torch.nn.ReLU()

        self.linear3 = torch.nn.Linear(h2, output_dim)
        self.sigmoid = torch.nn.Sigmoid()

        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.zeros_(self.linear1.bias)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        torch.nn.init.zeros_(self.linear2.bias)
        torch.nn.init.xavier_uniform_(self.linear3.weight)
        torch.nn.init.zeros_(self.linear3.bias)
  

    def forward(self, embedding_batch):
        #embedding_batch: [batch_size, embedding_length]
        l1_out = self.linear1(embedding_batch)
        l1_act = self.activation1(l1_out)

        l2_out = self.linear2(l1_act)
        l2_drop = self.dropout2(l2_out)
        l2_act = self.activation2(l2_drop)

        out = self.sigmoid(self.linear3(l2_act))
        return out
  

    def reset(self):
        torch.nn.init.xavier_uniform_(self.linear1.weight)
        torch.nn.init.zeros_(self.linear1.bias)
        torch.nn.init.xavier_uniform_(self.linear2.weight)
        torch.nn.init.zeros_(self.linear2.bias)
        torch.nn.init.xavier_uniform_(self.linear3.weight)
        torch.nn.init.zeros_(self.linear3.bias)

In [8]:
net = ClassifierModel(250, 250, 1).to(device)
net

ClassifierModel(
  (linear1): Linear(in_features=768, out_features=250, bias=True)
  (activation1): ReLU()
  (linear2): Linear(in_features=250, out_features=250, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (activation2): ReLU()
  (linear3): Linear(in_features=250, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [9]:
def get_accuracy(loader, net):
    total = 0
    correct = 0
    net.eval()
    with torch.no_grad():
        for data in loader:
            X, y = data[0].to(device).float(), data[1].to(device).int()
            y_pred = net(X)
            pos = (y_pred > 0.5)
            neg = (y_pred < 0.5)
            y_pred[pos] = 1
            y_pred[neg] = 0
            y_pred = y_pred.int().reshape(-1,)

            correct += (y_pred == y).sum().item()
            total += y.size(0)
    net.train()
    return correct*100.0/total

## Preparing to Train

In [10]:
all_batches = [i for i in range(100)]
random.shuffle(all_batches)

train_batches = all_batches[:90]
dev_batches = all_batches[90:]

In [11]:
X_train, y_train = load_data_from_disk(path, file_name, train_batches)
X_dev, y_dev = load_data_from_disk(path, file_name, dev_batches)

In [12]:
train_ds = StockDataset(X_train, y_train)
dev_ds = StockDataset(X_dev, y_dev)

train_loader = DataLoader(train_ds, 128, num_workers=4)
dev_loader = DataLoader(dev_ds, 128, num_workers=4)

In [13]:
optimizer = optim.Adam(net.parameters(), lr=1e-4)

In [14]:
epochs = 100
criterion = torch.nn.BCELoss()

In [16]:
for epoch in range(epochs):
    print('#'*100)
    print(f'epoch: {epoch:03d}')
    
    avg_loss = 0
    epoch_size = 0
    for i, data in enumerate(train_loader):
        X_batch = data[0].to(device).float()
        y_batch = data[1].to(device).float()
        
        optimizer.zero_grad()
        
        out = net(X_batch).reshape(-1,)
        
        loss = criterion(out, y_batch)
        loss.backward()
        
        optimizer.step()
        
        avg_loss += loss.item()
        epoch_size += 1
    
    train_acc = get_accuracy(train_loader, net)
    dev_acc = get_accuracy(dev_loader, net)
    print(f'Loss: {avg_loss/epoch_size:4.2f}   Train Accuracy: {train_acc:4.2f}   Dev Accuracy: {dev_acc:4.2f}')

####################################################################################################
epoch: 000
Loss: 0.08   Train Accuracy: 95.35   Dev Accuracy: 86.17
####################################################################################################
epoch: 001
Loss: 0.08   Train Accuracy: 95.44   Dev Accuracy: 86.43
####################################################################################################
epoch: 002
Loss: 0.08   Train Accuracy: 95.79   Dev Accuracy: 86.70
####################################################################################################
epoch: 003
Loss: 0.06   Train Accuracy: 95.86   Dev Accuracy: 86.59
####################################################################################################
epoch: 004
Loss: 0.06   Train Accuracy: 96.03   Dev Accuracy: 86.49
####################################################################################################
epoch: 005
Loss: 0.05   Train Accuracy: 95.88   Dev Ac

In [17]:
!mkdir model_checkpoints

In [19]:
torch.save(net, 'model_checkpoints/model.pt')

In [20]:
net2 = torch.load('model_checkpoints/model.pt')

In [21]:
net2.eval()

ClassifierModel(
  (linear1): Linear(in_features=768, out_features=250, bias=True)
  (activation1): ReLU()
  (linear2): Linear(in_features=250, out_features=250, bias=True)
  (dropout2): Dropout(p=0.5, inplace=False)
  (activation2): ReLU()
  (linear3): Linear(in_features=250, out_features=1, bias=True)
  (sigmoid): Sigmoid()
)

In [24]:
def get_class_accuracy(loader, net):
    total_pos = 0
    correct_pos = 0
    
    total_neg = 0
    correct_neg = 0
    net.eval()
    with torch.no_grad():
        for data in loader:
            X, y = data[0].to(device).float(), data[1].to(device).int()
            y_pred = net(X)
            pos = (y_pred > 0.5).reshape(-1,)
            neg = (y_pred < 0.5).reshape(-1,)
        

            correct_pos += (y[pos]==1).sum().item()
            total_pos += y[pos].size(0)
            
            correct_neg += (y[neg]==0).sum().item()
            total_neg += y[neg].size(0)
    net.train()
    return correct_pos/total_pos, correct_neg/total_neg

In [25]:
get_class_accuracy(train_loader, net2)

(0.9791464550894753, 0.9789230446047196)

In [26]:
get_class_accuracy(dev_loader, net2)

(0.8749867345855884, 0.9060564349621473)