In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.legacy import data
from torchtext.legacy import datasets
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

import spacy

# Preprocess the text data
def preprocess_text(tokens):
    
    # Convert to lowercase and remove non-alphabetic characters
    words = [word.lower() for word in tokens if word.isalpha()]
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    clean_words = [word for word in words if word not in stop_words]
    
    return clean_words

# Create the dataset
TEXT = data.Field(
    tokenize='spacy',
    preprocessing=preprocess_text,
    lower=True,
    batch_first=True)

LABEL = data.LabelField(dtype=torch.float, batch_first=True)

dataset = data.TabularDataset(
    path="data.csv",
    format="csv",
    fields={
        "text": ("text", TEXT),
        "vix_label": ("label", LABEL)
    },
)



In [12]:
# Split the dataset into training and testing sets
train_data, test_data = dataset.split(split_ratio=0.8)

# Build the vocabulary
TEXT.build_vocab(train_data, max_size=10000, vectors="glove.6B.100d", unk_init=torch.Tensor.normal_)
LABEL.build_vocab(train_data)

# Create data iterators
BATCH_SIZE = 8
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
train_iterator, test_iterator = data.BucketIterator.splits(
    (train_data, test_data),
    batch_size=BATCH_SIZE,
    device=device,
    sort_key = lambda x: len(x.text),
    sort_within_batch=True
)

# Create the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        embedded = self.dropout(self.embedding(text))
        lstm_output, (hidden, cell) = self.lstm(embedded)
        output = self.fc(self.dropout(lstm_output[:,-1,:]))
        return output

In [13]:
# Instantiate the model
VOCAB_SIZE = len(TEXT.vocab)
EMBEDDING_DIM = 100
HIDDEN_DIM = 64
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.5

model = LSTMClassifier(VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM,
                       N_LAYERS, DROPOUT).to(device)

# Train the model
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()


def train(model, iterator, optimizer, criterion):
    epoch_loss = 0
    epoch_acc = 0

    model.train()

    for batch in iterator:
        optimizer.zero_grad()
        predictions = model(batch.text).squeeze(1)
        loss = criterion(predictions, batch.label)
        acc = binary_accuracy(predictions, batch.label)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
        epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc


def evaluate(model, iterator, criterion):
    epoch_loss = 0
    epoch_acc = 0
    model.eval()

    with torch.no_grad():
        for batch in iterator:
            predictions = model(batch.text).squeeze(1)
            loss = criterion(predictions, batch.label)
            acc = binary_accuracy(predictions, batch.label)

            epoch_loss += loss.item()
            epoch_acc += acc.item()

    return epoch_loss / len(iterator), epoch_acc / len(iterator)


N_EPOCHS = 10

for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, test_iterator, criterion)
    print(
        f'Epoch: {epoch+1:02}, Train Loss: {train_loss:.3f}, Train Acc: {train_acc*100:.2f}%, Val. Loss: {valid_loss:.3f}, Val. Acc: {valid_acc*100:.2f}%'
    )

Epoch: 01, Train Loss: 0.688, Train Acc: 60.65%, Val. Loss: 0.719, Val. Acc: 33.33%
Epoch: 02, Train Loss: 0.677, Train Acc: 60.65%, Val. Loss: 0.728, Val. Acc: 33.33%
Epoch: 03, Train Loss: 0.685, Train Acc: 60.65%, Val. Loss: 0.733, Val. Acc: 33.33%
Epoch: 04, Train Loss: 0.682, Train Acc: 60.65%, Val. Loss: 0.741, Val. Acc: 33.33%
Epoch: 05, Train Loss: 0.674, Train Acc: 60.65%, Val. Loss: 0.748, Val. Acc: 33.33%
Epoch: 06, Train Loss: 0.671, Train Acc: 60.65%, Val. Loss: 0.760, Val. Acc: 33.33%
Epoch: 07, Train Loss: 0.676, Train Acc: 60.65%, Val. Loss: 0.779, Val. Acc: 33.33%
Epoch: 08, Train Loss: 0.680, Train Acc: 60.65%, Val. Loss: 0.779, Val. Acc: 33.33%
Epoch: 09, Train Loss: 0.664, Train Acc: 60.65%, Val. Loss: 0.780, Val. Acc: 33.33%
Epoch: 10, Train Loss: 0.661, Train Acc: 60.65%, Val. Loss: 0.800, Val. Acc: 33.33%
