In [1]:
import json
import pandas as pd
import numpy as np
import json

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix, roc_auc_score, f1_score

import time
import warnings

In [2]:
from Util.Util import get_k_val, get_k_val_2_domain

In [3]:
VOCAB_SIZE = 5000
VOCAB = [i for i in range(VOCAB_SIZE)]

In [4]:
max_seq_length = 256
BATCH_SIZE = 256

In [5]:
d1_file_path = "./data/domain1_train.json/domain1_train.json"
d2_file_path = "./data/domain2_train.json/domain2_train.json"

In [6]:
d1 = pd.read_json(path_or_buf="./data/domain1_train.json/domain1_train.json", lines=True)
d2 = pd.read_json(path_or_buf="./data/domain2_train.json/domain2_train.json", lines=True)

In [7]:
data_1 = get_k_val(d1['text'], d1['label'], 5)

In [8]:
train_val_set_1 = data_1[0]

In [9]:
X_train, X_val, y_train, y_val  = train_val_set_1

In [10]:
y_val

0      0
1      0
2      0
3      0
4      0
      ..
495    1
496    1
497    1
498    1
499    1
Name: label, Length: 500, dtype: int64

In [11]:
print(X_train.shape)
print(X_val.shape)
print(y_train.shape)
print(y_val.shape)

(19000,)
(500,)
(19000,)
(500,)


In [12]:
class CustomTextClassificationDataset(Dataset):
    def __init__(self, text, label, vocab, max_sequence_length):
        
        self.vocab = vocab
        self.max_sequence_length = max_sequence_length
        
        self.text = text
        self.label = label
                
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, idx):
        text = self.text[idx]
        label = self.label[idx]
        text_length = len(text)
        
        # Apply padding or truncation to ensure a fixed sequence length
        if text_length < self.max_sequence_length:
            text += [-1] * (self.max_sequence_length - len(text))
        else:
            text = text[:self.max_sequence_length]
            text_length = self.max_sequence_length
        
        # Convert sequence and label to PyTorch tensors
        sequence_tensor = torch.tensor(text, dtype=torch.long)
        label_tensor = torch.tensor(label, dtype=torch.float32)
        
        return sequence_tensor, label_tensor, text_length


In [13]:
train_set_1 = CustomTextClassificationDataset(X_train, y_train, VOCAB, max_seq_length)
dev_set_1 = CustomTextClassificationDataset(X_val, y_val, VOCAB, max_seq_length)

In [14]:
train_loader1 = DataLoader(train_set_1, batch_size=BATCH_SIZE, shuffle=True)
dev_loader1 = DataLoader(dev_set_1, batch_size=BATCH_SIZE, shuffle=True)

In [15]:
class TextClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers, dropout, pad_idx):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.LSTM_layers = n_layers

        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=n_layers, dropout=dropout)
        self.fc1 = nn.Linear(hidden_dim * 2, hidden_dim)
        
        self.fc2 = nn.Linear(hidden_dim, 1)
        self.dropout = nn.Dropout(dropout)

    def forward(self, text, text_length):
        h = torch.zeros((self.LSTM_layers, text.size(0), self.hidden_dim))
        c = torch.zeros((self.LSTM_layers, text.size(0), self.hidden_dim))

        torch.nn.init.xavier_normal_(h)
        torch.nn.init.xavier_normal_(c)

        out = self.embedding(text)
        out, (hidden, cell) = self.lstm(out, (h,c))
        out = self.dropout(out)
        out = torch.relu_(self.fc1(out[:,-1,:]))
        out = self.dropout(out)
        out = torch.sigmoid(self.fc2(out))

        return out

In [16]:
torch.cuda.is_available()

True

In [17]:
!export CUDA_LAUNCH_BLOCKING=1
warnings.filterwarnings("ignore")

INPUT_DIM = VOCAB_SIZE
EMBEDDING_DIM = 200
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
DROPOUT = 0.5
PAD = -1

model = TextClassifier(INPUT_DIM, EMBEDDING_DIM, HIDDEN_DIM, OUTPUT_DIM, N_LAYERS, DROPOUT, PAD)
optimizer = optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss()

def binary_accuracy(preds, y):
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float()
    acc = correct.sum() / len(correct)
    return acc

gpu = 0 #gpu ID
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(DEVICE)
criterion = criterion.to(DEVICE)

def train(model, dataloader, optimizer, criterion, DEVICE):
    model.train()
    epoch_loss = 0
    epoch_acc = 0
    
    for batch in dataloader:  
        texts, labels, text_length = batch
        texts = texts.to(DEVICE)
        labels = labels.to(DEVICE)
        # Zero the parameter gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(texts, text_length)
        loss = criterion(outputs, labels)
        acc = binary_accuracy(predictions, batch.label.float())
        
        
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
        epoch_acc += acc.item()

        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

def evaluate(model, dataloader, criterion, DEVICE):
    
    model.eval()
    epoch_loss = 0
    epoch_acc = 0
    with torch.no_grad():
        for batch in dataloader:  
            texts, labels,text_length = batch
            texts = texts.to(DEVICE)
            labels = labels.to(DEVICE)
            outputs = model(texts, text_length)
            loss = criterion(outputs, labels)
            acc = binary_accuracy(predictions, batch.label.float())

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

N_EPOCHS = 10
for epoch in range(N_EPOCHS):
    train_loss, train_acc = train(model, train_loader1, optimizer, criterion,gpu)
    valid_loss, valid_acc = evaluate(model, dev_loader1, criterion)
    print(f'Epoch: {epoch + 1:02}')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc * 100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc * 100:.2f}%')


../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [94,0,0], t

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
