In [None]:
import torch
import torch.nn as nn
import pandas as pd
import string
import pickle
import torch.nn.utils.rnn as padfunc
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
df = pd.read_csv("/kaggle/input/mydataset/train.csv")
corpus = df["Description"]

class MyDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    def __len__(self):
        return len(self.embeddings)
    def __getitem__(self, index):
        return self.embeddings[index], self.labels[index]

In [None]:
processed_sent = []
unq_words = {}
translation_table = str.maketrans('', '', string.punctuation)
for sent in corpus:
    sent = sent.replace('-', ' ').replace('\\', ' ')
    sent = sent.translate(translation_table)
    sent = sent.split()
    sent = [word.lower() for word in sent if word.isalpha()]
    sent = ['<sos>'] + sent + ['<eos>']
    processed_sent.append(sent)
    for word in sent:
        unq_words[word] = unq_words.get(word, 0) + 1

vocab = []
for key, value in unq_words.items():
    if value >= 10:
        vocab.append(key)

vocab.extend(['<unk>'])
vocab = list(set(vocab))
word2idx = {word: idx + 1 for idx, word in enumerate(vocab)}

In [None]:
class ForwardLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_prob):
        super(ForwardLM, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_prob)

        self.embed_layer = nn.Embedding(vocab_size, embedding_dim)
        self.layer1 = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        self.layer2 = nn.LSTM(hidden_dim, hidden_dim, batch_first = True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embed = self.embed_layer(x)
        lstm1, _ = self.layer1(embed)
        lstm1 = self.dropout(lstm1)
        lstm2, _ = self.layer2(lstm1)
        lstm2 = self.dropout(lstm2)
        output = self.fc(lstm2)
        return output, lstm1, lstm2, embed


class BackwardLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_prob):
        super(BackwardLM, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_prob)

        self.embed_layer = nn.Embedding(vocab_size, embedding_dim)
        self.layer1 = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        self.layer2 = nn.LSTM(hidden_dim, hidden_dim, batch_first = True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embed = self.embed_layer(x)
        lstm1, _ = self.layer1(embed)
        lstm1 = self.dropout(lstm1)
        lstm2, _ = self.layer2(lstm1)
        lstm2 = self.dropout(lstm2)
        output = self.fc(lstm2)
        return output, lstm1, lstm2, embed

In [None]:
inputs, outputs = [], []

for sent in processed_sent:
    sent = [word2idx.get(word, word2idx['<unk>']) for word in sent]
    inputs.append(torch.tensor(sent[:-1]))
    outputs.append(torch.tensor(sent[1:]))

## Forward LM

In [None]:
pad_inputs, pad_outputs = padfunc.pad_sequence(inputs, batch_first=True), padfunc.pad_sequence(outputs, batch_first=True)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
num_epochs = 10
learning_rate = 0.001
batch_size = 256

model = ForwardLM(len(vocab) + 1, 300, 300, 0.5)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
model = model.to(device)

In [None]:
train_loader = DataLoader(dataset=MyDataset(pad_inputs, pad_outputs), batch_size=batch_size, shuffle=True)

In [None]:
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_input, batch_output in tqdm(train_loader, total = len(train_loader), desc="Training"):
        batch_input, batch_output = batch_input.to(device), batch_output.to(device)
        optimizer.zero_grad()
        output = model(batch_input)
        output_flat = output[0].view(-1, output[0].size(-1))
        target_flat = batch_output.view(-1)
        loss = criterion(output_flat, target_flat)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_input.size(0)
    epoch_train_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}: Train Loss: {epoch_train_loss:.4f}")


model_path = '/kaggle/working/forward1.pt'
torch.save(model.state_dict(), model_path)


Training: 100%|██████████| 469/469 [02:53<00:00,  2.71it/s]


Epoch 1/10: Train Loss: 1.3881


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 2/10: Train Loss: 1.1385


Training: 100%|██████████| 469/469 [02:53<00:00,  2.71it/s]


Epoch 3/10: Train Loss: 1.0637


Training: 100%|██████████| 469/469 [02:53<00:00,  2.71it/s]


Epoch 4/10: Train Loss: 1.0167


Training: 100%|██████████| 469/469 [02:52<00:00,  2.71it/s]


Epoch 5/10: Train Loss: 0.9851


Training: 100%|██████████| 469/469 [02:53<00:00,  2.71it/s]


Epoch 6/10: Train Loss: 0.9615


Training: 100%|██████████| 469/469 [02:53<00:00,  2.71it/s]


Epoch 7/10: Train Loss: 0.9425


Training: 100%|██████████| 469/469 [02:53<00:00,  2.70it/s]


Epoch 8/10: Train Loss: 0.9267


Training: 100%|██████████| 469/469 [02:52<00:00,  2.71it/s]


Epoch 9/10: Train Loss: 0.9135


Training: 100%|██████████| 469/469 [02:52<00:00,  2.71it/s]

Epoch 10/10: Train Loss: 0.9022





## Backward LM

In [None]:
rev_outputs = [torch.flip(inp, [0]) for inp in inputs]
rev_inputs = [torch.flip(oup, [0]) for oup in outputs]

In [None]:
b_pad_inputs, b_pad_outputs = padfunc.pad_sequence(rev_inputs, batch_first=True), padfunc.pad_sequence(rev_outputs, batch_first=True)

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
num_epochs = 10
learning_rate = 0.001
batch_size = 256

model = BackwardLM(len(vocab) + 1, 300, 300, 0.5)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
model = model.to(device)

In [None]:
train_loader = DataLoader(dataset=MyDataset(b_pad_inputs, b_pad_outputs), batch_size=batch_size, shuffle=True)

In [None]:
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_input, batch_output in tqdm(train_loader, total = len(train_loader), desc="Training"):
        batch_input, batch_output = batch_input.to(device), batch_output.to(device)
        optimizer.zero_grad()
        output = model(batch_input)
        output_flat = output[0].view(-1, output[0].size(-1))
        target_flat = batch_output.view(-1)
        loss = criterion(output_flat, target_flat)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_input.size(0)
    epoch_train_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}: Train Loss: {epoch_train_loss:.4f}")


model_path = '/kaggle/working/backward1.pt'
torch.save(model.state_dict(), model_path)

Training: 100%|██████████| 469/469 [02:53<00:00,  2.71it/s]


Epoch 1/10: Train Loss: 1.3884


Training: 100%|██████████| 469/469 [02:52<00:00,  2.71it/s]


Epoch 2/10: Train Loss: 1.1473


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 3/10: Train Loss: 1.0659


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 4/10: Train Loss: 1.0165


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 5/10: Train Loss: 0.9847


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 6/10: Train Loss: 0.9613


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 7/10: Train Loss: 0.9425


Training: 100%|██████████| 469/469 [02:52<00:00,  2.71it/s]


Epoch 8/10: Train Loss: 0.9270


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]


Epoch 9/10: Train Loss: 0.9141


Training: 100%|██████████| 469/469 [02:52<00:00,  2.72it/s]

Epoch 10/10: Train Loss: 0.9029





In [None]:
dict_path = "/kaggle/working/word2idx.pkl"
with open(dict_path, 'wb') as file:
    pickle.dump(word2idx, file)

## Downstream Classification

In [None]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import string
import torch.nn.utils.rnn as padfunc
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

In [None]:
class DownstreamModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes, num_layers, forwardLM, backwardLM, num_lam):
        super(DownstreamModel, self).__init__()
        self.num_lam = num_lam
        self.lambdas = nn.Parameter(torch.rand(self.num_lam))
        self.forwardLM = forwardLM
        self.backwardLM = backwardLM
        self.layer = nn.LSTM(input_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.output = nn.Linear(hidden_dim, num_classes)
        self.softmax = nn.Softmax(dim=1)
        for param in self.forwardLM.parameters():
            param.requires_grad = False
        for param in self.backwardLM.parameters():
            param.requires_grad = False

    def forward(self, xf, xb):
        fout = self.forwardLM(xf)
        bout = self.backwardLM(xb)
        fupout = [0,0,0]
        bupout = [0,0,0]
        fupout[1], bupout[1], fupout[2], bupout[2] = fout[2][:, 1:, :], torch.flip(bout[2][:, 1:, :], dims=[1]), fout[3][:, 1:, :], torch.flip(bout[3][:, 1:, :], dims=[1])
        fupout[0], bupout[0] = fout[1][:, 1:, :], torch.flip(bout[1][:, 1:, :], dims=[1])
        ini_embed = torch.cat((fupout[0], bupout[0]), dim=2)
        hn1, hn2 = torch.cat((fupout[1], bupout[1]), dim=2), torch.cat((fupout[2], bupout[2]), dim=2)
        final_embed = (self.lambdas[0] * ini_embed) + (self.lambdas[1] * hn1) + (self.lambdas[2] * hn2)
        out, _ = self.layer(final_embed)
        out = out.mean(dim = 1)
        out = self.output(out)
        return out

In [None]:
class ForwardLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_prob):
        super(ForwardLM, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_prob)

        self.embed_layer = nn.Embedding(vocab_size, embedding_dim)
        self.layer1 = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        self.layer2 = nn.LSTM(hidden_dim, hidden_dim, batch_first = True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embed = self.embed_layer(x)
        lstm1, _ = self.layer1(embed)
        lstm1 = self.dropout(lstm1)
        lstm2, _ = self.layer2(lstm1)
        lstm2 = self.dropout(lstm2)
        output = self.fc(lstm2)
        return output, lstm1, lstm2, embed


class BackwardLM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, dropout_prob):
        super(BackwardLM, self).__init__()
        self.vocab_size = vocab_size
        self.embed_dim = embedding_dim
        self.hidden_dim = hidden_dim
        self.dropout = nn.Dropout(dropout_prob)

        self.embed_layer = nn.Embedding(vocab_size, embedding_dim)
        self.layer1 = nn.LSTM(embedding_dim, hidden_dim, batch_first = True)
        self.layer2 = nn.LSTM(hidden_dim, hidden_dim, batch_first = True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        embed = self.embed_layer(x)
        lstm1, _ = self.layer1(embed)
        lstm1 = self.dropout(lstm1)
        lstm2, _ = self.layer2(lstm1)
        lstm2 = self.dropout(lstm2)
        output = self.fc(lstm2)
        return output, lstm1, lstm2, embed

In [None]:
class MyDataset(Dataset):
    def __init__(self, embeddings, labels):
        self.embeddings = embeddings
        self.labels = labels
    def __len__(self):
        return len(self.embeddings)
    def __getitem__(self, index):
        return self.embeddings[index], self.labels[index]

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")

Using cuda device


In [None]:
dict_path = '/kaggle/working/word2idx.pkl'

with open(dict_path, 'rb') as file:
    word2idx = pickle.load(file)

In [None]:
dataset = np.asarray(pd.read_csv('/kaggle/input/mydataset/train.csv'))
test_dataset = np.asarray(pd.read_csv('/kaggle/input/mydataset/test.csv'))

In [None]:
translation_table = str.maketrans('', '', string.punctuation)
inputs, outputs = [], []
test_inputs, test_outputs = [], []
for index, sent in dataset:
    sent = sent.replace('-', ' ').replace('\\', ' ')
    sent = sent.translate(translation_table)
    sent = sent.split()
    sent = [word.lower() for word in sent if word.isalpha()]
    sent = ['<sos>'] + sent + ['<eos>']
    sent = [word2idx.get(word, word2idx['<unk>']) for word in sent]
    inputs.append(sent)
    outputs.append(torch.tensor(index - 1, dtype=torch.long))

for index, sent in test_dataset:
    sent = sent.replace('-', ' ').replace('\\', ' ')
    sent = sent.translate(translation_table)
    sent = sent.split()
    sent = [word.lower() for word in sent if word.isalpha()]
    sent = ['<sos>'] + sent + ['<eos>']
    sent = [word2idx.get(word, word2idx['<unk>']) for word in sent]
    test_inputs.append(sent)
    test_outputs.append(torch.tensor(index - 1, dtype=torch.long))

In [None]:
finputs, binputs = [], []
for inp in inputs:
    f = inp[:-1]
    b = inp[1:]
    b = b[::-1]
    finputs.append(torch.tensor(f))
    binputs.append(torch.tensor(b))

In [None]:
pad_finputs, pad_binputs = padfunc.pad_sequence(finputs, batch_first=True), padfunc.pad_sequence(binputs, batch_first=True)

In [None]:
com_inputs = list(zip(pad_finputs, pad_binputs))

In [None]:
fpath = '/kaggle/input/forwardlm/pytorch/forward1/1/forward1.pt'
fmodel = ForwardLM(len(vocab) + 1, 300, 300, 0.5)
fmodel.load_state_dict(torch.load(fpath))

<All keys matched successfully>

In [None]:
bpath = '/kaggle/input/backwardlm/pytorch/backward1/1/backward1.pt'
bmodel = BackwardLM(len(vocab) + 1, 300, 300, 0.5)
bmodel.load_state_dict(torch.load(bpath, map_location=device))

<All keys matched successfully>

In [None]:
num_epochs = 5
batch_size = 256
learning_rate = 0.001

model = DownstreamModel(600, 300, 4, 1, fmodel, bmodel, 3)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = learning_rate)
model = model.to(device)

In [None]:
train_loader = DataLoader(dataset=MyDataset(com_inputs, outputs), batch_size=batch_size, shuffle=True)

In [None]:
model.train()
for epoch in range(num_epochs):
    running_loss = 0.0
    for batch_input, batch_output in tqdm(train_loader, total = len(train_loader), desc="Training"):
        batch_output = batch_output.to(device)
        batch_input_0 = batch_input[0].to(device)
        batch_input_1 = batch_input[1].to(device)
        optimizer.zero_grad()
        model_output = model(batch_input_0, batch_input_1)
        loss = criterion(model_output, batch_output)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * batch_input_0.size(0)
    epoch_train_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}: Train Loss: {epoch_train_loss:.4f}")

model_path = '/kaggle/working/classifier2.pt'
torch.save(model.state_dict(), model_path)

Training: 100%|██████████| 469/469 [02:38<00:00,  2.95it/s]


Epoch 1/5: Train Loss: 1.1066


Training: 100%|██████████| 469/469 [02:38<00:00,  2.96it/s]


Epoch 2/5: Train Loss: 0.6576


Training: 100%|██████████| 469/469 [02:38<00:00,  2.96it/s]


Epoch 3/5: Train Loss: 0.5364


Training: 100%|██████████| 469/469 [02:38<00:00,  2.96it/s]


Epoch 4/5: Train Loss: 0.4735


Training: 100%|██████████| 469/469 [02:38<00:00,  2.96it/s]

Epoch 5/5: Train Loss: 0.4338





In [None]:
test_finputs, test_binputs = [], []
for inp in test_inputs:
    f = inp[:-1]
    b = inp[1:]
    b = b[::-1]
    test_finputs.append(torch.tensor(f))
    test_binputs.append(torch.tensor(b))

In [None]:
test_pad_finputs, test_pad_binputs = padfunc.pad_sequence(test_finputs, batch_first=True), padfunc.pad_sequence(test_binputs, batch_first=True)

In [None]:
test_com_inputs = list(zip(test_pad_finputs, test_pad_binputs))

In [None]:
test_loader = DataLoader(dataset=MyDataset(test_com_inputs, test_outputs), batch_size=batch_size, shuffle=True)

In [None]:
model.eval()
predicted_labels = []
true_labels = []
with torch.no_grad():
    for batch_input, batch_output in tqdm(test_loader, total=len(test_loader), desc="Testing"):
        batch_output = batch_output.to(device)
        batch_input_0 = batch_input[0].to(device)
        batch_input_1 = batch_input[1].to(device)
        model_output = model(batch_input_0, batch_input_1)
        pred = model_output.argmax(dim=1)
        predicted_labels.extend(pred.cpu().numpy())
        true_labels.extend(batch_output.cpu().numpy())

Testing: 100%|██████████| 30/30 [00:07<00:00,  3.84it/s]


In [None]:
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8763157894736842
Precision: 0.8785693988774963
Recall: 0.8763157894736842
F1 Score: 0.8763736711184595
Confusion Matrix:
 [[1625   82  123   70]
 [  36 1811   36   17]
 [  53   29 1675  143]
 [  54   36  261 1549]]


In [None]:
for param in model.named_parameters():
    if param[0] == 'lambdas':
        param[1].requires_grad = False

In [None]:
accuracy = accuracy_score(true_labels, predicted_labels)
precision = precision_score(true_labels, predicted_labels, average='macro')
recall = recall_score(true_labels, predicted_labels, average='macro')
f1 = f1_score(true_labels, predicted_labels, average='macro')
conf_matrix = confusion_matrix(true_labels, predicted_labels)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8555263157894737
Precision: 0.8553084311452964
Recall: 0.8555263157894737
F1 Score: 0.8548244476494145
Confusion Matrix:
 [[1687   73   69   71]
 [  60 1779   26   35]
 [ 158   32 1469  241]
 [ 106   52  175 1567]]


In [None]:
model

DownstreamModel(
  (forwardLM): ForwardLM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embed_layer): Embedding(16616, 300)
    (layer1): LSTM(300, 300, batch_first=True)
    (layer2): LSTM(300, 300, batch_first=True)
    (fc): Linear(in_features=300, out_features=16616, bias=True)
  )
  (backwardLM): BackwardLM(
    (dropout): Dropout(p=0.5, inplace=False)
    (embed_layer): Embedding(16616, 300)
    (layer1): LSTM(300, 300, batch_first=True)
    (layer2): LSTM(300, 300, batch_first=True)
    (fc): Linear(in_features=300, out_features=16616, bias=True)
  )
  (layer): LSTM(600, 300, batch_first=True)
  (output): Linear(in_features=300, out_features=4, bias=True)
  (softmax): Softmax(dim=1)
)