In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install torch
!pip install pytorch-crf
!pip install torchtext
!pip install nltk

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m58.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m70.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m88.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import numpy as np
from collections import defaultdict
import nltk


from torchcrf import CRF
from torchtext.vocab import GloVe

In [None]:
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Define device for training based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(torch.cuda.is_available())

Using device: cuda
True


In [None]:
def extract_data(seq_in_path, seq_out_path, label_path):
    """Extracts texts, slot tags, labels, and POS tags from files."""
    with open(seq_in_path, 'r', encoding='utf-8') as seq_in_file, \
         open(seq_out_path, 'r', encoding='utf-8') as seq_out_file, \
         open(label_path, 'r', encoding='utf-8') as label_file:
        texts = [line.strip().split() for line in seq_in_file.readlines()]
        slot_tags = [line.strip().split() for line in seq_out_file.readlines()]
        labels = [line.strip() for line in label_file.readlines()]

    # Generate POS tags for each sentence in texts
    pos_tags = [nltk.pos_tag(sentence) for sentence in texts]

    # Extract just the tags, discarding the words
    pos_tags_only = [[tag for word, tag in sentence] for sentence in pos_tags]

    return texts, slot_tags, labels, pos_tags_only

In [None]:
def prepare_data(texts, slot_tags, pos_tags_only):
    """Prepares data by creating word, tag, and POS tag indices."""
    word_to_ix = {"<PAD>": 0}
    tag_to_ix = {"<PAD>": 0}
    pos_to_ix = {"<PAD>": 0}  # Initialize POS tags index dictionary

    for sentence in texts:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)

    for tags in slot_tags:
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)

    for pos_sentence in pos_tags_only:
        for pos_tag in pos_sentence:
            if pos_tag not in pos_to_ix:
                pos_to_ix[pos_tag] = len(pos_to_ix)

    return word_to_ix, tag_to_ix, pos_to_ix

In [None]:
class DatasetObj(Dataset):
    """Custom Dataset class to handle data loading, including POS tags."""
    def __init__(self, texts, slot_tags, pos_tags, labels=None, word_to_ix=None, tag_to_ix=None, pos_to_ix=None):
        self.texts = texts
        self.slot_tags = slot_tags
        self.pos_tags = pos_tags  # New: Store POS tags
        self.labels = labels
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix
        self.pos_to_ix = pos_to_ix  # New: Store POS to index mapping
        self.sentences = [[self.word_to_ix[word] for word in sentence] for sentence in texts]
        self.tags = [[self.tag_to_ix[tag] for tag in slot_tag] for slot_tag in slot_tags]
        self.pos = [[self.pos_to_ix[pos] for pos in pos_sentence] for pos_sentence in pos_tags]  # New: Convert POS tags to indices

        if labels is not None:
            self.label_to_ix = {label: i for i, label in enumerate(set(labels))}
            self.labels_ix = [self.label_to_ix[label] for label in labels]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        sentence_tensor = torch.tensor(self.sentences[idx], dtype=torch.long)
        tags_tensor = torch.tensor(self.tags[idx], dtype=torch.long)
        pos_tensor = torch.tensor(self.pos[idx], dtype=torch.long)  # New: Create a tensor for POS tags

        if self.labels is not None:
            label_tensor = torch.tensor(self.labels_ix[idx], dtype=torch.long)
            return sentence_tensor, tags_tensor, pos_tensor, label_tensor  # New: Return POS tags tensor
        else:
            return sentence_tensor, tags_tensor, pos_tensor  # New: Include POS tensor in the return

    @staticmethod
    def collate_fn(batch):
        sentences, tags, pos_tags, *optional_labels = zip(*batch)  # New: Extract POS tags from batch
        sentences_padded = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True, padding_value=0)
        tags_padded = torch.nn.utils.rnn.pad_sequence(tags, batch_first=True, padding_value=0)
        pos_tags_padded = torch.nn.utils.rnn.pad_sequence(pos_tags, batch_first=True, padding_value=0)  # New: Pad POS tags

        if optional_labels:
            labels = torch.tensor(optional_labels[0], dtype=torch.long)
            return sentences_padded, tags_padded, pos_tags_padded, labels  # Return padded POS tags
        else:
            return sentences_padded, tags_padded, pos_tags_padded  # Include POS padding in return


In [None]:
# Choose dataset (atis, snips):
ds = "atis"

In [None]:
# Paths for the training dataset
train_seq_in_path = f'/content/drive/MyDrive/data/{ds}/train/seq.in'
train_seq_out_path = f'/content/drive/MyDrive/data/{ds}/train/seq.out'
train_label_path = f'/content/drive/MyDrive/data/{ds}/train/label'

# Paths for the development dataset
dev_seq_in_path = f'/content/drive/MyDrive/data/{ds}/dev/seq.in'
dev_seq_out_path = f'/content/drive/MyDrive/data/{ds}/dev/seq.out'
dev_label_path = f'/content/drive/MyDrive/data/{ds}/dev/label'

# Paths for the test dataset
test_seq_in_path = f'/content/drive/MyDrive/data/{ds}/test/seq.in'
test_seq_out_path = f'/content/drive/MyDrive/data/{ds}/test/seq.out'
test_label_path = f'/content/drive/MyDrive/data/{ds}/test/label'

In [None]:
# Extract data for each split
train_texts, train_slot_tags, train_labels, train_pos_tags = extract_data(train_seq_in_path, train_seq_out_path, train_label_path)
dev_texts, dev_slot_tags, dev_labels, dev_pos_tags = extract_data(dev_seq_in_path, dev_seq_out_path, dev_label_path)
test_texts, test_slot_tags, test_labels, test_pos_tags = extract_data(test_seq_in_path, test_seq_out_path, test_label_path)

all_texts = train_texts + dev_texts + test_texts
all_slot_tags = train_slot_tags + dev_slot_tags + test_slot_tags
all_pos_tags = train_pos_tags + dev_pos_tags + test_pos_tags  # Combine all POS tags

# Adjust the call to prepare_data to include POS tags
word_to_ix, tag_to_ix, pos_to_ix = prepare_data(all_texts, all_slot_tags, all_pos_tags)


32
{'<PAD>': 0, 'NN': 1, 'VBP': 2, 'TO': 3, 'VB': 4, 'IN': 5, 'NNS': 6, 'JJR': 7, 'CD': 8, 'JJ': 9, 'PRP': 10, 'DT': 11, 'VBG': 12, 'WDT': 13, 'CC': 14, 'VBZ': 15, 'RB': 16, 'WRB': 17, 'MD': 18, 'RP': 19, 'PDT': 20, 'WP': 21, 'JJS': 22, 'VBN': 23, 'PRP$': 24, 'EX': 25, 'VBD': 26, 'FW': 27, 'RBS': 28, 'UH': 29, 'NNP': 30, 'RBR': 31}


In [None]:
glove = GloVe(name='6B', dim=300)

.vector_cache/glove.6B.zip: 862MB [02:39, 5.39MB/s]                           
100%|█████████▉| 399999/400000 [00:54<00:00, 7344.18it/s]


In [None]:
def create_embedding_matrix(word_to_ix, glove):
    """Creates an embedding matrix for the vocabulary."""
    embedding_dim = glove.dim
    embeddings = torch.randn(len(word_to_ix), embedding_dim)
    embeddings[word_to_ix["<PAD>"]] = torch.zeros(embedding_dim)
    for word, ix in word_to_ix.items():
        if word in glove.stoi:
            embeddings[ix] = glove[word]
    return embeddings


def create_pos_embedding_matrix(pos_to_ix, pos_embedding_dim):
    """Creates an embedding matrix for POS tags."""
    embeddings = torch.randn(len(pos_to_ix), pos_embedding_dim)
    embeddings[pos_to_ix["<PAD>"]] = torch.zeros(pos_embedding_dim)  # Zero vector for padding
    return embeddings


In [None]:
class SentenceLevelPrediction(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(SentenceLevelPrediction, self).__init__()
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, lstm_out):
        pooled = self.pooling(lstm_out.transpose(1, 2)).squeeze(-1)
        logits = self.fc(pooled)
        return torch.sigmoid(logits)


In [None]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_dim * 2, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

In [None]:
class Model(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, pretrained_word_embeddings=None, pretrained_pos_embeddings=None, pos_vocab_size=None, pos_embedding_dim=None, num_labels=None):
        super(Model, self).__init__()

        # For word embeddings
        self.word_embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

        # For POS embeddings
        self.pos_embedding = nn.Embedding(pos_vocab_size, pos_embedding_dim)

        # Adjust LSTM input size to sum of word and POS embedding dimensions
        lstm_input_dim = embedding_dim + pos_embedding_dim
        self.lstm = nn.LSTM(lstm_input_dim, hidden_dim // 2, num_layers=1, bidirectional=True)


        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

        # Adversarial MI Estimation Components
        self.discriminator = Discriminator(hidden_dim)

        # Initialize the sentence-level prediction module if num_labels is provided
        if num_labels is not None:
            self.sentence_level_predictor = nn.Linear(hidden_dim, num_labels)

        # Auxiliary components
        self.word_context_classifier = nn.Linear(hidden_dim, tagset_size)

        self.context2label = nn.Linear(hidden_dim, tagset_size)

        self.sentence_label_classifier = nn.Linear(hidden_dim, tagset_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, sentence,pos_tags):
        word_embeds = self.word_embedding(sentence)
        pos_embeds = self.pos_embedding(pos_tags)
        embeds = torch.cat((word_embeds, pos_embeds), dim=-1)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)

        sentence_representation = torch.mean(lstm_out, dim=1)
        sentence_level_logits = None
        if hasattr(self, 'sentence_level_predictor'):
            sentence_level_logits = torch.sigmoid(self.sentence_level_predictor(sentence_representation))

        word_labels_from_context = self.context2label(torch.mean(lstm_out, dim=1))

        return tag_space, lstm_out, sentence_level_logits, word_labels_from_context

    def compute_discriminator_loss(self, lstm_out, device):
        batch_size, seq_len, hidden_dim = lstm_out.size()

        # context vector generation
        context_vectors = lstm_out.mean(dim=1, keepdim=True).expand(-1, seq_len, -1)

        joint_samples = torch.cat((lstm_out, context_vectors), dim=-1)

        # Randomly shuffle context vectors
        idx = torch.randperm(batch_size)
        marginal_context_vectors = context_vectors[idx]

        marginal_samples = torch.cat((lstm_out, marginal_context_vectors), dim=-1)

        # Compute discriminator loss
        true_preds = self.discriminator(joint_samples.view(-1, hidden_dim * 2))
        false_preds = self.discriminator(marginal_samples.view(-1, hidden_dim * 2))

        true_labels = torch.ones(true_preds.size(), device=device)
        false_labels = torch.zeros(false_preds.size(), device=device)

        disc_loss = F.binary_cross_entropy(torch.cat((true_preds, false_preds), dim=0),
                                           torch.cat((true_labels, false_labels), dim=0))

        return disc_loss

    def compute_auxiliary_losses(self, lstm_out, tags, sentence_lengths, labels=None):
        # Initialization of loss components
        auxiliary_loss = 0

        lstm_out_flat = lstm_out.contiguous().view(-1, lstm_out.shape[-1])
        tags_flat = tags.contiguous().view(-1)
        word_context_logits = self.word_context_classifier(lstm_out_flat)
        word_context_loss = nn.CrossEntropyLoss()(word_context_logits, tags_flat)
        auxiliary_loss += word_context_loss

        if labels is not None and hasattr(self, 'sentence_level_predictor'):
            sentence_representation = torch.mean(lstm_out, dim=1)
            sentence_label_logits = self.sentence_level_predictor(sentence_representation)
            sentence_label_loss = nn.BCEWithLogitsLoss()(sentence_label_logits, labels.float())
            auxiliary_loss += sentence_label_loss

        return auxiliary_loss


    def loss(self, tag_space, lstm_out, sentence_level_logits, word_labels_from_context, tags, alpha, beta, gamma, labels=None):
        crf_loss = -self.crf(tag_space, tags, mask=(tags != 0), reduction='mean')
        disc_loss = self.compute_discriminator_loss(lstm_out, device)

        auxiliary_loss = 0
        if labels is not None:
            if sentence_level_logits is not None:
                sentence_level_loss = F.binary_cross_entropy_with_logits(sentence_level_logits, labels.float())
                auxiliary_loss += sentence_level_loss

        total_loss = alpha * crf_loss + beta * disc_loss + gamma * auxiliary_loss
        return total_loss


    def predict(self, sentences,pos_tags):
        tag_space, _, _, _ = self.forward(sentences,pos_tags)
        tags = self.crf.decode(tag_space)
        return tags


In [None]:
def train_and_validate_model(model, train_loader, dev_loader, optimizer, num_epochs, device, alpha, beta, gamma):
    best_validation_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for sentences, pos_tags, tags, labels in train_loader:  # Adjusted to unpack pos_tags
            sentences, pos_tags, tags = sentences.to(device), pos_tags.to(device), tags.to(device)
            labels = labels.to(device) if labels is not None else None

            optimizer.zero_grad()

            # Call the model's forward method with both sentences and pos_tags
            outputs = model(sentences, pos_tags)
            tag_space, lstm_out, sentence_level_logits, word_labels_from_context = outputs


            # Use the outputs in the model's loss function
            loss = model.loss(tag_space=tag_space, lstm_out=lstm_out, sentence_level_logits=sentence_level_logits,
                  word_labels_from_context=word_labels_from_context, tags=tags, alpha=alpha, beta=beta, gamma=gamma, labels=labels)

            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        # Calculate average train loss
        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_validation_loss = 0
        with torch.no_grad():
            for sentences, pos_tags, tags, labels in dev_loader:  # Adjusted to unpack pos_tags
                sentences, pos_tags, tags = sentences.to(device), pos_tags.to(device), tags.to(device)
                labels = labels.to(device) if labels is not None else None

                # Compute the loss for validation, similarly updated
                tag_space, lstm_out, sentence_level_logits, word_labels_from_context = model(sentences, pos_tags)
                val_loss = model.loss(tag_space, lstm_out, sentence_level_logits,
                                      word_labels_from_context,tags, alpha, beta, gamma, labels=labels)  # Adjusted for validation
                total_validation_loss += val_loss.item()

        # Calculate average validation loss
        avg_validation_loss = total_validation_loss / len(dev_loader)

        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_validation_loss:.4f}")

        # Save the model if validation loss has improved
        if avg_validation_loss < best_validation_loss:
            best_validation_loss = avg_validation_loss
            torch.save(model.state_dict(), f'model_best_validation.pth')
            print("Model saved with improved validation loss.")

    return best_validation_loss


In [None]:
def evaluate_model(model, test_loader, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
    model.to(device)
    model.eval()

    true_tags, pred_tags = [], []

    with torch.no_grad():
        for sentences, pos_tags, tags, additional_data in test_loader:
            sentences, pos_tags, tags = sentences.to(device), pos_tags.to(device), tags.to(device)
            predicted_tags_batch = model.predict(sentences, pos_tags)
            predicted_tags_batch = torch.tensor(predicted_tags_batch, dtype=torch.long, device=device)
            predicted_tags_batch = predicted_tags_batch.view(-1).cpu().numpy()

            true_tags.extend(tags.view(-1).cpu().numpy())
            pred_tags.extend(predicted_tags_batch)

    # Calculate evaluation metrics
    accuracy = accuracy_score(true_tags, pred_tags)
    precision, recall, f1, _ = precision_recall_fscore_support(true_tags, pred_tags, average='macro', zero_division=1)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


In [None]:
def calculate_max_pos_vocab_size(data_loaders):
    max_index = -1
    for loader in data_loaders:
        for batch in loader:
            # Use index-based access for pos_tags
            pos_tags = batch[1]
            current_max = pos_tags.max().item()
            max_index = max(max_index, current_max)
    # Since indices are zero-based, add 1 to get the correct vocabulary size.
    pos_vocab_size = max_index + 1
    return pos_vocab_size


In [None]:
# Dimension for POS tag embeddings
pos_embedding_dim = 50
pretrained_embeddings = create_embedding_matrix(word_to_ix, glove)
pos_embeddings = create_pos_embedding_matrix(pos_to_ix, pos_embedding_dim)  # POS tag embeddings

max_index = max(pos_to_ix.values())

vocab_size = len(word_to_ix)
tagset_size = len(tag_to_ix)

embedding_dim = glove.dim
hidden_dim = 200

learning_rate = 0.001
num_epochs = 15
batch_size = 1

tradeoff_params = {'alpha': .1, 'beta': .1, 'gamma': .1}

best_val_loss = float('inf')

In [None]:
train_dataset = DatasetObj(train_texts, train_slot_tags, train_pos_tags, train_labels, word_to_ix, tag_to_ix, pos_to_ix)
dev_dataset = DatasetObj(dev_texts, dev_slot_tags, dev_pos_tags, dev_labels, word_to_ix, tag_to_ix, pos_to_ix)
test_dataset = DatasetObj(test_texts, test_slot_tags, test_pos_tags, test_labels, word_to_ix, tag_to_ix, pos_to_ix)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=DatasetObj.collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=DatasetObj.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=DatasetObj.collate_fn)


In [None]:
pos_vocab_size = calculate_max_pos_vocab_size([train_loader, dev_loader, test_loader])
model = Model(vocab_size=vocab_size, tagset_size=tagset_size, embedding_dim=embedding_dim, hidden_dim=hidden_dim,
              pretrained_word_embeddings=pretrained_embeddings, pretrained_pos_embeddings=pos_embeddings,
              pos_vocab_size=pos_vocab_size, pos_embedding_dim=pos_embedding_dim).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

val_loss = train_and_validate_model(model, train_loader, dev_loader, optimizer, num_epochs, device, **tradeoff_params)

if val_loss < best_val_loss:
   best_val_loss = val_loss

print(f"Best Validation Loss: {best_val_loss}")


Epoch 1, Train Loss: 0.4251, Validation Loss: 0.2800
Model saved with improved validation loss.
Epoch 2, Train Loss: 0.2355, Validation Loss: 0.2440
Model saved with improved validation loss.
Epoch 3, Train Loss: 0.2072, Validation Loss: 0.2317
Model saved with improved validation loss.
Epoch 4, Train Loss: 0.1952, Validation Loss: 0.2243
Model saved with improved validation loss.
Epoch 5, Train Loss: 0.1884, Validation Loss: 0.2219
Model saved with improved validation loss.
Epoch 6, Train Loss: 0.1834, Validation Loss: 0.2196
Model saved with improved validation loss.
Epoch 7, Train Loss: 0.1810, Validation Loss: 0.2230
Epoch 8, Train Loss: 0.1781, Validation Loss: 0.2284
Epoch 9, Train Loss: 0.1769, Validation Loss: 0.2214
Epoch 10, Train Loss: 0.1753, Validation Loss: 0.2287
Epoch 11, Train Loss: 0.1746, Validation Loss: 0.2208
Epoch 12, Train Loss: 0.1732, Validation Loss: 0.2281
Epoch 13, Train Loss: 0.1730, Validation Loss: 0.2305
Epoch 14, Train Loss: 0.1721, Validation Loss: 0.

In [None]:
evaluate_model(model, test_loader)

  score = torch.where(mask[i].unsqueeze(1), next_score, score)


Accuracy: 0.9470
Precision: 0.9012, Recall: 0.7854, F1: 0.7994
