In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch
!pip install pytorch-crf
!pip install torchtext
!pip install nltk

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m33.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m67.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m88.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Downloading nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader

from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

import numpy as np
from collections import defaultdict
import nltk


from torchcrf import CRF
from torchtext.vocab import GloVe

In [4]:
nltk.download('punkt')
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [5]:
# Define device for training based on CUDA availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
print(torch.cuda.is_available())

Using device: cuda
True


In [6]:
def extract_data(seq_in_path, seq_out_path, label_path):
    with open(seq_in_path, 'r', encoding='utf-8') as seq_in_file, \
         open(seq_out_path, 'r', encoding='utf-8') as seq_out_file, \
         open(label_path, 'r', encoding='utf-8') as label_file:

        texts = [line.strip().split() for line in seq_in_file.readlines()]
        slot_tags = [line.strip().split() for line in seq_out_file.readlines()]
        labels = [line.strip() for line in label_file.readlines()]

    return texts, slot_tags, labels

In [7]:
def prepare_data(texts, slot_tags):
    word_to_ix = {"<PAD>": 0}
    tag_to_ix = {"<PAD>": 0}

    for sentence in texts:
        for word in sentence:
            if word not in word_to_ix:
                word_to_ix[word] = len(word_to_ix)

    for tags in slot_tags:
        for tag in tags:
            if tag not in tag_to_ix:
                tag_to_ix[tag] = len(tag_to_ix)

    return word_to_ix, tag_to_ix

In [8]:
from torch.utils.data import Dataset, DataLoader
import torch

class DatasetObj(Dataset):
    def __init__(self, texts, slot_tags, labels=None, word_to_ix=None, tag_to_ix=None):
        self.texts = texts
        self.slot_tags = slot_tags
        self.labels = labels
        self.word_to_ix = word_to_ix
        self.tag_to_ix = tag_to_ix

        self.sentences = [[self.word_to_ix[word] for word in sentence] for sentence in texts]
        self.tags = [[self.tag_to_ix[tag] for tag in slot_tag] for slot_tag in slot_tags]
        if labels is not None:
            self.label_to_ix = {label: i for i, label in enumerate(set(labels))}
            self.labels_ix = [self.label_to_ix[label] for label in labels]

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        sentence_tensor = torch.tensor(self.sentences[idx], dtype=torch.long)
        tags_tensor = torch.tensor(self.tags[idx], dtype=torch.long)

        if self.labels is not None:
            label_tensor = torch.tensor(self.labels_ix[idx], dtype=torch.long)
            return sentence_tensor, tags_tensor, label_tensor
        else:
            return sentence_tensor, tags_tensor

    def collate_fn(batch):
        sentences, tags, *optional_labels = zip(*batch)
        sentences_padded = torch.nn.utils.rnn.pad_sequence(sentences, batch_first=True, padding_value=0)
        tags_padded = torch.nn.utils.rnn.pad_sequence(tags, batch_first=True, padding_value=0)

        if optional_labels:
            labels = torch.tensor(optional_labels[0], dtype=torch.long)
            return sentences_padded, tags_padded, labels
        else:
            return sentences_padded, tags_padded


In [9]:
def print_examples(texts, slot_tags, labels=None, num_examples=5):
    for i in range(min(num_examples, len(texts))):
        print("Sentence:", " ".join(texts[i]))
        print("Tags:", " ".join(slot_tags[i]))
        if labels is not None:
            print("Label:", labels[i])
        print()

In [10]:
# Choose dataset (atis, snips):
ds = "atis"

In [11]:
# Paths for the training dataset
train_seq_in_path = f'/content/drive/MyDrive/data/{ds}/train/seq.in'
train_seq_out_path = f'/content/drive/MyDrive/data/{ds}/train/seq.out'
train_label_path = f'/content/drive/MyDrive/data/{ds}/train/label'

# Paths for the development dataset
dev_seq_in_path = f'/content/drive/MyDrive/data/{ds}/dev/seq.in'
dev_seq_out_path = f'/content/drive/MyDrive/data/{ds}/dev/seq.out'
dev_label_path = f'/content/drive/MyDrive/data/{ds}/dev/label'

# Paths for the test dataset
test_seq_in_path = f'/content/drive/MyDrive/data/{ds}/test/seq.in'
test_seq_out_path = f'/content/drive/MyDrive/data/{ds}/test/seq.out'
test_label_path = f'/content/drive/MyDrive/data/{ds}/test/label'

# Extract data for each split
train_texts, train_slot_tags, train_labels = extract_data(train_seq_in_path, train_seq_out_path, train_label_path)
dev_texts, dev_slot_tags, dev_labels = extract_data(dev_seq_in_path, dev_seq_out_path, dev_label_path)
test_texts, test_slot_tags, test_labels = extract_data(test_seq_in_path, test_seq_out_path, test_label_path)

In [12]:
all_texts = train_texts + dev_texts + test_texts
all_slot_tags = train_slot_tags + dev_slot_tags + test_slot_tags

word_to_ix, tag_to_ix = prepare_data(all_texts, all_slot_tags)

In [13]:
glove = GloVe(name='6B', dim=300)

.vector_cache/glove.6B.zip: 862MB [02:52, 5.01MB/s]                           
100%|█████████▉| 399999/400000 [00:54<00:00, 7275.75it/s]


In [14]:
def create_embedding_matrix(word_to_ix, glove):
    embedding_dim = glove.dim
    embeddings = torch.randn(len(word_to_ix), embedding_dim)
    embeddings[word_to_ix["<PAD>"]] = torch.zeros(embedding_dim)  # Zero embedding for padding

    for word, ix in word_to_ix.items():
        if word in glove.stoi:
            embeddings[ix] = glove[word]

    return embeddings

pretrained_embeddings = create_embedding_matrix(word_to_ix, glove)

In [15]:
class SentenceLevelPrediction(nn.Module):
    def __init__(self, hidden_dim, num_labels):
        super(SentenceLevelPrediction, self).__init__()
        self.pooling = nn.AdaptiveMaxPool1d(1)
        self.fc = nn.Linear(hidden_dim, num_labels)

    def forward(self, lstm_out):
        pooled = self.pooling(lstm_out.transpose(1, 2)).squeeze(-1)
        logits = self.fc(pooled)
        return torch.sigmoid(logits)


In [16]:
class Discriminator(nn.Module):
    def __init__(self, input_dim):
        super(Discriminator, self).__init__()
        self.fc1 = nn.Linear(input_dim * 2, 128)
        self.fc2 = nn.Linear(128, 1)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

In [17]:
class Model(nn.Module):
    def __init__(self, vocab_size, tagset_size, embedding_dim, hidden_dim, pretrained_embeddings=None, num_labels=None):
        super(Model, self).__init__()
        if pretrained_embeddings is not None:
            self.embedding = nn.Embedding.from_pretrained(pretrained_embeddings, freeze=False)
        else:
            self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, num_layers=1, bidirectional=True)
        self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
        self.crf = CRF(tagset_size, batch_first=True)

        # Adversarial MI Estimation Components
        self.discriminator = Discriminator(hidden_dim)

        # Initialize the sentence-level prediction module if num_labels is provided
        if num_labels is not None:
            self.sentence_level_predictor = nn.Linear(hidden_dim, num_labels)

        # Auxiliary components
        self.word_context_classifier = nn.Linear(hidden_dim, tagset_size)

        self.context2label = nn.Linear(hidden_dim, tagset_size)  # <-- New code

        self.sentence_label_classifier = nn.Linear(hidden_dim, tagset_size)
        self.sigmoid = nn.Sigmoid()

    def forward(self, sentence):
        embeds = self.embedding(sentence)
        lstm_out, _ = self.lstm(embeds)
        tag_space = self.hidden2tag(lstm_out)

        sentence_representation = torch.mean(lstm_out, dim=1)  # Mean pooling as an example
        sentence_level_logits = None
        if hasattr(self, 'sentence_level_predictor'):
            sentence_level_logits = torch.sigmoid(self.sentence_level_predictor(sentence_representation))

        word_labels_from_context = self.context2label(torch.mean(lstm_out, dim=1))  # <-- New code

        return tag_space, lstm_out, sentence_level_logits, word_labels_from_context  # <-- Modified return statement

    def compute_discriminator_loss(self, lstm_out, device):
        batch_size, seq_len, hidden_dim = lstm_out.size()

        # Simple context vector generation
        context_vectors = lstm_out.mean(dim=1, keepdim=True).expand(-1, seq_len, -1)

        joint_samples = torch.cat((lstm_out, context_vectors), dim=-1)

        # Randomly shuffle context vectors for marginal samples
        idx = torch.randperm(batch_size)
        marginal_context_vectors = context_vectors[idx]

        marginal_samples = torch.cat((lstm_out, marginal_context_vectors), dim=-1)

        # Compute discriminator loss
        true_preds = self.discriminator(joint_samples.view(-1, hidden_dim * 2))
        false_preds = self.discriminator(marginal_samples.view(-1, hidden_dim * 2))

        true_labels = torch.ones(true_preds.size(), device=device)
        false_labels = torch.zeros(false_preds.size(), device=device)

        disc_loss = F.binary_cross_entropy(torch.cat((true_preds, false_preds), dim=0),
                                           torch.cat((true_labels, false_labels), dim=0))

        return disc_loss

    def compute_auxiliary_losses(self, lstm_out, tags, sentence_lengths, labels=None):
        # Initialization of loss components
        auxiliary_loss = 0

        lstm_out_flat = lstm_out.contiguous().view(-1, lstm_out.shape[-1])
        tags_flat = tags.contiguous().view(-1)
        word_context_logits = self.word_context_classifier(lstm_out_flat)
        word_context_loss = nn.CrossEntropyLoss()(word_context_logits, tags_flat)
        auxiliary_loss += word_context_loss

        if labels is not None and hasattr(self, 'sentence_level_predictor'):
            sentence_representation = torch.mean(lstm_out, dim=1)
            sentence_label_logits = self.sentence_level_predictor(sentence_representation)
            sentence_label_loss = nn.BCEWithLogitsLoss()(sentence_label_logits, labels.float())
            auxiliary_loss += sentence_label_loss

        return auxiliary_loss


    def loss(self, tag_space, lstm_out, sentence_level_logits, word_labels_from_context, tags, alpha, beta, gamma, labels=None):
        crf_loss = -self.crf(tag_space, tags, mask=(tags != 0), reduction='mean')
        disc_loss = self.compute_discriminator_loss(lstm_out, device)

        auxiliary_loss = 0
        if labels is not None:
            if sentence_level_logits is not None:
                sentence_level_loss = F.binary_cross_entropy_with_logits(sentence_level_logits, labels.float())
                auxiliary_loss += sentence_level_loss

        total_loss = alpha * crf_loss + beta * disc_loss + gamma * auxiliary_loss
        return total_loss


    def predict(self, sentences):
        tag_space, _, _, _ = self.forward(sentences)
        tags = self.crf.decode(tag_space)
        return tags


In [18]:
def train_and_validate_model(model, train_loader, dev_loader, optimizer, num_epochs, device, alpha, beta, gamma):
    best_validation_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_train_loss = 0
        for sentences, tags, labels in train_loader:
            sentences, tags = sentences.to(device), tags.to(device)
            labels = labels.to(device) if labels is not None else None

            optimizer.zero_grad()

            # Corrected: Ensure variables are defined by capturing the model's forward method outputs
            outputs = model(sentences)  # Call the model's forward method
            tag_space, lstm_out, sentence_level_logits, word_labels_from_context = outputs

            # Now use the outputs in the model's loss function
            loss = model.loss(tag_space=tag_space, lstm_out=lstm_out, sentence_level_logits=sentence_level_logits,
                  word_labels_from_context=word_labels_from_context, tags=tags,
                  alpha=alpha, beta=beta, gamma=gamma, labels=labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()

        # Calculate average train loss
        avg_train_loss = total_train_loss / len(train_loader)

        # Validation
        model.eval()
        total_validation_loss = 0
        with torch.no_grad():
            for sentences, tags, labels in dev_loader:
                sentences, tags = sentences.to(device), tags.to(device)
                labels = labels.to(device) if labels is not None else None

                # Compute the loss for validation, similarly updated
                tag_space, lstm_out, sentence_level_logits, word_labels_from_context = model(sentences)  # <-- Updated call for validation
                val_loss = model.loss(tag_space, lstm_out, sentence_level_logits, word_labels_from_context,  # <-- New validation parameters
                                      tags, alpha, beta, gamma, labels=labels)  # Adjusted for validation
                total_validation_loss += val_loss.item()


        # Calculate average validation loss
        avg_validation_loss = total_validation_loss / len(dev_loader)

        print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}, Validation Loss: {avg_validation_loss:.4f}")

        # Save the model if validation loss has improved
        if avg_validation_loss < best_validation_loss:
            best_validation_loss = avg_validation_loss
            torch.save(model.state_dict(), f'model_best_validation.pth')
            print("Model saved with improved validation loss.")

    return best_validation_loss

In [19]:
def evaluate_model(model, test_loader, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")):
    model.to(device)
    model.eval()

    true_tags, pred_tags = [], []

    with torch.no_grad():
        for batch in test_loader:
            try:
                sentences, tags, _ = batch  # Attempt to unpack as if labels are present
            except ValueError:
                sentences, tags = batch  # Fallback if only sentences and tags are returned

            sentences, tags = sentences.to(device), tags.to(device)

            predicted_tags_batch = model.predict(sentences)
            # Convert predicted tags to the same format as true_tags for evaluation
            predicted_tags_batch = torch.tensor(predicted_tags_batch, dtype=torch.long, device=device)
            predicted_tags_batch = predicted_tags_batch.view(-1).cpu().numpy()

            true_tags.extend(tags.view(-1).cpu().numpy())
            pred_tags.extend(predicted_tags_batch)

    # Calculate evaluation metrics here, such as accuracy, and print them out
    accuracy = accuracy_score(true_tags, pred_tags)
    precision, recall, f1, _ = precision_recall_fscore_support(true_tags, pred_tags, average='macro', zero_division=1)
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}, Recall: {recall:.4f}, F1: {f1:.4f}")


In [28]:
vocab_size = len(word_to_ix)
tagset_size = len(tag_to_ix)

embedding_dim = 200
hidden_dim = 200

learning_rate = 0.001
num_epochs = 15
batch_size = 1

tradeoff_params = {'alpha': 1.0, 'beta': 1.0, 'gamma': 1.0}

best_val_loss = float('inf')

In [29]:
train_dataset = DatasetObj(train_texts, train_slot_tags, train_labels, word_to_ix, tag_to_ix)
dev_dataset = DatasetObj(dev_texts, dev_slot_tags, dev_labels, word_to_ix, tag_to_ix)
test_dataset = DatasetObj(test_texts, test_slot_tags, test_labels, word_to_ix, tag_to_ix)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=DatasetObj.collate_fn)
dev_loader = DataLoader(dev_dataset, batch_size=batch_size, shuffle=False, collate_fn=DatasetObj.collate_fn)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, collate_fn=DatasetObj.collate_fn)

In [26]:
model = Model(vocab_size, tagset_size, embedding_dim, hidden_dim, pretrained_embeddings=pretrained_embeddings).to(device)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
val_loss = train_and_validate_model(model, train_loader, dev_loader, optimizer, num_epochs, device, **tradeoff_params)

if val_loss < best_val_loss:
   best_val_loss = val_loss

print(f"Best Validation Loss: {best_val_loss}")


Epoch 1, Train Loss: 5.5936, Validation Loss: 3.8964
Model saved with improved validation loss.
Epoch 2, Train Loss: 3.0887, Validation Loss: 3.3431
Model saved with improved validation loss.
Epoch 3, Train Loss: 2.7422, Validation Loss: 3.1388
Model saved with improved validation loss.
Epoch 4, Train Loss: 2.5867, Validation Loss: 3.0070
Model saved with improved validation loss.
Epoch 5, Train Loss: 2.4915, Validation Loss: 3.0386
Epoch 6, Train Loss: 2.4312, Validation Loss: 2.9683
Model saved with improved validation loss.
Epoch 7, Train Loss: 2.3867, Validation Loss: 3.0379
Epoch 8, Train Loss: 2.3512, Validation Loss: 2.9707
Epoch 9, Train Loss: 2.3286, Validation Loss: 3.0192
Epoch 10, Train Loss: 2.3098, Validation Loss: 2.9732
Epoch 11, Train Loss: 2.2842, Validation Loss: 3.0276
Epoch 12, Train Loss: 2.2749, Validation Loss: 3.0432
Epoch 13, Train Loss: 2.2578, Validation Loss: 3.0664
Epoch 14, Train Loss: 2.2463, Validation Loss: 3.1224
Epoch 15, Train Loss: 2.2376, Validati

In [27]:
evaluate_model(model, test_loader)

Accuracy: 0.9045
Precision: 0.7032, Recall: 0.5471, F1: 0.4793
