# HW4: Deep Learning on NER

### Setup environment

In [None]:
!pip install -q datasets accelerate

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/493.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.4/493.7 kB[0m [31m4.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2023-11-11 04:16:17--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2023-11-11 04:16:17--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2023-11-11 04:16:17--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [None]:
!wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py

--2023-11-11 04:19:32--  https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7502 (7.3K) [text/plain]
Saving to: ‘conlleval.py’


2023-11-11 04:19:32 (33.3 MB/s) - ‘conlleval.py’ saved [7502/7502]



In [None]:
!ls # Sanity check

conlleval.py	   glove.6B.200d.txt  glove.6B.50d.txt	sample_data
glove.6B.100d.txt  glove.6B.300d.txt  glove.6B.zip


## Task 0: Prepare Data

### Load dataset

In [None]:
import datasets

dataset = datasets.load_dataset("conll2003")
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

### Use GloVe embeddings

In [None]:
import numpy as np

vocab, embeddings = [], []
with open('glove.6B.100d.txt', 'rt') as glove_file:
  full_content = glove_file.read().strip().split('\n')

for i in range(len(full_content)):
  word = full_content[i].split(' ')[0]
  embedding = [float(val) for val in full_content[i].split(' ')[1:]]
  vocab.append(word)
  embeddings.append(embedding)

vocab_npa = np.array(vocab)
embeddings_npa = np.array(embeddings)

vocab_npa = np.insert(vocab_npa, 0, '[PAD]')
vocab_npa = np.insert(vocab_npa, 1, '[UNK]')

pad_embeddings_npa = np.zeros((1, embeddings_npa.shape[1]))
unk_embeddings_npa = np.mean(embeddings_npa, axis=0, keepdims=True)

embeddings_npa = np.vstack((pad_embeddings_npa, unk_embeddings_npa, embeddings_npa))
embeddings_npa

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.05209832, -0.09711439, -0.13807563, ...,  0.12381253,
        -0.23434524, -0.00925516],
       [-0.038194  , -0.24487   ,  0.72812   , ..., -0.1459    ,
         0.8278    ,  0.27062   ],
       ...,
       [ 0.36088   , -0.16919   , -0.32704   , ...,  0.27139   ,
        -0.29188   ,  0.16109   ],
       [-0.10461   , -0.5047    , -0.49331   , ...,  0.42527   ,
        -0.5125    , -0.17054   ],
       [ 0.28365   , -0.6263    , -0.44351   , ...,  0.43678   ,
        -0.82607   , -0.15701   ]])

In [None]:
word2idx = {
    word.lower(): idx for idx, word in enumerate(vocab, start=2)
}

word2idx['[PAD]'] = 0
word2idx['[UNK]'] = 1

dataset = (
    dataset.map(lambda x: {
        'input_ids': [
            word2idx.get(word.lower(), word2idx['[UNK]']) for word in x['tokens']
        ]
    })
)

dataset['train']['input_ids'][:3]

[[646, 7580, 516, 582, 6, 5262, 299, 10240, 4], [1296, 9005], [3881, 1]]

In [None]:
dataset = dataset.rename_column('ner_tags', 'labels')
dataset = dataset.remove_columns(['pos_tags', 'chunk_tags'])
dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'labels', 'input_ids'],
        num_rows: 3453
    })
})

In [None]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

## Task 3: Transformer Model

### Define class

In [None]:
import torch.nn as nn

embedding_dim = 128
num_attention_heads = 8
seq_max_len = 128
feedforward_dims = 128
num_encoder_layers = 6

class Transformer(nn.Module):
  def __init__(self, vocab_size, num_classes):
    super(Transformer, self).__init__()
    self.token_embedding = nn.Embedding(vocab_size, embedding_dim)
    self.positional_embedding = nn.Embedding(seq_max_len, embedding_dim)
    self.transformer_encoder = nn.TransformerEncoder(
        nn.TransformerEncoderLayer(embedding_dim, num_attention_heads, feedforward_dims),
        num_encoder_layers
    )
    self.linear = nn.Linear(embedding_dim, num_classes)

  def forward(self, src, src_key_padding_mask):
    src = self.token_embedding(src) + self.positional_embedding(torch.arange(0, src.size(0)).unsqueeze(1).to(device))
    src = src.permute(1, 0, 2)

    memory = self.transformer_encoder(src, src_key_padding_mask)
    output = self.linear(memory)

    return output


In [None]:
import os

vocab_size, num_classes = len(word2idx), 9
model = Transformer(vocab_size, num_classes)
model.to(device)

using_loaded_weights = False

model_path = './task3.pt'
if os.path.exists(model_path):
  using_loaded_weights = True
  model.load_state_dict(torch.load(model_path))
  print(f'Model loaded from {model_path}')

model



Transformer(
  (token_embedding): Embedding(400002, 128)
  (positional_embedding): Embedding(128, 128)
  (transformer_encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-5): 6 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=128, out_features=128, bias=True)
        )
        (linear1): Linear(in_features=128, out_features=128, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=128, out_features=128, bias=True)
        (norm1): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (linear): Linear(in_features=128, out_features=9, bias=True)
)

### Build train set

In [None]:
from torch.nn.utils.rnn import pad_sequence

def collate_fn(batch):
    input_ids = [torch.tensor(item['input_ids']) for item in batch]
    labels = [torch.tensor(item['labels']) for item in batch]

    input_ids_padded = pad_sequence(input_ids, batch_first=True, padding_value=0)
    labels_padded = pad_sequence(labels, batch_first=True, padding_value=-100)  # Assuming -100 is the ignore index for labels

    attn_masks = torch.zeros(input_ids_padded.shape, dtype=torch.long)
    attn_masks = input_ids_padded != 0  # 0 where there's padding, 1 elsewhere

    max_len = input_ids_padded.size(1)
    square_attn_masks = torch.zeros((len(batch), max_len, max_len), dtype=torch.long)
    for i, mask in enumerate(attn_masks):
        mask_len = mask.sum()
        square_attn_masks[i, :mask_len, :mask_len] = 1

    return {
        'input_ids': input_ids_padded,
        'labels': labels_padded,
        'padding_mask': square_attn_masks
    }


In [None]:
from torch.utils.data import DataLoader

batch_size = 32
shuffle = True

train_loader = DataLoader(dataset['train'], batch_size, shuffle, collate_fn=collate_fn)
dev_loader = DataLoader(dataset['validation'], batch_size, shuffle, collate_fn=collate_fn)
test_loader = DataLoader(dataset['test'], batch_size, shuffle, collate_fn=collate_fn)

In [None]:
# Helper function to print green text
def print_green(text):
  print(f'\033[92m{text}\033[0m')

### Train model

In [None]:
import torch.optim as optim
from conlleval import evaluate

def train_model(model):
  print('Begin training BiLSTM with GloVe embeddings')

  lr = 1e-3
  loss_fn = nn.CrossEntropyLoss(ignore_index=9)
  optimizer = optim.Adam(model.parameters(), lr=lr)

  tag_to_index = {
      'O': 0,
      'B-PER': 1,
      'I-PER': 2,
      'B-ORG': 3,
      'I-ORG': 4,
      'B-LOC': 5,
      'I-LOC': 6,
      'B-MISC': 7,
      'I-MISC': 8
  }
  index_to_tag = {index: tag for tag, index in tag_to_index.items()}


  num_epochs = 20
  for epoch in range(num_epochs):
    # Training phase
    model.train()
    train_loss_total = 0
    for batch in train_loader:
      inputs = batch['input_ids'].to(device)
      labels = batch['labels'].to(device)
      padding_mask = batch['padding_mask'].to(device)
      print(padding_mask.shape)

      optimizer.zero_grad()
      outputs = model(inputs, padding_mask)
      loss = loss_fn(outputs.permute(0,2,1), labels.long())
      loss.backward()
      optimizer.step()

      train_loss_total += loss.item()

    train_loss_ave = train_loss_total / len(train_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, train loss: {train_loss_ave:.4f}')

    # Evaluation phase
    model.eval()
    dev_loss_total = 0
    pred_tags = []
    true_tags = []
    with torch.no_grad():
      for batch in dev_loader:
        inputs = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        padding_mask = batch['padding_mask'].to(device)

        outputs = model(inputs, padding_mask)
        loss = loss_fn(outputs.permute(0,2,1), labels.long())
        dev_loss_total += loss.item()

        preds = torch.argmax(outputs, dim=2)
        for i in range(labels.size(0)):
          pred_seq = preds[i].cpu().numpy()
          true_seq = labels[i].cpu().numpy()

          indices_valid = true_seq != 9
          valid_pred_tags = [index_to_tag[idx] for idx in pred_seq[indices_valid]]
          valid_true_tags = [index_to_tag[idx] for idx in true_seq[indices_valid]]

          pred_tags.append(valid_pred_tags)
          true_tags.append(valid_true_tags)

    dev_loss_ave = dev_loss_total / len(dev_loader)
    print(f'Epoch {epoch+1}/{num_epochs}, dev loss: {dev_loss_ave:.4f}')

    # Calculate metrics
    pred_tags_flattened = []
    for valid_pred_tag in pred_tags:
      for tag in valid_pred_tag:
        pred_tags_flattened.append(tag)

    true_tags_flattened = []
    for valid_true_tag in true_tags:
      for tag in valid_true_tag:
        true_tags_flattened.append(tag)

    precision, recall, f1 = evaluate(true_tags_flattened, pred_tags_flattened)
    print(f'Epoch {epoch+1}/{num_epochs}, Precision: {precision}, Recall: {recall}, F1: {f1}')

    early_stopping_epoch, min_f1 = 10, 77
    if epoch >= early_stopping_epoch and f1 >= min_f1:
      print_green('Expected F1 reached! 🚀🚀'
            f'Epoch: {epoch+1}, F1: {f1}')
      break

### Train model and save weights

In [None]:
if not using_loaded_weights:
  print('Training model...')
  train_model(model)
  torch.save(model.state_dict(), model_path)
else:
  print('Using loaded model weights')

Training model...
Begin training BiLSTM with GloVe embeddings


RuntimeError: ignored

### Evaluate model

In [None]:
def test_model(model, loader, desc):
  tag_to_index = {
      'O': 0,
      'B-PER': 1,
      'I-PER': 2,
      'B-ORG': 3,
      'I-ORG': 4,
      'B-LOC': 5,
      'I-LOC': 6,
      'B-MISC': 7,
      'I-MISC': 8
  }
  index_to_tag = {index: tag for tag, index in tag_to_index.items()}

  # Testing phase
  model.eval()
  pred_tags = []
  true_tags = []
  with torch.no_grad():
    for batch in loader:
      inputs = batch['input_ids'].to(device)
      labels = batch['labels'].to(device)

      outputs = model(inputs)
      preds = torch.argmax(outputs, dim=2)
      for i in range(labels.size(0)):
        pred_seq = preds[i].cpu().numpy()
        true_seq = labels[i].cpu().numpy()

        indices_valid = true_seq != 9
        valid_pred_tags = [index_to_tag[idx] for idx in pred_seq[indices_valid]]
        valid_true_tags = [index_to_tag[idx] for idx in true_seq[indices_valid]]

        pred_tags.append(valid_pred_tags)
        true_tags.append(valid_true_tags)

  # Calculate metrics
  pred_tags_flattened = []
  for valid_pred_tag in pred_tags:
    for tag in valid_pred_tag:
      pred_tags_flattened.append(tag)

  true_tags_flattened = []
  for valid_true_tag in true_tags:
    for tag in valid_true_tag:
      true_tags_flattened.append(tag)

  precision, recall, f1 = evaluate(true_tags_flattened, pred_tags_flattened)
  print_green(f'{desc} Data:\n'
        f'Precision: {precision}, Recall: {recall}, F1: {f1}')

test_model(model, train_loader, 'Train')
test_model(model, dev_loader, 'Validation')
test_model(model, test_loader, 'Test')