**set up environment:**

In [None]:
!rm -rf half-distant-labeled-datas/ distant-labeled-datas/ test-datas/ train-datas/ val-datas/

!unzip final_slides_test.zip -d test-datas
!unzip final_slides_train.zip -d train-datas
!unzip final_slides_val.zip -d val-datas
!unzip final_slides_json_distant.zip -d distant-labeled-datas
!unzip final_slides_json_distant_half.zip -d half-distant-labeled-datas

!rm -rf self-datas-1/ self-datas-2/

!unzip final_slides_self_train_1.zip -d self-datas-1
!unzip final_slides_self_train_2.zip -d self-datas-2


Archive:  final_slides_test.zip
   creating: test-datas/final_slides_test/
   creating: test-datas/final_slides_test/CS-0441 Lecture Slides/
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec17.json  
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec18.json  
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec19.json  
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec20.json  
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec21.json  
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec22.json  
  inflating: test-datas/final_slides_test/CS-0441 Lecture Slides/lec23.json  
Archive:  final_slides_train.zip
   creating: train-datas/final_slides_train/
   creating: train-datas/final_slides_train/CS-0449 Lecture Slides/
  inflating: train-datas/final_slides_train/CS-0449 Lecture Slides/01_CS449_Introduction.json  
  inflating: train-datas/final_slides_train/CS-0449 Lecture Slides/02_C

In [None]:
!pip install transformers seqeval[gpu]
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.dummy import DummyClassifier
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLNetTokenizer, XLNetConfig, XLNetForTokenClassification
from transformers import BertTokenizer, BertConfig, BertForTokenClassification
from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
import json
import os
import random
from seqeval.metrics import classification_report

def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed(0)

In [None]:
from pathlib import Path
import itertools

train_path = 'train-datas'
val_path = 'val-datas'
test_path = 'test-datas'
distant_label_dir_path = 'distant-labeled-datas'
half_distant_label_dir_path = 'half-distant-labeled-datas'
self_label_1_path = 'self-datas-1'
self_label_2_path = 'self-datas-2'
is_flat = False
window = False
is_distant_label = False
is_half_distant_label = False
is_validation = True
WINDOW_SIZE=1
#model_type = 'BERT'
model_type = 'XLNet'

class EarlyStopper:
    def __init__(self, patience=1, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = float('inf')

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

def get_data(data_path):
  sentences = []
  labels = []
  for dir in Path(data_path).glob('*'):
    for filename in Path(dir).glob('**/*.json'):
        path = filename
        if os.path.isfile(path):
          f = open(path)
          print(path, f)
          file_data = json.load(f)
          if not is_flat:
            if not window:
              for item in file_data:
                sentences.append(item['words'])
                labels.append(item['labels'])
            else:
              for i in range(len(file_data)):
                window_words = []
                window_labels = []
                for item in file_data[i:i+WINDOW_SIZE]:
                  window_words.extend(item['words'])
                  window_labels.extend(item['labels'])
                sentences.append(window_words)
                labels.append(window_labels)
          else:
            sentences.append(file_data['words'])
            labels.append(file_data['labels'])
  return (sentences, labels)

def get_eval_data(data_path):
  sentences = dict()
  labels = dict()
  for dir in Path(data_path).glob('*'):
    for filename in Path(dir).glob('**/*.json'):
        path = filename
        if os.path.isfile(path):
          f = open(path)
          print(path, f)
          file_data = json.load(f)
          sentences[Path(path).stem] = []
          labels[Path(path).stem] = []
          for item in file_data:
            sentences[Path(path).stem].append(item['words'])
            labels[Path(path).stem].append(item['labels'])
  return sentences, labels


sentences, labels = get_data(test_path)
test_dataset = pd.DataFrame({'sentence': sentences, 'word_labels': labels})
sentences, labels = get_eval_data(test_path)
test_block_datasets = [pd.DataFrame(({'sentence': sentences[file], 'word_labels': labels[file]})) for file in sentences]
print(test_block_datasets)
sentences, labels = get_data(train_path)
train_dataset = pd.DataFrame({'sentence': sentences, 'word_labels': labels})
sentences, labels = get_data(val_path)
validation_dataset = pd.DataFrame({'sentence': sentences, 'word_labels': labels})
sentences, labels = get_data(self_label_1_path)
self_training_dataset_1 = pd.DataFrame({'sentence': sentences, 'word_labels': labels})
sentences, labels = get_data(self_label_2_path)
self_training_dataset_2 = pd.DataFrame({'sentence': sentences, 'word_labels': labels})
if is_distant_label:
  d_sentences, d_labels = get_data(distant_label_dir_path)
  d_data = pd.DataFrame({'sentence': d_sentences, 'word_labels': d_labels})
  print(d_data)

if is_half_distant_label:
  d_sentences, d_labels = get_data(half_distant_label_dir_path)
  d_data = pd.DataFrame({'sentence': d_sentences, 'word_labels': d_labels})
  print(d_data)

print(train_dataset)
print(validation_dataset)
print(test_dataset)

#train_size = 0.8
#train_dataset = big_train_dataset.sample(frac=train_size,random_state=200)
#validation_dataset = big_train_dataset.drop(train_dataset.index).reset_index(drop=True)
#train_dataset, validation_dataset = np.split(big_train_dataset, [int(train_size*len(big_train_dataset))])
#validation_dataset = validation_dataset.reset_index(drop=True)
if is_distant_label or is_half_distant_label:
  train_dataset = pd.concat([train_dataset, d_data])
  pass
if not is_validation:
  train_dataset = pd.concat([train_dataset, validation_dataset])
train_dataset = train_dataset.reset_index(drop=True)
self_training_dataset = pd.concat([self_training_dataset_1, self_training_dataset_2])
self_training_dataset = self_training_dataset.reset_index(drop=True)

print("TRAIN Dataset: {}".format(train_dataset.shape))
print("VAL Dataset: {}".format(validation_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))


**preparing dataset and dataloader:**

In [None]:
import math
import copy

MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 50
LEARNING_RATE = 1e-05
MAX_GRAD_NORM = 10
tokenizer = XLNetTokenizer.from_pretrained('xlnet-base-cased') if model_type == 'XLNet' else BertTokenizer.from_pretrained('bert-base-uncased')
CLS_TOKEN = "<cls>" if model_type == 'XLNet' else "[CLS]"
SEP_TOKEN = "<sep>" if model_type == 'XLNet' else "[SEP]"
PAD_TOKEN = "<pad>" if model_type == 'XLNet' else "[PAD]"

label2id = {
    'B': 0,
    'I': 1,
    'O': 2
 }
id2label = {
    0: 'B',
    1: 'I',
    2: 'O'
}

def tokenize_and_preserve_labels(sentence, text_labels, tokenizer):
    """
    Word piece tokenization makes it difficult to match word labels
    back up with individual word pieces. This function tokenizes each
    word one at a time so that it is easier to preserve the correct
    label for each subword. It is, of course, a bit slower in processing
    time, but it will help our model achieve higher accuracy.
    """

    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):

        # Tokenize the word and count # of subwords the word is broken into
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        # Add the tokenized word to the final tokenized word list
        tokenized_sentence.extend(tokenized_word)

        # Add the same label to the new list of labels `n_subwords` times
        labels.extend([label] * n_subwords)

    return tokenized_sentence, labels

class dataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len, model=None, model_name=""):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.model = model
        self.model_name = model_name

    def set_model(self, model, model_name=""):
        del self.model
        torch.cuda.empty_cache()
        self.model = copy.deepcopy(model)
        self.model_name = model_name

    def __getitem__(self, index):
        # step 1: tokenize (and adapt corresponding labels)
        sentence = self.data.sentence[index]
        word_labels = self.data.word_labels[index]
        tokenized_sentence, labels = tokenize_and_preserve_labels(sentence, word_labels, self.tokenizer)

        # step 2: add special tokens (and corresponding labels)
        tokenized_sentence = tokenized_sentence + [SEP_TOKEN] + [CLS_TOKEN] if model_type =='XLNet' else [CLS_TOKEN] + tokenized_sentence + [SEP_TOKEN] # add special tokens
        if model_type == "XLNet":
          labels.append("O") # add outside label for [SEP] token
          labels.append("O") # add outside label for [CLS] token
        else:
          labels.insert(0, "O") # add outside label for [CLS] token
          labels.append("O") # add outside label for [SEP] token

        # step 3: truncating/padding
        maxlen = self.max_len

        if (len(tokenized_sentence) > maxlen):
          # truncate
          tokenized_sentence = tokenized_sentence[:maxlen]
          labels = labels[:maxlen]
        else:
          # pad
          tokenized_sentence = tokenized_sentence + [PAD_TOKEN for _ in range(maxlen - len(tokenized_sentence))]
          labels = labels + ["O" for _ in range(maxlen - len(labels))]

        # step 4: obtain the attention mask
        attn_mask = [1 if tok != PAD_TOKEN else 0 for tok in tokenized_sentence]

        # step 5: convert tokens to input ids
        ids = self.tokenizer.convert_tokens_to_ids(tokenized_sentence)

        label_ids = [label2id[label] for label in labels]

        if self.model is not None:
          self.model.eval()
          ids_t = torch.tensor(ids, dtype=torch.long).unsqueeze(0).to(device)
          mask_t = torch.tensor(attn_mask, dtype=torch.long).unsqueeze(0).to(device)
          label_ids_t =  torch.tensor(label_ids, dtype=torch.long).unsqueeze(0).to(device)
          outputs = self.model(input_ids=ids_t, attention_mask=mask_t, labels=label_ids_t)
          loss, eval_logits = outputs.loss, outputs.logits

          active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
          flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
          # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
          # active_accuracy = mask_t.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
          # prediction_ids = torch.masked_select(flattened_predictions, active_accuracy)
          prediction_ids = flattened_predictions
        else:
          prediction_ids = [label2id['O'] for label in labels]
        # the following line is deprecated
        #label_ids = [label if label != 0 else -100 for label in label_ids]

        return {
              'ids': torch.tensor(ids, dtype=torch.long),
              'mask': torch.tensor(attn_mask, dtype=torch.long),
              #'token_type_ids': torch.tensor(token_ids, dtype=torch.long),
              'targets': torch.tensor(label_ids, dtype=torch.long),
              'predictions': prediction_ids
        }

    def __len__(self):
        return self.len

    def __iter__(self):
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is None:
            return map(self.__getitem__, range(self.__len__()))

        per_worker = int(math.ceil((self.__len__()) / float(worker_info.num_workers)))
        worker_id = worker_info.id
        iter_start = worker_id * per_worker
        iter_end = min(iter_start + per_worker, self.__len__())
        return map(self.__getitem__, range(iter_start, iter_end))

training_set = dataset(train_dataset, tokenizer, MAX_LEN)
self_training_set_1 = dataset(self_training_dataset_1, tokenizer, MAX_LEN)
self_training_set_2 = dataset(self_training_dataset_2, tokenizer, MAX_LEN)
self_training_set = dataset(self_training_dataset, tokenizer, MAX_LEN)
validation_set = dataset(validation_dataset, tokenizer, MAX_LEN)
testing_set = dataset(test_dataset, tokenizer, MAX_LEN)
test_block_set = [dataset(data_set, tokenizer, MAX_LEN) for data_set in test_block_datasets]

# print the first 30 tokens and corresponding labels
for token, label in zip(tokenizer.convert_ids_to_tokens(training_set[0]["ids"][:30]), training_set[0]["targets"][:30]):
  print('{0:10}  {1}'.format(token, id2label[label.item()]))


train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

valid_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': 1,
                'shuffle': False,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
self_training_loader_1 = DataLoader(self_training_set_1, **train_params)
self_training_loader_2 = DataLoader(self_training_set_2, **train_params)
self_training_loader = DataLoader(self_training_set, **train_params)
validation_loader = DataLoader(validation_set, **valid_params)
testing_loader = DataLoader(testing_set, **test_params)
test_block_loader = [DataLoader(data_set, **test_params) for data_set in test_block_set]

**defining the model:**

In [None]:
if model_type == 'XLNet':
  model = XLNetForTokenClassification.from_pretrained('xlnet-base-cased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
else:
  model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

**training the model:**

In [None]:
#ids = training_set[0]["ids"].unsqueeze(0)
#mask = training_set[0]["mask"].unsqueeze(0)
#targets = training_set[0]["targets"].unsqueeze(0)
#ids = ids.to(device)
#mask = mask.to(device)
#targets = targets.to(device)
#outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
#initial_loss = outputs[0]
#tr_logits = outputs[1]
model_reload=True
if model_reload:
  model = XLNetForTokenClassification.from_pretrained("AwesomeREK/concept-extraction-xlnet-early-stopping",
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)
optimizer = torch.optim.Adam(params=model.parameters(), lr=LEARNING_RATE)

def get_chunks(seq):
    """
    Adapted from BOND paper
    """
    chunks = []

    chunk_start = None
    for i, tok in enumerate(seq):
        if tok == "O" and chunk_start is not None:
            chunk = (chunk_start, i)
            chunks.append(chunk)
            chunk_start = None
        if tok == "B":
            if chunk_start is not None:
                chunk = (chunk_start, i)
                chunks.append(chunk)
            chunk_start=i
    if chunk_start is not None:
        chunk = (chunk_start, len(seq))
        chunks.append(chunk)
    return chunks

# Defining the training function on the 80% of the dataset for tuning the xlnet model
def train(model, optimizer, loader, epoch, gold=True):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    tr_preds, tr_labels = [], []
    field = 'targets' if gold else 'predictions'
    # put model in training mode
    model.train()

    for idx, batch in enumerate(loader):

        ids = batch['ids'].to(device, dtype = torch.long)
        mask = batch['mask'].to(device, dtype = torch.long)
        targets = batch[field].to(device, dtype = torch.long)

        outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
        loss, tr_logits = outputs.loss, outputs.logits
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += targets.size(0)

        if idx % 100==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss per 100 training steps: {loss_step}")

        # compute training accuracy
        flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
        active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
        targets = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)

        tr_preds.extend(predictions)
        tr_labels.extend(targets)

        tmp_tr_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy

        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=MAX_GRAD_NORM
        )

        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

from time import sleep
def valid(model, loader):
    # put model in evaluation mode
    model.eval()

    eval_loss, eval_accuracy = 0, 0
    nb_eval_examples, nb_eval_steps = 0, 0
    eval_preds, eval_labels = [], []

    with torch.no_grad():
        for idx, batch in enumerate(loader):

            ids = batch['ids'].to(device, dtype = torch.long)
            mask = batch['mask'].to(device, dtype = torch.long)
            targets = batch['targets'].to(device, dtype = torch.long)

            outputs = model(input_ids=ids, attention_mask=mask, labels=targets)
            loss, eval_logits = outputs.loss, outputs.logits

            eval_loss += loss.item()

            nb_eval_steps += 1
            nb_eval_examples += targets.size(0)

            if idx % 100==0:
                loss_step = eval_loss/nb_eval_steps
                print(f"Validation loss per 100 evaluation steps: {loss_step}")

            # compute evaluation accuracy
            flattened_targets = targets.view(-1) # shape (batch_size * seq_len,)
            active_logits = eval_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
            flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
            # now, use mask to determine where we should compare predictions with targets (includes [CLS] and [SEP] token predictions)
            active_accuracy = mask.view(-1) == 1 # active accuracy is also of shape (batch_size * seq_len,)
            targets = torch.masked_select(flattened_targets, active_accuracy)
            predictions = torch.masked_select(flattened_predictions, active_accuracy)

            eval_labels.extend(targets)
            eval_preds.extend(predictions)

            tmp_eval_accuracy = accuracy_score(targets.cpu().numpy(), predictions.cpu().numpy())
            eval_accuracy += tmp_eval_accuracy

    #print(eval_labels)
    #print(eval_preds)

    labels = [id2label[id.item()] for id in eval_labels]
    predictions = [id2label[id.item()] for id in eval_preds]

    #print(labels)
    #print(predictions)

    eval_loss = eval_loss / nb_eval_steps
    eval_accuracy = eval_accuracy / nb_eval_steps

    label_chunks, pred_chunks = set(get_chunks(labels)), set(get_chunks(predictions))
    print(label_chunks)
    print(pred_chunks)
    correct_preds = len(label_chunks & pred_chunks)
    total_preds = len(pred_chunks)
    total_correct = len(label_chunks)

    p   = correct_preds / total_preds if correct_preds > 0 else 0
    r   = correct_preds / total_correct if correct_preds > 0 else 0
    new_F  = 2 * p * r / (p + r) if correct_preds > 0 else 0

    print(f"Validation Loss: {eval_loss}")
    print(f"Validation Accuracy: {eval_accuracy}")
    print(f"Chunk Precision: {p}")
    print(f"Chunk Recall: {r}")
    print(f"Chunk F1: {new_F}")

    return eval_loss, labels, predictions

def train_epochs(model, optimizer, train_loader, dev_loader, num_epochs):
  early_stopper = EarlyStopper(patience=5, min_delta=0.01)
  for epoch in range(num_epochs):
    print(f"Training epoch: {epoch + 1}")
    train(model, optimizer, train_loader, epoch)
    if is_validation:
      loss, labels, predictions = valid(model, dev_loader)
      print(classification_report([labels], [predictions]))
      if early_stopper.early_stop(loss):
        break
    loss, labels, predictions = valid(model, testing_loader)
    print(classification_report([labels], [predictions]))

if not model_reload:
  train_epochs(model, optimizer, training_loader, validation_loader, EPOCHS)

Training epoch: 1
Training loss per 100 training steps: 3.6240651607513428
Training loss per 100 training steps: 0.22154512774604973
Training loss epoch: 0.15658160727074805
Training accuracy epoch: 0.8790059430434554
Validation loss per 100 evaluation steps: 0.024863142520189285
Validation loss per 100 evaluation steps: 0.05877726149369529
{(4697, 4700), (25514, 25515), (9988, 9990), (3640, 3642), (16368, 16370), (3910, 3911), (5590, 5591), (26939, 26941), (8101, 8102), (8932, 8933), (21408, 21409), (5591, 5592), (5623, 5624), (12521, 12524), (21441, 21442), (5624, 5625), (25613, 25614), (18384, 18385), (28426, 28427), (8167, 8168), (13749, 13750), (5981, 5983), (6553, 6554), (13222, 13223), (26831, 26832), (7705, 7706), (14872, 14874), (25680, 25681), (735, 737), (12103, 12104), (17084, 17089), (23472, 23473), (24594, 24596), (768, 770), (17157, 17158), (24084, 24085), (8299, 8300), (26048, 26049), (6911, 6914), (21330, 21331), (3840, 3842), (3053, 3054), (26898, 26899), (8300, 8301)

**evaluating the model before self-training:**

In [None]:
import copy
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

#model.to(device)
_, labels, predictions = valid(model, testing_loader)
print(labels[:30], predictions[:30])
print(classification_report([labels], [predictions]))
conf=confusion_matrix(labels, predictions, labels=["B", "I", "O"])
disp = ConfusionMatrixDisplay(confusion_matrix=conf, display_labels=["B", "I", "O"])
disp.plot()
plt.show()

def get_predictions(model, loader, model_name=""):
  dataset = copy.deepcopy(loader.dataset)
  dataset.set_model(model, model_name)
  return dataset


def eval_dummy():
  dummy = DummyClassifier(strategy='most_frequent')
  train_real_dataset = {"words": [], "labels": []}
  test_real_dataset = {"words": [], "labels": []}
  for (sentence, labels) in zip(train_dataset["sentence"], train_dataset["word_labels"]):
      train_real_dataset["words"].extend(sentence)
      train_real_dataset["labels"].extend(labels)
  for (sentence, labels) in zip(test_dataset["sentence"], test_dataset["word_labels"]):
      test_real_dataset["words"].extend(sentence)
      test_real_dataset["labels"].extend(labels)
  dummy.fit(train_real_dataset["words"], train_real_dataset["labels"])

  _predictions = dummy.predict(test_real_dataset["words"])
  print("Dummy Accuracy:", dummy.score(_predictions.tolist(), test_real_dataset["labels"]))
  print(classification_report([test_real_dataset["labels"]], [_predictions.tolist()]))

def extract_concepts(dataset, gold=False):
  concepts=set()
  concept=[]
  extracting = False
  field = 'targets' if gold else 'predictions'
  for processed in dataset:
    for i in range(len(processed['ids'])):
      if id2label[processed[field][i].item()] == "O" and extracting:
        concepts.add(tokenizer.decode(concept))
        concept = []
        extracting = False
      if id2label[processed[field][i].item()] == "B":
        if extracting and not tokenizer.convert_ids_to_tokens(processed['ids'][i].item()).startswith("##"):
          concepts.add(tokenizer.decode(concept))
          concept = []
        extracting = True
      if extracting:
        concept.append(processed['ids'][i].item())
  return concepts

def eval_deck(concepts, gold_concepts):
	fn = gold_concepts - concepts
	tp = gold_concepts & concepts
	fp = concepts - gold_concepts
	print(f"TP: {len(tp)}, FP: {len(fp)}, FN: {len(fn)}")
	return (len(tp),len(fp),len(fn))

def eval_concepts(model, deck_loader_list):
  tp, fp, fn = 0, 0, 0
  for deck_loader in deck_loader_list:
    pred_ds = get_predictions(model, deck_loader)
    concepts = extract_concepts(pred_ds)
    gold_concepts = extract_concepts(pred_ds, gold=True)
    tp_c, fp_c, fn_c = eval_deck(concepts, gold_concepts)
    tp += tp_c
    fp += fp_c
    fn += fn_c
  p = tp / (tp + fp) if tp + fp > 0 else 0
  r = tp / (tp + fn) if tp + fn > 0 else 0
  f1 = 2 * (p * r) / (p + r) if p + r > 0 else 0
  print(f"precision: {p}, recall: {r}, F1: {f1}")

eval_concepts(model, test_block_loader)
model.push_to_hub("AwesomeREK/concept-extraction-xlnet")

#model.save_pretrained("./xlnet-concept-extractor-model-distant-half-labels/model/")
#tokenizer.save_pretrained("./xlnet-concept-extractor-model-distant-half-labels/tokenizer/")
#!zip -r xlnet-concept-extractor-model-distant-half-labels.zip xlnet-concept-extractor-model-distant-half-labels

Validation loss per 100 evaluation steps: 0.014405729249119759
Validation loss per 100 evaluation steps: 0.11982853178824428


# New Kind of Self-Training:

1. Divide unlabeled data into two seperate sets of courses.

2. One model outputs predictions on one set of courses, the other the other set.

3. The first model learns based off of the second model's prediction for N epochs, and vice versa.

4. Then they swap sets, now being "experts", and output their predictions on those sets.

5. They train off of these predictions for N epochs

6. Repeat for K times, or until they do not do better on the validation set

7. Finally, pick the one with the lowest validation loss on the validation set

**self-training:**

In [None]:
#del model_1
#del model_2
#del model_best
torch.cuda.empty_cache()
model_1 = copy.deepcopy(model)
model_2 = copy.deepcopy(model)
model_best = copy.deepcopy(model)
model_1.to(device)
model_2.to(device)
model_best.to(device)
optimizer_1 = torch.optim.Adam(params=model_1.parameters(), lr=LEARNING_RATE)
optimizer_2 = torch.optim.Adam(params=model_2.parameters(), lr=LEARNING_RATE)



best_model_init = False
ROUNDS = 4
SELF_TRAIN_EPOCH = 5
self_training_type = "ts"

def p2p_self_train(model_1, model_2, model_best, optimizer_1, optimizer_2, self_training_loader_1, self_training_loader_2, validation_loader, rounds, self_epochs, best_model_init=False):
  predicted_training_loader_1 = DataLoader(get_predictions(model_1, self_training_loader_1, model_name="model_1"), **train_params)
  predicted_training_loader_2 = DataLoader(get_predictions(model_2, self_training_loader_2, model_name="model_2"), **train_params)
  early_stopper = EarlyStopper(patience=3, min_delta=0.01)
  for round in range(rounds):
    print(f"Training round: {round + 1}")
    for epoch in range(self_epochs):
      print(f"Training epoch: {epoch + 1}")
      print(f"...Training Model 1 on {predicted_training_loader_2.dataset.model_name}'s predictions")
      train(model_1, optimizer_1, predicted_training_loader_2, epoch, gold=False)
      print(f"...Training Model 2 on {predicted_training_loader_1.dataset.model_name}'s predictions")
      train(model_2, optimizer_2, predicted_training_loader_1, epoch, gold=False)
      if is_validation:
        print("MODEL 1 VALIDATION")
        loss_1, labels, predictions = valid(model_1, validation_loader)
        print(classification_report([labels], [predictions]))
        #eval_concepts(model_1, test_block_loader)
        print("MODEL 2 VALIDATION")
        loss_2, labels, predictions = valid(model_2, validation_loader)
        print(classification_report([labels], [predictions]))
        #eval_concepts(model_2, test_block_loader)
      print("MODEL 1 TEST")
      loss_1, labels, predictions = valid(model_1, testing_loader)
      print(classification_report([labels], [predictions]))
      #eval_concepts(model_1, test_block_loader)
      print("MODEL 2 TEST")
      loss_2, labels, predictions = valid(model_2, testing_loader)
      print(classification_report([labels], [predictions]))
      #eval_concepts(model_2, test_block_loader)
    self_training_loader_1, self_training_loader_2 = self_training_loader_2, self_training_loader_1
    if is_validation:
        print("MODEL 1 VALIDATION")
        loss_1, labels, predictions = valid(model_1, validation_loader)
        print(classification_report([labels], [predictions]))
        #eval_concepts(model_1, test_block_loader)
        print("MODEL 2 VALIDATION")
        loss_2, labels, predictions = valid(model_2, validation_loader)
        print(classification_report([labels], [predictions]))
        #eval_concepts(model_2, test_block_loader)
        model_best = copy.deepcopy(model_1 if loss_1 < loss_2 else model_2)
        model_best.to(device)
        if early_stopper.early_stop(min(loss_1, loss_2)):
          break
    print("MODEL 1 TEST")
    loss_1, labels, predictions = valid(model_1, testing_loader)
    print(classification_report([labels], [predictions]))
    #eval_concepts(model_1, test_block_loader)
    print("MODEL 2 TEST")
    loss_2, labels, predictions = valid(model_2, testing_loader)
    print(classification_report([labels], [predictions]))
    #eval_concepts(model_2, test_block_loader)
    if is_validation:
      print("BEST MODEL TEST")
      loss_best, labels, predictions = valid(model_best, testing_loader)
      print(classification_report([labels], [predictions]))
      #eval_concepts(model_best, test_block_loader)
      if best_model_init and (round % 2 == 1 or round == ROUNDS - 1):
        model_1 = copy.deepcopy(model_best)
        model_2 = copy.deepcopy(model_best)
        model_1.to(device)
        model_2.to(device)
        optimizer_1 = torch.optim.Adam(params=model_1.parameters(), lr=LEARNING_RATE)
        optimizer_2 = torch.optim.Adam(params=model_2.parameters(), lr=LEARNING_RATE)

    predicted_training_loader_1 = DataLoader(get_predictions(model_1, self_training_loader_1, model_name="model_1"), **train_params)
    predicted_training_loader_2 = DataLoader(get_predictions(model_2, self_training_loader_2, model_name="model_2"), **train_params)
  return model_best
def ts_self_train(teacher_model, student_model, teacher_optimizer, student_optimizer, self_training_loader, validation_loader, rounds, self_epochs):
  predicted_training_loader = DataLoader(get_predictions(teacher_model, self_training_loader, model_name="teacher_model"), **train_params)
  early_stopper_round = EarlyStopper(patience=3, min_delta=0.01)
  for round in range(rounds):
    print(f"Training round: {round + 1}")
    early_stopper = EarlyStopper(patience=3, min_delta=0.01)
    for epoch in range(self_epochs):
      print(f"Training epoch: {epoch + 1}")
      print(f"...Training student_model on {predicted_training_loader.dataset.model_name}'s predictions")
      train(student_model, student_optimizer, predicted_training_loader, epoch, gold=False)
      if is_validation:
        print("STUDENT VALIDATION")
        loss, labels, predictions = valid(student_model, validation_loader)
        print(classification_report([labels], [predictions]))
        #eval_concepts(student_model, test_block_loader)
        if early_stopper.early_stop(loss):
          break
      print("STUDENT MODEL TEST")
      loss, labels, predictions = valid(student_model, testing_loader)
      print(classification_report([labels], [predictions]))
      #eval_concepts(student_model, test_block_loader)
    print(f"ROUND {round} COMPLETE!")
    teacher_model = copy.deepcopy(student_model)
    if is_validation:
      print("STUDENT VALIDATION")
      loss, labels, predictions = valid(student_model, validation_loader)
      print(classification_report([labels], [predictions]))
      #eval_concepts(student_model, test_block_loader)
      if early_stopper_round.early_stop(loss):
        break
    print("STUDENT MODEL TEST")
    loss, labels, predictions = valid(student_model, testing_loader)
    print(classification_report([labels], [predictions]))
    #eval_concepts(student_model, test_block_loader)
    predicted_training_loader = DataLoader(get_predictions(teacher_model, self_training_loader, model_name="teacher_model"), **train_params)
  return student_model
if self_training_type == "p2p":
  model_best=p2p_self_train(model_1, model_2, model_best, optimizer_1, optimizer_2, self_training_loader_1, self_training_loader_2, validation_loader, rounds=ROUNDS, self_epochs=SELF_TRAIN_EPOCH, best_model_init=best_model_init)
elif self_training_type == "ts":
  model_best=ts_self_train(teacher_model=model_1, student_model=model_2, teacher_optimizer=optimizer_1, student_optimizer=optimizer_2, self_training_loader=self_training_loader, validation_loader=validation_loader, rounds=ROUNDS, self_epochs=SELF_TRAIN_EPOCH)

Training round: 1
Training epoch: 1
...Training student_model on teacher_model's predictions
Training loss per 100 training steps: 0.027964100241661072
Training loss per 100 training steps: 0.013893163809091738
Training loss per 100 training steps: 0.014322738472455696
Training loss per 100 training steps: 0.013915067093069371
Training loss per 100 training steps: 0.013977241519480906
Training loss epoch: 0.013977241519480906
Training accuracy epoch: 0.9795702556349547
STUDENT VALIDATION
Validation loss per 100 evaluation steps: 0.03717029094696045
{(6975, 6976), (8035, 8036), (3046, 3047), (9426, 9427), (3257, 3258), (5859, 5860), (2518, 2519), (10468, 10469), (2610, 2611), (3969, 3970), (10427, 10428), (628, 629), (9427, 9428), (11258, 11259), (132, 133), (2189, 2190), (4608, 4609), (10988, 10989), (1551, 1552), (4741, 4742), (41, 42), (812, 813), (8252, 8253), (7739, 7740), (12062, 12063), (12760, 12761), (3204, 3205), (4264, 4265), (12365, 12366), (10006, 10007), (166, 167), (2585,

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

**evaluating the model:**

In [None]:
_, labels, predictions = valid(model_1, testing_loader)
print(labels[:30], predictions[:30])
print(classification_report([labels], [predictions]))
eval_concepts(model_1, test_block_loader)
_, labels, predictions = valid(model_2, testing_loader)
print(labels[:30], predictions[:30])
print(classification_report([labels], [predictions]))
eval_concepts(model_2, test_block_loader)
if is_validation:
  _, labels, predictions = valid(model_best, testing_loader)
  print(labels[:30], predictions[:30])
  print(classification_report([labels], [predictions]))
  conf=confusion_matrix(labels, predictions, labels=["B", "I", "O"])
  disp = ConfusionMatrixDisplay(confusion_matrix=conf, display_labels=["B", "I", "O"])
  disp.plot()
  plt.show()
  eval_concepts(model_best, test_block_loader)
  model_best.push_to_hub("AwesomeREK/concept-extraction-xlnet-early-stopping-teacher-student-self-trained")
torch.cuda.empty_cache()

In [None]:
model = XLNetForTokenClassification.from_pretrained("AwesomeREK/concept-extraction-reduced-distant-enhanced-early-stopping-self-trained-xlnet-model",
                                                   num_labels=len(id2label),
                                                   id2label=id2label,
                                                   label2id=label2id)
model.to(device)

**inference:**

In [None]:
sentence = "The client node in a wide area network, which connects to a router node."

inputs = tokenizer(sentence, padding='max_length', truncation=True, max_length=MAX_LEN, return_tensors="pt")

# move to gpu
ids = inputs["input_ids"].to(device)
mask = inputs["attention_mask"].to(device)
# forward pass
outputs = model(ids, mask)
logits = outputs[0]

active_logits = logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size*seq_len,) - predictions at the token level

tokens = tokenizer.convert_ids_to_tokens(ids.squeeze().tolist())
token_predictions = [id2label[i] for i in flattened_predictions.cpu().numpy()]
wp_preds = list(zip(tokens, token_predictions)) # list of tuples. Each tuple = (wordpiece, prediction)

word_level_predictions = []
for pair in wp_preds:
  if (pair[0].startswith(" ##")) or (pair[0] in [CLS_TOKEN, SEP_TOKEN, PAD_TOKEN]):
    # skip prediction
    continue
  else:
    word_level_predictions.append(pair[1])

# we join tokens, if they are not special ones
str_rep = " ".join([t[0] for t in wp_preds if t[0] not in [CLS_TOKEN, SEP_TOKEN, PAD_TOKEN]]) # .replace(" ##", "") if you want the original sentence (but wont align with labels...
print(str_rep)
print(word_level_predictions)

[CLS] the client node in a wide area network , which connects to a route ##r node . [SEP] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD] [PAD