# Import libraries & load data

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import torch
from sklearn.metrics import classification_report

# Load the data
data = pd.read_csv('EPRL_data.csv')


In [None]:
# Verify data loaded properly
data.head()

Unnamed: 0,Procedure Type,PROCEDURE NAME,Procedure End Goal,Procedure File Number,Step Number,EPRL,TEXT
0,Manual Manipulation of Items,Reconfigure HAL for EVA,Configure the habitable airlock for EVA by rem...,HAL_1_0.pdf,1,Action (What),Stow monitors against the wall
1,Manual Manipulation of Items,Reconfigure HAL for EVA,Configure the habitable airlock for EVA by rem...,HAL_1_0.pdf,2,Action (What),Stow the keyboards against the wall
2,Manual Manipulation of Items,Reconfigure HAL for EVA,Configure the habitable airlock for EVA by rem...,HAL_1_0.pdf,3,Action (What),Remove the seat cushion
3,Manual Manipulation of Items,Reconfigure HAL for EVA,Configure the habitable airlock for EVA by rem...,HAL_1_0.pdf,4,Action (What),Fold the chair backs forward
4,Manual Manipulation of Items,Reconfigure HAL for EVA,Configure the habitable airlock for EVA by rem...,HAL_1_0.pdf,5,Action (What),Detach crew hygiene kit


In [None]:
# Ensure transformers library is installed
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m50.9 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m93.3 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.11.0
  Downloading huggingface_hub-0.13.4-py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.1/200.1 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.13.4 tokenizers-0.13.3 transformers-4.28.1


# BERT-base-uncased

Here, we are using BertForSequenceClassification and BertTokenizer classes from the transformers library. We are using the 'bert-base-uncased' pre-trained model, which does not distinguish between upper-case and lower-case letters.

For smaller datasets, bert-base-uncased is generally more appropriate than bert-large-uncased since it has fewer parameters, requires less memory, and is overall less computationally expensive. Bert-large models may lead to overfitting on small datasets.

In [None]:
# Import the BERT model and tokenizer
from transformers import BertForSequenceClassification, BertTokenizer

# Define the mapping from EPRL tags to integers. If there are additional tags to be included, add them now.
eprl_map = {
    'Action (How)': 0,
    'Action (What)': 1,
    'Action (Where)': 2,
    'Decision (What)': 3,
    'Trigger (How)': 4,
    'Trigger (What)': 5,
    'Trigger (Where)': 6,
    'Verification (How)': 7,
    'Verification (What)': 8,
    'Verification (Where)': 9,
    'Waiting (How)': 10,
    'Waiting (What)': 11,
    'Waiting (Where)': 12
}

# Map the EPRL tags to integers
data['EPRL'] = data['EPRL'].map(eprl_map)

# Load the BERT model and tokenizer
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(eprl_map))
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define a function to preprocess the text data
def preprocess(text):
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_text

# Preprocess the text data and convert it to a tensor
inputs = data['TEXT'].apply(preprocess)
input_ids = torch.cat([i['input_ids'] for i in inputs])
attention_masks = torch.cat([i['attention_mask'] for i in inputs])
labels = torch.tensor(data['EPRL'].values)

# Define a DataLoader to feed the input to the model
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32)

# Define the training and evaluation loops
def train(model, dataloader, optimizer):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

def evaluate(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1])
            logits = outputs.logits
            predictions += logits.argmax(-1).cpu().numpy().tolist()
            true_labels += batch[2].cpu().numpy().tolist()
    report = classification_report(true_labels, predictions, target_names=list(eprl_map.keys()))
    return report

# Define the training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 5

# Train the model
for epoch in range(num_epochs):
    train(model, dataloader, optimizer)
    report = evaluate(model, dataloader)
    print(f'Epoch {epoch + 1}:\n{report}')

# Make predictions on the test set
model.eval()
predictions = []
for batch in dataloader:
    with torch.no_grad():
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        logits = outputs.logits
        predictions += logits.argmax(-1).cpu().numpy().tolist()

# Map the predicted integer labels back to EPRL tags
predicted_eprl = pd.Series(predictions).map({v: k for k, v in eprl_map.items()})

# Add the predicted EPRL tags to the original data
data['EPRL_predicted'] = predicted_eprl

# Save the updated data to a CSV file
data.to_csv('bert-base-uncased.csv', index=False)



Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Epoch 1:
                      precision    recall  f1-score   support

        Action (How)       1.00      1.00      1.00       429
       Action (What)       0.99      0.90      0.95       436
      Action (Where)       0.93      0.75      0.83       432
     Decision (What)       1.00      1.00      1.00       347
       Trigger (How)       0.84      0.98      0.90       270
      Trigger (What)       0.99      1.00      1.00       356
     Trigger (Where)       0.95      0.99      0.97       270
  Verification (How)       1.00      0.88      0.94       426
 Verification (What)       0.96      0.83      0.89       426
Verification (Where)       0.92      0.80      0.85       426
       Waiting (How)       0.98      1.00      0.99       423
      Waiting (What)       0.86      1.00      0.92       423
     Waiting (Where)       0.75      1.00      0.85       423

            accuracy                           0.93      5087
           macro avg       0.94      0.93      0.93      50

#distilBERT

Here, we are using DistilBertForSequenceClassification and DistilBertTokenizer classes from the transformers library. We are using the 'distilbert-base-uncased' pre-trained model, which is a smaller and faster version of BERT with fewer layers and fewer parameters.





In [None]:
# Load the distilBERT model and tokenizer
from transformers import DistilBertForSequenceClassification, DistilBertTokenizer

# Define the mapping from EPRL tags to integers. If there are additional tags to be included, add them now.
eprl_map = {
    'Action (How)': 0,
    'Action (What)': 1,
    'Action (Where)': 2,
    'Decision (What)': 3,
    'Trigger (How)': 4,
    'Trigger (What)': 5,
    'Trigger (Where)': 6,
    'Verification (How)': 7,
    'Verification (What)': 8,
    'Verification (Where)': 9,
    'Waiting (How)': 10,
    'Waiting (What)': 11,
    'Waiting (Where)': 12
}

# Map the EPRL tags to integers
data['EPRL'] = data['EPRL'].map(eprl_map)

# Load the distilBERT model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=len(eprl_map))
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define a function to preprocess the text data
def preprocess(text):
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_text

# Preprocess the text data and convert it to a tensor
inputs = data['TEXT'].apply(preprocess)
input_ids = torch.cat([i['input_ids'] for i in inputs])
attention_masks = torch.cat([i['attention_mask'] for i in inputs])
labels = torch.tensor(data['EPRL'].values)

# Define a DataLoader to feed the input to the model
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32)

# Define the training and evaluation loops
def train(model, dataloader, optimizer):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

def evaluate(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1])
            logits = outputs.logits
            predictions += logits.argmax(-1).cpu().numpy().tolist()
            true_labels += batch[2].cpu().numpy().tolist()
    report = classification_report(true_labels, predictions, target_names=list(eprl_map.keys()))
    return report

# Define the training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 20

# Train the model
for epoch in range(num_epochs):
    train(model, dataloader, optimizer)
    report = evaluate(model, dataloader)
    print(f'Epoch {epoch + 1}:\n{report}')

# Make predictions on the test set
model.eval()
predictions = []
for batch in dataloader:
    with torch.no_grad():
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        logits = outputs.logits
        predictions += logits.argmax(-1).cpu().numpy().tolist()

# Map the predicted integer labels back to EPRL tags
predicted_eprl = pd.Series(predictions).map({v: k for k, v in eprl_map.items()})

# Save the updated data to a CSV file
data.to_csv('distilbert.csv', index=False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Epoch 1:
                      precision    recall  f1-score   support

        Action (How)       0.99      0.98      0.99       429
       Action (What)       1.00      0.92      0.96       436
      Action (Where)       0.64      0.84      0.73       432
     Decision (What)       1.00      1.00      1.00       347
       Trigger (How)       0.83      0.98      0.90       270
      Trigger (What)       0.99      0.87      0.92       356
     Trigger (Where)       1.00      0.99      0.99       270
  Verification (How)       1.00      0.88      0.94       426
 Verification (What)       0.77      0.74      0.76       426
Verification (Where)       0.90      0.60      0.72       426
       Waiting (How)       1.00      1.00      1.00       423
      Waiting (What)       0.87      0.78      0.83       423
     Waiting (Where)       0.74      1.00      0.85       423

            accuracy                           0.89      5087
           macro avg       0.90      0.89      0.89      50

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# XLNet

In [None]:
# Import the XLNet model and tokenizer
from transformers import AutoTokenizer, XLNetForSequenceClassification

# Define the mapping from EPRL tags to integers
eprl_map = {
    'Action (How)': 0,
    'Action (What)': 1,
    'Action (Where)': 2,
    'Decision (What)': 3,
    'Trigger (How)': 4,
    'Trigger (What)': 5,
    'Trigger (Where)': 6,
    'Verification (How)': 7,
    'Verification (What)': 8,
    'Verification (Where)': 9,
    'Waiting (How)': 10,
    'Waiting (What)': 11,
    'Waiting (Where)': 12
}

# Map the EPRL tags to integers
data['EPRL'] = data['EPRL'].map(eprl_map)

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("xlnet-base-cased")
model = XLNetForSequenceClassification.from_pretrained("xlnet-base-cased", num_labels=len(eprl_map))

# Define a function to preprocess the text data
def preprocess(text):
    encoded_text = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=64,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    return encoded_text

# Preprocess the text data and convert it to a tensor
inputs = data['TEXT'].apply(preprocess)
input_ids = torch.cat([i['input_ids'] for i in inputs])
attention_masks = torch.cat([i['attention_mask'] for i in inputs])
labels = torch.tensor(data['EPRL'].values)

# Define a DataLoader to feed the input to the model
from torch.utils.data import TensorDataset, DataLoader

dataset = TensorDataset(input_ids, attention_masks, labels)
dataloader = DataLoader(dataset, batch_size=32)

# Define the training and evaluation loops
def train(model, dataloader, optimizer):
    model.train()
    for batch in dataloader:
        optimizer.zero_grad()
        outputs = model(input_ids=batch[0], attention_mask=batch[1], labels=batch[2])
        loss = outputs.loss
        logits = outputs.logits
        loss.backward()
        optimizer.step()

def evaluate(model, dataloader):
    model.eval()
    predictions = []
    true_labels = []
    for batch in dataloader:
        with torch.no_grad():
            outputs = model(input_ids=batch[0], attention_mask=batch[1])
            logits = outputs.logits
            predictions += logits.argmax(-1).cpu().numpy().tolist()
            true_labels += batch[2].cpu().numpy().tolist()
    report = classification_report(true_labels, predictions, target_names=list(eprl_map.keys()))
    return report

# Define the training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 5

# Train the model
for epoch in range(num_epochs):
    train(model, dataloader, optimizer)
    report = evaluate(model, dataloader)
    print(f'Epoch {epoch + 1}:\n{report}')

# Make predictions on the test set
model.eval()
predictions = []
for batch in dataloader:
    with torch.no_grad():
        outputs = model(input_ids=batch[0], attention_mask=batch[1])
        logits = outputs.logits
        predictions += logits.argmax(-1).cpu().numpy().tolist()

# Map the predicted integer labels back to EPRL tags
predicted_eprl = pd.Series(predictions).map({v: k for k, v in eprl_map.items()})


# Save the updated data to a CSV file
data.to_csv('xlnet.csv', index=False)

Downloading (…)lve/main/config.json:   0%|          | 0.00/760 [00:00<?, ?B/s]

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/467M [00:00<?, ?B/s]

Some weights of the model checkpoint at xlnet-base-cased were not used when initializing XLNetForSequenceClassification: ['lm_loss.bias', 'lm_loss.weight']
- This IS expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLNetForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of XLNetForSequenceClassification were not initialized from the model checkpoint at xlnet-base-cased and are newly initialized: ['logits_proj.weight', 'logits_proj.bias', 'sequence_summary.summary.bias', 'sequence_summary.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions a

Epoch 1:
                      precision    recall  f1-score   support

        Action (How)       1.00      0.98      0.99       429
       Action (What)       0.91      0.94      0.92       436
      Action (Where)       0.83      0.73      0.78       432
     Decision (What)       1.00      1.00      1.00       347
       Trigger (How)       0.82      0.98      0.89       270
      Trigger (What)       1.00      1.00      1.00       356
     Trigger (Where)       1.00      0.99      0.99       270
  Verification (How)       0.78      0.88      0.83       426
 Verification (What)       1.00      0.66      0.79       426
Verification (Where)       0.90      0.54      0.68       426
       Waiting (How)       1.00      1.00      1.00       423
      Waiting (What)       1.00      1.00      1.00       423
     Waiting (Where)       0.63      1.00      0.77       423

            accuracy                           0.89      5087
           macro avg       0.91      0.90      0.90      50

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>