# Task: `Natural Language Processing`

Given an audio transcription of a turret command instruction, return a JSON object corresponding to the specified target’s description, heading, and the tool to be deployed against it.

**For Advanced teams**, the transcript will be a turret instruction in natural language.

**For Novice teams**, the transcript will follow a relatively structured format with the turret operator explicitly reading out the turret, heading, and tool, though the order may vary.

_Insert Code Here_

In [4]:
import os
from transformers import BertForQuestionAnswering, AutoTokenizer, DefaultDataCollator, TrainingArguments, Trainer, BertTokenizer
import torch

In [5]:
cur_dir = os.getcwd()
nlp_dir = os.path.dirname(cur_dir)
til_dir = os.path.dirname(nlp_dir)
home_dir = os.path.dirname(til_dir)
data_dir = os.path.join(home_dir, 'novice')
audio_dir = os.path.join(data_dir, 'audio')

audio_dir

'/home/jupyter/novice/audio'

In [6]:
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = BertForQuestionAnswering.from_pretrained(model_name)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
context = "The University of California was founded in 1868, located in Berkeley."
question = "When was the University of California established?"

inputs = tokenizer(question, context, return_tensors='pt')
with torch.no_grad():
    outputs = model(**inputs)

# Find the tokens with the highest `start` and `end` scores
answer_start = torch.argmax(outputs.start_logits)
answer_end = torch.argmax(outputs.end_logits) + 1

# Convert tokens to answer string
answer = tokenizer.convert_tokens_to_string(tokenizer.convert_ids_to_tokens(inputs.input_ids[0, answer_start:answer_end]))
print("Answer:", answer)

Answer: when was the university of california established? [SEP] the university of california


## Fine Tuning

In [None]:
import datasets

def read_data(file_path):
    tokens = []
    labels = []
    with open(file_path, "r") as f:
        token_list = []
        label_list = []
        for line in f:
            if line.strip():
                token, label = line.strip().split()
                token_list.append(token)
                label_list.append(label)
            else:
                tokens.append(token_list)
                labels.append(label_list)
                token_list = []
                label_list = []
        if token_list:
            tokens.append(token_list)
            labels.append(label_list)
    return tokens, labels

train_tokens, train_labels = read_data("train.txt")
val_tokens, val_labels = read_data("val.txt")

train_dataset = datasets.Dataset.from_dict({"tokens": train_tokens, "ner_tags": train_labels})
val_dataset = datasets.Dataset.from_dict({"tokens": val_tokens, "ner_tags": val_labels})


In [None]:
label_list = ["O", "B-TARGET", "I-TARGET", "B-HEADING", "I-HEADING", "B-TOOL", "I-TOOL"]
id2label = {i: label for i, label in enumerate(label_list)}
label2id = {label: i for i, label in enumerate(label_list)}


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # Special token like [CLS], [SEP], etc.
            elif word_idx != previous_word_idx:
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(label2id[label[word_idx]])
            previous_word_idx = word_idx
        tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_val_dataset = val_dataset.map(tokenize_and_align_labels, batched=True)


In [None]:
from transformers import AutoModelForTokenClassification, Trainer, TrainingArguments

model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", num_labels=len(label_list))

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
)

trainer.train()
