### Text Data Preprocessing

In [1]:
# TODO sentence preprocessing, turn big paragraphs into single sentences

import pandas as pd
import nltk

from nltk.stem import PorterStemmer, WordNetLemmatizer

Reading the Excel File into a pandas DataFrame

In [2]:
file_path = "interrater_data.xlsx"
# Read the Excel file into a pandas DataFrame
excel_file = pd.read_excel(file_path)

Extracting the text and associated label for each excel row and storing them in _sentences_ and _labels_ respectively

In [3]:
actual_labels = []
for col in excel_file.columns[3:]:
    actual_labels.append(col)

sentences = []
labels = []

for row in excel_file.iterrows():
    for index, speech_act in enumerate(row[1].iloc[3:10]):
        if speech_act == "x":
            # TODO break down the rows further
            row_content = row[1].iloc[2].lower().strip()
            
            sentences.append(row_content)
            labels.append(actual_labels[index])
            break

# print("Sentences: ", sentences[:5])
# print("I have sentences: ", len(sentences))
# print("Correct Labels: ", labels[:5])
# print("I have labels: ", len(labels))

In [4]:
excel_file

Unnamed: 0,Time,Speaker,Utterance,Not Classified,Statement of Intent,Statement of Prediction,Statement of Situation,Statement of Action,Request for Intent,Request for Prediction,Request for Situation,Request for Action
0,00:00:00,Bravo,"Alpha, Charlie. Bravo check.",,,,,,,,x,
1,00:00:05,Charlie,Alpha you're loud_and_clear.,,,,x,,,,,
2,00:00:06,Alpha,Charlie. Good to me,,,,x,,,,,
3,00:00:10,Bravo,"Charlie, Charlie one, Bravo radio check.",,,,,,,,x,
4,00:00:13,Alpha,Yeah. Charlie good to me. Over,,,,x,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...
111,00:15:52,Charlie,Yep,X,,,,,,,,
112,00:15:54,Alpha,Don't forget we still got a fire in the kind o...,,,,X,,,,,
113,00:16:00,Bravo,Yeah. Now.,,,,,X,,,,
114,00:16:05,Charlie,OK.,,,,,,,,,


In [5]:
excel_file.shape
# The first number is the number of rows/entries there are, the second is the number of columns of the .shape function

(116, 12)

In [6]:
excel_file.nunique()

Time                       115
Speaker                      4
Utterance                  114
Not Classified               2
Statement of Intent          2
Statement of Prediction      2
Statement of Situation       2
Statement of Action          2
Request for Intent           0
Request for Prediction       0
Request for Situation        2
Request for Action           2
dtype: int64

In [7]:
print(sentences)   



In [8]:
for l in labels: print(l)

Statement of Situation
Statement of Situation
Statement of Situation
Statement of Situation
Not Classified
Statement of Situation
Statement of Situation
Statement of Situation
Statement of Action
Statement of Intent
Statement of Situation
Statement of Situation
Not Classified
Statement of Situation
Statement of Situation
Not Classified
Not Classified
Statement of Intent
Not Classified
Statement of Intent
Not Classified
Statement of Intent
Statement of Intent
Statement of Prediction
Not Classified
Not Classified
Statement of Intent
Statement of Intent
Statement of Intent
Statement of Intent
Statement of Prediction
Statement of Situation
Statement of Situation
Not Classified
Statement of Situation
Statement of Intent
Statement of Intent
Statement of Situation
Statement of Situation
Statement of Action
Statement of Intent
Not Classified
Statement of Intent
Statement of Intent
Statement of Action
Statement of Situation
Statement of Situation
Statement of Intent
Statement of Action
Statemen

# Bert Embeddings

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, TensorDataset
import torch
from tqdm.notebook import tqdm

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(sentences, labels, test_size=0.2, random_state=42)

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(actual_labels))

# Tokenize and convert the text data to input features
def tokenize_text(text1):
    return tokenizer.encode_plus(
        text1,
        add_special_tokens=True,
        max_length=512,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

X_train_tokens = [tokenize_text(text) for text in X_train]
X_test_tokens = [tokenize_text(text) for text in X_test]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.preprocessing import LabelEncoder

# Encode labels using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(torch.cat([item['input_ids'] for item in X_train_tokens]),
                              torch.cat([item['attention_mask'] for item in X_train_tokens]),
                              torch.tensor(y_train_encoded))  # Use the encoded labels
test_dataset = TensorDataset(torch.cat([item['input_ids'] for item in X_test_tokens]),
                             torch.cat([item['attention_mask'] for item in X_test_tokens]),
                             torch.tensor(y_test_encoded))  # Use the encoded labels
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# Set up GPU if available
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")
model.to(device)

# Set up optimizer and training parameters
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
num_epochs = 3

In [11]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0
    for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{num_epochs}'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()

        loss.backward()
        optimizer.step()

    avg_train_loss = train_loss / len(train_loader)

Epoch 1/3:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch 2/3:   0%|          | 0/7 [00:00<?, ?it/s]

Epoch 3/3:   0%|          | 0/7 [00:00<?, ?it/s]

In [12]:
# Validation loop
model.eval()
val_loss = 0.0
predictions, true_labels = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc=f'Validation'):
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        val_loss += loss.item()

        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).tolist())
        true_labels.extend(labels.tolist())

avg_val_loss = val_loss / len(test_loader)

accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions)

print(f"Epoch {epoch + 1}/{num_epochs} - Avg. Train Loss: {avg_train_loss:.4f} - Avg. Validation Loss: {avg_val_loss:.4f}")
print(f"Accuracy: {accuracy}")
print("Classification Report:\n", report)

Validation:   0%|          | 0/2 [00:00<?, ?it/s]

Epoch 3/3 - Avg. Train Loss: 1.5778 - Avg. Validation Loss: 1.6484
Accuracy: 0.5384615384615384
Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.33      0.50         3
           1       0.00      0.00      0.00         1
           2       0.67      0.80      0.73         5
           3       0.00      0.00      0.00         1
           4       0.33      0.67      0.44         3

    accuracy                           0.54        13
   macro avg       0.40      0.36      0.33        13
weighted avg       0.56      0.54      0.50        13


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
