In [154]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter

df = pd.read_json('./labelling/labelled_data_aggregated_2024-10-07 15:59:46.787108.json')

label_counts = Counter(df['0'])
label_count = df['0'].nunique()

print(f"Found {label_count} unique labels, for a total of {len(df)} rows.\n")

sorted_label_counts = dict(sorted(label_counts.items(), key=lambda item: item[1], reverse=True))

for label, count in sorted_label_counts.items():
    print(f"Label {label}: {count} examples")

le = LabelEncoder()
df['label_num'] = le.fit_transform(df['0'])

train_questions, val_questions, train_labels, val_labels = train_test_split(
    df['Text'], df['label_num'],
    test_size=0.2,
    random_state=34197
)

df.head()

Found 19 unique labels, for a total of 290 rows.

Label SPEC_TRANS: 55 examples
Label TRANS_DETAIL: 47 examples
Label GEN_INFO: 42 examples
Label STATE_COUNT: 23 examples
Label GRAMMAR: 22 examples
Label FINAL_STATE: 16 examples
Label PATTERN_RECOG: 15 examples
Label START: 12 examples
Label SHAPE_AUT: 12 examples
Label STATE_ID: 9 examples
Label OTHER: 8 examples
Label INPUT_QUERY: 6 examples
Label LOOPS: 5 examples
Label OUTPUT_QUERY: 5 examples
Label OPT_REP: 4 examples
Label REPETITIVE_PAT: 3 examples
Label TRANS_BETWEEN: 3 examples
Label IO_EXAMPLES: 2 examples
Label EFFICIENCY: 1 examples


Unnamed: 0,Text,0,label_num
0,hi,START,14
1,describe the automaton,GEN_INFO,2
2,There is a transition between q2 and q0,SPEC_TRANS,13
3,There is a transition between q5 and q7,SPEC_TRANS,13
4,Describe it please,START,14


In [155]:
from transformers import BertTokenizer, DistilBertTokenizer

tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_encodings = tokenizer(
    train_questions.tolist(), truncation=True, padding=True, max_length=128
)
val_encodings = tokenizer(
    val_questions.tolist(), truncation=True, padding=True, max_length=128
)

In [156]:
import torch
from torch.utils.data import Dataset


class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {
            key: torch.tensor(val[idx]) for key, val in self.encodings.items()
        }
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)


train_dataset = SentimentDataset(train_encodings, train_labels)
val_dataset = SentimentDataset(val_encodings, val_labels)

In [157]:
from transformers import BertForSequenceClassification, DistilBertForSequenceClassification, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader
from torch.optim import AdamW

EPOCHS = 15

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=label_count)
# model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=label_count)

# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Set up the optimizer and learning rate scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)

# Total number of training steps
total_steps = len(train_loader) * EPOCHS

# Scheduler to update the learning rate
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [158]:
from tqdm import tqdm
from sklearn.metrics import accuracy_score
import time

training_stats = []

progress_bar = tqdm(total=EPOCHS, desc=f"Training model...", unit="epochs")

for epoch in range(EPOCHS):
    start_time = time.time()

    # Training
    model.train()
    total_train_loss = 0
    for batch in train_loader:
        # Move batch to device
        batch = {k: v.to(device) for k, v in batch.items()}

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(**batch)
        loss = outputs.loss
        total_train_loss += loss.item()

        # Backward pass
        loss.backward()

        # Update parameters
        optimizer.step()
        scheduler.step()

    avg_train_loss = total_train_loss / len(train_loader)
    # print(f'Epoch {epoch + 1}, Training Loss: {avg_train_loss}')

    # Evaluation
    model.eval()
    total_eval_accuracy = 0
    total_eval_loss = 0
    predictions, true_labels = [], []

    with torch.no_grad():
        for batch in val_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)

            loss = outputs.loss
            total_eval_loss += loss.item()

            logits = outputs.logits
            preds = torch.argmax(logits, dim=-1)

            predictions.extend(preds.cpu().numpy())
            true_labels.extend(batch['labels'].cpu().numpy())

    avg_val_loss = total_eval_loss / len(val_loader)
    val_accuracy = accuracy_score(true_labels, predictions)

    end_time = time.time()
    epoch_duration = end_time - start_time

    # print(f'Validation Loss: {avg_val_loss}')
    # print(f'Validation Accuracy: {val_accuracy}')
    # print(classification_report(true_labels, predictions))
    # Count true labels
    # print("True Labels Distribution:", Counter(true_labels)) 
    # Count predicted labels
    # print("Predicted Labels Distribution:", Counter(predictions))

    training_stats.append(
        {
            'epoch': epoch + 1,
            'training_loss': avg_train_loss,
            'validation_loss': avg_val_loss,
            'validation_accuracy': val_accuracy,
            'epoch_duration': epoch_duration,
            'true_labels_distribution': Counter(true_labels),
            'predicted_labels_distribution': Counter(predictions)
        }
    )

    progress_bar.update()


Training model...: 100%|██████████| 100/100 [03:01<00:00,  1.82s/epochs]

Training model...:   7%|▋         | 1/15 [00:00<00:09,  1.45epochs/s][A
Training model...:  13%|█▎        | 2/15 [00:01<00:07,  1.72epochs/s][A
Training model...:  20%|██        | 3/15 [00:01<00:06,  1.90epochs/s][A
Training model...:  27%|██▋       | 4/15 [00:02<00:05,  2.00epochs/s][A
Training model...:  33%|███▎      | 5/15 [00:02<00:04,  2.05epochs/s][A
Training model...:  40%|████      | 6/15 [00:03<00:04,  2.09epochs/s][A
Training model...:  47%|████▋     | 7/15 [00:03<00:03,  2.12epochs/s][A
Training model...:  53%|█████▎    | 8/15 [00:03<00:03,  2.13epochs/s][A
Training model...:  60%|██████    | 9/15 [00:04<00:02,  2.14epochs/s][A
Training model...:  67%|██████▋   | 10/15 [00:04<00:02,  2.15epochs/s][A
Training model...:  73%|███████▎  | 11/15 [00:05<00:01,  2.15epochs/s][A
Training model...:  80%|████████  | 12/15 [00:05<00:01,  2.16epochs/s][A
Training model...:  87%|████████▋ | 13/15 [00:

In [159]:
# Create a DataFrame from the training statistics
df_stats = pd.DataFrame(training_stats)

# Use the 'epoch' as the index
df_stats = df_stats.set_index('epoch')

# Display the DataFrame
df_stats

Unnamed: 0_level_0,training_loss,validation_loss,validation_accuracy,epoch_duration,true_labels_distribution,predicted_labels_distribution
epoch,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,2.602827,2.347158,0.396552,0.689554,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{18: 14, 2: 14, 13: 30}"
2,2.032785,1.85701,0.603448,0.503935,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{18: 24, 2: 10, 13: 14, 3: 2, 15: 6, 10: 1, 1: 1}"
3,1.564519,1.472145,0.741379,0.460194,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{18: 11, 2: 14, 13: 13, 3: 3, 15: 9, 10: 5, 14..."
4,1.170305,1.220039,0.775862,0.460159,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{18: 10, 2: 12, 13: 12, 10: 6, 3: 4, 15: 9, 14..."
5,0.912789,1.103985,0.741379,0.461535,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{18: 14, 2: 11, 10: 6, 13: 10, 3: 3, 15: 8, 14..."
6,0.721874,1.032454,0.775862,0.45852,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{18: 10, 2: 12, 13: 12, 10: 6, 3: 3, 15: 8, 9:..."
7,0.589566,0.96928,0.758621,0.460081,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{15: 9, 2: 12, 18: 9, 10: 6, 13: 10, 3: 4, 9: ..."
8,0.477211,0.913297,0.775862,0.460844,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{14: 3, 2: 12, 18: 10, 10: 6, 13: 11, 3: 3, 15..."
9,0.434633,0.898039,0.775862,0.46096,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{4: 2, 2: 12, 18: 10, 10: 6, 13: 10, 3: 3, 15:..."
10,0.369197,0.872873,0.827586,0.460452,"{4: 2, 2: 9, 16: 3, 18: 9, 11: 2, 13: 9, 8: 3,...","{4: 1, 2: 11, 15: 10, 18: 8, 10: 6, 13: 11, 3:..."


In [160]:
import matplotlib.pyplot as plt

%matplotlib notebook

# Plot the training and validation loss over epochs
plt.figure(figsize=(12, 6))

# Plot loss
plt.plot(df_stats['training_loss'], label='Training Loss')
plt.plot(df_stats['validation_loss'], label='Validation Loss')

# Format the plot
plt.title('Training & Validation Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()


<IPython.core.display.Javascript object>

In [161]:
# Plot the validation accuracy over epochs
plt.figure(figsize=(12, 6))

# Plot accuracy
plt.plot(df_stats['validation_accuracy'], label='Validation Accuracy', color='green')

# Format the plot
plt.title('Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()
plt.grid(True)
plt.show()


<IPython.core.display.Javascript object>

In [162]:
plt.figure(figsize=(12, 6))

# Plot time between each epoch
plt.plot(df_stats['epoch_duration'], label='Epochs Duration', color='green')

# Format the plot
plt.title('Epochs Duration')
plt.xlabel('Epoch')
plt.ylabel('s')
plt.legend()
plt.grid(True)
plt.show()


<IPython.core.display.Javascript object>

In [164]:
def predict(text):
    # Tokenize input
    inputs = tokenizer(
        text, return_tensors='pt', truncation=True, padding=True, max_length=128
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=-1)

    return preds.item()


sample_text = "Which is the end state?"
prediction = predict(sample_text)

# Extract unique pairs of label IDs and text labels
unique_labels = df[['label_num', '0']].drop_duplicates()

# Create a dictionary mapping from label ID to text label
label_map = dict(zip(unique_labels['label_num'], unique_labels['0']))

# # Print the mapping
# for label_id, label_text in label_map.items():
#     print(f"ID {label_id}: {label_text}")

print(f'Prediction: {label_map[prediction]}')


Prediction: START


In [171]:
import torch.nn.functional as F

def predict_top(text, top_k=5):
    # Tokenize input
    inputs = tokenizer(
        text, return_tensors='pt', truncation=True, padding=True, max_length=128
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Get model outputs
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        top_k_logits, top_k_indices = torch.topk(logits, top_k, dim=-1)

    # Apply softmax to get confidence scores
    softmax_scores = F.softmax(logits, dim=-1)
    top_k_scores = torch.gather(softmax_scores, 1, top_k_indices)

    # Map indices to labels and pair with confidence scores
    top_k_results = [(label_map[idx.item()], score.item()) for idx, score in zip(top_k_indices[0], top_k_scores[0])]

    return top_k_results

# Example usage
sample_text = "What is the final state?"
top_predictions = predict_top(sample_text, top_k=5)
print(f'Top Predictions: {top_predictions}')

Top Predictions: [('GEN_INFO', 0.9563645124435425), ('SHAPE_AUT', 0.01008952409029007), ('OTHER', 0.0038228940684348345), ('TRANS_DETAIL', 0.003795928554609418), ('START', 0.0033612248953431845)]
