In [2]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("dair-ai/emotion", "split")

In [3]:
### Task 1.1 ###
# Extract class labels into list of integers

train_df = ds["train"].to_pandas()
test_df = ds["test"].to_pandas() 
validation_df = ds["validation"].to_pandas()

print(train_df.size)
print(test_df.size)
print(validation_df.size)

train_dist = train_df['label'].value_counts(normalize=True)
test_dist = test_df['label'].value_counts(normalize=True)
validation_dist = validation_df['label'].value_counts(normalize=True)

print(type(train_dist))
print(test_dist)
print(validation_dist)

# The class distribution is not balanced, but the balance is the
# same across all three splitsacc=


32000
4000
4000
<class 'pandas.core.series.Series'>
label
1    0.3475
0    0.2905
3    0.1375
4    0.1120
2    0.0795
5    0.0330
Name: proportion, dtype: float64
label
1    0.3520
0    0.2750
3    0.1375
4    0.1060
2    0.0890
5    0.0405
Name: proportion, dtype: float64


In [8]:
# What is the chance accuracy level?

chance_level_train = (train_dist** 2).sum()
chance_level_test = (test_dist ** 2).sum()
chance_level_val = (validation_dist ** 2).sum()

print("Chance Levels")
print("Train: ", chance_level_train)
print("Test: ", chance_level_test)
print("Validation: ", chance_level_val)

Chance Levels
Train:  0.2381384765625
Test:  0.24400599999999997
Validation:  0.23923250000000004


In [9]:
# What would be the accuracy of a classifier
# that only predicts the most common class seen in training?

print("Accuracy of classifier only predicting most common class: ", 13521/40000)


Accuracy of classifier only predicting most common class:  0.338025


In [None]:
### Task 1.2 ###
# Analyze the distribution of text lengths by providing its range, mean and standard deviation.

splits = [
    {"label": "Train", "df": train_df},
    {"label": "Test", "df": test_df},
    {"label": "Validation", "df": validation_df}
]
for split in splits:    
    text_lengths = split["df"]["text"].map(lambda x: len(x))
    text_lengths_range = text_lengths.max() - text_lengths.min()
    print(f"[{split['label']}] Text Length - Range              :", text_lengths_range)
    text_lengths_mean = text_lengths.mean()
    print(f"[{split['label']}] Text Length - Mean               :", text_lengths_mean)
    text_lengths_std = text_lengths.std()
    print(f"[{split['label']}] Text Length - Std                :", text_lengths_std)


# Extract the texts for all splits and split each text into tokens.
def whitespace_tokenizer(text):
    split = text.split()
    return [token.strip().lower() for token in text.split()]

train_df["tokens"] = train_df["text"].apply(lambda x: whitespace_tokenizer(x))
test_df["tokens"] = test_df["text"].apply(lambda x: whitespace_tokenizer(x))
validation_df["tokens"] = validation_df["text"].apply(lambda x: whitespace_tokenizer(x))


[Train] Text Length - Range              : 293
[Train] Text Length - Mean               : 96.8458125
[Train] Text Length - Std                : 55.904952812332766
[Test] Text Length - Range              : 282
[Test] Text Length - Mean               : 96.5865
[Test] Text Length - Std                : 55.71599100417033
[Validation] Text Length - Range              : 284
[Validation] Text Length - Mean               : 95.3475
[Validation] Text Length - Std                : 54.82375913810559


In [11]:
### Task 1.3 ###
# Build a vocabulary (map string to integer) based on train split
from collections import Counter
import torch

counter = Counter()
for sample in train_df["tokens"]:
    counter.update(sample)
vocabulary = {
    '<UNK>': 0,
    '<PAD>': 1,
    **{word: idx + 2 for idx, (word, count) in enumerate(counter.most_common(1000))}   
}

In [12]:
### Task 1.4 ###
# Encode all texts with the defined vocabulary
# value 0 resembles <UNK> (unknown token)
# value 1 resemples <PAD> (padding token)

# Sequences shorter than max_length, will be filled
# up with <PAD> until they match max_length
def pad_sequence(sequence, max_length=100, pad_value=1):
    if len(sequence) > max_length:
        return sequence[:max_length]
    else:
        return sequence + [pad_value] * (max_length - len(sequence))

# Encode and pad all texts with the defined vocabulary
train_sequences = [pad_sequence([vocabulary.get(token, 0) for token in sample]) for sample in train_df["tokens"]]
test_sequences = [pad_sequence([vocabulary.get(token, 0) for token in sample]) for sample in test_df["tokens"]]
validation_sequences = [pad_sequence([vocabulary.get(token, 0) for token in sample]) for sample in validation_df["tokens"]]


In [13]:
### Task 1.5a ###
# Convert lists into tensors
def vectorize_sequences(sequences, samples, vocabulary):
    one_hot_results = torch.zeros(len(samples), len(vocabulary) + 1)
    for idx, sequence in enumerate(sequences):
        one_hot_results[idx, sequence] = 1
    return one_hot_results

train_data = vectorize_sequences(train_sequences, train_df["text"],vocabulary)
test_data = vectorize_sequences(test_sequences, test_df["text"],vocabulary)
validation_data = vectorize_sequences(validation_sequences, validation_df["text"],vocabulary)


In [14]:
### Task 1.5b ###
# Load the data
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Assuming you have labels in your DataFrame
train_labels = train_df["label"].tolist()
test_labels = test_df["label"].tolist()
validation_labels = validation_df["label"].tolist()

# Create dataset instances
train_dataset = TextDataset(train_data, train_labels)
test_dataset = TextDataset(test_data, test_labels)
validation_dataset = TextDataset(validation_data, validation_labels)

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)

In [4]:
### Task 2 ###
# Design a model that is suitable for the task. Network 1 --> RNN
import torch.nn as nn

class GRUModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=100, hidden_size=128, num_layers=1, num_classes=6):
        super(GRUModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=1)  # PAD token index is 1
        self.rnn = nn.GRU(input_size=embedding_dim, hidden_size=hidden_size, num_layers=num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 100)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(100, num_classes)
    
    def forward(self, x):
        x = x.to(next(self.embedding.parameters()).device)  # Ensure x is on the same device as the embedding layer
        x = self.embedding(x)  # Shape: (batch_size, max_length, embedding_dim)
        # Initialize hidden state (ensure batch_size matches x.size(0))
        h0 = torch.zeros(self.rnn.num_layers, x.size(0), self.rnn.hidden_size).to(x.device)
        # Pass through RNN
        out, hn = self.rnn(x, h0)
        # Extract the last hidden state (last time step of the last layer)
        hidden_state_outputs = out[:, -1, :]
        # Pass through fully connected layers
        hidden_state_outputs = self.fc(hidden_state_outputs)
        result = self.fc2(hidden_state_outputs)
        return result

   
vocab_size = len(vocabulary) 

NameError: name 'vocabulary' is not defined

In [5]:
# Train the model
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = GRUModel(vocab_size=vocab_size).to(device)

epochs = 10
learning_rate = 0.0001
train_losses = []
validation_losses = []
train_accuracies = []
validation_accuracies = []

def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)
        pred = model(X)
        loss = loss_fn(pred, y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        running_loss += loss.item()
        _, predicted = torch.max(pred.data, 1)
        total += y.size(0)
        correct += (predicted == y).sum().item()
        if batch % 100 == 0:
            loss, current = loss.item(), batch * len(X)
            print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
            
    train_losses.append(running_loss / len(dataloader))
    train_accuracies.append(correct / total) 


def validation_loop(dataloader, model, loss_fn):
    model.eval()
    
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    validation_loss, correct = 0, 0
    total = 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            validation_loss += loss_fn(pred, y).item()
            _, predicted = torch.max(pred.data, 1)
            correct += (predicted == y).sum().item()
            total += y.size(0)
            
    validation_losses.append(validation_loss / num_batches)  # Average test loss
    validation_accuracies.append(correct / total)

    validation_loss /= num_batches
    correct /= size
    print(f"Validation Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {validation_loss:>8f} \n")


def test(dataloader, model, loss_fn):
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    validation_loss, correct = 0, 0

    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X)
            validation_loss += loss_fn(pred, y).item()
            correct += (pred.argmax(1) == y).type(torch.float).sum().item()
    
    validation_loss /= num_batches
    correct /= size
    print(f"Test Error: \n Accuracy: {(100 * correct):>0.1f}%, Avg loss: {validation_loss:>8f} \n")


loss_fn = nn.CrossEntropyLoss(weight=class_weights)
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

for t in range(epochs):
    print(f"Epoch {t + 1}\n-------------------------------")
    train_loop(train_loader, model, loss_fn, optimizer)
    validation_loop(validation_loader, model, loss_fn)
test(test_loader, model, loss_fn)
print("Done!")

NameError: name 'torch' is not defined