In [2]:
from datasets import load_dataset
import pandas as pd

ds = load_dataset("dair-ai/emotion", "split")

In [19]:
### Task 1.1 ###
# Extract class labels into list of integers

train_df = ds["train"].to_pandas()
test_df = ds["test"].to_pandas() 
validation_df = ds["validation"].to_pandas()

train_dist = train_df['label'].value_counts(normalize=True)
test_dist = test_df['label'].value_counts(normalize=True)
validation_dist = validation_df['label'].value_counts(normalize=True)

print(train_dist)
print(test_dist)
print(validation_dist)

# The class distribution is not balanced, but the balance is the
# same across all three splitsacc=


label
1    0.335125
0    0.291625
3    0.134937
4    0.121063
2    0.081500
5    0.035750
Name: proportion, dtype: float64
label
1    0.3475
0    0.2905
3    0.1375
4    0.1120
2    0.0795
5    0.0330
Name: proportion, dtype: float64
label
1    0.3520
0    0.2750
3    0.1375
4    0.1060
2    0.0890
5    0.0405
Name: proportion, dtype: float64


In [20]:
# What is the chance accuracy level?

chance_level_train = (train_dist** 2).sum()
chance_level_test = (test_dist ** 2).sum()
chance_level_val = (validation_dist ** 2).sum()

print("Chance Levels")
print("Train: ", chance_level_train)
print("Test: ", chance_level_test)
print("Validation: ", chance_level_val)

Chance Levels
Train:  0.2381384765625
Test:  0.24400599999999997
Validation:  0.23923250000000004


In [21]:
# What would be the accuracy of a classifier
# that only predicts the most common class seen in training?

print("Accuracy of classifier only predicting most common class: ", 5362/60000)


Accuracy of classifier only predicting most common class:  0.08936666666666666


In [None]:
### Task 1.2 ###
# Analyze the distribution of text lengths by providing its range, mean and standard deviation.

splits = [
    {"label": "Train", "df": train_df},
    {"label": "Test", "df": test_df},
    {"label": "Validation", "df": validation_df}
]
for split in splits:    
    text_lengths = split["df"]["text"].map(lambda x: len(x))
    text_lengths_range = text_lengths.max() - text_lengths.min()
    print(f"[{split['label']}] Text Length - Range              :", text_lengths_range)
    text_lengths_mean = text_lengths.mean()
    print(f"[{split['label']}] Text Length - Mean               :", text_lengths_mean)
    text_lengths_std = text_lengths.std()
    print(f"[{split['label']}] Text Length - Std                :", text_lengths_std)


# Extract the texts for all splits and split each text into tokens.
import torchtext
from torchtext.data import get_tokenizer
tokenizer = get_tokenizer("basic_english")

train_df["tokens"] = train_df["text"].apply(lambda x: tokenizer(x))
test_df["tokens"] = test_df["text"].apply(lambda x: tokenizer(x))
validation_df["tokens"] = validation_df["text"].apply(lambda x: tokenizer(x))


[Train] Text Length - Range              : 293
[Train] Text Length - Mean               : 96.8458125
[Train] Text Length - Std                : 55.904952812332766
[Test] Text Length - Range              : 282
[Test] Text Length - Mean               : 96.5865
[Test] Text Length - Std                : 55.71599100417033
[Validation] Text Length - Range              : 284
[Validation] Text Length - Mean               : 95.3475
[Validation] Text Length - Std                : 54.82375913810559
                                                text  label  \
0                            i didnt feel humiliated      0   
1  i can go from feeling so hopeless to so damned...      0   
2   im grabbing a minute to post i feel greedy wrong      3   
3  i am ever feeling nostalgic about the fireplac...      2   
4                               i am feeling grouchy      3   

                                              tokens  
0                       [i, didnt, feel, humiliated]  
1  [i, can, go, fr

In [42]:
### Task 1.3 ###
# Build a vocabulary (map string to integer) based on train split
from collections import Counter
import torch

counter = Counter()
for sample in train_df["tokens"]:
    counter.update(sample)
vocabulary = {
    '<UNK>': 0,
    '<PAD>': 1,
    **{word: idx + 2 for idx, (word, count) in enumerate(counter.most_common(1000))}   
}

In [None]:
### Task 1.4 ###
# Encode all texts with the defined vocabulary
# value 0 resembles <UNK> (unknown token)
# value 1 resemples <PAD> (padding token)

# Sequences shorter than max_length, will be filled
# up with <PAD> until they match max_length
def pad_sequence(sequence, max_length=100, pad_value=1):
    if len(sequence) > max_length:
        return sequence[:max_length]
    else:
        return sequence + [pad_value] * (max_length - len(sequence))

# Encode and pad all texts with the defined vocabulary
train_sequences = [pad_sequence([vocabulary.get(token, 0) for token in sample]) for sample in train_df["tokens"]]
test_sequences = [pad_sequence([vocabulary.get(token, 0) for token in sample]) for sample in test_df["tokens"]]
validation_sequences = [pad_sequence([vocabulary.get(token, 0) for token in sample]) for sample in validation_df["tokens"]]


[2, 40, 101, 60, 8, 15, 494, 5, 15, 0, 553, 32, 60, 61, 128, 148, 76, 0, 4, 22, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [None]:
### Task 1.5a ###
# Convert lists into tensors
def vectorize_sequences(sequences, samples, vocabulary):
    one_hot_results = torch.zeros(len(samples), len(vocabulary) + 1)
    for idx, sequence in enumerate(sequences):
        one_hot_results[idx, sequence] = 1
    return one_hot_results

train_data = vectorize_sequences(train_sequences, train_df["text"],vocabulary)
test_data = vectorize_sequences(test_sequences, test_df["text"],vocabulary)
validation_data = vectorize_sequences(validation_sequences, validation_df["text"],vocabulary)


In [67]:
### Task 1.5b ###
# Load the data
import torch
from torch.utils.data import Dataset, DataLoader

class TextDataset(Dataset):
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.labels[idx], dtype=torch.long)

# Assuming you have labels in your DataFrame
train_labels = train_df["label"].tolist()
test_labels = test_df["label"].tolist()
validation_labels = validation_df["label"].tolist()

# Create dataset instances
train_dataset = TextDataset(train_sequences, train_labels)
test_dataset = TextDataset(test_sequences, test_labels)
validation_dataset = TextDataset(validation_sequences, validation_labels)

# Create DataLoader instances
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
validation_loader = DataLoader(validation_dataset, batch_size=32, shuffle=False)