In [1]:
import numpy as np

# Hugging Face Datasets

In [2]:
from datasets import load_dataset

emotions=load_dataset("emotion")

Using the latest cached version of the module from C:\Users\amrul\.cache\huggingface\modules\datasets_modules\datasets\emotion\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705 (last modified on Fri Mar 18 22:16:45 2022) since it couldn't be found locally at emotion., or remotely on the Hugging Face Hub.
Using custom data configuration default
Reusing dataset emotion (C:\Users\amrul\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
# Let's explore emotions dataset
# It is a DatasetDict with keys like train, validation and test
# Each of those is a Dataset with features, column names which are usually a text and a label
emotions

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [5]:
# I want to view textlabel of a text from emotions dataset
# to do that first I get access to train_set.features.label which returns ClassLabel type
# and from there I can use its int2str function

train_set = emotions["train"]
label_feature = train_set.features["label"]
print(f"the type of train_set['features']['label'] : {type(label_feature)}")
print(f"label_feature.int2str(0) : {label_feature.int2str(0)}")
print(f"number of classes in label_feature : {label_feature.num_classes}")
print(f"name of classes in label_feature : {label_feature.names}")

the type of train_set['features']['label'] : <class 'datasets.features.features.ClassLabel'>
label_feature.int2str(0) : sadness
number of classes in label_feature : 6
name of classes in label_feature : ['sadness', 'joy', 'love', 'anger', 'fear', 'surprise']


In [6]:
text_feature = train_set.features["text"]
print(f"text_feature : {text_feature}")

text_feature : Value(dtype='string', id=None)


In [7]:
train_texts = train_set["text"]
print(f"type of train_text : {type(train_texts)}")
print(f"there are {len(train_texts)} elements in train_texts")
print(f"first text in train_texts list : {train_texts[0]} and its label : {label_feature.int2str(train_set['label'][0])}")

type of train_text : <class 'list'>
there are 16000 elements in train_texts
first text in train_texts list : i didnt feel humiliated and its label : sadness


In [10]:
import random
train_set=emotions["train"]
_random_idx=random.randint(0,len(train_set))
_random_emotion_data=train_set[_random_idx]
print("text : ",_random_emotion_data["text"],"\n","label : ",label_feature.int2str(_random_emotion_data["label"]))

text :  i didn t feel very reassured by her tone but i understand this is a big shock and adjustment for everyone 
 label :  joy


   # Import Tokenizer and DistilBert model

It is important to use the right pretrained tokenizer for a pretrained model. Otherwise pretrained token representations become obsolete

In [12]:
from transformers import AutoTokenizer

In [13]:
# We will use DistilBERT which is smaller version of BERT to classify emotion text
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

ValueError: Connection error, and we cannot find the requested files in the cached path. Please try again or make sure your Internet connection is on.

In a similar manner we can load transformer models of interest by passing model name to AutoModel.from_pretrained(<model_name>)

In [34]:
from transformers import AutoModel
import torch

In [35]:
device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModel.from_pretrained(model_name).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Above we are checking if GPU is available. If not we are loading the model to CPU device

To warm up let's extract the last hidden states for a simple string

In [36]:
# In below example we are using tokenizer.encode to encode long text into a series of tokens which are bunch of ids
# tokenizer.encode returns the result as a tensor
# also we are loading that tensor to the device, CPU in this case
text = "NLP will completely transform our understanding of machines speaking a language. We will come into realization that machines can master language better than humans."
tokens=tokenizer.encode(text, return_tensors="pt").to(device)

In [40]:
print(f"tokens : {tokens}")

tokens : tensor([[  101, 17953,  2361,  2097,  3294, 10938,  2256,  4824,  1997,  6681,
          4092,  1037,  2653,  1012,  2057,  2097,  2272,  2046, 12393,  2008,
          6681,  2064,  3040,  2653,  2488,  2084,  4286,  1012,   102]])


In [37]:
print(tokens.shape)
print(tokenizer.model_input_names)

torch.Size([1, 29])
['input_ids', 'attention_mask']


```return_tensors="pt"``` ensures that we return token embeddings as PyTorch tensors and we load them into the same device as the model.

In [38]:
# if we call tokenizer and pass the text to it, it returns a dictionary with input_ids and attention_mask keys
inputs=tokenizer(text, return_tensors="pt")
inputs

{'input_ids': tensor([[  101, 17953,  2361,  2097,  3294, 10938,  2256,  4824,  1997,  6681,
          4092,  1037,  2653,  1012,  2057,  2097,  2272,  2046, 12393,  2008,
          6681,  2064,  3040,  2653,  2488,  2084,  4286,  1012,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1]])}

In [39]:
def view_tokens(tokenizer,tokens):
    for token in tokens:
        print(token,tokenizer.decode(token))

view_tokens(tokenizer,tokens[0])

tensor(101) [CLS]
tensor(17953) nl
tensor(2361) ##p
tensor(2097) will
tensor(3294) completely
tensor(10938) transform
tensor(2256) our
tensor(4824) understanding
tensor(1997) of
tensor(6681) machines
tensor(4092) speaking
tensor(1037) a
tensor(2653) language
tensor(1012) .
tensor(2057) we
tensor(2097) will
tensor(2272) come
tensor(2046) into
tensor(12393) realization
tensor(2008) that
tensor(6681) machines
tensor(2064) can
tensor(3040) master
tensor(2653) language
tensor(2488) better
tensor(2084) than
tensor(4286) humans
tensor(1012) .
tensor(102) [SEP]


In [50]:
output=model(tokens)
output.last_hidden_state.shape

torch.Size([1, 29, 768])

Looking at the hidden states we can see it has the shape ```[batch_size,n_tokens,hid_dim]```. BERT generates a hidden state for each input token. Then it uses these hidden states to predict masked tokens. For classification tasks it is common to use the hidden state of [CLS]

# Tokenizing the whole dataset

```padding``` will pad each sequence with zeroes to the longest sequence in the batch. ```truncation``` will truncate at model's maximum context size.

In [51]:
def tokenize(tokenizer,batch):
    return tokenizer(batch["text"],padding=True,truncation=True)

In [52]:
tokenize(tokenizer,emotions["train"][:3])

{'input_ids': [[101, 1045, 2134, 2102, 2514, 26608, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [101, 1045, 2064, 2175, 2013, 3110, 2061, 20625, 2000, 2061, 9636, 17772, 2074, 2013, 2108, 2105, 2619, 2040, 14977, 1998, 2003, 8300, 102], [101, 10047, 9775, 1037, 3371, 2000, 2695, 1045, 2514, 20505, 3308, 102, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]}

Above you will notice that batch tokenizer returns ```attention_masks``` in addition to ```input_ids```. This is necessary so that the model doesn't get confused with paddings and can ignore them when processing each text.

In [53]:
emotions_encoded=emotions.map(lambda batch : tokenize(tokenizer,batch),batched=True,batch_size=None)

Loading cached processed dataset at C:\Users\amrul\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705\cache-a2030fb1427d9b81.arrow
Loading cached processed dataset at C:\Users\amrul\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705\cache-59c2c3f8ffffd72b.arrow
Loading cached processed dataset at C:\Users\amrul\.cache\huggingface\datasets\emotion\default\0.0.0\348f63ca8e27b3713b6c04d723efe6d824a56fb3d1449794716c0f0296072705\cache-67e9f09415f9a25a.arrow


By default ```DatasetDict.map``` operates on operates individually on every example in the corpus, so setting ```batched=True``` will encode the tweets in batches, while ```batch_size=None``` applies ```tokenize``` in one single batch and ensures that input tensors and attention masks have the same shape globally. We can confirm that this operation added two new features to the dataset ```input_ids``` and ```attention_masks```

In [56]:
emotions_encoded["train"].features

{'text': Value(dtype='string', id=None),
 'label': ClassLabel(num_classes=6, names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

We can pass ```input_ids``` and ```attention_mask``` to the model in a below manner if we had single example. Notice we have to convert them into PyTorch tensors before passing them into the model

In [57]:
train_set=emotions_encoded["train"]
train_set.features
input_ids=train_set['input_ids']
attention_mask=train_set["attention_mask"]
with torch.no_grad():
    output=model(torch.tensor(input_ids[:5]),torch.tensor(attention_mask[:5]))
last_hidden_state=output.last_hidden_state
print(last_hidden_state.shape)
lhs_np=last_hidden_state.cpu().numpy()
print(type(lhs_np))

torch.Size([5, 87, 768])
<class 'numpy.ndarray'>


What we really want are hidden states across the whole dataset. For this, we can use the ```DatasetDict.map``` function again!

In [105]:
def extract_hidden_states(batch):
    # place model inputs on the right device
    inputs = {k:v.to(device) for k,v in batch.items() if k in tokenizer.model_input_names}
    
    # extract last hidden states
    with torch.no_grad():
        last_hidden_state = model(**inputs).last_hidden_state
    
    # return vector for [CLS] token
    return {"hidden_state" : last_hidden_state[:,0].cpu().numpy()}

In [106]:
emotions_encoded.set_format("torch", columns=["input_ids","attention_mask","label"])

In [107]:
emotions_hidden = emotions_encoded.map(extract_hidden_states, batched=True)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

**Create feature matrix**

In [109]:
X_train = np.array(emotions_hidden["train"]["hidden_state"])
X_valid = np.array(emotions_hidden["validation"]["hidden_state"])

y_train = np.array(emotions_hidden["train"]["label"])
y_valid = np.array(emotions_hidden["validation"]["label"])

print(f"X_train shape : {X_train.shape}, X_valid shape : {X_valid.shape}")

X_train shape : (16000, 768), X_valid shape : (2000, 768)


In [110]:
from sklearn.linear_model import LogisticRegression

lr_clf = LogisticRegression(max_iter=3000)
lr_clf.fit(X_train,y_train)
lr_clf.score(X_valid, y_valid)

0.633