* Training with DistilBERT
* Examining Tokenization

# Import libraries

In [1]:
import torch
import numpy as np
import pandas as pd
import lightning as L

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from pathlib import Path
import os

# Import data

In [3]:
Path.cwd()

PosixPath('/media/skesava/D/Training/MachineLearning/PyTorch_Lightning/NLP_using_LLMs/SentimentAnalysis')

In [4]:
data_directory = Path.cwd().parent.parent.parent.joinpath('ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/processed_data/')
data_directory.exists()

True

### Pandas dataframe

In [5]:
# df_train = pd.read_csv(data_directory.joinpath("train.csv"))
# df_val = pd.read_csv(data_directory.joinpath("test.csv"))
# df_test = pd.read_csv(data_directory.joinpath("val.csv"))

In [6]:
# df_train.info()

### Using HuggingFace Dataset library

In [7]:
from datasets import load_dataset, Features, Value, ClassLabel

In [8]:
columns_to_select = Features(dict({"text": Value(dtype='string', id=None), 
                                   "label": ClassLabel(num_classes=2)
                                  })
                            )

In [9]:
imdb_dataset = load_dataset(path=str(data_directory), data_files={"train": "train.csv",
                                                            "test": "test.csv",
                                                            "val": "val.csv"
                                                            },
                            features=columns_to_select
                           )

In [10]:
imdb_dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 10000
    })
    val: Dataset({
        features: ['text', 'label'],
        num_rows: 5000
    })
})

In [11]:
type(imdb_dataset["train"])

datasets.arrow_dataset.Dataset

# Tokenization

In [12]:
from transformers import AutoTokenizer, DistilBertTokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [14]:
tokenizer.max_len_single_sentence

510

In [15]:
tokenizer.vocab_size

30522

In [16]:
tokenizer.verbose

False

In [17]:
tokenizer.model_max_length

512

In [18]:
tokenizer.model_input_names

['input_ids', 'attention_mask']

In [19]:
def tokenize_text(batch):

    return tokenizer(batch["text"], truncation=True, padding=True)

In [20]:
imdb_tokens = imdb_dataset.map(tokenize_text, batched=True, batch_size=None)
# batch_size=None, provides the full dataset

Map: 100%|███████████████████████| 10000/10000 [00:03<00:00, 2985.70 examples/s]


### Deleting the dataset after tokenization

In [21]:
del imdb_dataset

In [22]:
imdb_tokens.cache_files

{'train': [{'filename': '/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-dbda72b584173fd4.arrow'}],
 'test': [{'filename': '/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-3dad57f397a21204.arrow'}],
 'val': [{'filename': '/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-842e0b75e7af8836.arrow'}]}

In [23]:
imdb_tokens.column_names

{'train': ['text', 'label', 'input_ids', 'attention_mask'],
 'test': ['text', 'label', 'input_ids', 'attention_mask'],
 'val': ['text', 'label', 'input_ids', 'attention_mask']}

### Setting framework format for training

In [24]:
imdb_tokens.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

In [25]:
imdb_tokens.cache_files

{'train': [{'filename': '/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-dbda72b584173fd4.arrow'}],
 'test': [{'filename': '/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-3dad57f397a21204.arrow'}],
 'val': [{'filename': '/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-842e0b75e7af8836.arrow'}]}

* Nothing changed

### *Exploring tokenized data*

In [26]:
imdb_tokens.data

{'train': MemoryMappedTable
 text: string
 label: int64
 input_ids: list<item: int32>
   child 0, item: int32
 attention_mask: list<item: int8>
   child 0, item: int8
 ----
 text: [["Robert Duvall is a direct descendent of Confederate General Robert E. Lee, according the IMDb.com movie database. After seeing this film, you may think Duvall's appearance is reincarnation at it's best. One of my most favorite films. I wish the composer, Peter Rodgers Melnick had a CD or there was a soundtrack available. Wonderful scenery and music and "all too-true-to-life," especially for those of us that live in, or have moved to, the South. This is a "real moment in time." Life moves on, slowly, but "strangers we do not remain."","The Railway Children, at least this 1970 movie version written and directed by that long-time British character actor, Lionel Jeffries, is an unmitigated...classic. It tells a childhood story with great simplicity and charm; the sentimentality is muted; the evocation of child

In [27]:
test_data = imdb_tokens['test'].data

In [28]:
test_data.column_names

['text', 'label', 'input_ids', 'attention_mask']

In [29]:
test_data.nbytes/(1024*1024)

37.308979988098145

In [30]:
test_data.num_rows

10000

In [31]:
test_data.path

'/home/skesava/.cache/huggingface/datasets/processed_data/default-afc46ccbcdb7d983/0.0.0/316f5994c628c49b/cache-3dad57f397a21204.arrow'

In [32]:
test_data.fast_gather([0])

pyarrow.Table
text: string
label: int64
input_ids: list<item: int32>
  child 0, item: int32
attention_mask: list<item: int8>
  child 0, item: int8
----
text: [["I have to say, when "Pushing Daisies" came out I was immediately won out by the fairy-tale like setting of such grimness. The narrator made a cake out of the whole ordeal by making death seem as routine as, well, Ned (Lee Pace) baking pies. And that bringing them back to life was just as routine.<br /><br />The trio of Ned, Emerson Cod (Chi McBride) and Charlotte "Chuck" Charles (Anna Friel), plus sometimes-sidekick Olive Snook (Kristin Chenoweth, who made the musical Wicked such a delight) made for some fantastic dialogue and silliness. It definitely deserves the title of a (romantic) comedy/drama.<br /><br />Ned and Chuck made for a shy and not-quite-ready-for-love couple who are still exploring their feelings even though they cannot touch - an obstacle that seems to be truly no obstacle with aids such as gloves, cellophane, 

In [33]:
sample = test_data.fast_gather([9999])
sample

pyarrow.Table
text: string
label: int64
input_ids: list<item: int32>
  child 0, item: int32
attention_mask: list<item: int8>
  child 0, item: int8
----
text: [["The story for Hare Rama Hare Krishna actually came to Dev Anand's mind when he saw hippies and their fallen values in Kathmandu where he was on a visit after the protests against his previous Prem Pujari in Calcutta. He was low in spirits because his film had been opposed and some had burnt Prem Pujari's posters. But the life of hippies re ignited a story in Dev's mind to be made into a film.<br /><br />This was Dev Anand's perhaps best directorial effort. The film was a blockbuster super hit at the box office and Zeenat Aman as Dev's sister made a tremendous impact.<br /><br />This film was Dev Anand's call to the nation to keep up their moral values.<br /><br />It is about a Montereal based Indian family and the brother's role is a very affectionate one for his sister. But the parents quarrel and separate leaving Prashant(Dev

In [34]:
sample.get_total_buffer_size()/(1024*1024)

3.7312192916870117

In [35]:
sample.column('input_ids')

<pyarrow.lib.ChunkedArray object at 0x79a036be90c0>
[
  [
    [
      101,
      1996,
      2466,
      2005,
      14263,
      ...
      8754,
      2923,
      2143,
      2453,
      102
    ]
  ]
]

In [36]:
sample_column = sample.column('attention_mask')
sample_column

<pyarrow.lib.ChunkedArray object at 0x79a036be9480>
[
  [
    [
      1,
      1,
      1,
      1,
      1,
      ...
      1,
      1,
      1,
      1,
      1
    ]
  ]
]

In [37]:
sample_column.to_numpy()

array([array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
              1, 1, 1, 1, 1, 1, 1, 1, 1,

In [38]:
sample_column.to_numpy()[0].shape

(512,)

In [39]:
sample2 = test_data.fast_gather([100])
sample2

pyarrow.Table
text: string
label: int64
input_ids: list<item: int32>
  child 0, item: int32
attention_mask: list<item: int8>
  child 0, item: int8
----
text: [["I was hoping to like this movie, to settle in for an evening of goofy fun. I like Judy Davis and Juliette Lewis, and the premise seemed off the wall enough to be entertaining.<br /><br />Unfortunately, I found myself dozing over and over again. Judy Davis gave a fine performance, but had very little to work with. Juliette Lewis was fabulous as expected, but had very little to do. The plot was full of "twists" that were just plain silly, and as so often happens in movies of this type, nobody acted the way a real human being would act. And, personally, I thought Marcia Gay Harden was totally miscast.<br /><br />The movie also seemed to shift about midway from a black comedy with touches of farce to a total farce with touches of black comedy. One reviewer here notes that other reviews seem to want this movie to be something differ

In [40]:
sample2_column1 = sample2.column('input_ids').to_numpy()[0]
sample2_column2 = sample2.column('attention_mask').to_numpy()[0]

In [41]:
sample2_column1

array([  101,  1045,  2001,  5327,  2000,  2066,  2023,  3185,  1010,
        2000,  7392,  1999,  2005,  2019,  3944,  1997, 27243,  4569,
        1012,  1045,  2066, 12120,  4482,  1998, 24696,  4572,  1010,
        1998,  1996, 18458,  2790,  2125,  1996,  2813,  2438,  2000,
        2022, 14036,  1012,  1026,  7987,  1013,  1028,  1026,  7987,
        1013,  1028,  6854,  1010,  1045,  2179,  2870,  2079,  6774,
        2058,  1998,  2058,  2153,  1012, 12120,  4482,  2435,  1037,
        2986,  2836,  1010,  2021,  2018,  2200,  2210,  2000,  2147,
        2007,  1012, 24696,  4572,  2001, 18783,  2004,  3517,  1010,
        2021,  2018,  2200,  2210,  2000,  2079,  1012,  1996,  5436,
        2001,  2440,  1997,  1000, 21438,  1000,  2008,  2020,  2074,
        5810, 10021,  1010,  1998,  2004,  2061,  2411,  6433,  1999,
        5691,  1997,  2023,  2828,  1010,  6343,  6051,  1996,  2126,
        1037,  2613,  2529,  2108,  2052,  2552,  1012,  1998,  1010,
        7714,  1010,

In [42]:
sample2_column2

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [43]:
sample2_column1[sample2_column1 != 0].size

252

In [44]:
sample2_column2[sample2_column2 != 0].size

252

In [45]:
sample2_column1[sample2_column2 != 0]

array([  101,  1045,  2001,  5327,  2000,  2066,  2023,  3185,  1010,
        2000,  7392,  1999,  2005,  2019,  3944,  1997, 27243,  4569,
        1012,  1045,  2066, 12120,  4482,  1998, 24696,  4572,  1010,
        1998,  1996, 18458,  2790,  2125,  1996,  2813,  2438,  2000,
        2022, 14036,  1012,  1026,  7987,  1013,  1028,  1026,  7987,
        1013,  1028,  6854,  1010,  1045,  2179,  2870,  2079,  6774,
        2058,  1998,  2058,  2153,  1012, 12120,  4482,  2435,  1037,
        2986,  2836,  1010,  2021,  2018,  2200,  2210,  2000,  2147,
        2007,  1012, 24696,  4572,  2001, 18783,  2004,  3517,  1010,
        2021,  2018,  2200,  2210,  2000,  2079,  1012,  1996,  5436,
        2001,  2440,  1997,  1000, 21438,  1000,  2008,  2020,  2074,
        5810, 10021,  1010,  1998,  2004,  2061,  2411,  6433,  1999,
        5691,  1997,  2023,  2828,  1010,  6343,  6051,  1996,  2126,
        1037,  2613,  2529,  2108,  2052,  2552,  1012,  1998,  1010,
        7714,  1010,

#### NOTE

* Each token is given an attention value of 1 in the tokenized data
* The padded values are given 0
* The tokenized data is stored in pyarrow format

In [46]:
type(imdb_tokens)

datasets.dataset_dict.DatasetDict

In [47]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# not sure what this is for

# DataLoader for training

In [48]:
from torch.utils.data import DataLoader, Dataset

## First: Dataset creation

In [49]:
from datasets.dataset_dict import DatasetDict

In [50]:
class IMDBDataset(Dataset):

    def __init__(self, huggingface_dataset_dict: DatasetDict, partition: str):

        super().__init__()        
        self.partition = huggingface_dataset_dict[partition]

    def __getitem__(self, index):
        
        return self.partition[index]

    def __len__(self):
        
        return self.partition.num_rows

In [51]:
imdb_tokens.keys()

dict_keys(['train', 'test', 'val'])

In [52]:
train_dataset = IMDBDataset(imdb_tokens, "train")
val_dataset = IMDBDataset(imdb_tokens, "val")
test_dataset = IMDBDataset(imdb_tokens, "test")

In [53]:
import sys

In [54]:
sys.getsizeof(train_dataset)

56

In [55]:
sys.getsizeof(imdb_tokens)

208

#### NOTE
* Clearing, the torch Datasets and hc Datasets are stored in the disk and not in memory

## Second: Dataloaders

In [56]:
batch_size = 16

#### Question
* Why 12?

In [57]:
train_dataloader = DataLoader(train_dataset, 
                              shuffle=True, 
                              batch_size=batch_size, 
                              num_workers=4
                             )

In [58]:
val_dataloader = DataLoader(val_dataset, 
                            batch_size=batch_size, 
                            num_workers=4
                           )

In [59]:
test_dataloader = DataLoader(test_dataset, 
                             batch_size=batch_size, 
                             num_workers=4
                            )

# Initialising DistilBERT

In [60]:
from transformers import AutoModelForSequenceClassification

In [61]:
data_directory

PosixPath('/media/skesava/D/Training/MachineLearning/ML datasets/IMDB_SentimentAnalysis_TextData/aclImdb/processed_data')

In [62]:
model_directory = data_directory.parent.parent.parent.parent.joinpath("LLM_models")
if not model_directory.exists():
    Path.mkdir(model_directory)

In [63]:
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    cache_dir=model_directory,
    num_labels=2
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## *Exploring the initialised model*

In [64]:
model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [79]:
type(model)

transformers.models.distilbert.modeling_distilbert.DistilBertForSequenceClassification

In [65]:
model.num_parameters()

66955010

In [66]:
model.base_model

DistilBertModel(
  (embeddings): Embeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (transformer): Transformer(
    (layer): ModuleList(
      (0-5): 6 x TransformerBlock(
        (attention): DistilBertSdpaAttention(
          (dropout): Dropout(p=0.1, inplace=False)
          (q_lin): Linear(in_features=768, out_features=768, bias=True)
          (k_lin): Linear(in_features=768, out_features=768, bias=True)
          (v_lin): Linear(in_features=768, out_features=768, bias=True)
          (out_lin): Linear(in_features=768, out_features=768, bias=True)
        )
        (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (ffn): FFN(
          (dropout): Dropout(p=0.1, inplace=False)
          (lin1): Linear(in_features=768, out_features=3072, bias=True)
          (lin2): L

In [67]:
model.base_model_prefix

'distilbert'

In [68]:
model.classifier

Linear(in_features=768, out_features=2, bias=True)

In [69]:
model.name_or_path

'distilbert-base-uncased'

In [70]:
model.pre_classifier

Linear(in_features=768, out_features=768, bias=True)

In [71]:
model.num_labels

2

In [72]:
model.is_parallelizable

False

In [73]:
model.model_tags

In [74]:
model.framework

'pt'

### Accessing Transformer layer

In [75]:
model.distilbert.transformer.layer

ModuleList(
  (0-5): 6 x TransformerBlock(
    (attention): DistilBertSdpaAttention(
      (dropout): Dropout(p=0.1, inplace=False)
      (q_lin): Linear(in_features=768, out_features=768, bias=True)
      (k_lin): Linear(in_features=768, out_features=768, bias=True)
      (v_lin): Linear(in_features=768, out_features=768, bias=True)
      (out_lin): Linear(in_features=768, out_features=768, bias=True)
    )
    (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (ffn): FFN(
      (dropout): Dropout(p=0.1, inplace=False)
      (lin1): Linear(in_features=768, out_features=3072, bias=True)
      (lin2): Linear(in_features=3072, out_features=768, bias=True)
      (activation): GELUActivation()
    )
    (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  )
)

In [76]:
model.distilbert.transformer.layer[0]

TransformerBlock(
  (attention): DistilBertSdpaAttention(
    (dropout): Dropout(p=0.1, inplace=False)
    (q_lin): Linear(in_features=768, out_features=768, bias=True)
    (k_lin): Linear(in_features=768, out_features=768, bias=True)
    (v_lin): Linear(in_features=768, out_features=768, bias=True)
    (out_lin): Linear(in_features=768, out_features=768, bias=True)
  )
  (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (ffn): FFN(
    (dropout): Dropout(p=0.1, inplace=False)
    (lin1): Linear(in_features=768, out_features=3072, bias=True)
    (lin2): Linear(in_features=3072, out_features=768, bias=True)
    (activation): GELUActivation()
  )
  (output_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
)

## Customise

In [77]:
# freeze all layers
for param in model.parameters():
    param.requires_grad = False

In [78]:
# unfreeze last layer for finetuning
for param in model.pre_classifier.parameters():
    param.requires_grad = True

for param in model.classifier.parameters():
    param.requires_grad = True