In [None]:
!pip install transformers==4.17

Collecting transformers==4.17
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
Collecting sacremoses (from transformers==4.17)
  Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sacremoses, transformers
  Attempting uninstall: transformers
    Found existing installation: transformers 4.35.2
    Uninstalling transformers-4.35.2:
      Successfully uninstalled transformers-4.35.2
Successfully installed sacremoses-0.1.1 transformers-4.17.0


In [None]:
import transformers,torch
from transformers import AutoModelForMaskedLM, AutoTokenizer,DataCollatorForLanguageModeling

In [None]:
transformers.__version__

'4.17.0'

In [None]:
!pip install -U accelerate



In [None]:

!pip install datasets evaluate transformers[sentencepiece]



In [None]:
from datasets import load_dataset

In [None]:
!pip install transformers[torch]

In [None]:
from transformers import TrainingArguments,Trainer

In [None]:
model_check='distilbert-base-uncased'

In [None]:
model=AutoModelForMaskedLM.from_pretrained(model_check)

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

In [None]:
tokenizer=AutoTokenizer.from_pretrained(model_check)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [None]:
text="wow what a [MASK]."
inputs=tokenizer(text,return_tensors='pt')
token_logits=model(**inputs).logits
mask_token_index=torch.where(inputs['input_ids']==tokenizer.mask_token_id)[1]
mask_token_logits=token_logits[0,mask_token_index,:]
too_tokens=torch.topk(mask_token_logits,5,dim=1).indices[0].tolist()

In [None]:
for tokens in too_tokens:
  print({text.replace(tokenizer.mask_token,tokenizer.decode([tokens]))})

{'wow what a shame.'}
{'wow what a surprise.'}
{'wow what a mess.'}
{'wow what a coincidence.'}
{'wow what a bitch.'}


In [None]:
dataset=load_dataset('imdb')

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['text', 'label'],
        num_rows: 50000
    })
})

In [None]:
def tokenize_function(examples):
  result=tokenizer(examples['text'])
  if tokenizer.is_fast:
    result['word_ids']=[result.word_ids(i) for i in range(len(result['input_ids']))]
  return result

In [None]:
tokenized_data=dataset.map(tokenize_function,batched=True,remove_columns=['text','label'])

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (720 > 512). Running this sequence through the model will result in indexing errors


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 25000
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids'],
        num_rows: 50000
    })
})

In [None]:
chunk_size=128
def group_texts(examples):
  concatenate={k:sum(examples[k],[]) for k in examples.keys()}
  total_len=len(concatenate[list(examples.keys())[0]])
  total_len=(total_len//chunk_size)*chunk_size
  result={k: [t[i:i+chunk_size] for i in range(0,total_len,chunk_size)]
          for k, t in concatenate.items()}
  result["labels"] = result["input_ids"].copy()
  return result


In [None]:
lm_dataset=tokenized_data.map(group_texts,batched=True)


Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [None]:
lm_dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 61291
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 59904
    })
    unsupervised: Dataset({
        features: ['input_ids', 'attention_mask', 'word_ids', 'labels'],
        num_rows: 122957
    })
})

In [None]:
tokenizer.decode(lm_dataset['train'][1]['input_ids'])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [None]:
tokenizer.decode(lm_dataset['train'][1]['labels'])

"as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens of stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is that 40 years ago, this was considered pornographic. really, the sex and nudity scenes are few and far between, even then it's not shot like some cheaply made porno. while my countrymen mind find it shocking, in reality sex and nudity are a major staple in swedish cinema. even ingmar bergman,"

In [None]:
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm_probability=0.15)

In [None]:
samples=[lm_dataset['train'][i] for i in range(2)]

In [None]:
for sample in samples:
  _=sample.pop('word_ids')

In [None]:
for chunk in data_collator(samples)['input_ids']:
  print(tokenizer.decode(chunk))


[CLS] i [MASK] i am directing - yellow from my video store because of [MASK] the controversy that surrounded it when it was [MASK] released in wei. i also heard that at first it was seized by u. s. [MASK] if it ever tried [MASK] enter [MASK] country, therefore being a [MASK] of films considered " [MASK] [MASK] i really had to see this [MASK] myself. < br / > < [MASK] / [MASK]zak plot is centered around a young swedish drama student named lena who wants to learn everything she can about life [MASK] in particular she wants to focus her attentions to making some sort of documentary on what the average [MASK]ede [MASK] about certain political issues such
as the vietnam war and race issues in the united states. in between asking politicians and ordinary denizens [MASK] stockholm about their opinions on politics, she has sex with her drama teacher, classmates, and married men. < br / > < br / > what kills me about i am curious - yellow is [MASK] [MASK] [MASK] ago, this was considered pornogr

In [None]:
import collections
import numpy as np
from transformers import default_data_collator

In [None]:
probs=0.2
def whole_word_masking(features):
  for feature in features:
    word_ids=feature.pop('word_ids')
    mapping=collections.defaultdict(list)
    current_word_idx=-1
    current_word=None
    for idx,word_id in enumerate(word_ids[:5]):
      print(idx)
      print(word_id)
      if word_id is not None:
        if word_id != current_word:
          current_word=word_id
          current_word_idx+=1
        mapping[current_word_idx].append(idx)
    mask = np.random.binomial(1,probs,(len(mapping),))
    input_ids=feature['inputs_ids']
    labels=feature['labels']
    new_labels=[-100]*len(labels)
    for word_id in np.where(mask)[0]:
      word_id=word_id.item()
      for idx in mapping[word_id]:
        new_labels[idx]=labels[idx]
        input_ids[idx]=tokenizer.mask_token_id
    feature['labels']=new_labels
  return default_data_collator(features)

In [None]:
train_size=10000
test_size=int(0.1*train_size)
down_dataset=lm_dataset['train'].train_test_split(train_size=train_size,test_size=test_size,seed=42)

In [None]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
batch_size = 64
# Show the training loss with every epoch
logging_steps = len(down_dataset["train"]) // batch_size
model_name = model_check.split("/")[-1]

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=True,
    fp16=True,
    logging_steps=logging_steps,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=down_dataset["train"],
    eval_dataset=down_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

For more details, please read https://huggingface.co/docs/huggingface_hub/concepts/git_vs_http.
Cloning https://huggingface.co/Sabbasi-11/distilbert-base-uncased-finetuned-imdb into local empty directory.
Using amp half precision backend


In [None]:
trainer.train()

The following columns in the training set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 10000
  Num Epochs = 3
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 471


Epoch,Training Loss,Validation Loss
1,2.6954,2.524316
2,2.563,2.473834
3,2.5258,2.436905


The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64
The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


Training completed. Do not forge

TrainOutput(global_step=471, training_loss=2.5955933248920804, metrics={'train_runtime': 173.3817, 'train_samples_per_second': 173.029, 'train_steps_per_second': 2.717, 'total_flos': 994208670720000.0, 'train_loss': 2.5955933248920804, 'epoch': 3.0})

In [None]:
import math
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

The following columns in the evaluation set  don't have a corresponding argument in `DistilBertForMaskedLM.forward` and have been ignored: word_ids. If word_ids are not expected by `DistilBertForMaskedLM.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 1000
  Batch size = 64


In [None]:
print({math.exp(eval_results['eval_loss'])})

In [None]:
v=eval_results['eval_loss']

In [None]:
eval_results['eval_loss']

2.4159090518951416

In [None]:
#perplexity score

math.exp(v)

11.199947064378936

In [None]:
trainer.push_to_hub()

Saving model checkpoint to distilbert-base-uncased-finetuned-imdb
Configuration saved in distilbert-base-uncased-finetuned-imdb/config.json
Model weights saved in distilbert-base-uncased-finetuned-imdb/pytorch_model.bin
tokenizer config file saved in distilbert-base-uncased-finetuned-imdb/tokenizer_config.json
Special tokens file saved in distilbert-base-uncased-finetuned-imdb/special_tokens_map.json


Upload file pytorch_model.bin:   0%|          | 1.00/256M [00:00<?, ?B/s]

Upload file runs/Dec14_06-45-18_62923d7857bd/events.out.tfevents.1702536800.62923d7857bd.4016.2:   0%|        …

Upload file training_args.bin:   0%|          | 1.00/3.43k [00:00<?, ?B/s]

Upload file runs/Dec14_06-45-18_62923d7857bd/1702536390.4655473/events.out.tfevents.1702536390.62923d7857bd.40…

Upload file runs/Dec14_06-45-18_62923d7857bd/events.out.tfevents.1702536390.62923d7857bd.4016.0:   0%|        …

To https://huggingface.co/Sabbasi-11/distilbert-base-uncased-finetuned-imdb
   1902bb1..0d11fb9  main -> main

   1902bb1..0d11fb9  main -> main

Dropping the following result as it does not have all the necessary fields:
{'task': {'name': 'Masked Language Modeling', 'type': 'fill-mask'}, 'dataset': {'name': 'imdb', 'type': 'imdb', 'args': 'plain_text'}}
To https://huggingface.co/Sabbasi-11/distilbert-base-uncased-finetuned-imdb
   0d11fb9..0dea39c  main -> main

   0d11fb9..0dea39c  main -> main



'https://huggingface.co/Sabbasi-11/distilbert-base-uncased-finetuned-imdb/commit/0d11fb97179ff4b3772c32e8656965e4956693ca'

In [None]:
from transformers import pipeline

In [None]:
mask_filler = pipeline(
    "fill-mask", model="huggingface-course/distilbert-base-uncased-finetuned-imdb"
)

https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpmu1t4z7j


Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/bc4545eb3fd5fa91ded7b52c820d769ab2791e220c5fc4a098863c3ffba814d3.d93697f2459b6c5b21e398760e556bd3849c8d50e99fb4992d5ede6b6dfbfdc1
creating metadata file for /root/.cache/huggingface/transformers/bc4545eb3fd5fa91ded7b52c820d769ab2791e220c5fc4a098863c3ffba814d3.d93697f2459b6c5b21e398760e556bd3849c8d50e99fb4992d5ede6b6dfbfdc1
loading configuration file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/bc4545eb3fd5fa91ded7b52c820d769ab2791e220c5fc4a098863c3ffba814d3.d93697f2459b6c5b21e398760e556bd3849c8d50e99fb4992d5ede6b6dfbfdc1
Model config DistilBertConfig {
  "_name_or_path": "huggingface-course/distilbert-base-uncased-finetuned-imdb",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "att

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/68293fbe100166528ae658fb82fba8342643c8286ef9c0bd1b68cd36f06e32c4.0f49b33894172bcdf49f481f47af60206c0432545e87f4e5599b4dafc0234557
creating metadata file for /root/.cache/huggingface/transformers/68293fbe100166528ae658fb82fba8342643c8286ef9c0bd1b68cd36f06e32c4.0f49b33894172bcdf49f481f47af60206c0432545e87f4e5599b4dafc0234557
loading weights file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/68293fbe100166528ae658fb82fba8342643c8286ef9c0bd1b68cd36f06e32c4.0f49b33894172bcdf49f481f47af60206c0432545e87f4e5599b4dafc0234557
All model checkpoint weights were used when initializing DistilBertForMaskedLM.

All the weights of DistilBertForMaskedLM were initialized from the model checkpoint at huggingface-course/dis

Downloading:   0%|          | 0.00/333 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/0e75ce48bd829d9de3854ca4dc6094c0873a91ccf82203e36f2416b14b47bc3e.42154c5fd30bfa7e34941d0d8ad26f8a3936990926fbe06b2da76dd749b1c6d4
creating metadata file for /root/.cache/huggingface/transformers/0e75ce48bd829d9de3854ca4dc6094c0873a91ccf82203e36f2416b14b47bc3e.42154c5fd30bfa7e34941d0d8ad26f8a3936990926fbe06b2da76dd749b1c6d4
https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/vocab.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpxqppf5mu


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/6ad739928d7876d3bc26435b31d4d319b0fc77cea3d8e74e8da3eeedd1967f84.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/6ad739928d7876d3bc26435b31d4d319b0fc77cea3d8e74e8da3eeedd1967f84.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp47io3vgy


Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/1fdace8e42689b11ea1af0cdc221cd292866d751b934e7d88df01d61fe16332f.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829
creating metadata file for /root/.cache/huggingface/transformers/1fdace8e42689b11ea1af0cdc221cd292866d751b934e7d88df01d61fe16332f.f471bd2d72c48b932f7be40446896b7e97c3be406ee93abfb500399bc606c829
https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpf78z1q8a


Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/998d766bcbe227da8ff9181cda6c27ee7aa93d73c1e44a437104333e3caf7a92.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/998d766bcbe227da8ff9181cda6c27ee7aa93d73c1e44a437104333e3caf7a92.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/6ad739928d7876d3bc26435b31d4d319b0fc77cea3d8e74e8da3eeedd1967f84.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/huggingface-course/distilbert-base-uncased-finetuned-imdb/resolve/main/tokenizer.json from cache at /root/.cache/huggingface/transformers/1fdace8e42689b11ea1af0cdc2

In [None]:
text="Can you [MASK] me the bottle."
preds = mask_filler(text)

for pred in preds:
    print(f">>> {pred['sequence']}")

TypeError: ignored