In [1]:
pip install transformers transformers[sentencepiece] accelerate datasets sacrebleu  rouge_score  py7zr pandas nltk tqdm PyYAML matplotlib torch notebook gradio boto3 mypy-boto3-s3 python-box==6.0.2 ensure==1.0.2 gdown Jinja2==3.1.2

Collecting sacrebleu
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m58.0/58.0 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl.metadata (17 kB)
Collecting gradio
  Downloading gradio-4.27.0-py3-none-any.whl.metadata (15 kB)
Collecting mypy-boto3-s3
  Downloading mypy_boto3_s3-1.34.65-py3-none-any.whl.metadata (17 kB)
Collecting python-box==6.0.2
  Downloading python_box-6.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting ensure==1.0.2
  Downloading ensure-1.0.2-py2.py3-none-any.whl.metadata (10 kB)
Collecting gdown
  Downloading gdown-5.1.0-py3-none-any.whl.metadata (5.7 kB)
Collecting portalocker (from sacrebleu)
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
C

# **DATA INGESTION**

In [2]:
import gdown
import zipfile
import os
file_id = "https://drive.google.com/file/d/1crHgL6Q1xO_N0AZxi5xbnCoFx-tZHisB/view?usp=sharing".split("/")[-2]
prefix = 'https://drive.google.com/uc?/export=download&id='
gdown.download(prefix+file_id,"/kaggle/working/data.zip")

unzip_path = "/kaggle/working/"
os.makedirs(unzip_path, exist_ok=True)
with zipfile.ZipFile("/kaggle/working/data.zip", 'r') as zip_ref:
    zip_ref.extractall(unzip_path)

Downloading...
From: https://drive.google.com/uc?/export=download&id=1crHgL6Q1xO_N0AZxi5xbnCoFx-tZHisB
To: /kaggle/working/data.zip
100%|██████████| 7.90M/7.90M [00:00<00:00, 19.6MB/s]


# **DATA TRANSFORMATION**

In [3]:
import os
from transformers import AutoTokenizer
from datasets import load_dataset, load_from_disk

tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")

def convert_examples_to_features(example_batch):
    input_encodings = tokenizer(example_batch['dialogue'] , max_length = 1024, truncation = True )
    
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(example_batch['summary'], max_length = 128, truncation = True )
        
    return {
        'input_ids' : input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }

dataset_samsum = load_from_disk("/kaggle/working/samsum_dataset")
dataset_samsum_pt = dataset_samsum.map(convert_examples_to_features, batched = True)
dataset_samsum_pt.save_to_disk(os.path.join("/kaggle/working/","samsum_dataset_transformed"))

tokenizer_config.json:   0%|          | 0.00/88.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/1.91M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/65.0 [00:00<?, ?B/s]

Map:   0%|          | 0/14732 [00:00<?, ? examples/s]



Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Map:   0%|          | 0/818 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/14732 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/818 [00:00<?, ? examples/s]

# **TRAINING**

In [4]:
os.environ["WANDB_DISABLED"] = "True"

In [5]:
from transformers import TrainingArguments, Trainer
from transformers import DataCollatorForSeq2Seq
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from datasets import load_dataset, load_from_disk
import torch
import os

device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained("google/pegasus-cnn_dailymail")
model_pegasus = AutoModelForSeq2SeqLM.from_pretrained("google/pegasus-cnn_dailymail").to(device)
seq2seq_data_collator = DataCollatorForSeq2Seq(tokenizer, model=model_pegasus)

#loading data 
dataset_samsum_pt = load_from_disk("/kaggle/working/samsum_dataset_transformed")

trainer_args = TrainingArguments(
    output_dir="/kaggle/working/", num_train_epochs=5, warmup_steps=500,
    per_device_train_batch_size=1, per_device_eval_batch_size=1,
    weight_decay=0.01, logging_steps=10,
    evaluation_strategy='steps', eval_steps=500, save_steps=1e6,
    gradient_accumulation_steps=16,
    report_to=None
)

trainer = Trainer(model=model_pegasus, args=trainer_args,
          tokenizer=tokenizer, data_collator=seq2seq_data_collator,
          train_dataset=dataset_samsum_pt["train"], 
          eval_dataset=dataset_samsum_pt["validation"])

trainer.train()

## Save model
model_pegasus.save_pretrained(os.path.join("/kaggle/working/","pegasus-samsum-model"))
## Save tokenizer
tokenizer.save_pretrained(os.path.join("/kaggle/working/","tokenizer"))

2024-04-20 05:26:32.539842: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-20 05:26:32.539980: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-20 05:26:32.667748: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


pytorch_model.bin:   0%|          | 0.00/2.28G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of PegasusForConditionalGeneration were not initialized from the model checkpoint at google/pegasus-cnn_dailymail and are newly initialized: ['model.decoder.embed_positions.weight', 'model.encoder.embed_positions.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


generation_config.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss,Validation Loss
500,1.6592,1.483876
1000,1.4706,1.404548
1500,1.3931,1.37303
2000,1.3314,1.361406
2500,1.4205,1.350548
3000,1.3055,1.348588
3500,1.247,1.34151
4000,1.1219,1.343566
4500,1.1765,1.339435


Non-default generation parameters: {'max_length': 128, 'min_length': 32, 'num_beams': 8, 'length_penalty': 0.8, 'forced_eos_token_id': 1}


('/kaggle/working/tokenizer/tokenizer_config.json',
 '/kaggle/working/tokenizer/special_tokens_map.json',
 '/kaggle/working/tokenizer/spiece.model',
 '/kaggle/working/tokenizer/added_tokens.json',
 '/kaggle/working/tokenizer/tokenizer.json')

In [7]:
!zip -r /kaggle/working/model.zip /kaggle/working/pegasus-samsum-model

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


updating: kaggle/working/pegasus-samsum-model/ (stored 0%)
updating: kaggle/working/pegasus-samsum-model/model.safetensors (deflated 7%)
updating: kaggle/working/pegasus-samsum-model/generation_config.json (deflated 44%)
updating: kaggle/working/pegasus-samsum-model/config.json (deflated 60%)


In [8]:
!zip -r /kaggle/working/tokenizer.zip /kaggle/working/tokenizer

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: kaggle/working/tokenizer/ (stored 0%)
  adding: kaggle/working/tokenizer/special_tokens_map.json (deflated 82%)
  adding: kaggle/working/tokenizer/tokenizer.json (deflated 78%)
  adding: kaggle/working/tokenizer/spiece.model (deflated 50%)
  adding: kaggle/working/tokenizer/tokenizer_config.json (deflated 94%)
