In [1]:
!pip install transformers



In [2]:
!pip install -U PyPDF2
!pip install python-docx

Collecting PyPDF2
  Downloading pypdf2-3.0.1-py3-none-any.whl.metadata (6.8 kB)
Downloading pypdf2-3.0.1-py3-none-any.whl (232 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyPDF2
Successfully installed PyPDF2-3.0.1
Collecting python-docx
  Downloading python_docx-1.1.2-py3-none-any.whl.metadata (2.0 kB)
Downloading python_docx-1.1.2-py3-none-any.whl (244 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m244.3/244.3 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx
Successfully installed python-docx-1.1.2


In [3]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [4]:
!pip install py7zr

Collecting py7zr
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.16.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.4 kB)
Collecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading pyppmd-1.1.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.7 kB)
Collecting pybcj<1.1.0,>=1.0.0 (from py7zr)
  Downloading pybcj-1.0.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting multivolumefile>=0.2.3 (from py7zr)
  Downloading multivolumefile-0.2.3-py3-none-any.whl.metadata (6.3 kB)
Collecting inflate64<1.1.0,>=1.0.0 (from py7zr)
  Downloading inflate64-1.0.0-cp310-cp310-manylinux_2_17_

In [5]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

# Load the SAMSum dataset
dataset = load_dataset('samsum')

# Limit the number of samples
num_train_samples = 500
num_val_samples = 200

# Slice training and validation datasets
train_dataset = dataset['train'].select(range(num_train_samples))
val_dataset = dataset['validation'].select(range(num_val_samples))

# Load tokenizer and add padding token
tokenizer = GPT2Tokenizer.from_pretrained('distilgpt2')
tokenizer.pad_token = tokenizer.eos_token  # Use the EOS token as padding token

# Preprocess function for SAMSum
def preprocess_function(examples):
    inputs = examples['dialogue']
    targets = examples['summary']
    model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')
    labels = tokenizer(targets, max_length=150, truncation=True, padding='max_length')
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

# Tokenize and preprocess the dataset
tokenized_train_dataset = train_dataset.map(preprocess_function, batched=True)
tokenized_val_dataset = val_dataset.map(preprocess_function, batched=True)

# Print dataset lengths
train_dataset_length = len(tokenized_train_dataset)
validation_dataset_length = len(tokenized_val_dataset)
print(f"Train dataset length: {train_dataset_length}")
print(f"Validation dataset length: {validation_dataset_length}")

# Define the data collator
def load_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False
    )

# Training function
def train(train_dataset, eval_dataset, model_name, output_dir, overwrite_output_dir, per_device_train_batch_size, num_train_epochs):
    model = GPT2LMHeadModel.from_pretrained(model_name)
    data_collator = load_data_collator(tokenizer)

    training_args = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=overwrite_output_dir,
        per_device_train_batch_size=per_device_train_batch_size,
        num_train_epochs=num_train_epochs,
        save_steps=100,
        logging_dir='./logs',
        logging_steps=10,  # Log every 10 steps
        evaluation_strategy="steps",
        eval_steps=10
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset
    )

    trainer.train()
    trainer.save_model(output_dir)
    tokenizer.save_pretrained(output_dir)

# Define paths and parameters
model_name = 'distilgpt2'
output_dir = '/content/custom_samsum'
overwrite_output_dir = True
per_device_train_batch_size = 4
num_train_epochs = 1

# Train the model
train(
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs
)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

The repository for samsum contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/samsum.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


Downloading data:   0%|          | 0.00/2.94M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14732 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/818 [00:00<?, ? examples/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Train dataset length: 500
Validation dataset length: 200


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



Step,Training Loss,Validation Loss
10,4.1923,3.865301
20,3.9371,3.591946
30,3.6291,3.464634
40,3.5115,3.399692
50,3.4496,3.340317
60,3.591,3.292998
70,3.4198,3.257626
80,3.4926,3.225054
90,3.2907,3.205209
100,3.2403,3.194742


In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

def load_model_and_tokenizer(model_path):
    # Load the fine-tuned model and tokenizer from the same path
    model = GPT2LMHeadModel.from_pretrained(model_path)
    tokenizer = GPT2Tokenizer.from_pretrained(model_path)
    return model, tokenizer

def generate_text(model, tokenizer, prompt, max_new_tokens=150):
    # Tokenize the prompt
    inputs = tokenizer(prompt, return_tensors='pt', truncation=True, max_length=512, padding='max_length')

    # Generate text
    outputs = model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],  # Provide attention mask
        max_new_tokens=max_new_tokens,  # Generate a specific number of new tokens
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )

    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Define paths and prompt
model_path = '/content/custom_samsum'  # Path where the model and tokenizer are saved
prompt = "Summarize the following dialogue: [John decided to spend his weekend in nature. He packed a small bag with essentials: a water bottle, some snacks, and a warm jacket. Early Saturday morning, he drove to a nearby park known for its beautiful hiking trails. After a short drive, he arrived and started his hike. The weather was perfect—sunny with a gentle breeze. As he walked along the trail, he enjoyed the sounds of birds chirping and the rustling of leaves. By the end of the day, John felt refreshed and happy. It was a simple yet satisfying escape from his usual routine.]"

# Load the model and tokenizer
model, tokenizer = load_model_and_tokenizer(model_path)

# Generate and print text
generated_text = generate_text(model, tokenizer, prompt)
print(generated_text)


Summarize the following dialogue: [John decided to spend his weekend in nature. He packed a small bag with essentials: a water bottle, some snacks, and a warm jacket. Early Saturday morning, he drove to a nearby park known for its beautiful hiking trails. After a short drive, he arrived and started his hike. The weather was perfect—sunny with a gentle breeze. As he walked along the trail, he enjoyed the sounds of birds chirping and the rustling of leaves. By the end of the day, John felt refreshed and happy. It was a simple yet satisfying escape from his usual routine.]Photo: John
John then took the bus to a small town called Waukegan, where he was staying at the time. He got there in good standing. He made his way across to the town in the morning at about 8:30, and then left after 3 o'clock on his way home. He saw the trees and trees. He went home. He didn't see any signs of birds or birds and didn't find anything. He walked right to the local grocery store and discovered a little bi

In [9]:
# Total parameters
total_params = sum(p.numel() for p in model.parameters())
# Trainable parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total parameters: {total_params}")
print(f"Trainable parameters: {trainable_params}")


Total parameters: 81912576
Trainable parameters: 81912576
