In [6]:
import os
import shutil

In [7]:
def remove_path_starts_with(starts_with=["lora", "distilgpt", "sample_data"]):
    for entry in os.listdir():
        if os.path.isdir(entry) and any(entry.startswith(prefix) for prefix in starts_with):
            shutil.rmtree(entry, ignore_errors=True)
            print(f"Deleted folder: {entry}")

In [8]:
remove_path_starts_with()

Deleted folder: sample_data


In [9]:
! pip install datasets



In [10]:
import transformers
transformers.__version__

'4.51.3'

## Preparing the dataset




In [11]:
from datasets import load_dataset

In [12]:
data_files = {"train": "train.csv", "test": "test.csv"}
dataset = load_dataset('jonaskoenig/trump_administration_statement', data_files=data_files)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

train.csv:   0%|          | 0.00/4.75M [00:00<?, ?B/s]

test.csv:   0%|          | 0.00/1.35M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [13]:
dataset["train"], dataset["test"]

(Dataset({
     features: ['text', 'ft_tense'],
     num_rows: 51247
 }),
 Dataset({
     features: ['text', 'ft_tense'],
     num_rows: 14642
 }))

In [14]:
dataset["train"][0]

{'text': 'And I understand, frankly, every President, for the last many Presidents, have said, Were going to open our embassy in Jerusalem. And then they never did it.',
 'ft_tense': 1}

## Causal Language modeling

In [15]:
model_checkpoint = "distilgpt2"

In [16]:
from transformers import AutoTokenizer

In [17]:
tokenizer = AutoTokenizer.from_pretrained(
    model_checkpoint,
    trust_remote_code=True,
    use_fast=True
)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/762 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [18]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [19]:
tokenized_dataset = dataset.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text", "ft_tense"]
)

Map (num_proc=4):   0%|          | 0/51247 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/14642 [00:00<?, ? examples/s]

In [20]:
dataset["train"][0]

{'text': 'And I understand, frankly, every President, for the last many Presidents, have said, Were going to open our embassy in Jerusalem. And then they never did it.',
 'ft_tense': 1}

In [21]:
tokenized_dataset["train"][0]

{'input_ids': [1870,
  314,
  1833,
  11,
  17813,
  11,
  790,
  1992,
  11,
  329,
  262,
  938,
  867,
  35506,
  11,
  423,
  531,
  11,
  15176,
  1016,
  284,
  1280,
  674,
  18613,
  287,
  10843,
  13,
  843,
  788,
  484,
  1239,
  750,
  340,
  13],
 'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1]}

In [22]:
tokenizer.model_max_length

1024

In [23]:
block_size = 256

In [24]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
    # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [25]:
lm_dataset = tokenized_dataset.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

Map (num_proc=4):   0%|          | 0/51247 [00:00<?, ? examples/s]

Map (num_proc=4):   0%|          | 0/14642 [00:00<?, ? examples/s]

In [26]:
tokenizer.decode(lm_dataset["train"][0]["input_ids"])

'And I understand, frankly, every President, for the last many Presidents, have said, Were going to open our embassy in Jerusalem. And then they never did it.And youll be glad to know the United States will continue to be a driver of global growth.Were going to have a great red wave.And I knew I was going to be here with you to mark this momentous occasion, so I wore my Houston Astros tie because theyre still the champion of the World Series.And whenever youre dealing with the medical field, its its a serious thing.And the test result comes back in five minutes, and we have great testing.Opportunity Scholarship Program right here in the district.And many of you, I hope, will be following me.So, were so happy for your family, and I know your sons are going to make you very proud.And at this time, our team is working with the governor and working with the senator to ensure that we flow testing resources.All over the world.There will be some areas hit harder than we think.And well be anno

In [27]:
tokenizer.decode(lm_dataset["test"][0]["input_ids"])

'And because of that, a repeat colonoscopy was not indicated at this time and will be deferred until his next periodic physical exam.And we are going to realize a free and open Indo-Pacific economic growth.And we are going to be signing a full and complete pardon.I dont think youre going to stop it at all.Were going to start drilling in ANWR one of the largest oil reserves in the world that for 40 years this country was unable to touch.MR. SPICER: I will ask, Alexis.And I think that he has been very remarkably adept at fielding our inquiries, and I know hes going to go back and help to continue this effort.And I guess were going to be a little bit late.A lot of people are going to be evicted, but Im going to stop it because Ill do it myself if I have to.He quickly encountered an insurgent who was about to fire a rocket-propelled grenade at his squad.hiring rose more than forecast in May, wages picked up and the unemployment rate matched the lowest in almost five decades, indicating the

In [28]:
from transformers import AutoModelForCausalLM

In [29]:
model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/353M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [30]:
def print_trainable_parameters(model):
    trainable = sum(p.numel() for p in model.parameters() if p.requires_grad)
    total = sum(p.numel() for p in model.parameters())
    print(f"Trainable parameters: {trainable} / {total} ({100 * trainable / total:.2f}%)")

In [31]:
print_trainable_parameters(model)

Trainable parameters: 81912576 / 81912576 (100.00%)


In [32]:
from transformers import Trainer, TrainingArguments

In [33]:
training_args = TrainingArguments(
    model_checkpoint,
    eval_strategy = "epoch",
    label_names=["labels"],
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none", # disable wandb
)

In [34]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
)

In [35]:
train_output = trainer.train()
train_output

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,3.377837
2,3.534400,3.337214
3,3.375400,3.327589


TrainOutput(global_step=1428, training_loss=3.4177546995360646, metrics={'train_runtime': 564.4346, 'train_samples_per_second': 20.208, 'train_steps_per_second': 2.53, 'total_flos': 745087684313088.0, 'train_loss': 3.4177546995360646, 'epoch': 3.0})

In [36]:
train_output.metrics

{'train_runtime': 564.4346,
 'train_samples_per_second': 20.208,
 'train_steps_per_second': 2.53,
 'total_flos': 745087684313088.0,
 'train_loss': 3.4177546995360646,
 'epoch': 3.0}

In [37]:
import math

In [38]:
eval_results = trainer.evaluate()
eval_results

{'eval_loss': 3.327589273452759,
 'eval_runtime': 14.3086,
 'eval_samples_per_second': 75.479,
 'eval_steps_per_second': 9.435,
 'epoch': 3.0}

In [39]:
cross_entropy_loss = eval_results['eval_loss']
print(f"Cross Entropy Loss:{cross_entropy_loss:.4f} Perplexity:{math.exp(cross_entropy_loss):.2f}")

Cross Entropy Loss:3.3276 Perplexity:27.87


In [40]:
from transformers import pipeline

In [41]:
generator = pipeline("text-generation", model=model, tokenizer=tokenizer)

Device set to use cuda:0


In [42]:
prompt = "Donlad Trump says China"

In [43]:
for _ in range(10):
  out = generator(prompt, pad_token_id=generator.tokenizer.eos_token_id)
  print(out[0]['generated_text'])

Donlad Trump says China isnt a friend.We have a great job that people dont like, but we have tremendous businesses going through this area.But they didnt like it, and theyre going to get it a little bit lower than whats happening
Donlad Trump says China could be a bigger problem.But I will say: I will try to go ahead and say, How will he even do it if?We had these guys all living in an area and you all know that youllnt
Donlad Trump says China has its problems, but China will fix them because thats not right.And were going to have all of these issues solved, and I will not compromise on them.On Tuesday, President Xi will meet with other countries for a
Donlad Trump says China will help build the Americas.And he will not just say those are the problems in China.MR. SPICER: Yeah.They wont, I will.Theres no longer the problem as to who will become the
Donlad Trump says China will never have to deal with ISIS, and we have not even known or saw them before.And he thinks so, and he knows wha

# Fine-Tuning with LoRA

In [44]:
remove_path_starts_with()

Deleted folder: distilgpt2


In [45]:
from peft import get_peft_model, LoraConfig, TaskType

In [46]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    target_modules=["c_attn"],  # Specific to GPT2's attention
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

In [47]:
base_model = AutoModelForCausalLM.from_pretrained(model_checkpoint)

In [48]:
lora_model = get_peft_model(base_model, lora_config)



In [49]:
lora_model.print_trainable_parameters()

trainable params: 147,456 || all params: 82,060,032 || trainable%: 0.1797


In [50]:
training_args = TrainingArguments(
    model_checkpoint,
    eval_strategy = "epoch",
    label_names=["labels"],
    learning_rate=2e-5,
    weight_decay=0.01,
    push_to_hub=False,
    report_to="none", # disable wandb
)

In [51]:
lora_trainer = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=lm_dataset["train"],
    eval_dataset=lm_dataset["test"],
)

In [52]:
lora_train_output = lora_trainer.train()
lora_train_output

Epoch,Training Loss,Validation Loss
1,No log,3.805296
2,3.955800,3.732754
3,3.828800,3.714524


TrainOutput(global_step=1428, training_loss=3.861566367269564, metrics={'train_runtime': 374.7405, 'train_samples_per_second': 30.437, 'train_steps_per_second': 3.811, 'total_flos': 747671056809984.0, 'train_loss': 3.861566367269564, 'epoch': 3.0})

In [53]:
lora_eval_results = lora_trainer.evaluate()
lora_eval_results

{'eval_loss': 3.714524030685425,
 'eval_runtime': 14.5183,
 'eval_samples_per_second': 74.389,
 'eval_steps_per_second': 9.299,
 'epoch': 3.0}

In [54]:
lora_cross_entropy_loss = lora_eval_results['eval_loss']
print(f"Cross Entropy Loss:{lora_cross_entropy_loss:.4f} Perplexity:{math.exp(lora_cross_entropy_loss):.2f}")

Cross Entropy Loss:3.7145 Perplexity:41.04


In [55]:
lora_model_checkpoint = f"lora_{model_checkpoint}"
lora_model_checkpoint

'lora_distilgpt2'

In [56]:
lora_model.save_pretrained(lora_model_checkpoint)
tokenizer.save_pretrained(lora_model_checkpoint)

('lora_distilgpt2/tokenizer_config.json',
 'lora_distilgpt2/special_tokens_map.json',
 'lora_distilgpt2/vocab.json',
 'lora_distilgpt2/merges.txt',
 'lora_distilgpt2/added_tokens.json',
 'lora_distilgpt2/tokenizer.json')

In [57]:
lora_generator = pipeline(
    "text-generation",
    model=base_model,
    tokenizer=tokenizer
)

Device set to use cuda:0


In [58]:
from peft import PeftModel

In [59]:
lora_generator.model = PeftModel.from_pretrained(
    model=lora_model,
    model_id=lora_model_checkpoint
)



In [60]:
for _ in range(10):
  out = lora_generator(prompt, pad_token_id=generator.tokenizer.eos_token_id)
  print(out[0]['generated_text'])

Donlad Trump says China wants an apology, but Chinese media reports suggest the president’s response is likely too low.

























Donlad Trump says China will cut off trade with America and "start a new global manufacturing factory on our doorstep" in 2015.
























Donlad Trump says China could develop new nuclear weapons in exchange for nuclear weapons Read more

































Donlad Trump says China will not pay for arms for its air force if it doesn't get military support





























Donlad Trump says China wouldn't be able to "buy" the United States because of a trade deal between China and the US, the Times of Taiwan reported on Sunday.















Donlad Trump says China is “obsolete ‘’’“


In a column by Dr Alex Jones, the top Democrat on CNN's Morning Joe, Jones claimed that Chinese economic growth is “obsolete �
Donlad Trump says China will cut jobs and cut taxes for the middle class







Trump told an audi

In [61]:
!pip show datasets transformers peft

Name: datasets
Version: 3.5.0
Summary: HuggingFace community-driven open-source library of datasets
Home-page: https://github.com/huggingface/datasets
Author: HuggingFace Inc.
Author-email: thomas@huggingface.co
License: Apache 2.0
Location: /usr/local/lib/python3.11/dist-packages
Requires: aiohttp, dill, filelock, fsspec, huggingface-hub, multiprocess, numpy, packaging, pandas, pyarrow, pyyaml, requests, tqdm, xxhash
Required-by: 
---
Name: transformers
Version: 4.51.3
Summary: State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow
Home-page: https://github.com/huggingface/transformers
Author: The Hugging Face team (past and future) with the help of all our contributors (https://github.com/huggingface/transformers/graphs/contributors)
Author-email: transformers@huggingface.co
License: Apache 2.0 License
Location: /usr/local/lib/python3.11/dist-packages
Requires: filelock, huggingface-hub, numpy, packaging, pyyaml, regex, requests, safetensors, tokenizers, tqdm
Required-by: 