In [1]:
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

In [3]:
df = pd.read_csv("shortjokes.csv")
df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [None]:
# Preprocessing data

df = df.dropna(subset = ["Joke"])
df = df[df["Joke"].apply(lambda x: isinstance(x,str) and len(x.strip()) > 10)]
df.head()

Unnamed: 0,ID,Joke
0,1,"[me narrating a documentary about narrators] ""..."
1,2,Telling my daughter garlic is good for you. Go...
2,3,I've been going through a really rough period ...
3,4,"If I could have dinner with anyone, dead or al..."
4,5,Two guys walk into a bar. The third guy ducks.


In [None]:
# Get only 1000 samples
df = df.sample(1000, random_state=42).reset_index(drop=True)

In [6]:
dataset = Dataset.from_pandas(df[["Joke"]].rename(columns={"Joke": "text"}))
dataset

Dataset({
    features: ['text'],
    num_rows: 1000
})

In [None]:
# Load the model and tokenizer
model_name ="gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Tokenizing definition
def tokenize(example):
  encoding = tokenizer(example["text"], truncation = True, return_tensors = "pt",max_length = 128, padding = "max_length")
  item = {}
  for k,v in encoding.items():
    item[k] = v.squeeze(0)
  item["labels"] = item["input_ids"].clone()        # For training inputs = labels
  return item

In [None]:
# Map the tokenizer to the dataset
tokenized_dataset = dataset.map(tokenize)

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [None]:
base_model = AutoModelForCausalLM.from_pretrained(model_name)
base_model.resize_token_embeddings(len(tokenizer))

In [None]:
# Configure LoRA parameters

lora_config = LoraConfig(
    r=8,                                    # LoRA Rank
    lora_alpha=16,                          # LoRA Alpha
    target_modules=["c_attn", "c_proj"],    # LoRA Target Modules
    lora_dropout=0.05,                      # LoRA Dropout
    bias="none",                            # LoRA Bias
    task_type=TaskType.CAUSAL_LM
)

In [12]:
model = get_peft_model(base_model, lora_config)
model.print_trainable_parameters()

trainable params: 811,008 || all params: 125,250,816 || trainable%: 0.6475




In [None]:
# Training Arguments to be attached to Trainer
training_args = TrainingArguments(
    output_dir="./gpt2-jokes-lora",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    logging_steps=10,
    save_steps=500,
    save_total_limit=2,
    fp16=True,
    report_to="none",
    push_to_hub=False,
)

In [None]:
# Configuring Trainer Parameters
trainer = Trainer(
    model=model,
    train_dataset=tokenized_dataset,
    args=training_args,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
# Train the model
trainer.train()
model.save_pretrained("gpt2-jokes-lora")
tokenizer.save_pretrained("gpt2-jokes-lora")

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,4.5222
20,4.4067
30,4.2537
40,4.3655
50,4.3539
60,4.2702
70,4.2096
80,4.1375
90,4.1391
100,3.8619


('gpt2-jokes-lora/tokenizer_config.json',
 'gpt2-jokes-lora/special_tokens_map.json',
 'gpt2-jokes-lora/vocab.json',
 'gpt2-jokes-lora/merges.txt',
 'gpt2-jokes-lora/added_tokens.json',
 'gpt2-jokes-lora/tokenizer.json')

In [None]:
# Inference

from transformers import pipeline

pipe = pipeline("text-generation", model="gpt2-jokes-lora", tokenizer = tokenizer)

prompt = "Why did the chicken cross the road?"
print(pipe(prompt, max_length=10, top_k=25, top_p=0.95)[0]["generated_text"])


Device set to use cuda:0
Both `max_new_tokens` (=256) and `max_length`(=10) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Why did the chicken cross the road? It was the same chicken the guy had the other day. The chicken was in the wrong place at the wrong time. He just ate it and didn't know where it was. A friend suggested he go back to the mall and buy some more chicken. The chicken didn't even go back. It was the same chicken the guy had the other day. The chicken was in the wrong place at the wrong time. He just ate it and didn't know where it was. A friend suggested he go back to the mall and buy some more chicken. The chicken didn't even go back. It was the same chicken the guy had the other day. The chicken was in the wrong place at the wrong time. He just ate it and didn't know where it was. A friend suggested he go back to the mall and buy some more chicken. The chicken didn't even go back. It was the same chicken the guy had the other day. The chicken was in the wrong place at the wrong time. He just ate it and didn't know where it was. A friend suggested he go back to the mall and buy some mor

### Quantising my Model with GPTQ

In [22]:
! pip install auto_gptq

Collecting auto_gptq
  Downloading auto_gptq-0.7.1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (18 kB)
Collecting rouge (from auto_gptq)
  Downloading rouge-1.0.1-py3-none-any.whl.metadata (4.1 kB)
Collecting gekko (from auto_gptq)
  Downloading gekko-1.3.0-py3-none-any.whl.metadata (3.0 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.13.0->auto_gptq)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.13.0->auto_gptq)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.13.0->auto_gptq)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.13.0->auto_gptq)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1

In [None]:
from auto_gptq import AutoGPTQForCausalLM, BaseQuantizeConfig
from transformers import AutoTokenizer

model_path = "gpt2-jokes-lora"

quantize_config = BaseQuantizeConfig(           # Qunatisation with GPTQ
    bits=4,                                     # Final values to be represented in 4 bits
    group_size=128,                             
    desc_act=False,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

quant_model = AutoGPTQForCausalLM.from_pretrained(
    model_path,
    quantize_config=quantize_config,
    trust_remote_code=True,
)

inputs = tokenizer("Hello, this is a test input for GPTQ.", return_tensors="pt")
quant_model.quantize([inputs])                   # Quantisation happens here

# Save Model and Tokenizer
quant_model.save_quantized("gpt2-jokes-gptq")
tokenizer.save_pretrained("gpt2-jokes-gptq")

  @custom_fwd
  @custom_bwd
  @custom_fwd(cast_inputs=torch.float16)
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
INFO - Start quantizing layer 1/12
INFO:auto_gptq.modeling._base:Start quantizing layer 1/12
INFO - Quantizing mlp.c_fc in layer 1/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_fc in layer 1/12...
INFO - Start quantizing layer 2/12
INFO:auto_gptq.modeling._base:Start quantizing layer 2/12
INFO - Quantizing mlp.c_fc in layer 2/12...
INFO:auto_gptq.modeling._base:Quantizing mlp.c_fc in layer 2/12...
INFO - Start quantizing layer 3/12
INFO:auto_gptq.modeling._base:Start quantizing layer 3/12
INFO - Quantizing 

('gpt2-jokes-gptq/tokenizer_config.json',
 'gpt2-jokes-gptq/special_tokens_map.json',
 'gpt2-jokes-gptq/vocab.json',
 'gpt2-jokes-gptq/merges.txt',
 'gpt2-jokes-gptq/added_tokens.json',
 'gpt2-jokes-gptq/tokenizer.json')

In [None]:
# Inference with Quantised Model

import torch
from transformers import AutoTokenizer
from auto_gptq import AutoGPTQForCausalLM

model_path = "gpt2-jokes-gptq/gpt2-jokes-gptq"

tokenizer = AutoTokenizer.from_pretrained(model_path)

model = AutoGPTQForCausalLM.from_quantized(
    model_path,
    device="cuda:0",
    use_triton=False,
    trust_remote_code=True,
)

prompt = "Why did the chicken cross the road?"

input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to("cuda:0")

generated_ids = input_ids
for _ in range(30):
    with torch.no_grad():
        outputs = model(generated_ids)
        logits = outputs.logits[:, -1, :]
        probabilities = torch.softmax(logits, dim=-1)
        next_token_id = torch.argmax(probabilities, dim=-1).unsqueeze(0)

        generated_ids = torch.cat([generated_ids, next_token_id], dim=-1)

generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

print("Prompt:", prompt)
print("Generated:", generated_text)