In [1]:
!pip install transformers datasets peft

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting peft
  Downloading peft-0.12.0-py3-none-any.whl.metadata (13 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.13.0->peft)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cu

## Model

In [2]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
from datasets import load_dataset
from peft import LoraConfig, get_peft_model

#tokenizer and pre-trained model
model_name = "microsoft/DialoGPT-small"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# LoRA configuration
lora_config = LoraConfig(
    r=4,  # LoRA rank
    lora_alpha=32,  # LoRA alpha scaling
    target_modules=["c_attn", "c_proj"],
    lora_dropout=0.1,
    bias="none"
)

# Apply LoRA
model = get_peft_model(model, lora_config)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/641 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/351M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]



## Data Loading And Preprocessing

In [3]:
import pandas as pd
from datasets import Dataset
tokenizer.pad_token = tokenizer.eos_token
# dataset
data = pd.read_csv('/content/dialogs.txt', delimiter='\t', header=None, names=['Context', 'Response'])
dataset = Dataset.from_pandas(data)


In [4]:
data

Unnamed: 0,Context,Response
0,"hi, how are you doing?",i'm fine. how about yourself?
1,i'm fine. how about yourself?,i'm pretty good. thanks for asking.
2,i'm pretty good. thanks for asking.,no problem. so how have you been?
3,no problem. so how have you been?,i've been great. what about you?
4,i've been great. what about you?,i've been good. i'm in school right now.
...,...,...
3720,that's a good question. maybe it's not old age.,are you right-handed?
3721,are you right-handed?,yes. all my life.
3722,yes. all my life.,you're wearing out your right hand. stop using...
3723,you're wearing out your right hand. stop using...,but i do all my writing with my right hand.


In [5]:

def preprocess_function(examples):
    conversations = [f"{context} {response}" for context, response in zip(examples['Context'], examples['Response'])]
    tokenized = tokenizer(conversations, truncation=True, padding='max_length', max_length=128)

    return {
        'input_ids': tokenized['input_ids'],
        'attention_mask': tokenized['attention_mask']
    }


tokenized_dataset = dataset.map(preprocess_function, batched=True, remove_columns=["Context", "Response"])


Map:   0%|          | 0/3725 [00:00<?, ? examples/s]

In [6]:

train_size = 800
eval_size = 200
subset_tokenized_dataset = tokenized_dataset.select(range(1000))
train_dataset = subset_tokenized_dataset.select(range(train_size))
eval_dataset = subset_tokenized_dataset.select(range(train_size, 1000))

## Arguments

In [10]:
#training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    auto_find_batch_size=True,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir='./logs',
    save_steps=90,
    save_total_limit=2,
)



## Trainer

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
     train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)


## Training

In [12]:
# Fine tune the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,No log,No log


TrainOutput(global_step=100, training_loss=6.346240234375, metrics={'train_runtime': 1019.335, 'train_samples_per_second': 0.785, 'train_steps_per_second': 0.098, 'total_flos': 52507548057600.0, 'train_loss': 6.346240234375, 'epoch': 1.0})

## Generating Response

In [16]:
import torch

def generate_response(input_text, model, tokenizer, max_length=100):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    input_ids = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors='pt').to(device)

    attention_mask = torch.ones(input_ids.shape, device=device)

    with torch.no_grad():
        output_ids = model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_length=max_length,
            pad_token_id=tokenizer.eos_token_id,
            do_sample=False
        )

    response_text = tokenizer.decode(output_ids[:, input_ids.shape[-1]:][0], skip_special_tokens=True)
    return response_text

input_text = "where are you going"
response = generate_response(input_text, model, tokenizer)
print("Bot:", response)


Bot: I'm going to the game tonight.
