<a href="https://colab.research.google.com/github/s-grzhang/gwc-chatbot/blob/main/gwc_transfer_learning_k_fold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers
!pip install datasets
!pip install torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-

In [None]:
import pandas as pd
from sklearn.model_selection import KFold
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset
import torch
import numpy as np

# Load GPT-2 small model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add a padding token
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.resize_token_embeddings(len(tokenizer))

# Ensure that the pad_token is set
tokenizer.pad_token = tokenizer.eos_token

print(f"Padding token added: {tokenizer.pad_token}")

# Load the dataset
file_path = '/content/drive/My Drive/gwc_chatbot_data_revised.csv'
df = pd.read_csv(file_path)

# Convert to Hugging Face Dataset
def df_to_dataset(df):
    return Dataset.from_pandas(df)

dataset = df_to_dataset(df)

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['User Input'], truncation=True, padding='max_length')

tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Define a Custom Dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, dataset):
        self.dataset = dataset

    def __getitem__(self, idx):
        item = {}
        for key in self.dataset.features:
            value = self.dataset[key][idx]
            if isinstance(value, str):
                # Tokenize the string value and convert to tensor
                tokenized_value = tokenizer(value, truncation=True, padding='max_length', return_tensors='pt')
                item['input_ids'] = tokenized_value['input_ids'].squeeze(0)
                item['attention_mask'] = tokenized_value['attention_mask'].squeeze(0)
            elif isinstance(value, list):
                # Convert list of numeric values to tensor
                item[key] = torch.tensor(value)
            elif isinstance(value, (int, float)):
                # Convert single numeric value to tensor
                item[key] = torch.tensor([value])
            else:
                print(f"Unexpected data type for key '{key}' at index {idx}: {type(value)}")

        # Use input_ids as labels
        item['labels'] = item['input_ids'].clone()

        return item

    def __len__(self):
        return len(self.dataset)

# Define the data collator function
def data_collator(data):
    input_ids = torch.stack([f['input_ids'] for f in data])
    attention_mask = torch.stack([f['attention_mask'] for f in data])
    labels = input_ids.clone()  # Use input_ids as labels

    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}

# Set the number of folds for cross-validation
k = 5  # Define k for K-Fold cross-validation

# k-Fold Cross-Validation
kf = KFold(n_splits=k, shuffle=True, random_state=42)
results = []
for fold, (train_index, val_index) in enumerate(kf.split(tokenized_dataset)):
    print(f"Training Fold {fold + 1}/{k}")

    # Split data into training and validation folds
    train_dataset = tokenized_dataset.select(train_index)
    val_dataset = tokenized_dataset.select(val_index)

    train_dataset = CustomDataset(train_dataset)
    val_dataset = CustomDataset(val_dataset)

    # Define the training arguments
    training_args = TrainingArguments(
        output_dir=f"./results_fold_{fold}",
        evaluation_strategy="epoch",
        learning_rate=5e-5,
        per_device_train_batch_size=2,
        per_device_eval_batch_size=2,
        num_train_epochs=7,
        weight_decay=0.01,
        logging_dir=f'./logs_fold_{fold}',
        logging_steps=10,
        save_strategy="epoch",
        load_best_model_at_end=True,
        metric_for_best_model="loss",
    )

    # Define the Trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        data_collator=data_collator,
    )

    # Train the model
    trainer.train()

    # Evaluate the model
    eval_results = trainer.evaluate()
    results.append(eval_results)
    print(f"Fold {fold + 1} Evaluation Results: {eval_results}")

# Average results across all folds
average_results = {key: np.mean([result[key] for result in results]) for key in results[0]}
print(f"Average Results Across All Folds: {average_results}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Padding token added: <|endoftext|>


Map:   0%|          | 0/98 [00:00<?, ? examples/s]

Training Fold 1/5




Epoch,Training Loss,Validation Loss
1,0.2312,0.054481
2,0.0392,0.033145
3,0.0265,0.027754
4,0.0223,0.026049
5,0.0176,0.025164
6,0.0151,0.024565


In [None]:
# Save the model and tokenizer
model.save_pretrained('/content/drive/My Drive/gwc_transfer_learning_chatbot_model_k-fold')
tokenizer.save_pretrained('/content/drive/My Drive/gwc_transfer_learning_chatbot_tokenizer_k-fold')

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel

# Load the tokenizer and model
tokenizer_name = '/content/drive/My Drive/gwc_transfer_learning_chatbot_tokenizer_k-fold'
model_name = '/content/drive/My Drive/gwc_transfer_learning_chatbot_model_k-fold'

tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Test the chatbot
def get_response(user_input):
    inputs = tokenizer.encode(user_input, return_tensors="pt")
    outputs = model.generate(inputs, max_length=63, num_return_sequences=1)
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

user_input = "When does the first meeting take place?"
response = get_response(user_input)
print(response)
