<a href="https://colab.research.google.com/github/s-grzhang/gwc-chatbot/blob/web-interface/gwc_bot_working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

IMPORTANT: Push to web-interface branch, not main branch.

In [1]:
!pip install transformers
!pip install datasets
!pip install torch
!pip install rouge-score
!pip install trl
!pip install evaluate

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [2]:
import pandas as pd
from datasets import Dataset
from trl import PPOTrainer, PPOConfig
from evaluate import load

# Load the DataFrame from the CSV file
file_path = '/content/drive/My Drive/gwc_chatbot_data_revised.csv'
df = pd.read_csv(file_path)

# Format the dataset for Q&A, combining User Input (question) and Bot Response (answer)
df['input'] = df['User Input']
df['output'] = df['Bot Response']

# Create a Dataset from the DataFrame
dataset = Dataset.from_pandas(df[['input', 'output']])

Dataset({
    features: ['input', 'output'],
    num_rows: 120
})


In [3]:
from transformers import AutoTokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token if not already added
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset with both input (question) and output (answer)
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding="max_length", truncation=True, max_length=128, return_attention_mask=True) #Try to make length longer
    outputs = tokenizer(examples['output'], padding="max_length", truncation=True, max_length=128, return_attention_mask=True)
    # Shift outputs so they match the input for language modeling
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

In [4]:
# Split the dataset into train and test sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)


In [5]:
import evaluate
rouge_metric = evaluate.load("rouge")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [11]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments
import numpy as np
from transformers import pipeline

import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred

    # Predictions might be logits, convert them to token IDs using argmax along the last axis
    preds_ids = np.argmax(predictions, axis=-1)

    # Decode the predicted IDs and labels into text
    preds_text = []
    for pred in preds_ids:
        if isinstance(pred, list) or isinstance(pred, np.ndarray):
            pred = [p for p in pred if p < tokenizer.vocab_size]  # Filter invalid token IDs
        preds_text.append(tokenizer.decode(pred, skip_special_tokens=True))

    # Decode labels (ignoring -100, which is padding)
    labels_text = []
    for label in labels:
        label = [l for l in label if l != -100]  # Remove padding (-100)
        labels_text.append(tokenizer.decode(label, skip_special_tokens=True))

    # Calculate ROUGE score
    results = rouge_metric.compute(predictions=preds_text, references=labels_text)

    # Directly return the ROUGE scores as floating-point values
    return {
        "rouge1": results["rouge1"],
        "rouge2": results["rouge2"],
        "rougeL": results["rougeL"],
    }


In [12]:
# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4, #or 8
    per_device_eval_batch_size=4,
    num_train_epochs=10,  # You can adjust the number of epochs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,2.0724,3.096421,0.311755,0.0997,0.298074
2,2.2882,2.887761,0.350517,0.10895,0.333864
3,1.6904,2.81272,0.36016,0.105896,0.339784
4,1.6313,2.77825,0.356379,0.109814,0.335147
5,1.5583,2.783862,0.366315,0.112378,0.339792
6,1.2957,2.812516,0.363813,0.112689,0.342581
7,1.3159,2.809642,0.367113,0.112378,0.341429
8,1.3403,2.819612,0.378928,0.126995,0.352707
9,1.1998,2.826625,0.379407,0.126995,0.353601
10,1.2408,2.832173,0.375913,0.127348,0.354223


TrainOutput(global_step=240, training_loss=1.5375581900278728, metrics={'train_runtime': 57.2785, 'train_samples_per_second': 16.76, 'train_steps_per_second': 4.19, 'total_flos': 62710087680000.0, 'train_loss': 1.5375581900278728, 'epoch': 10.0})

In [None]:
import torch

# Ensure the model is on the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_response(user_input):
    # Tokenize the input with attention_mask
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Generate a response with attention_mask
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Pass attention_mask
        max_length=128,  # Set the maximum length of the generated response
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,  # Use EOS token as PAD token
        eos_token_id=tokenizer.eos_token_id   # Ensure EOS token is set
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot after fine-tuning
user_input = "What is the purpose of Girls Who Code?"
response = get_response(user_input)
print(response)


What is the purpose of Girls Who Code? Girls Who Code is to help girls learn computer science and computer science by participating in computer science clubs and other activities. Girls Who Code is a non-profit organization that does not provide any financial support to any other organizations. Girls Who Code does not provide any financial support to any other organizations. Girls Who Code does not provide any financial support to any other organizations. Girls Who Code does not provide any financial support to any other organizations. Girls Who Code does not provide any financial support to any other organizations. Girls Who Code does not provide any financial support to any other organizations. Girls Who Code does not


In [None]:
model.save_pretrained('/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit')
tokenizer.save_pretrained('/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer')


('/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer/vocab.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer/merges.txt',
 '/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer/added_tokens.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_not_overfit_tokenizer/tokenizer.json')

In [None]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence_transformers-3.1.1-py3-none-any.whl.metadata (10 kB)
Downloading sentence_transformers-3.1.1-py3-none-any.whl (245 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m245.3/245.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence-transformers
Successfully installed sentence-transformers-3.1.1


In [None]:
def get_response(user_input, max_sentences=2):
    # Tokenize the user input and generate a response from GPT-2
    inputs = gpt2_tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=150)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to device
    outputs = gpt2_model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=128,
        num_return_sequences=1,
        top_p=0.9,
        no_repeat_ngram_size=3,
        pad_token_id=gpt2_tokenizer.eos_token_id,
        eos_token_id=gpt2_tokenizer.eos_token_id,
        do_sample=True
    )
    # Decode the GPT-2 response
    response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    # Stop generation after max_sentences (e.g., 2 sentences)
    sentence_endings = ['.', '!', '?']  # Define sentence-ending punctuation
    sentence_count = 0
    final_response = ""

    # Iterate through characters and stop after two sentences
    for char in response:
        final_response += char
        if char in sentence_endings:
            sentence_count += 1
        if sentence_count == max_sentences:
            break

    return final_response

# Example usage
user_input = "What is the purpose of Girls Who Code?"
response = get_response(user_input, max_sentences=3)

print("Response with two sentences:")
print(response)


NameError: name 'gpt2_tokenizer' is not defined

In [None]:
# Define paths to save the model and tokenizer
model_save_path = '/content/gwc_gpt2_model'
tokenizer_save_path = '/content/gwc_gpt2_tokenizer'

# Save the model and tokenizer
gpt2_model.save_pretrained(model_save_path)
gpt2_tokenizer.save_pretrained(tokenizer_save_path)


('/content/gwc_gpt2_tokenizer/tokenizer_config.json',
 '/content/gwc_gpt2_tokenizer/special_tokens_map.json',
 '/content/gwc_gpt2_tokenizer/vocab.json',
 '/content/gwc_gpt2_tokenizer/merges.txt',
 '/content/gwc_gpt2_tokenizer/added_tokens.json',
 '/content/gwc_gpt2_tokenizer/tokenizer.json')

In [None]:
!zip -r /content/gwc_gpt2_model.zip /content/gwc_gpt2_model /content/gwc_gpt2_tokenizer


  adding: content/gwc_gpt2_model/ (stored 0%)
  adding: content/gwc_gpt2_model/generation_config.json (deflated 24%)
  adding: content/gwc_gpt2_model/model.safetensors (deflated 7%)
  adding: content/gwc_gpt2_model/config.json (deflated 50%)
  adding: content/gwc_gpt2_tokenizer/ (stored 0%)
  adding: content/gwc_gpt2_tokenizer/merges.txt (deflated 53%)
  adding: content/gwc_gpt2_tokenizer/vocab.json (deflated 59%)
  adding: content/gwc_gpt2_tokenizer/special_tokens_map.json (deflated 74%)
  adding: content/gwc_gpt2_tokenizer/tokenizer_config.json (deflated 56%)
  adding: content/gwc_gpt2_tokenizer/tokenizer.json (deflated 72%)


In [None]:
from google.colab import files

# Download the zip file to your local machine
files.download('/content/gwc_gpt2_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>