<a href="https://colab.research.google.com/github/s-grzhang/gwc-chatbot/blob/web-interface/gwc_bot_working.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers
!pip install datasets
!pip install torch



In [None]:
import pandas as pd
from datasets import Dataset

# Load the DataFrame from the CSV file
file_path = '/content/drive/My Drive/gwc_chatbot_data_revised.csv'
df = pd.read_csv(file_path)

# Format the dataset for Q&A, combining User Input (question) and Bot Response (answer)
df['input'] = df['User Input']
df['output'] = df['Bot Response']

# Create a Dataset from the DataFrame
dataset = Dataset.from_pandas(df[['input', 'output']])

# Display the dataset
print(dataset)


Dataset({
    features: ['input', 'output'],
    num_rows: 105
})


In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token if not already added
tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset with both input (question) and output (answer)
def tokenize_function(examples):
    inputs = tokenizer(examples['input'], padding="max_length", truncation=True, max_length=128, return_attention_mask=True)
    outputs = tokenizer(examples['output'], padding="max_length", truncation=True, max_length=128, return_attention_mask=True)
    # Shift outputs so they match the input for language modeling
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_datasets = dataset.map(tokenize_function, batched=True)




Map:   0%|          | 0/105 [00:00<?, ? examples/s]

In [None]:
# Split the dataset into train and test sets
tokenized_datasets = tokenized_datasets.train_test_split(test_size=0.2)


In [None]:
from transformers import DataCollatorForLanguageModeling, Trainer, TrainingArguments

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=10,  # You can adjust the number of epochs
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=500,
    save_total_limit=2
)

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    data_collator=data_collator
)

# Train the model
trainer.train()




Epoch,Training Loss,Validation Loss
1,2.9301,2.870145
2,1.7733,2.516184
3,1.5166,2.45692
4,1.2378,2.480402
5,0.9912,2.604203
6,0.7996,2.720523
7,0.7147,2.720606
8,0.7067,2.827834
9,0.6006,2.856132
10,0.5036,2.860963


TrainOutput(global_step=420, training_loss=1.2156076976231167, metrics={'train_runtime': 65.8673, 'train_samples_per_second': 12.753, 'train_steps_per_second': 6.376, 'total_flos': 54871326720000.0, 'train_loss': 1.2156076976231167, 'epoch': 10.0})

In [None]:
import torch

# Ensure the model is on the GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def get_response(user_input):
    # Tokenize the input with attention_mask
    inputs = tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    # Generate a response with attention_mask
    outputs = model.generate(
        inputs.input_ids,
        attention_mask=inputs.attention_mask,  # Pass attention_mask
        max_length=128,  # Set the maximum length of the generated response
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,  # Use EOS token as PAD token
        eos_token_id=tokenizer.eos_token_id   # Ensure EOS token is set
    )

    # Decode the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Test the chatbot after fine-tuning
user_input = "What is the purpose of Girls Who Code?"
response = get_response(user_input)
print(response)


What is the purpose of Girls Who Code? Girls Who Code is to provide leadership opportunities for girls in the club. Girls Who Code does not provide any certifications or awards. Girls Who Code does provide scholarships for its members. Girls Who Code does not provide any other scholarships or awards. Girls Who Code does not provide any other awards or scholarships. Girls Who Code does not provide any other awards or scholarships. Girls Who Code does not provide any other awards or scholarships. Girls Who Code does not provide any other awards or scholarships. Girls Who Code does not provide any other awards or scholarships. Girls Who Code does not provide any other awards or scholarships


In [None]:
model.save_pretrained('/content/drive/My Drive/gwc_chatbot_finetuned')
tokenizer.save_pretrained('/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer')


('/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer/tokenizer_config.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer/special_tokens_map.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer/vocab.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer/merges.txt',
 '/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer/added_tokens.json',
 '/content/drive/My Drive/gwc_chatbot_finetuned_tokenizer/tokenizer.json')

In [None]:
!pip install sentence-transformers



In [None]:
def get_response(user_input, max_sentences=2):
    # Tokenize the user input and generate a response from GPT-2
    inputs = gpt2_tokenizer(user_input, return_tensors="pt", padding=True, truncation=True, max_length=150)
    inputs = {key: value.to(device) for key, value in inputs.items()}  # Move inputs to device
    outputs = gpt2_model.generate(
        inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_length=128,
        num_return_sequences=1,
        top_p=0.9,
        no_repeat_ngram_size=3,
        pad_token_id=gpt2_tokenizer.eos_token_id,
        eos_token_id=gpt2_tokenizer.eos_token_id,
        do_sample=True
    )
    # Decode the GPT-2 response
    response = gpt2_tokenizer.decode(outputs[0], skip_special_tokens=True, clean_up_tokenization_spaces=True)

    # Stop generation after max_sentences (e.g., 2 sentences)
    sentence_endings = ['.', '!', '?']  # Define sentence-ending punctuation
    sentence_count = 0
    final_response = ""

    # Iterate through characters and stop after two sentences
    for char in response:
        final_response += char
        if char in sentence_endings:
            sentence_count += 1
        if sentence_count == max_sentences:
            break

    return final_response

# Example usage
user_input = "What is the purpose of Girls Who Code?"
response = get_response(user_input, max_sentences=3)

print("Response with two sentences:")
print(response)


Response with two sentences:
What is the purpose of Girls Who Code? Girls Who Codes (GWC) is a global nonprofit organization dedicated to reducing the gender gap in technology by empowering young women through coding education and community support. It offers after-school clubs, summer programs, and college-level initiatives that teach programming skills, web development, and other tech competencies.


In [None]:
# Define paths to save the model and tokenizer
model_save_path = '/content/gwc_gpt2_model'
tokenizer_save_path = '/content/gwc_gpt2_tokenizer'

# Save the model and tokenizer
gpt2_model.save_pretrained(model_save_path)
gpt2_tokenizer.save_pretrained(tokenizer_save_path)


('/content/gwc_gpt2_tokenizer/tokenizer_config.json',
 '/content/gwc_gpt2_tokenizer/special_tokens_map.json',
 '/content/gwc_gpt2_tokenizer/vocab.json',
 '/content/gwc_gpt2_tokenizer/merges.txt',
 '/content/gwc_gpt2_tokenizer/added_tokens.json',
 '/content/gwc_gpt2_tokenizer/tokenizer.json')

In [None]:
!zip -r /content/gwc_gpt2_model.zip /content/gwc_gpt2_model /content/gwc_gpt2_tokenizer


  adding: content/gwc_gpt2_model/ (stored 0%)
  adding: content/gwc_gpt2_model/generation_config.json (deflated 24%)
  adding: content/gwc_gpt2_model/model.safetensors (deflated 7%)
  adding: content/gwc_gpt2_model/config.json (deflated 50%)
  adding: content/gwc_gpt2_tokenizer/ (stored 0%)
  adding: content/gwc_gpt2_tokenizer/merges.txt (deflated 53%)
  adding: content/gwc_gpt2_tokenizer/vocab.json (deflated 59%)
  adding: content/gwc_gpt2_tokenizer/special_tokens_map.json (deflated 74%)
  adding: content/gwc_gpt2_tokenizer/tokenizer_config.json (deflated 56%)
  adding: content/gwc_gpt2_tokenizer/tokenizer.json (deflated 72%)


In [None]:
from google.colab import files

# Download the zip file to your local machine
files.download('/content/gwc_gpt2_model.zip')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>