## 1. Prepare Your Custom Dataset

In [1]:
!pip install datasets

import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import pandas as pd
from datasets import Dataset

#loading file through upload button
from google.colab import files

uploaded = files.upload()
for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

# Load the dataset
def load_dataset(file_path):
    df = pd.read_csv(file_path)
    return df

# Prepare the dataset for GPT-2
def prepare_dataset(df, tokenizer):
    # Concatenate question and response as a single text
    df['text'] = df['question'] + tokenizer.eos_token + df['response']
    # Tokenize the text
    tokenized_data = df['text'].apply(lambda x: tokenizer(x, truncation=True, padding='max_length', max_length=128))
    # Convert to Dataset object from the datasets library
    return Dataset.from_pandas(pd.DataFrame(tokenized_data.tolist()))

# Load the dataset
file_path = "chatbot-dialogs1.csv"
df = load_dataset(file_path)



Saving chatbot-dialogs1.csv to chatbot-dialogs1.csv
User uploaded file "chatbot-dialogs1.csv" with length 250722 bytes


## 2. Fine-Tune the Model

In [2]:
import transformers
import torch
import accelerate

print("Transformers version:", transformers.__version__)
print("Torch version:", torch.__version__)
print("Accelerate version:", accelerate.__version__)


Transformers version: 4.42.4
Torch version: 2.3.1+cu121
Accelerate version: 0.32.1


In [8]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, TextDataset, DataCollatorForLanguageModeling
import torch

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Padding token
tokenizer.pad_token = tokenizer.eos_token

# Sample a fraction of the dataset for quick testing
# sample_df = df.sample(frac=0.1, random_state=42)  # 10% of the data

# Prepare the dataset
train_dataset = prepare_dataset(df, tokenizer)

# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=10, # number of epochs
    per_device_train_batch_size=2,  # Adjust batch size if needed1
    save_steps=10_000,  # Save model more frequently
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")


Step,Training Loss
500,3.2273
1000,2.9578
1500,2.7403
2000,2.4881
2500,2.0495
3000,2.0125
3500,1.9662
4000,1.6993
4500,1.5258
5000,1.5177


('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

## 3. Predict with the Fine-Tuned Model

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Function to predict the response without repeating the question
def predict_response(question, model, tokenizer, max_length=20, temperature=0.7, top_k=50, top_p=0.9, repetition_penalty=1.2):
    input_ids = tokenizer.encode(question, return_tensors='pt')

    attention_mask = (input_ids != tokenizer.pad_token_id).long()

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length + len(input_ids[0]),  # Adjust max_length to account for input length
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,
        top_k=top_k,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
    )

    # Decode the response and remove the input question from it
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the input question from the response
    response = response[len(question):].strip()

    return response

# Chat with the bot
print("Start chatting with the bot (type 'quit' to stop)!")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = predict_response(user_input, fine_tuned_model, fine_tuned_tokenizer)
    print(f"Bot: {response}")



Start chatting with the bot (type 'quit' to stop)!
You: hi




Bot: , what's the matter with your nose? i don't know. maybe it was from a cigarette
You: how are you doing
Bot: that? i'm having a party this friday. what's the weather going to be like?
You: do you like rain
Bot: ? of course. it dries the air. i like to see it fall from the sky.
You: quit


## Exact Response Length from dataset:(Solving)


In [9]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

def get_response_length(question, dataset):
    # Get the expected response length for a similar question in the dataset
    similar_responses = dataset[dataset['question'].str.contains(question, case=False, na=False)]
    if not similar_responses.empty:
        # Estimate the length of the response
        response_lengths = similar_responses['response'].apply(lambda x: len(x.split()))
        return int(response_lengths.mean()) + 5  # Add some buffer to the average length
    else:
        return 20  # Fallback to a default length if no similar question found

# Function to predict the response
def predict_response(question, model, tokenizer, dataset, temperature=0.3, top_k=40, top_p=0.5, repetition_penalty=1.2):

    # Get the expected response length from the dataset
    max_length = get_response_length(question, dataset)

    input_ids = tokenizer.encode(question, return_tensors='pt')

    # Setting attention mask
    attention_mask = (input_ids != tokenizer.pad_token_id).long()

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,       # Controls randomness: lower is more deterministic, higher is more random
        top_k=top_k,                   # Limits sampling to the top k tokens
        top_p=top_p,                   # Nucleus sampling: selects tokens with cumulative probability up to p
        repetition_penalty=repetition_penalty,  # Applies a penalty to repeated tokens
    )

    # Decoding and post-processing the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

   # Remove the input question from the response
    response = response[len(question):].strip()

    return response

# Chat with the bot
print("Start chatting with the bot (type 'quit' to stop)!")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = predict_response(user_input, fine_tuned_model, fine_tuned_tokenizer, df)
    print(f"Bot: {response}")


Start chatting with the bot (type 'quit' to stop)!
You: hi




Bot: , how are you. is alice there?
You: how are you
Bot: doing today? i'm doing great.
You: which school do you go to?
Bot: i go to pcc. is there a school that you go
You: do you like rain
Bot: ? i loved it. how was the weather? we went out to eat!
You: how are you doing today?
Bot: i'm doing great.
You: sure, what did you want to do?
Bot: maybe we can go see
You: bye
Bot: , i'm sorry. i already took some medicine. what did you take? iously l
You: quit


## (Optional) Generate Text with the Fine-Tuned Model

In [None]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer
fine_tuned_model = GPT2LMHeadModel.from_pretrained("./gpt2-finetuned")
fine_tuned_tokenizer = GPT2Tokenizer.from_pretrained("./gpt2-finetuned")

# Function to generate text with attention mask and improved diversity
def generate_text(prompt, model, tokenizer, max_length=50, temperature=0.7, top_k=50, top_p=0.9):
    inputs = tokenizer.encode(prompt, return_tensors='pt')
    attention_mask = torch.ones(inputs.shape, dtype=torch.long)

    outputs = model.generate(
        inputs,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        temperature=temperature,  # Controls randomness: lower is more deterministic, higher is more random
        top_k=top_k,              # Limits sampling to the top k tokens
        top_p=top_p,              # Nucleus sampling: selects tokens with cumulative probability up to p
        pad_token_id=tokenizer.eos_token_id
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

# Example usage
prompt = "Your custom prompt here"
generated_text = generate_text(prompt, fine_tuned_model, fine_tuned_tokenizer)
print(f"Generated text: {generated_text}")




Generated text: Your custom prompt here. i'll get the latest news. i'll be happy to give you the invite later on today. i'll be glad to give you the invite if you're interested. i'll be glad to give you the invitation if you
