
## Chatbot Intents Data Visualization

This notebook provides a visualization of the chatbot intents data in a tabular format.


In [14]:
# pip install torch


Collecting torch
  Downloading torch-2.4.1-cp311-cp311-win_amd64.whl.metadata (27 kB)
Collecting typing-extensions>=4.8.0 (from torch)
  Downloading typing_extensions-4.12.2-py3-none-any.whl.metadata (3.0 kB)
Downloading torch-2.4.1-cp311-cp311-win_amd64.whl (199.4 MB)
   ---------------------------------------- 0.0/199.4 MB ? eta -:--:--
   - -------------------------------------- 7.6/199.4 MB 42.7 MB/s eta 0:00:05
   --- ------------------------------------ 17.6/199.4 MB 46.1 MB/s eta 0:00:04
   ---- ----------------------------------- 24.4/199.4 MB 41.7 MB/s eta 0:00:05
   ------ --------------------------------- 31.2/199.4 MB 39.6 MB/s eta 0:00:05
   ------- -------------------------------- 35.9/199.4 MB 35.7 MB/s eta 0:00:05
   -------- ------------------------------- 40.4/199.4 MB 32.9 MB/s eta 0:00:05
   -------- ------------------------------- 43.5/199.4 MB 30.4 MB/s eta 0:00:06
   --------- ------------------------------ 46.4/199.4 MB 28.1 MB/s eta 0:00:06
   --------- -------

In [3]:
# pip install pandas

In [24]:

import pandas as pd

# Load the intents data
data = {
    "intents": [
        {"tag": "greeting",
         "patterns": ["Hi", "Hey", "Is anyone there?", "Hello", "Hay"],
         "responses": ["Hello", "Hi", "Hi there"]
        },
        {"tag": "goodbye",
         "patterns": ["Bye", "See you later", "Goodbye"],
         "responses": ["See you later", "Have a nice day", "Bye! Come back again"]
        },
        {"tag": "thanks",
         "patterns": ["Thanks", "Thank you", "That's helpful", "Thanks for the help"],
         "responses": ["Happy to help!", "Any time!", "My pleasure", "You're most welcome!"]
        },
        {"tag": "about",
         "patterns": ["Who are you?", "What are you?", "Who you are?" ],
         "responses": ["I'm Hyunh, your bot assistant", "I'm Hyunh, an Artificial Intelligent bot"]
        },
        {"tag": "name",
        "patterns": ["what is your name", "what should I call you", "whats your name?"],
        "responses": ["You can call me Hyunh.", "I'm Hyunh!", "Just call me as Hyunh"]
        },
        {"tag": "help",
        "patterns": ["Could you help me?", "give me a hand please", "Can you help?", "What can you do for me?", "I need a support", "I need a help", "support me please"],
        "responses": ["Tell me how can assist you", "Tell me your problem to assist you", "Yes Sure, How can I support you"]
        },
        {"tag": "createaccount",
        "patterns": ["I need to create a new account", "how to open a new account", "I want to create an account", "can you create an account for me", "how to open a new account"],
        "responses": ["You can just easily create a new account from our web site", "Just go to our web site and follow the guidelines to create a new account"]
        },
        {"tag": "complaint",
        "patterns": ["have a complaint", "I want to raise a complaint", "there is a complaint about a service"],
        "responses": ["Please provide us your complaint in order to assist you", "Please mention your complaint, we will reach you and sorry for any inconvenience caused"]
        }
    ]
}

# Convert the intents data to a DataFrame
patterns_responses = []
for intent in data['intents']:
    for pattern in intent['patterns']:
        for response in intent['responses']:
            patterns_responses.append({'Tag': intent['tag'], 'Pattern': pattern, 'Response': response})

data = pd.DataFrame(patterns_responses)

# Display the DataFrame
# display(data)

from datasets import Dataset
dataset = Dataset.from_pandas(data)

# Display the first few rows of the dataset
display(dataset)

Dataset({
    features: ['Tag', 'Pattern', 'Response'],
    num_rows: 92
})

In [25]:
from transformers import GPT2Tokenizer
from datasets import Dataset

# Initialize the tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\matha/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\matha/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at C:\Users\matha/.cache\huggingface\transformers\b105cf342574b32b2f8d5ea86c4845f46d8162160345fd0c85bd9ca3bc5cc48e.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8
loading configuration file https:

In [20]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments

# Load tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Update tokenizer settings
tokenizer.pad_token = tokenizer.eos_token  # Using EOS token for padding

# Tokenization function adjusted to ensure consistent sequence length
def tokenize_function(examples):
    # Concatenate text, ensuring every input ends with an EOS token
    text_to_tokenize = [p + tokenizer.eos_token + r + tokenizer.eos_token for p, r in zip(examples['Pattern'], examples['Response'])]
    # Ensure each sequence is exactly 20 tokens
    return tokenizer(text_to_tokenize, padding="max_length", truncation=True, max_length=20)

# Example data simulation
# data = {'Pattern': ["Hello, how are you?", "What is your name?"], 'Response': ["I am fine, thanks!", "My name is AI Model."]}
dataset = {"text": [f"{p}{tokenizer.eos_token}{r}{tokenizer.eos_token}" for p, r in zip(data['Pattern'], data['Response'])]}

# Tokenize dataset
tokenized_inputs = tokenizer(dataset["text"], padding="max_length", truncation=True, max_length=20, return_tensors="pt")

# Checking if all sequences are of length 20
print(tokenized_inputs['input_ids'].shape)  # Should print something like (batch_size, 20)


loading file https://huggingface.co/gpt2/resolve/main/vocab.json from cache at C:\Users\matha/.cache\huggingface\transformers\684fe667923972fb57f6b4dcb61a3c92763ad89882f3da5da9866baf14f2d60f.c7ed1f96aac49e745788faa77ba0a26a392643a50bb388b9c04ff469e555241f
loading file https://huggingface.co/gpt2/resolve/main/merges.txt from cache at C:\Users\matha/.cache\huggingface\transformers\c0c761a63004025aeadd530c4c27b860ec4ecbe8a00531233de21d865a402598.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/gpt2/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/special_tokens_map.json from cache at None
loading file https://huggingface.co/gpt2/resolve/main/tokenizer_config.json from cache at C:\Users\matha/.cache\huggingface\transformers\b105cf342574b32b2f8d5ea86c4845f46d8162160345fd0c85bd9ca3bc5cc48e.67d01b18f2079bd75eac0b2f2e7235768c7f26bd728e7a855a1c5acae01a91a8
loading configuration file https:

torch.Size([92, 20])


In [5]:
# print(str(tokenizer.eos_token))

In [6]:
# dataset['Pattern']+str(tokenizer.eos_token)

In [34]:
# Add special tokens if necessary
tokenizer.pad_token = tokenizer.eos_token  # Set pad token to eos token
def tokenize_function(examples, **kwargs):
    text_to_tokenize = [p + tokenizer.eos_token + r + tokenizer.eos_token for p, r in zip(examples['Pattern'], examples['Response'])]

    return tokenizer(text_to_tokenize, truncation=True, padding="max_length", max_length=1500)

tokenized_dataset = dataset.map(tokenize_function, batched=True)
display(tokenized_dataset)

Map: 100%|██████████| 92/92 [00:00<00:00, 2037.56 examples/s]


Dataset({
    features: ['Tag', 'Pattern', 'Response', 'input_ids', 'attention_mask'],
    num_rows: 92
})

In [35]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments

# Load the pre-trained GPT-2 model and tokenizer
model = GPT2LMHeadModel.from_pretrained('gpt2')

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="no",
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# Start training
trainer.train()


loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\matha/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generati

KeyError: 'loss'

In [22]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

# Load the pre-trained GPT-2 model
model = GPT2LMHeadModel.from_pretrained('gpt2')
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    
)
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    evaluation_strategy="no",
)
# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)
# Start training
trainer.train()

loading configuration file https://huggingface.co/gpt2/resolve/main/config.json from cache at C:\Users\matha/.cache\huggingface\transformers\fc674cd6907b4c9e933cb42d67662436b89fa9540a1f40d7c919d0109289ad01.7d2e0efa5ca20cef4fb199382111e9d3ad96fd77b849e1d4bed13a66e1336f51
Model config GPT2Config {
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generati

ValueError: expected sequence of length 20 at dim 1 (got 14)

In [14]:
# Start training
trainer.train()

The following columns in the training set  don't have a corresponding argument in `GPT2LMHeadModel.forward` and have been ignored: Response, Pattern, Tag. If Response, Pattern, Tag are not expected by `GPT2LMHeadModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 92
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 69


ValueError: expected sequence of length 20 at dim 1 (got 14)

In [62]:
pip install accelerate>=0.26.0


Note: you may need to restart the kernel to use updated packages.


In [63]:
pip install --upgrade pip


Note: you may need to restart the kernel to use updated packages.


In [64]:
pip install transformers[torch]





In [None]:
# Save the fine-tuned model
model.save_pretrained("fine-tuned-gpt2")
tokenizer.save_pretrained("fine-tuned-gpt2")

In [None]:
# Test the model with a custom prompt
prompt = "Explain machine learning in simple terms."
inputs = tokenizer.encode(prompt, return_tensors='pt')
output = model.generate(inputs, max_length=100, num_beams=5, no_repeat_ngram_size=2, early_stopping=True)

# Decode and print the response
response = tokenizer.decode(output[0], skip_special_tokens=True)
print("Chatbot Response:", response)