In [3]:
import json
import os
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
import torch
from torch.utils.data import DataLoader, Dataset

# Specify the directory containing the JSON files
directory = r'D:\chatbot-json\chatbot'

# Combine all JSON files into a single list
combined_data = []
for filename in sorted(os.listdir(directory)):
    if filename.startswith('data') and filename.endswith('.json'):
        file_path = os.path.join(directory, filename)
        with open(file_path, 'r') as f:
            try:
                data = json.load(f)
                combined_data.extend(data)
            except json.JSONDecodeError as e:
                print(f"Skipping file due to JSON error: {e}\nFile: {filename}")

# Convert the combined JSON data to a list of dictionaries with 'text' as a key
formatted_data = []
for entry in combined_data:
    try:
        system_prompt = entry['messages'][0]['content']
        assistant_text = entry['messages'][-1]['content']  # Get the last message as the assistant's response
        for user_msg in entry['messages'][1:-1]:  # Iterate over all user messages except the last one
            user_text = user_msg['content']
            combined_text = f"System: {system_prompt}\nUser: {user_text}\nAssistant: {assistant_text}"
            formatted_data.append({"text": combined_text})
    except (IndexError, KeyError, TypeError) as e:
        print(f"Skipping entry due to error: {e}\nEntry: {entry}")

# Create a custom Dataset class
class CustomDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=128):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        item = self.data[idx]
        inputs = self.tokenizer(item['text'], truncation=True, padding='max_length', max_length=self.max_length)
        inputs['labels'] = inputs['input_ids'].copy()
        return {key: torch.tensor(val) for key, val in inputs.items()}

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Add padding token
tokenizer.pad_token = tokenizer.eos_token
model.resize_token_embeddings(len(tokenizer))

# Create the dataset and dataloader
train_dataset = CustomDataset(formatted_data, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Set up the optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=5e-5)
num_training_steps = len(train_dataloader) * 20  # Set num_epochs to 20
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

# Training loop
model.train()
for epoch in range(20):  # number of epochs
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        print(f"Epoch {epoch+1}, Loss: {loss.item()}")

# Save the fine-tuned model
output_directory = r"D:\chatbot-json\fine-tuned-gpt2"
model.save_pretrained(output_directory)
tokenizer.save_pretrained(output_directory)

print(f"Model saved in {output_directory}")

# Function to generate a response
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=100,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop to ask questions (optional, you can comment it out if not needed)
# while True:
#     user_input = input("You: ")
#     if user_input.lower() in ["exit", "quit"]:
#         break
#     system_prompt = "You are a chatbot for Telenor Pakistan providing support for various inquiries. Your responses should be clear, concise, and helpful.\n"
#     full_prompt = f"{system_prompt}User: {user_input}\nAssistant:"
#     response = generate_response(full_prompt)
#     print(f"Assistant: {response.strip()}")


Skipping entry due to error: string indices must be integers, not 'str'
Entry: messages
Skipping entry due to error: string indices must be integers, not 'str'
Entry: messages
Skipping entry due to error: string indices must be integers, not 'str'
Entry: messages
Skipping entry due to error: string indices must be integers, not 'str'
Entry: messages
Skipping entry due to error: string indices must be integers, not 'str'
Entry: messages
Skipping entry due to error: string indices must be integers, not 'str'
Entry: messages
Epoch 1, Loss: 6.673068523406982
Epoch 1, Loss: 4.70991325378418
Epoch 1, Loss: 4.303225517272949
Epoch 1, Loss: 2.1549689769744873
Epoch 1, Loss: 2.0466599464416504
Epoch 1, Loss: 1.934098243713379
Epoch 1, Loss: 1.5834742784500122
Epoch 1, Loss: 2.41691255569458
Epoch 1, Loss: 1.8442178964614868
Epoch 1, Loss: 1.7686702013015747
Epoch 1, Loss: 3.050541877746582
Epoch 1, Loss: 2.398160219192505
Epoch 1, Loss: 2.350947618484497
Epoch 1, Loss: 1.8265684843063354
Epoch 

In [4]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model and tokenizer from the saved directory
output_directory = r"D:\chatbot-json\fine-tuned-gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(output_directory)
model = GPT2LMHeadModel.from_pretrained(output_directory)

# Move model to GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

# Function to generate a response
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, max_length=128)
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    outputs = model.generate(
        input_ids,
        attention_mask=attention_mask,
        max_length=100,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=0.7,
        top_k=50,
        top_p=0.9,
        do_sample=True
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return response

# Interactive loop to ask questions
while True:
    user_input = input("You: ")
    if user_input.lower() in ["exit", "quit"]:
        break
    system_prompt = "You are a chatbot for Telenor Pakistan providing support for various inquiries. Your responses should be clear, concise, and helpful.\n"
    full_prompt = f"{system_prompt}User: {user_input}\nAssistant:"
    response = generate_response(full_prompt)
    print(f"Assistant: {response.strip()}")


You:  What's the way to verify how many SIMs are registered on my CNIC?


Assistant: You are a chatbot for Telenor Pakistan providing support for various inquiries. Your responses should be clear, concise, and helpful.
User: What's the way to verify how many SIMs are registered on my CNIC?
Assistant: Send an SMS with your CNIC number (without space and dash) to 668; or call our helpline on 345 or UAN 042111345100 for SIM count inquiries.


You:  exit
