In [3]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
# Install Transformers library
!pip install transformers


Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m59.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m25.6 MB/s[0m eta [36m0:00:0

In [2]:
# Import necessary libraries
import os
import json
import string
import re
import torch
import random
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [4]:
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# Set up the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [6]:
# Load your conversational dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

data = load_dataset('/content/drive/MyDrive/data.json')


In [7]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

preprocessed_data = []

for conversation in data:
    preprocessed_conversation = []

    for message in conversation:
        preprocessed_message = preprocess_text(message)
        preprocessed_conversation.append(preprocessed_message)

    preprocessed_data.append(preprocessed_conversation)


In [8]:
# Split data into training, validation, and test sets
random.seed(42)  # Set a random seed for reproducibility
train_ratio = 0.7  # Adjust the ratio as needed
val_ratio = 0.15
test_ratio = 0.15

total_conversations = len(preprocessed_data)
train_size = int(train_ratio * total_conversations)
val_size = int(val_ratio * total_conversations)
test_size = total_conversations - train_size - val_size

train_data, val_data, test_data = random_split(preprocessed_data, [train_size, val_size, test_size])


In [9]:
# Training Data Preparation
train_text = ""
val_text = ""
test_text = ""

for conversation in train_data:
    train_text += " ".join(conversation) + " "

for conversation in val_data:
    val_text += " ".join(conversation) + " "

for conversation in test_data:
    test_text += " ".join(conversation) + " "

In [10]:
# Tokenize and encode the training, validation, and test text
train_input_ids = tokenizer.encode(train_text, return_tensors="pt", max_length=512, truncation=True).to(device)
val_input_ids = tokenizer.encode(val_text, return_tensors="pt", max_length=512, truncation=True).to(device)
test_input_ids = tokenizer.encode(test_text, return_tensors="pt", max_length=512, truncation=True).to(device)


In [15]:
# Define the training parameters
learning_rate = 1e-4
epochs = 200
batch_size = 2  # Adjust batch size according to your hardware


In [16]:
# Create data loaders
train_data = torch.utils.data.TensorDataset(train_input_ids)
val_data = torch.utils.data.TensorDataset(val_input_ids)
test_data = torch.utils.data.TensorDataset(test_input_ids)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)


In [17]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)


In [48]:


# Add these lines at the beginning to import the necessary modules
from torch.utils.tensorboard import SummaryWriter

# Create a TensorBoard writer
writer = SummaryWriter()


In [49]:
# Training Loop
model.train()
for epoch in range(epochs):
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch[0].to(device)
        labels = inputs.clone()

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Average Loss: {average_loss:.4f}")

    # Log loss to TensorBoard
    writer.add_scalar('Loss/Train', average_loss, epoch)


Epoch 1/200 - Average Loss: 0.0559
Epoch 2/200 - Average Loss: 0.0495
Epoch 3/200 - Average Loss: 0.0834
Epoch 4/200 - Average Loss: 0.0403
Epoch 5/200 - Average Loss: 0.0661
Epoch 6/200 - Average Loss: 0.0626
Epoch 7/200 - Average Loss: 0.0532
Epoch 8/200 - Average Loss: 0.0449
Epoch 9/200 - Average Loss: 0.0546
Epoch 10/200 - Average Loss: 0.0437
Epoch 11/200 - Average Loss: 0.0783
Epoch 12/200 - Average Loss: 0.0511
Epoch 13/200 - Average Loss: 0.0419
Epoch 14/200 - Average Loss: 0.0600
Epoch 15/200 - Average Loss: 0.0943
Epoch 16/200 - Average Loss: 0.0412
Epoch 17/200 - Average Loss: 0.0633
Epoch 18/200 - Average Loss: 0.0630
Epoch 19/200 - Average Loss: 0.0433
Epoch 20/200 - Average Loss: 0.0547
Epoch 21/200 - Average Loss: 0.0499
Epoch 22/200 - Average Loss: 0.0401
Epoch 23/200 - Average Loss: 0.0646
Epoch 24/200 - Average Loss: 0.0643
Epoch 25/200 - Average Loss: 0.0542
Epoch 26/200 - Average Loss: 0.0585
Epoch 27/200 - Average Loss: 0.0468
Epoch 28/200 - Average Loss: 0.0514
E

In [51]:
def generate_response(input_text, max_length=100):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)  # Ensure input_ids is on the same device as the model

    # Generate a response
    response_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and return the response
    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    return response


In [52]:
# Example conversation
user_input = "Hello, how are you?"
conversation_history = []

while True:
    user_input = input("User: ")

    # Append the user's input to the conversation history
    conversation_history.append(user_input)

    # Generate a response based on the entire conversation history
    bot_response = generate_response(" ".join(conversation_history))
    print("Bot:", bot_response)

    # Optionally, you can add logic to end the conversation (e.g., "exit" command)
    if user_input.lower() == "exit":
        break

User: how are you
Bot: how are you a fad?

I'm a fad f c'mon f fad.

I f a c e 8 f f 2 a  4 d 1  9 9 8  5 5 5 8 b f f 2 a 0 2 d t  4 d 1 f c f 4 a  b c 9 6  4 1 8 e  9 9 d 8  2 a e e e 4 b e 7 5 7 a 4 t  3 6 7 7
User: how was a day
Bot: how are you how was a day 1 d t-v t- 1 f c f 3 3 d 9  4 a f 8  9 8 1  5 5 5 3 8 b f f 2 a 0 2 d t  4 d 1 f c f 4 a  b c 9 6  4 1 8 e  9 9 d 8  2 a e e e 4 b e 7 5 7 a 4 t  3 6 7 7 b 4 2 4  b d 9 1 
User: fuking
Bot: how are you how was a day fuking f**king d 9 f c f 3 a f 8 d t  4 d 1 f c f 4 a  4 a f 8  9 9 8 1  5 3 8 b f f 2 a 0 2 d t  3 6 7 7 b 4 2 4  b d 9 1  4 9 0 5  8 0 2 a  d 0 f 1 e f 9 d 6 7 3 9 t  5 6 3 4 c
User: exit
Bot: how are you how was a day fuking exit?


I don't know. I don't know. I don't know.


I don't know. I don't know.

I don't know.

I don't know.

I don't know.

I don't know.

I don't know.

I don't know.

I don't know.


I don't know.


In [53]:
import torch

# Define your PyTorch model
model = YourModelClass()  # Replace with your model instance

# Define a file path for saving the H5 model
h5_model_path = "your_model.h5"

# Save the model to H5 format
torch.save(model.state_dict(), h5_model_path)

print(f"Model saved to {h5_model_path} in H5 format.")


NameError: ignored