In [3]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)


Mounted at /content/drive


In [4]:
# Install Transformers library
!pip install transformers


Collecting transformers
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m66.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m29.3 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m82.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m61.1 MB/s[0m eta [36m0:00:00[0m
Col

In [5]:
# Import necessary libraries
import os
import json
import string
import re
import torch
import random
import numpy as np
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW, get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [6]:
# Load the GPT-2 tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2")


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# Set up the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
# Load your conversational dataset
def load_dataset(file_path):
    with open(file_path, 'r') as f:
        data = json.load(f)
    return data

data = load_dataset('/content/sample_data/anscombe.json')


In [11]:
# Data Preprocessing
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

preprocessed_data = []

for conversation in data:
    preprocessed_conversation = []

    for message in conversation:
        preprocessed_message = preprocess_text(message)
        preprocessed_conversation.append(preprocessed_message)

    preprocessed_data.append(preprocessed_conversation)


In [12]:
# Split data into training, validation, and test sets
random.seed(42)  # Set a random seed for reproducibility
train_ratio = 0.7  # Adjust the ratio as needed
val_ratio = 0.15
test_ratio = 0.15

total_conversations = len(preprocessed_data)
train_size = int(train_ratio * total_conversations)
val_size = int(val_ratio * total_conversations)
test_size = total_conversations - train_size - val_size

train_data, val_data, test_data = random_split(preprocessed_data, [train_size, val_size, test_size])


In [13]:
# Training Data Preparation
train_text = ""
val_text = ""
test_text = ""

for conversation in train_data:
    train_text += " ".join(conversation) + " "

for conversation in val_data:
    val_text += " ".join(conversation) + " "

for conversation in test_data:
    test_text += " ".join(conversation) + " "

In [14]:
# Tokenize and encode the training, validation, and test text
train_input_ids = tokenizer.encode(train_text, return_tensors="pt", max_length=512, truncation=True).to(device)
val_input_ids = tokenizer.encode(val_text, return_tensors="pt", max_length=512, truncation=True).to(device)
test_input_ids = tokenizer.encode(test_text, return_tensors="pt", max_length=512, truncation=True).to(device)


In [15]:
# Define the training parameters
learning_rate = 1e-4
epochs = 200
batch_size = 2  # Adjust batch size according to your hardware


In [16]:
# Create data loaders
train_data = torch.utils.data.TensorDataset(train_input_ids)
val_data = torch.utils.data.TensorDataset(val_input_ids)
test_data = torch.utils.data.TensorDataset(test_input_ids)

train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=batch_size)
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=batch_size)


In [17]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=learning_rate)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_dataloader) * epochs)




In [18]:


# Add these lines at the beginning to import the necessary modules
from torch.utils.tensorboard import SummaryWriter

# Create a TensorBoard writer
writer = SummaryWriter()


In [19]:
# Training Loop
model.train()
for epoch in range(epochs):
    total_loss = 0.0
    for batch in train_dataloader:
        inputs = batch[0].to(device)
        labels = inputs.clone()

        optimizer.zero_grad()
        outputs = model(inputs, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{epochs} - Average Loss: {average_loss:.4f}")

    # Log loss to TensorBoard
    writer.add_scalar('Loss/Train', average_loss, epoch)


Epoch 1/200 - Average Loss: 0.8586
Epoch 2/200 - Average Loss: 3.1682
Epoch 3/200 - Average Loss: 0.5095
Epoch 4/200 - Average Loss: 0.4272
Epoch 5/200 - Average Loss: 0.3496
Epoch 6/200 - Average Loss: 0.2780
Epoch 7/200 - Average Loss: 0.2161
Epoch 8/200 - Average Loss: 0.2052
Epoch 9/200 - Average Loss: 0.2316
Epoch 10/200 - Average Loss: 0.2717
Epoch 11/200 - Average Loss: 0.1000
Epoch 12/200 - Average Loss: 0.1317
Epoch 13/200 - Average Loss: 0.0782
Epoch 14/200 - Average Loss: 0.1466
Epoch 15/200 - Average Loss: 0.0700
Epoch 16/200 - Average Loss: 0.1206
Epoch 17/200 - Average Loss: 0.0347
Epoch 18/200 - Average Loss: 0.1460
Epoch 19/200 - Average Loss: 0.0405
Epoch 20/200 - Average Loss: 0.1241
Epoch 21/200 - Average Loss: 0.1141
Epoch 22/200 - Average Loss: 0.0352
Epoch 23/200 - Average Loss: 0.1268
Epoch 24/200 - Average Loss: 0.0644
Epoch 25/200 - Average Loss: 0.0400
Epoch 26/200 - Average Loss: 0.0341
Epoch 27/200 - Average Loss: 0.0544
Epoch 28/200 - Average Loss: 0.0415
E

In [20]:
def generate_response(input_text, max_length=100):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)  # Ensure input_ids is on the same device as the model

    # Generate a response
    response_ids = model.generate(input_ids, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)

    # Decode and return the response
    response = tokenizer.decode(response_ids[0], skip_special_tokens=True)
    return response


In [22]:
# Example conversation
user_input = "Hello, how are you?"
conversation_history = []

while True:
    user_input = input("User: ")

    # Append the user's input to the conversation history
    conversation_history.append(user_input)

    # Generate a response based on the entire conversation history
    bot_response = generate_response(" ".join(conversation_history))
    print("Bot:", bot_response)

    # Optionally, you can add logic to end the conversation (e.g., "exit" command)
    if user_input.lower() == "exit":
        break

User: Hello, how are you?
Bot: Hello, how are you?

I'm a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy who is a guy
User: exit
Bot: Hello, how are you? exit code x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y series x y   y
