In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/chatbot-dataset-for-transformers/label_texts.txt
/kaggle/input/chatbot-dataset-for-transformers/dialogs.txt
/kaggle/input/chatbot-dataset-for-transformers/input_texts.txt


In [2]:
# Caminho dos arquivos
dialogs_path = "/kaggle/input/chatbot-dataset-for-transformers/dialogs.txt"
input_texts_path = "/kaggle/input/chatbot-dataset-for-transformers/input_texts.txt"
label_texts_path = "/kaggle/input/chatbot-dataset-for-transformers/label_texts.txt"

In [3]:
# Carregar os dados completos dos arquivos
with open(input_texts_path, 'r', encoding='utf-8') as f:
    input_texts = f.readlines()

with open(label_texts_path, 'r', encoding='utf-8') as f:
    label_texts = f.readlines()

# Garantir que ambos os arquivos tenham o mesmo comprimento
assert len(input_texts) == len(label_texts), "Os arquivos input_texts e label_texts devem ter o mesmo número de entradas."


In [4]:
pip install transformers datasets

Note: you may need to restart the kernel to use updated packages.


In [5]:
import torch
from torch.utils.data import DataLoader
from datasets import Dataset
from transformers import GPT2LMHeadModel, GPT2Tokenizer, AdamW

In [6]:
# Carregar o tokenizador GPT-2 e definir o token de padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Preparar os dados com todas as entradas
data = {
    'input_text': [text.strip() for text in input_texts],
    'label_text': [text.strip() for text in label_texts]
}

# Criar o dataset do Hugging Face
train_dataset = Dataset.from_dict(data)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [7]:
max_length = 1000 
# Tokenizar o dataset
def tokenize_function(examples):
    input_encodings = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=50)
    label_encodings = tokenizer(examples['label_text'], padding='max_length', truncation=True, max_length=50)
    return {
        'input_ids': input_encodings['input_ids'],
        'labels': label_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask']
    }

tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)

# Verificar os dados tokenizados
print(tokenized_train_dataset[0])

Map:   0%|          | 0/3982 [00:00<?, ? examples/s]

{'input_text': '[sos] hi, how are you doing? [eos]', 'label_text': "[sos] i'm fine. how about yourself? [eos]", 'input_ids': [58, 82, 418, 60, 23105, 11, 703, 389, 345, 1804, 30, 685, 68, 418, 60, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'labels': [58, 82, 418, 60, 1312, 1101, 3734, 13, 703, 546, 3511, 30, 685, 68, 418, 60, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [8]:
# Definir o dispositivo (GPU ou CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [9]:
# Carregar o modelo GPT-2
model = GPT2LMHeadModel.from_pretrained("gpt2").to(device)

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [10]:
# Definir o otimizador
optimizer = AdamW(model.parameters(), lr=5e-5)



In [11]:
# Definir o DataLoader
train_dataloader = DataLoader(tokenized_train_dataset, batch_size=8, shuffle=True)

In [12]:
# Loop de treinamento
num_epochs = 3  # Ajuste o número de épocas
for epoch in range(num_epochs):
    for batch in train_dataloader:
        # Correção: Converter listas de tensores em tensores únicos e para o tipo correto (long)
        input_ids = torch.stack(batch['input_ids']).to(device)
        labels = torch.stack(batch['labels']).long().to(device)  # Converte para long (int64)
        attention_mask = torch.stack(batch['attention_mask']).to(device)

        # Forward pass
        outputs = model(input_ids=input_ids, labels=labels, attention_mask=attention_mask)
        loss = outputs.loss

        # Backward pass e otimização
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Epoch {epoch+1} - Loss: {loss.item()}")

# Salvar o modelo treinado
"""model.save_pretrained("caminho/para/salvar/modelo")"""

Epoch 1 - Loss: 1.2926667928695679
Epoch 2 - Loss: 1.4195623397827148
Epoch 3 - Loss: 1.6184022426605225


'model.save_pretrained("caminho/para/salvar/modelo")'