In [10]:
!pip install transformers datasets torch




In [19]:
import re
import torch
from torch.utils.data import Dataset, random_split
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TrainingArguments, Trainer

# Step 1: Load and Preprocess the Data
file_path = "reviews.txt"  # Replace with the path to your file
with open(file_path, "r", encoding="utf-8") as file:
    data = file.readlines()

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove special characters
    text = re.sub(r"\s+", " ", text).strip()  # Remove extra spaces
    return text

preprocessed_data = [preprocess_text(line) for line in data]

# Step 2: Tokenize Data with Padding
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# Add a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use the end-of-sequence token as padding

def tokenize_data(data, tokenizer, max_length=50):
    tokenized = tokenizer(
        data,
        truncation=True,  # Truncate sequences longer than max_length
        padding="max_length",  # Pad sequences shorter than max_length
        max_length=max_length,  # Maximum sequence length
        return_tensors="pt"  # Return as PyTorch tensors
    )
    return tokenized

tokenized_data = tokenize_data(preprocessed_data, tokenizer)

# Step 3: Create Dataset Class
class TextDataset(Dataset):
    def __init__(self, input_ids, attention_masks):
        self.input_ids = input_ids
        self.attention_masks = attention_masks

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        input_ids = self.input_ids[idx]
        attention_mask = self.attention_masks[idx]
        labels = input_ids.clone()  # Labels are the same as input_ids

        # Mask the padding tokens in the labels (optional but recommended)
        labels[input_ids == tokenizer.pad_token_id] = -100

        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


# Prepare input_ids and attention_masks
input_ids = tokenized_data["input_ids"]
attention_masks = tokenized_data["attention_mask"]

# Split data into training and evaluation sets
train_size = int(0.8 * len(input_ids))
eval_size = len(input_ids) - train_size

train_input_ids = input_ids[:train_size]
train_attention_masks = attention_masks[:train_size]
eval_input_ids = input_ids[train_size:]
eval_attention_masks = attention_masks[train_size:]

train_dataset = TextDataset(train_input_ids, train_attention_masks)
eval_dataset = TextDataset(eval_input_ids, eval_attention_masks)

# Step 4: Load Pretrained Model and Set Training Arguments
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Disable WandB
import os
os.environ["WANDB_DISABLED"] = "true"

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    logging_dir="./logs",
    logging_steps=100,
    learning_rate=5e-5,
    save_total_limit=2,
    report_to="none",  # Disable external reporting
)

# Step 5: Train the Model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=lambda data: {
        "input_ids": torch.stack([f["input_ids"] for f in data]),
        "attention_mask": torch.stack([f["attention_mask"] for f in data]),
        "labels": torch.stack([f["labels"] for f in data]),  # Include labels
    },
)


# Train the model
trainer.train()

# Step 6: Save the Model and Tokenizer
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")

# Step 7: Generate Text
def generate_text(prompt, model, tokenizer, max_length=50):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example Usage
prompt = "The Da Vinci Code is"
generated_text = generate_text(prompt, model, tokenizer)
print("Generated Text:", generated_text)




Step,Training Loss,Validation Loss
500,1.205,2.840474
1000,0.8942,2.675874
1500,0.8833,2.690382
2000,0.7324,2.687572
2500,0.6618,2.714529
3000,0.6134,2.792249
3500,0.5426,2.749416
4000,0.5764,2.799986


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cuda:0 and cpu! (when checking argument for argument index in method wrapper_CUDA__index_select)

In [22]:
import torch

# Check if GPU is available and move the model to the appropriate device (cuda or cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 7: Generate Text (Updated to use the correct device)
def generate_text(prompt, model, tokenizer, max_length=20):
    input_ids = tokenizer.encode(prompt, return_tensors="pt").to(device)  # Move input to the same device as the model

    # Generate text
    output = model.generate(
        input_ids,
        max_length=max_length,
        num_return_sequences=1,
        do_sample=True,
        top_k=50,
        top_p=0.95
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example usage
prompt = "The shailendra is a good"
generated_text = generate_text(prompt, model, tokenizer)
print("Generated Text:", generated_text)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Generated Text: The shailendra is a good film all things being a show about how much the da vin
