In [None]:
pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K

In [None]:
import os
import json
from transformers import (
    GPT2Tokenizer,
    GPT2LMHeadModel,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset, Dataset, concatenate_datasets
import torch
import os


# Load tokenizer and model
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token to the tokenizer
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token # Use eos_token as pad_token

model = GPT2LMHeadModel.from_pretrained(model_name)

os.chdir("/content/drive/MyDrive/QA_dataset/Json")

# Load your JSON data files
files = ["Generate_workout.json", "g2.json", "g4.json", "nutrition.json"]
datasets = [Dataset.from_json(f) for f in files]

# Concatenate all datasets
dataset = concatenate_datasets(datasets)

# Function to concatenate question and answer
def format_data(examples):
    return {
        'text': [f"Question: {q}\nAnswer: {a}" for q, a in zip(examples['question'], examples['answer'])]
    }

# Apply formatting
formatted_dataset = dataset.map(format_data, batched=True, remove_columns=['question', 'answer'])

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset = formatted_dataset.map(tokenize_function, batched=True)

# Set the format for PyTorch
tokenized_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

# Split the dataset into train and evaluation sets
train_dataset = tokenized_dataset.select(range(int(len(tokenized_dataset)*0.9))) # Select 90% for training
eval_dataset = tokenized_dataset.select(range(int(len(tokenized_dataset)*0.9), len(tokenized_dataset))) # Select remaining 10% for evaluation

# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
)
# Initialize Data Collator for Language Modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False) # mlm=False for causal language modeling


# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset, # Pass the training dataset
    eval_dataset=eval_dataset, # Pass the evaluation dataset
    data_collator=data_collator,
)

# Fine-tune the model
trainer.train()

# Save the model
model.save_pretrained("./fine_tuned_gpt2")
tokenizer.save_pretrained("./fine_tuned_gpt2")



Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss
1,0.5672,0.391242
2,0.514,0.37978


RuntimeError: [enforce fail at inline_container.cc:603] . unexpected pos 634855808 vs 634855700

In [None]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextGenerationPipeline

# Load the fine-tuned model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("./fine_tuned_gpt2")
model = GPT2LMHeadModel.from_pretrained("./fine_tuned_gpt2")

# Create a text generation pipeline
pipeline = TextGenerationPipeline(model=model, tokenizer=tokenizer, device=0)

def generate_response(question, max_length=150):
    prompt = f"<|startoftext|>\nUser: {question}\nAssistant:"
    response = pipeline(
        prompt,
        max_length=max_length,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        clean_up_tokenization_spaces=True,
        pad_token_id=tokenizer.eos_token_id
    )
    generated_text = response[0]['generated_text']
    # Extract the assistant's response
    assistant_response = generated_text.split("Assistant:")[1].strip().split("<|endoftext|>")[0].strip()
    return assistant_response

# Example usage
question = "What is the capital of Germany?"
answer = generate_response(question)
print(f"Assistant: {answer}")
