In [None]:
!pip install transformers datasets torch accelerate


In [None]:
import pandas as pd
df = pd.read_csv("qa_dataset.csv", quotechar='"', escapechar='\\')
df.head()


In [None]:
import pandas as pd
from datasets import Dataset

# Load the CSV you uploaded
df = pd.read_csv("qa_dataset.csv")

# Combine question and answer into a single conversational text
df["text"] = "User: " + df["question"] + "\nBot: " + df["answer"]

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[["text"]])
dataset = dataset.train_test_split(test_size=0.1)

print(dataset)



In [None]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize, batched=True)


In [None]:
print(tokenized_datasets["train"][0])


In [None]:
# Add labels for language modeling (GPT-2 needs them)
tokenized_datasets = tokenized_datasets.map(
    lambda samples: {"labels": samples["input_ids"]}, batched=True
)


In [None]:
from transformers import GPT2LMHeadModel

# Load pre-trained GPT-2 base model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Make sure tokenizer and model have same vocab size
model.resize_token_embeddings(len(tokenizer))


In [None]:
!pip install -U transformers


In [None]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# ✅ Add labels
tokenized_datasets = tokenized_datasets.map(
    lambda samples: {"labels": samples["input_ids"]}, batched=True
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",     # or evaluation_strategy if updated version
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to="none",          # ✅ Disable W&B if you don't want tracking
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
def chat(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=120,
        top_k=50,
        top_p=0.9,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(chat("User: What is Artificial Intelligence?\nBot:"))
