In [1]:
!pip install transformers datasets torch accelerate




In [9]:
import pandas as pd
df = pd.read_csv("qa_dataset.csv", quotechar='"', escapechar='\\')
df.head()


Unnamed: 0,question,answer
0,What is Artificial Intelligence?,Artificial Intelligence is the simulation of h...
1,Who developed Python?,Python was developed by Guido van Rossum and f...
2,What is Machine Learning?,Machine Learning is a subset of AI that enable...
3,What is Deep Learning?,Deep Learning is a subset of Machine Learning ...
4,What is Natural Language Processing?,Natural Language Processing is a field of AI t...


In [11]:
import pandas as pd
from datasets import Dataset

# Load the CSV you uploaded
df = pd.read_csv("qa_dataset.csv")

# Combine question and answer into a single conversational text
df["text"] = "User: " + df["question"] + "\nBot: " + df["answer"]

# Convert to Hugging Face dataset
dataset = Dataset.from_pandas(df[["text"]])
dataset = dataset.train_test_split(test_size=0.1)

print(dataset)



DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 18
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2
    })
})


In [12]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def tokenize(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=128)

tokenized_datasets = dataset.map(tokenize, batched=True)


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [13]:
print(tokenized_datasets["train"][0])


{'text': 'User: What is a loss function?\nBot: A loss function measures the difference between a model’s predictions and the true values during training.', 'input_ids': [12982, 25, 1867, 318, 257, 2994, 2163, 30, 198, 20630, 25, 317, 2994, 2163, 5260, 262, 3580, 1022, 257, 2746, 447, 247, 82, 16277, 290, 262, 2081, 3815, 1141, 3047, 13, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 502

In [23]:
# Add labels for language modeling (GPT-2 needs them)
tokenized_datasets = tokenized_datasets.map(
    lambda samples: {"labels": samples["input_ids"]}, batched=True
)


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

In [14]:
from transformers import GPT2LMHeadModel

# Load pre-trained GPT-2 base model
model = GPT2LMHeadModel.from_pretrained("gpt2")

# Make sure tokenizer and model have same vocab size
model.resize_token_embeddings(len(tokenizer))


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [18]:
!pip install -U transformers




In [24]:
from transformers import GPT2LMHeadModel, Trainer, TrainingArguments

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

# ✅ Add labels
tokenized_datasets = tokenized_datasets.map(
    lambda samples: {"labels": samples["input_ids"]}, batched=True
)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",     # or evaluation_strategy if updated version
    learning_rate=5e-5,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    warmup_steps=50,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    report_to="none",          # ✅ Disable W&B if you don't want tracking
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["test"],
    tokenizer=tokenizer,
)

trainer.train()


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.
`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss
1,No log,6.195826
2,7.188000,0.891544
3,2.767700,0.580513




TrainOutput(global_step=27, training_loss=3.919660974431921, metrics={'train_runtime': 156.9909, 'train_samples_per_second': 0.344, 'train_steps_per_second': 0.172, 'total_flos': 3527442432000.0, 'train_loss': 3.919660974431921, 'epoch': 3.0})

In [25]:
def chat(prompt):
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=120,
        top_k=50,
        top_p=0.9,
        temperature=0.8,
        pad_token_id=tokenizer.eos_token_id,
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)

print(chat("User: What is Artificial Intelligence?\nBot:"))


The following generation flags are not valid and may be ignored: ['temperature', 'top_p']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


User: What is Artificial Intelligence?
Bot: Artificial intelligence is a type of artificial intelligence that is designed to be able to understand and understand human behavior.
