In [41]:
pip install transformers datasets



In [42]:
import pandas as pd
from datasets import Dataset

In [43]:
df = pd.read_csv("/content/drive/MyDrive/NLP project /Sources/course_qa_500_unique.csv")

In [44]:
print(df.columns.tolist())


['question', 'answer']


In [45]:
df.head()

Unnamed: 0,question,answer
0,What does it mean if EEX4351 lists 'CR' or 'CA...,"'CR' means you must register for the course, a..."
1,What should I complete before attempting an ad...,Advanced courses like EEX7340 often require pa...
2,What does the 3rd character in the course code...,The 3rd character in the course code represent...
3,What makes me eligible to take EEX5335?,"To be eligible for EEX5335, you must fulfill t..."
4,What subjects do I need before I can take EEW6...,You need to complete certain prerequisite subj...


In [46]:
df.shape

(152, 2)

In [47]:
# Normalize
df["question"] = df["question"].str.lower()
df["question"] = df["question"].str.strip()
df["answer"] = df["answer"].str.strip()


In [48]:
df.shape

(152, 2)

In [49]:
# Optional: remove excessive spaces
df["question"] = df["question"].str.replace(r"\s+", " ", regex=True)

In [50]:
df.shape

(152, 2)

In [51]:
# Drop duplicate rows
df = df.drop_duplicates(subset=["question", "answer"])

In [52]:
df.shape

(152, 2)

In [53]:
# Save clean file
df.to_csv("chatbot_university_qa_clean.csv", index=False)

In [54]:
# Load CSV into a Hugging Face Dataset
dataset = Dataset.from_pandas(df)

In [55]:
print(dataset)

Dataset({
    features: ['question', 'answer'],
    num_rows: 152
})


In [56]:
from transformers import T5Tokenizer

In [57]:
#Load Tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

In [58]:
# #Tokenization Function
# def preprocess(example):
#     input_text = "question: " + example["question"]
#     target_text = example["answer"]

#     model_inputs = tokenizer(
#         input_text,
#         max_length=128,
#         truncation=True,
#         padding="max_length"     # optional if you want fixed length
#     )

#     labels = tokenizer(
#         target_text,
#         max_length=128,
#         truncation=True,
#         padding="max_length"
#     )

#     model_inputs["labels"] = labels["input_ids"]
#     return model_inputs


In [59]:
def preprocess(example):
    input_text = "question: " + example["question"]
    target_text = example["answer"]

    model_inputs = tokenizer(
        input_text,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    labels = tokenizer(
        target_text,
        max_length=128,
        truncation=True,
        padding="max_length"
    )

    # Replace padding token IDs with -100
    labels["input_ids"] = [
        (l if l != tokenizer.pad_token_id else -100) for l in labels["input_ids"]
    ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs


In [60]:
#Tokenize Entire Dataset
tokenized_dataset = dataset.map(preprocess)

Map:   0%|          | 0/152 [00:00<?, ? examples/s]

In [61]:
result = preprocess({
    "question": "Can I take EEX5270 if I failed EEX3336?",
    "answer": "No, you must pass EEX3336 before taking EEX5270."
})


In [62]:
print(result)

{'input_ids': [822, 10, 1072, 27, 240, 3, 5080, 4, 5373, 2518, 3, 99, 27, 4567, 3, 5080, 4, 4201, 3420, 58, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'labels': [465, 6, 25, 398, 1903, 3, 5080, 4, 4201, 3420, 274, 838, 3, 5080, 4, 5373, 2518, 5, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,

In [63]:
#Load Model
from transformers import T5ForConditionalGeneration
model = T5ForConditionalGeneration.from_pretrained("t5-small")

In [64]:
#Create Data Collator
from transformers import DataCollatorForSeq2Seq
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [65]:
#Define Training Arguments
from transformers import TrainingArguments
training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/NLP project /Sources/University_chatbot",
    per_device_train_batch_size=8,
    num_train_epochs=10,
    logging_steps=10,
    save_total_limit=2
)


# training_args = TrainingArguments(
#     output_dir="./university_chatbot",
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     logging_steps=10,
#     save_total_limit=2,
#     evaluation_strategy="no",    # change to "steps" if you have a validation set
#     fp16=False                   # set True if running on supported GPU
# )


In [66]:
#Create Trainer
from transformers import Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    data_collator=data_collator
)

In [67]:
#Train
trainer.train()

Step,Training Loss
10,4.2693
20,3.5685
30,3.1121
40,2.8178
50,2.6086
60,2.2695
70,2.2061
80,2.033
90,1.9046
100,1.784


TrainOutput(global_step=190, training_loss=2.1019189533434415, metrics={'train_runtime': 1814.7869, 'train_samples_per_second': 0.838, 'train_steps_per_second': 0.105, 'total_flos': 51429884559360.0, 'train_loss': 2.1019189533434415, 'epoch': 10.0})

In [68]:
#Save Model
trainer.save_model("/content/drive/MyDrive/NLP project /Sources/University_chatbot")

In [69]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

model_path = "/content/drive/MyDrive/NLP project /Sources/University_chatbot"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

In [70]:
question = "Can I take EEX5270 if I failed EEX3336?"
input_text = "question: " + question

In [71]:
print (input_text)

question: Can I take EEX5270 if I failed EEX3336?


In [72]:
#Tokenize Question
input_ids = tokenizer.encode(
    input_text,
    return_tensors="pt"
)

In [73]:
print(input_ids)

tensor([[ 822,   10, 1072,   27,  240,    3, 5080,    4, 5373, 2518,    3,   99,
           27, 4567,    3, 5080,    4, 4201, 3420,   58,    1]])


In [74]:
outputs = model.generate(
    input_ids,
    max_length=128,
    num_beams=4,
    early_stopping=True
)


In [75]:
# #Generate The Answer
# outputs = model.generate(
#     input_ids,
#     max_length=128,
#     num_beams=4,            # optional - improves quality
#     #early_stopping=True
# )


In [76]:
print (outputs)

tensor([[   0,  156,   27, 4567,    3, 5080,    4, 4201, 3420,    6,   25,   54,
          240,    3, 5080,    4, 5373, 2518,    5,    1]])


In [77]:
#Decode The Answer
answer = tokenizer.decode(
    outputs[0],
    skip_special_tokens=True
)

print("Chatbot Answer:", answer)


Chatbot Answer: If I failed EEX3336, you can take EEX5270.


In [78]:
from transformers import T5ForConditionalGeneration, T5Tokenizer

# Load your fine-tuned model
model_path = "/content/drive/MyDrive/NLP project /Sources/University_chatbot"

model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = T5Tokenizer.from_pretrained(model_path)

# List of your questions
questions = [
    "Can I take EEX5570 if I failed EEX3336?",
    "How many credits do I need for EEX5270?"
]

for q in questions:
    input_text = "question: " + q
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    outputs = model.generate(
        input_ids,
        max_length=128,
        num_beams=4,
        early_stopping=True
    )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"Q: {q}")
    print(f"A: {answer}\n")


Q: Can I take EEX5570 if I failed EEX3336?
A: If I failed EEX3336, you can take EEX5570.

Q: How many credits do I need for EEX5270?
A: You need to complete a required number of credits for EEX5270.



In [79]:
# from transformers import T5ForConditionalGeneration, TrainingArguments, Trainer

# model = T5ForConditionalGeneration.from_pretrained("t5-small")

# training_args = TrainingArguments(
#     output_dir="./university_chatbot",
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     logging_steps=10,
#     save_total_limit=2
# )

# trainer = Trainer(
#     model=model,
#     args=training_args,
#     train_dataset=tokenized_dataset,
# )

# trainer.train()
