In [1]:
!pip install transformers datasets


Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl (

In [2]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
from datasets import load_dataset, Dataset
import json

In [35]:
# Step 1: Load the dataset
def load_data(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    # Combine prompt and response for causal language modeling
    return [{"text": f"<|startoftext|>{item['prompt']} {item['response']}<|endoftext|>"} for item in data]

data = load_data("chat_data.json")
dataset = Dataset.from_list(data)

In [36]:
print(dataset)

Dataset({
    features: ['text'],
    num_rows: 50
})


In [37]:
# Step 2: Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

def preprocess(data):
    inputs = tokenizer(
        data["text"],
        truncation=True,
        padding="max_length",
        max_length=128,
    )
    # Set the labels to be the same as input_ids for causal LM
    inputs["labels"] = inputs["input_ids"].copy()
    return inputs

tokenized_data = dataset.map(preprocess, batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

In [49]:
# Step 3: Load the model
model = AutoModelForCausalLM.from_pretrained("gpt2")

# Step 4: Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="no",
    report_to="none",
)



In [50]:
# Step 5: Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss
50,0.9454


TrainOutput(global_step=75, training_loss=0.7338128089904785, metrics={'train_runtime': 22.7893, 'train_samples_per_second': 6.582, 'train_steps_per_second': 3.291, 'total_flos': 9798451200000.0, 'train_loss': 0.7338128089904785, 'epoch': 3.0})

In [51]:
# Save the fine-tuned model
trainer.save_model("./fine_tuned_chatbot")
tokenizer.save_pretrained("./fine_tuned_chatbot")

('./fine_tuned_chatbot/tokenizer_config.json',
 './fine_tuned_chatbot/special_tokens_map.json',
 './fine_tuned_chatbot/vocab.json',
 './fine_tuned_chatbot/merges.txt',
 './fine_tuned_chatbot/added_tokens.json',
 './fine_tuned_chatbot/tokenizer.json')

In [52]:
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load the fine-tuned model
model = AutoModelForCausalLM.from_pretrained("./fine_tuned_chatbot")
tokenizer = AutoTokenizer.from_pretrained("./fine_tuned_chatbot")

# Ensure the pad token is set properly
tokenizer.pad_token = tokenizer.eos_token

def chat_with_model(prompt):
    # Prepare input text
    input_text = f"<|startoftext|>{prompt}"
    inputs = tokenizer(input_text, return_tensors="pt", padding=True, truncation=True, max_length=128)

    # Generate response with attention mask
    outputs = model.generate(
        inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_length=150,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1,
        # temperature=0.7,  # Adjust for randomness in output
    )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Return the generated response
    return response.split(prompt)[-1].strip()



In [54]:
# Example usage
print(chat_with_model("How can I access the e-library?"))

You can access the e-library through the student portal under the 'Access to the e-library' section.
