## 1. Prepare Your (URL) Custom Dataset

In [1]:
!pip install datasets



In [2]:
import requests
from bs4 import BeautifulSoup
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
import pandas as pd
from datasets import Dataset

# Scrape and parse the content from the URL
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Example heuristic: Assume questions are headers (e.g., h2, h3) and responses are paragraphs
    questions = [h2.get_text(strip=True) for h2 in soup.find_all(['h2', 'h3', 'h4'])]
    responses = [p.get_text(strip=True) for p in soup.find_all('p')]

  # Print extracted questions and responses for debugging
    print("Extracted Questions:")
    for q in questions:
        print(f"- {q}")

    print("\nExtracted Responses:")
    for r in responses:
        print(f"- {r}")

    # Pair questions and responses
    # (This simple heuristic assumes they follow each other; modify as needed)
    paired_data = []
    for i, question in enumerate(questions):
        if i < len(responses):
            paired_data.append((question, responses[i]))

  # Print paired data for debugging
    print("\nPaired Data:")
    for q, r in paired_data:
        print(f"Q: {q}")
        print(f"R: {r}")

    return paired_data

# Prepare the dataset for GPT-2
def prepare_dataset(paired_data, tokenizer):
    # Concatenate question and response as a single text
    texts = [q + tokenizer.eos_token + r for q, r in paired_data]

    # Tokenize the text
    tokenized_data = [tokenizer(text, truncation=True, padding='max_length', max_length=128) for text in texts]

    # Convert to Dataset object from the datasets library
    return Dataset.from_pandas(pd.DataFrame(tokenized_data))

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Padding token
tokenizer.pad_token = tokenizer.eos_token

# Scrape the website and prepare the dataset
url = "https://thearyanschool.edu.np/"  # Replace with the actual URL
paired_data = scrape_website(url)
train_dataset = prepare_dataset(paired_data, tokenizer)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Extracted Questions:
- Digital Notice Board
- Message From The Principal
- Message From the Vice-Principal (Academic)
- Message from the Vice Principal (Admin & Account)
- Message from the Academic Director
- Our ProgramsView More
- Mailing Address
- Emergency Contact
- Education
- School Address
- Scholarship Programs
- Areas of Facilities
- Life @ Aryan School
- History Of The School
- OurInspiringAlumni
- Our Video
- Our Gallery
- Association & Clubs
- Subscribe for Newsletter
- About Us
- Academics
- Other Links
- Follow Us

Extracted Responses:
- Any girls students from BIT and BCA interested for the workshop in Robotics and AI can contact the department by tomorrow 10:00am.contact person:Er. Mukund Raj Joshi+977-9868429529Head of DepartmentDepartment of Science and Technology (BIT/BCA)
- Free Eye and Dental Screening Program by Kathmandu Mero Lions Club in association with Venus hospital at Aryan College.#medical #checkup #LionsClubsInternational#kathmandu
- Know our Alumni(Civil

## 2. Fine-Tune the Model

In [7]:
# Define the data collator for language modeling
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Set up training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-finetuned-url",
    overwrite_output_dir=True,
    num_train_epochs=3,  # Number of epochs
    per_device_train_batch_size=2,  # Adjust batch size if needed
    save_steps=10_000,  # Save model more frequently
    save_total_limit=2,
    prediction_loss_only=True,
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("./gpt2-finetuned-url")
tokenizer.save_pretrained("./gpt2-finetuned-url")

Step,Training Loss


('./gpt2-finetuned-url/tokenizer_config.json',
 './gpt2-finetuned-url/special_tokens_map.json',
 './gpt2-finetuned-url/vocab.json',
 './gpt2-finetuned-url/merges.txt',
 './gpt2-finetuned-url/added_tokens.json')

## 3. Predict with the Fine-Tuned Model

## Exact Response Length from dataset:(Solving)


In [8]:
def get_response_length(question, paired_data, default_length=50):
    # Implement logic to determine the response length based on the dataset
    # For now, just return a default length
    return default_length

def predict_response(question, model, tokenizer, paired_data, temperature=0.3, top_k=40, top_p=0.5, repetition_penalty=1.2):
    # Get the expected response length from the dataset
    max_length = get_response_length(question, paired_data)

    # Encode the question and move the input to the same device as the model
    input_ids = tokenizer.encode(question, return_tensors='pt').to(model.device)

    # Setting attention mask and moving it to the same device
    attention_mask = (input_ids != tokenizer.pad_token_id).long().to(model.device)

    # Generate the response
    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_mask,
        max_length=max_length,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id,
        temperature=temperature,       # Controls randomness: lower is more deterministic, higher is more random
        top_k=top_k,                   # Limits sampling to the top k tokens
        top_p=top_p,                   # Nucleus sampling: selects tokens with cumulative probability up to p
        repetition_penalty=repetition_penalty,  # Applies a penalty to repeated tokens
    )

    # Decoding and post-processing the response
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the input question from the response
    response = response[len(question):].strip()

    return response

# Ensure the model runs on GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Start chatting with the bot
print("Start chatting with the bot (type 'quit' to stop)!")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = predict_response(user_input, model, tokenizer, paired_data)
    print(f"Bot: {response}")



Start chatting with the bot (type 'quit' to stop)!
You:  School Address




Bot: : department-wide, and university  (contact details) Please contact me for more information regarding the programme. I would like to thank everyone who has been a part of this program so far! Aryan College (IN 5th year
You: Scholarship Programs
Bot: for Students and Scholarships are offered by the University of California at Berkeley. The program is designed to provide a safe harbor between students, faculty members, staff and visitors in order that they will be able access their respective homes with dignity
You:  Life @ Aryan School
Bot: of Engineering & Management (AHA) #7798. pic/twitter: 🐞💜‌️ students and alumni for their future endeavors! 👏👭#!@�ℰ
You: History Of The School
Bot: Principal.
Thesis of the Hon'ble Director, Department and Board (Academic) or Higher Education(Legal/Management).
You: quit


# ***Limited to paired_data***!!!!

In [12]:
import requests
from bs4 import BeautifulSoup
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from difflib import get_close_matches, SequenceMatcher

# Scrape and parse the content from the URL
def scrape_website(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.content, 'html.parser')

    # Example heuristic: Assume questions are headers (e.g., h2, h3) and responses are paragraphs
    questions = [h2.get_text(strip=True) for h2 in soup.find_all(['h2', 'h3', 'h4'])]
    responses = [p.get_text(strip=True) for p in soup.find_all('p')]

    # Pair questions and responses
    paired_data = []
    for i, question in enumerate(questions):
        if i < len(responses):
            paired_data.append((question, responses[i]))

    return paired_data

# Load pre-trained GPT-2 model and tokenizer (if still needed)
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Scrape the website and prepare the paired data
url = "https://thearyanschool.edu.np/"  # Replace with the actual URL
paired_data = scrape_website(url)

# Function to find the closest match in the paired data with more flexibility
def find_closest_match(question, paired_data, threshold=0.5):
    # Normalize question text
    question = question.lower()

    # List of questions from paired data
    questions = [pair[0] for pair in paired_data]

    # Function to compute similarity score
    def similar(a, b):
        return SequenceMatcher(None, a, b).ratio()

    # Find the best match based on similarity score
    best_match = None
    highest_score = 0

    for q in questions:
        score = similar(question, q)
        if score > highest_score and score > threshold:
            highest_score = score
            best_match = q

    if best_match:
        for pair in paired_data:
            if pair[0] == best_match:
                return pair[1]

    return None

# Function to respond based on the closest match in the paired data
def respond_from_paired_data(question, paired_data):
    response = find_closest_match(question, paired_data)
    if response:
        return response
    else:
        # Generic fallback response if no close match is found
        return "I'm not sure about that. Could you please clarify?"


# Start chatting with the bot
print("Start chatting with the bot (type 'quit' to stop)!")
while True:
    user_input = input("You: ")
    if user_input.lower() == 'quit':
        break
    response = respond_from_paired_data(user_input, paired_data)
    print(f"Bot: {response}")


Start chatting with the bot (type 'quit' to stop)!
You: history
Bot: I'm not sure about that. Could you please clarify?
You: school
Bot: I'm not sure about that. Could you please clarify?
You: history of
Bot: for passing BE Civil VII Semester University Examination !
You: of school
Bot: for passing BE Civil VII Semester University Examination !
You: areas
Bot: I'm not sure about that. Could you please clarify?
You: Areas of
Bot: Field visit notice!
You: Address
Bot: Congratulations Mr. Ameer Sampang Rai (BIT 8th Semester).
You: address
Bot: Congratulations Mr. Ameer Sampang Rai (BIT 8th Semester).
You: Academics
Bot: Welcome !
You: academic
Bot: Welcome !
You: histroy
Bot: I'm not sure about that. Could you please clarify?
You: History
Bot: I'm not sure about that. Could you please clarify?
You: Aryan School
Bot: Congratulations 🎉 👏
You: quit
