<a href="https://colab.research.google.com/github/rishi0728/ordered/blob/main/DholuGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# New section

In [77]:
# ✅ Step 1: Install dependencies
# pip install transformers datasets --quiet

# ✅ Step 2: Hinglish-style chat dataset
chat_lines = [
    "bhai kal college chalna hai?",
    "haan bhai, 10 baje nikalte hain",
    "usne reply hi nahi kiya 😢",
    "exam kab hai bhai?",
    "20 tareek ko hai, padle ab!",
    "bhai kal raat 2 baje tak deploy kar raha tha 😭",
    "manager ne firse bola weekend me bhi login karna hoga",
    "ghar pe internet nahi chal raha 😓",
    "chai peene chalein?",
    "client bola changes urgent hai, shaadi cancel kar do ab",
    "standup mein bas bolte rehna hai, kaam kuch nahi hota",
    "mummy gussa hai aaj 😅",
    "salary to fixed hai, kaam daily increment ho raha",
    "bhai system slow hai, ya zindagi?",
    "college ke time mein chatGPT hota toh CGPA bhi AI hoti",
    "project manager ne bola timeline flexible hai, but delivery kal hi chahiye",
]

# ✅ Step 3: Load tokenizer and model
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config
from datasets import Dataset
from transformers import Trainer, TrainingArguments

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

config = GPT2Config.from_pretrained("gpt2")
model = GPT2LMHeadModel.from_pretrained("gpt2", config=config)
model.resize_token_embeddings(len(tokenizer))

# ✅ Step 4: Prepare dataset
split_data = [{"text": line} for line in chat_lines]
dataset = Dataset.from_list(split_data)

def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function)

# ✅ Step 5: Training
training_args = TrainingArguments(
    output_dir="./dholugpt",
    per_device_train_batch_size=2,
    num_train_epochs=8,
    logging_steps=5,
    save_steps=20,
    save_total_limit=1,
    logging_dir="./logs",
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

# ✅ Step 6: Save model
model.save_pretrained("./dholugpt")
tokenizer.save_pretrained("./dholugpt")

# ✅ Step 7: Chatting Function
def chat_with_dholu(input_text):
    inputs = tokenizer.encode(input_text + tokenizer.eos_token, return_tensors="pt")
    outputs = model.generate(
        inputs,
        max_length=100,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,  # Allow sampling for some randomness
        top_k=50,  # Limit the sample pool
        top_p=0.95,  # Consider 95% of the token probability distribution
        temperature=0.1,  # Slight randomness for diversity
    )
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("🧠 DholuGPT:", reply.replace(input_text, "").strip())

# ✅ Step 8: Try it
chat_with_dholu("bhai kal raat 2 baje tak deploy kar raha tha 😭")
chat_with_dholu("exam kab hai bhai?")
chat_with_dholu("usne mujhe message kiya kal finally 😭")


Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Step,Training Loss
5,4.1252
10,1.1492
15,1.0852
20,0.8304
25,1.0237
30,0.8907
35,0.6538
40,0.7805
45,0.5964
50,0.5918


🧠 DholuGPT: 
🧠 DholuGPT: 
🧠 DholuGPT: 


In [93]:
# Add more Hinglish-style chat dataset with English and Hindi slang, no emojis
chat_lines = [
    "bhai kal college chalna hai?",
    "haan bhai, 10 baje nikalte hain",
    "usne reply hi nahi kiya",
    "exam kab hai bhai?",
    "20 tareek ko hai, padle ab!",
    "bhai kal raat 2 baje tak deploy kar raha tha",
    "manager ne firse bola weekend me bhi login karna hoga",
    "ghar pe internet nahi chal raha",
    "chai peene chalein?",
    "client bola changes urgent hai, shaadi cancel kar do ab",
    "standup mein bas bolte rehna hai, kaam kuch nahi hota",
    "mummy gussa hai aaj",
    "salary to fixed hai, kaam daily increment ho raha",
    "bhai system slow hai, ya zindagi?",
    "college ke time mein chatGPT hota toh CGPA bhi AI hoti",
    "project manager ne bola timeline flexible hai, but delivery kal hi chahiye",

    # Add more Hinglish lines with slang
    "bhai yeh jo client hai na, bilkul pagal hai",
    "exam ka time hai aur sab apne apne chill kar rahe hain",
    "bhai system crash ho gaya, kya karoon?",
    "ghar ka kaam hi khatam nahi hota, sabko laga abhi party chalegi",
    "office mein sab apne apne kaam mein busy hain, koi kaam ka nahi",
    "yaar, maine bola tha na ki weekend pe kaam karte hain, par ye manager samajhta hi nahi",
    "mujhe pata tha ki client waisa hi kuch kahega, har bar wahi hota hai",
    "yaar ye jo app hai, itna slow kyu chal raha hai?",
    "boss ne bola kal project ko finalize karna hai, mujhe laga sab set hai, par abhi tak kaam start hi nahi hua!",
    "tumne yeh task complete kiya ya fir se postpone ho gaya?",
    "boss ka phone aaya, bola project update chahiye, main soch raha tha yeh weekend tha kahan!",
    "accha hua tumne mujhe yaad dilaya, woh task pending tha",
    "bhai kal exam hai, aur aaj bhi revision ki koi planning nahi ki",
    "client ko samjha nahi paa rahe hain, ab woh humse irritate ho gaye hain",
    "manager bola ki weekend pe bhi kaam karna hai, boss ki mood swings dekhke pata chal raha hai ki kis din kaunsa kaam milega",
    "yaar, yeh work from home waale din bhi ab bore kar rahe hain",
    "thoda chill karo yaar, sab kaam ho hi jayega, bas tension mat lo",
    "pata nahi yaar, abhi tak mujhe time mila nahi, kal se kuch kaam karenge",
    "oye, kabhi kaam ke baare mein sochne ki zaroorat padti hai, apne apne kaam se fursat nahi milti!",
    "yeh jo log hain na jo 'work from home' ki baat karte hain, unko pata nahi ki ghar mein kaam bhi hota hai!",
    "kaam karte karte kabhi lagta hai ki zindagi ka matlab hi yeh hai ki kaam karte raho",
    "manager ne bola kuch extra features chahiye, ab hum kaha se laaye?",
    "boss ke saath call pe baat karke lag raha hai ki koi epic meeting thi",
    "office mein sab log kitne chill hote hain, main hi kaam kar raha hoon",
    "project ki deadline pass aa gayi hai, abhi tak kaam ka kuch pata nahi",
    "dost ka message aaya, bola weekend pe chill karte hain, par kaam ka pressure kuch zyada ho gaya hai",
    "tumko pata hai na ki abhi office ke andar ka kaam waise hi chalu hai, sab ka tension baitha hai",
    "client ko apni situation samjha nahi paaye, ab woh humse irritate ho gaye hain",
    "team mein sab apne apne kaam mein busy hain, koi help nahi karta",
    "apne office ka to yeh haal hai, sab log kaam ko avoid kar rahe hain",
    "boss ka mood aaj kaafi chill hai, lagta hai kuch badhiya kaam ho gaya hai",
    "naya project start ho gaya hai, aur sab log apne apne ideas rakh rahe hain!",
    "bhai weekend pe apna plan hai, kaam nahi karna!"
]

# Convert chat_lines to dataset
split_data = [{"text": line} for line in chat_lines]
dataset = Dataset.from_list(split_data)

# Tokenizing function
def tokenize_function(example):
    tokenized = tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=64,
    )
    tokenized["labels"] = tokenized["input_ids"].copy()
    return tokenized

tokenized_dataset = dataset.map(tokenize_function)

# Continue with the training process
training_args = TrainingArguments(
    output_dir="./dholugpt",
    per_device_train_batch_size=2,
    num_train_epochs=8,
    logging_steps=5,
    save_steps=20,
    save_total_limit=1,
    logging_dir="./logs",
    remove_unused_columns=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

# Save the updated model
model.save_pretrained("./dholugpt")
tokenizer.save_pretrained("./dholugpt")


Map:   0%|          | 0/49 [00:00<?, ? examples/s]

Step,Training Loss
5,1.9281
10,1.4214
15,1.6732
20,0.9993
25,1.3937
30,1.2222
35,1.0739
40,1.3151
45,1.1035
50,1.0562


('./dholugpt/tokenizer_config.json',
 './dholugpt/special_tokens_map.json',
 './dholugpt/vocab.json',
 './dholugpt/merges.txt',
 './dholugpt/added_tokens.json')

In [104]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# ✅ Load model and tokenizer
model = GPT2LMHeadModel.from_pretrained("./dholugpt")
tokenizer = GPT2Tokenizer.from_pretrained("./dholugpt")

# Set pad token to eos token (important for generation)
tokenizer.pad_token = tokenizer.eos_token
model.eval()

# Chat function to interact with DholuGPT
def chat_with_dholu(input_text):
    conversation_prompt = f"User: {input_text}\nDholuGPT: "

    # Encoding the input with tokenizer
    inputs = tokenizer.encode(conversation_prompt, return_tensors="pt")

    # Generate response with diverse settings
    outputs = model.generate(
        inputs,
        max_length=150,  # Increase max length
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.95,
        temperature=0.8,  # Slightly more randomness
        num_return_sequences=1,
    )

    # Print raw output to inspect
    raw_output = tokenizer.decode(outputs[0], skip_special_tokens=False)

    # Clean up the output by removing the "<|endoftext|>" token and the prompt
    reply = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Remove the initial conversation prompt from the response
    response = reply.replace(conversation_prompt, "").strip()

    if not response:  # If response is empty, try again
        print("🧠 DholuGPT: Response empty. Trying again.")
        chat_with_dholu(input_text)
    else:
        print("🧠 DholuGP:", response)



In [119]:
from google.colab import files

# Upload file dialog will appear
uploaded = files.upload()

# Check the uploaded file(s)
print(uploaded)  # This will print the file name(s)

# Access the uploaded file
# The file will be stored in the '/content/' directory
pdf_file_path = list(uploaded.keys())[0]  # Get the name of the uploaded file
print(f"Uploaded file path: /content/{pdf_file_path}")

# You can also use the file directly by the file name:
with open(f'/content/{pdf_file_path}', 'rb') as file:
    # Example: print the first 100 bytes of the file (just for checking)
    print(file.read(100))


Saving book.pdf to book.pdf
{'book.pdf': b'%PDF-1.7\r\n%\xb5\xb5\xb5\xb5\r\n1 0 obj\r\n<</Type/Catalog/Pages 2 0 R/Lang(en-US) /StructTreeRoot 298 0 R/MarkInfo<</Marked true>>/Metadata 1598 0 R/ViewerPreferences 1599 0 R>>\r\nendobj\r\n2 0 obj\r\n<</Type/Pages/Count 53/Kids[ 3 0 R 17 0 R 18 0 R 19 0 R 20 0 R 24 0 R 25 0 R 33 0 R 36 0 R 39 0 R 42 0 R 46 0 R 49 0 R 52 0 R 57 0 R 62 0 R 66 0 R 70 0 R 76 0 R 79 0 R 85 0 R 87 0 R 92 0 R 96 0 R 99 0 R 104 0 R 108 0 R 113 0 R 117 0 R 122 0 R 128 0 R 132 0 R 135 0 R 144 0 R 146 0 R 151 0 R 156 0 R 160 0 R 164 0 R 169 0 R 173 0 R 179 0 R 184 0 R 188 0 R 195 0 R 201 0 R 205 0 R 210 0 R 215 0 R 218 0 R 219 0 R 222 0 R 290 0 R] >>\r\nendobj\r\n3 0 obj\r\n<</Type/Page/Parent 2 0 R/Resources<</Font<</F1 5 0 R/F2 9 0 R/F3 11 0 R/F4 13 0 R/F5 21 0 R/F6 26 0 R>>/ExtGState<</GS7 7 0 R/GS8 8 0 R>>/ProcSet[/PDF/Text/ImageB/ImageC/ImageI] >>/Annots[ 15 0 R 16 0 R 23 0 R 31 0 R 32 0 R 34 0 R 35 0 R 37 0 R 38 0 R 40 0 R 41 0 R 43 0 R 44 0 R 45 0 R 47 0 R 48 

In [121]:
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='pdfminer')

# Now run the PDF extraction
import pdfplumber

# Path to the uploaded PDF
pdf_file_path = '/content/book.pdf'

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Extract the text from the uploaded PDF
extracted_text = extract_text_from_pdf(pdf_file_path)

# Check the first 500 characters of the extracted text
print(extracted_text[:500])




BHAGAVAD-GITA in ENGLISH (Source)
For commentaries:
https://www.gita-society.com/Read-bhagavad-gita.html
CONTENTS
INTRODUCTION ...................................................... 1
1. Arjuna’s Dilemma .................................................. 3
2. Spiritual knowledge................................................ 3
The spirit is eternal, body is transitory ........................ 4
Death and Reincarnation of the soul .......................... 4
Duty of a warrior ..................


In [124]:
import pdfplumber
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='pdfminer')

# Path to the uploaded PDF
pdf_file_path = '/content/book.pdf'

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Extract the text from the uploaded PDF
extracted_text = extract_text_from_pdf(pdf_file_path)

# Clean the extracted text (e.g., remove newlines and unnecessary whitespace)
cleaned_text = extracted_text.replace('\n', ' ').replace('\r', '')

# Check the first 500 characters of the cleaned text
print(cleaned_text[:500])




BHAGAVAD-GITA in ENGLISH (Source) For commentaries: https://www.gita-society.com/Read-bhagavad-gita.html CONTENTS INTRODUCTION ...................................................... 1 1. Arjuna’s Dilemma .................................................. 3 2. Spiritual knowledge................................................ 3 The spirit is eternal, body is transitory ........................ 4 Death and Reincarnation of the soul .......................... 4 Duty of a warrior ..................


In [125]:
# Split the cleaned text into smaller chunks (sentences or paragraphs)
text_chunks = cleaned_text.split(".")  # You can use a different delimiter if needed

# Prepare data for training (as a list of dictionaries)
train_data = [{"text": chunk.strip()} for chunk in text_chunks if chunk.strip()]

# Check the first few chunks
print(train_data[:5])


[{'text': 'BHAGAVAD-GITA in ENGLISH (Source) For commentaries: https://www'}, {'text': 'gita-society'}, {'text': 'com/Read-bhagavad-gita'}, {'text': 'html CONTENTS INTRODUCTION'}, {'text': '1 1'}]


In [126]:
from datasets import Dataset

# Convert train data to Dataset format
dataset = Dataset.from_list(train_data)

# Check the first few entries of the dataset
print(dataset[:5])


{'text': ['BHAGAVAD-GITA in ENGLISH (Source) For commentaries: https://www', 'gita-society', 'com/Read-bhagavad-gita', 'html CONTENTS INTRODUCTION', '1 1']}


In [127]:
from transformers import GPT2Tokenizer

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set the pad token to the EOS token

# Tokenize the dataset
def tokenize_function(example):
    return tokenizer(example['text'], truncation=True, padding="max_length", max_length=128)

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Check the tokenized data
print(tokenized_dataset[:5])


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

{'text': ['BHAGAVAD-GITA in ENGLISH (Source) For commentaries: https://www', 'gita-society', 'com/Read-bhagavad-gita', 'html CONTENTS INTRODUCTION', '1 1'], 'input_ids': [[33, 39, 4760, 10116, 2885, 12, 38, 2043, 32, 287, 12964, 8763, 18422, 357, 7416, 8, 1114, 2912, 3166, 25, 3740, 1378, 2503, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 50256, 5025

In [None]:
import warnings
import pdfplumber
from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments
from datasets import Dataset

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module='pdfminer')

# Path to the uploaded PDF
pdf_file_path = '/content/book.pdf'

# Function to extract text from the PDF
def extract_text_from_pdf(pdf_file):
    with pdfplumber.open(pdf_file) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text()
    return text

# Extract the text from the uploaded PDF
extracted_text = extract_text_from_pdf(pdf_file_path)

# Clean up the extracted text (for simplicity, we will remove any unwanted whitespace)
cleaned_text = extracted_text.replace("\n", " ").replace("\r", " ").strip()

# Split the cleaned text into smaller chunks (sentences or paragraphs)
text_chunks = cleaned_text.split(".")  # You can use a different delimiter if needed

# Prepare data for training (as a list of dictionaries)
train_data = [{"text": chunk.strip()} for chunk in text_chunks if chunk.strip()]

# Convert to HuggingFace dataset format
dataset = Dataset.from_dict({"text": [data["text"] for data in train_data]})

# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Add padding token if necessary
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

# Resize model embeddings to account for new token
model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

# Check the tokenizer's vocabulary size
print(f"Tokenizer vocabulary size: {len(tokenizer)}")

# Tokenize the dataset
def tokenize_function(example):
    encoding = tokenizer(example['text'], truncation=True, padding='max_length', max_length=512, return_tensors="pt")
    labels = encoding['input_ids'].clone()  # Copy input_ids to labels

    # Mask padding tokens for language modeling (set padding tokens to -100)
    labels[labels == tokenizer.pad_token_id] = -100

    return {
        'input_ids': encoding['input_ids'],
        'attention_mask': encoding['attention_mask'],
        'labels': labels
    }

# Apply tokenization to the dataset
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Check the first 5 tokenized examples
print(tokenized_dataset[:5])

# Prepare training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    overwrite_output_dir=True,       # overwrite output_dir
    num_train_epochs=1,              # number of training epochs
    per_device_train_batch_size=4,   # batch size for training
    save_steps=10_000,               # save checkpoint every 10,000 steps
    save_total_limit=2,              # only last 2 models are saved
)

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

# Train the model
trainer.train()




Tokenizer vocabulary size: 50258


Map:   0%|          | 0/1821 [00:00<?, ? examples/s]

{'text': ['BHAGAVAD-GITA in ENGLISH (Source) For commentaries: https://www', 'gita-society', 'com/Read-bhagavad-gita', 'html CONTENTS INTRODUCTION', '1 1'], 'input_ids': [[33, 39, 4760, 10116, 2885, 12, 38, 2043, 32, 287, 12964, 8763, 18422, 357, 7416, 8, 1114, 2912, 3166, 25, 3740, 1378, 2503, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 50257, 5025

Step,Training Loss
