In [None]:
import re
# Path to input and output files
input_file = "hindi_wikipedia_corpus.txt"
output_file = "hindi_wikipedia_corpus_cleaned.txt"

# Devanagari Unicode range: U+0900 to U+097F
# Optional: Add some punctuation if needed (like । or whitespace)
devanagari_pattern = re.compile(r'^[\u0900-\u097F]+$')
def is_hindi_word(word):
    return devanagari_pattern.match(word) is not None

cleaned_lines = []

with open(input_file, "r", encoding="utf-8") as infile:
    for line in infile:
        words = line.strip().split()
        hindi_words = [word for word in words if is_hindi_word(word)]
        if hindi_words:
            cleaned_lines.append(" ".join(hindi_words))

# Save cleaned lines to a new file
with open(output_file, "w", encoding="utf-8") as outfile:
    for line in cleaned_lines:
        outfile.write(line + "\n")

print(f"Cleaned Hindi text saved to {output_file}")

Cleaned Hindi text saved to hindi_wikipedia_corpus_cleaned.txt


In [1]:
# morpheme_tokenizer.py

# Expanded Prefixes (unique additions)
prefixes = [
    'अ', 'अन', 'अध', 'अति', 'पर', 'स', 'व', 'प्र', 'उ', 'सभी', 'उप', 'वि', 'संप', 'प्रौ',
    'अधिकार', 'आ', 'न', 'सम', 'अंतर', 'बिना', 'सदृश', 'पार', 'उदाहरण', 'रूप', 'विक', 'संपर्क',
    'विवेक', 'क', 'मुख्य', 'प्रकृति', 'तंत्र', 'धर्म', 'सामाजिक', 'विज्ञान', 'विशेष', 'अधिकार',
    'प्रति', 'अधिवेशन', 'संचालन', 'संविधान', 'समाज', 'प्रतिनिधि', 'संचालित', 'अंतरिक्ष',
    'विज्ञापन', 'आधिकारिक', 'आध्यात्मिक', 'कृपया', 'संस्कृत', 'दृश्य', 'पारंपरिक', 'स्वदेशी',
    'अंतरराष्ट्रीय', 'न्यायिक', 'प्राकृतिक', 'समान', 'समानांतर', 'संकुचित', 'विकसीत', 'साक्षात्कार',
    'उधार', 'उत्साहित', 'सजग', 'आत्म', 'विश्वास', 'प्राकृतिक', 'विकसित'
]

# Expanded Suffixes (unique additions)
suffixes = [
    'ता', 'वाला', 'वाली', 'ने', 'रूप', 'पन', 'इ', 'नुमा', 'ण', 'वृत्ति', 'वाद', 'शक्ति', 'रण',
    'कता', 'धन', 'सिद्ध', 'त्व', 'ता', 'करण', 'ज्ञान', 'काल', 'शास्त्र', 'ई', 'साधना', 'सार',
    'देश', 'मूल्य', 'वृद्धि', 'वाला', 'प्रकार', 'भावना', 'आत्म', 'शांति', 'समान', 'वर्धन',
    'हीन', 'मूल', 'निवासी', 'शुल्क', 'विधि', 'कारण', 'वीर', 'देश', 'अधिकार', 'भविष्य', 'प्रभाव',
    'स्वीकृति', 'त्याग', 'योजना', 'पुरस्कार', 'निर्णय', 'जन', 'संतुलन', 'मान्यता', 'विश्व', 'संपत्ति',
    'सिद्धांत', 'समाजवादी', 'आधुनिक', 'सुरक्षा', 'संगठित', 'निष्ठा', 'परिस्थिति', 'विकास', 'शक्ति',
    'साक्षात्कार', 'प्रशासन', 'रचनात्मक', 'समर्थ', 'विवेचना', 'शिकायत', 'मुलायम', 'विवाह',
    'महत्व', 'समझौता', 'समर्पण', 'स्वास्थ्य', 'पदवी', 'लाभ', 'लक्षण', 'नौकरी', 'योजना'
]

# Expanded Roots (unique additions)
roots = [
    'भारत', 'शक्ति', 'समाज', 'धर्म', 'विज्ञान', 'ज्ञान', 'प्रकृति', 'जीवन', 'कला', 'शांति', 'प्यार',
    'सपना', 'विकास', 'आत्मा', 'मूल्य', 'रक्षा', 'शौर्य', 'विज्ञानी', 'वृत्ति', 'संतोष',
    'राज', 'नेता', 'संत', 'योग', 'विपक्ष', 'विजेता', 'लक्ष्य', 'मूल', 'न्याय', 'सत्यमेव',
    'मित्र', 'कर्म', 'शिक्षा', 'श्रम', 'विधि', 'रचनात्मक', 'जिंदगी', 'संगीत', 'दर्शन',
    'संविधान', 'लोक', 'राजनीति', 'प्रशासन', 'स्वतंत्रता', 'संस्कार', 'मुलायम', 'संघ',
    'विरासत', 'अनुराग', 'भावना', 'रचनात्मकता', 'समस्या', 'समाधान', 'आत्मनिर्भर', 'अभियान',
    'उत्साह', 'सपने', 'भविष्य', 'शिक्षक', 'शिक्षिका', 'विद्यालय', 'नौकरी', 'स्वास्थ्य', 'समाजसेवा',
    'विकिरण', 'विश्वविद्यालय', 'संस्था', 'अधिकार', 'संगठन', 'संचार', 'मूल', 'समाजशास्त्र', 'अर्थशास्त्र',
    'शिक्षण', 'अध्ययन', 'समाजवादी', 'विपरीत', 'सामाजिक', 'कर्मचारी', 'प्रारंभ', 'शुद्धता', 'विवाह',
    'तंत्रज्ञान', 'विपणन', 'नैतिकता', 'संस्कार', 'निर्माण', 'स्वतंत्रता', 'ध्वनि', 'आदर्श'
]

# Tokenizer function
def tokenize_morpheme(word):
    # Try to match prefix
    for prefix in prefixes:
        if word.startswith(prefix):
            word = word[len(prefix):]  # Remove prefix
            return [prefix] + tokenize_morpheme(word)

    # Try to match suffix
    for suffix in suffixes:
        if word.endswith(suffix):
            word = word[:-len(suffix)]  # Remove suffix
            return tokenize_morpheme(word) + [suffix]

    # If the word is in roots, return it as a root
    if word in roots:
        return [word]

    # If no match, return the word as it is
    return [word]

# Function to process the file and tokenize each word
def process_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        # Read file content
        content = file.read()

        # Tokenize the content
        words = content.split()
        tokenized_words = {}

        for word in words:
            # Apply morpheme tokenizer
            morphemes = tokenize_morpheme(word)
            tokenized_words[word] = ' + '.join(morphemes)

        return tokenized_words

# Function to save tokenized words to a file
def save_tokenized_output(file_path, output_file):
    # Process the input file
    tokenized_words = process_file(file_path)

    # Open output file for writing
    with open(output_file, 'w', encoding='utf-8') as output:
        for word, morphemes in tokenized_words.items():
            output.write(f"{word} → {morphemes}\n")

# Main function
if __name__ == "__main__":
    # Replace 'your_file.txt' with the path to your input text file
    input_file = 'hindi_wikipedia_corpus_cleaned.txt'  # Change this path
    output_file = 'tokenized_output.txt'  # Output file where tokenized words will be saved

    # Save tokenized words to the output file
    save_tokenized_output(input_file, output_file)

    print(f"Tokenized output has been saved to {output_file}")


Tokenized output has been saved to tokenized_output.txt


TRAINING llM ON BOTH DATA

In [2]:
import os
os.environ["WANDB_DISABLED"] = "true"

from tokenizers import Tokenizer, models, trainers, pre_tokenizers


def train_tokenizer(file_path, vocab_size=30000, output_path="tokenizer.json"):
    from tokenizers import Tokenizer
    from tokenizers.models import WordLevel
    from tokenizers.trainers import WordLevelTrainer
    from tokenizers.pre_tokenizers import Whitespace

    tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
    trainer = WordLevelTrainer(vocab_size=vocab_size, special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"])
    tokenizer.pre_tokenizer = Whitespace()

    tokenizer.train([file_path], trainer)
    tokenizer.save(output_path)

# Train on both tokenized versions
#train_tokenizer("bpe_tokenized.txt", output_path="bpe_tokenizer.json")
train_tokenizer("tokenized_output.txt", output_path="morph_tokenizer.json")


In [3]:
from transformers import GPT2TokenizerFast
from torch.utils.data import Dataset, DataLoader
import torch

# Load tokenizer
tokenizer = GPT2TokenizerFast(tokenizer_file="morph_tokenizer.json")

# Load your text
with open("hindi_wikipedia_corpus_cleaned.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Encode the full text
tokens = tokenizer.encode(text)
print(f"Total tokens: {len(tokens)}")

# Create a simple dataset that chunks the tokenized text
class TextDataset(Dataset):
    def __init__(self, tokens, block_size):
        self.tokens = tokens
        self.block_size = block_size

    def __len__(self):
        return len(self.tokens) // self.block_size

    def __getitem__(self, idx):
        start = idx * self.block_size
        end = start + self.block_size
        x = torch.tensor(self.tokens[start:end], dtype=torch.long)
        y = torch.tensor(self.tokens[start + 1:end + 1], dtype=torch.long)
        return x, y


Total tokens: 271285


In [4]:
from transformers import GPT2LMHeadModel, GPT2Config

# Build a GPT2 model from scratch
config = GPT2Config(
    vocab_size=tokenizer.vocab_size,
    n_positions=128,
    n_ctx=128,
    n_embd=256,
    n_layer=4,
    n_head=4
)
model = GPT2LMHeadModel(config)


In [5]:
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

# Prepare dataset and dataloader
dataset = TextDataset(tokens, block_size=128)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-4)

# Move to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.train()

# Train manually
epochs = 20
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for x, y in tqdm(dataloader):
        x, y = x.to(device), y.to(device)

        outputs = model(x, labels=y)
        loss = outputs.loss

        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

    print(f"Loss: {loss.item():.4f}")


Epoch 1/20


  0%|          | 0/530 [00:00<?, ?it/s]`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
100%|██████████| 530/530 [00:10<00:00, 49.43it/s]


Loss: 7.3840
Epoch 2/20


100%|██████████| 530/530 [00:10<00:00, 50.28it/s]


Loss: 7.0616
Epoch 3/20


100%|██████████| 530/530 [00:09<00:00, 53.11it/s]


Loss: 6.6113
Epoch 4/20


100%|██████████| 530/530 [00:10<00:00, 52.65it/s]


Loss: 6.5115
Epoch 5/20


100%|██████████| 530/530 [00:10<00:00, 51.99it/s]


Loss: 6.1297
Epoch 6/20


100%|██████████| 530/530 [00:10<00:00, 51.73it/s]


Loss: 6.2287
Epoch 7/20


100%|██████████| 530/530 [00:10<00:00, 51.21it/s]


Loss: 6.1076
Epoch 8/20


100%|██████████| 530/530 [00:10<00:00, 51.46it/s]


Loss: 6.2100
Epoch 9/20


100%|██████████| 530/530 [00:10<00:00, 51.63it/s]


Loss: 5.6979
Epoch 10/20


100%|██████████| 530/530 [00:10<00:00, 52.01it/s]


Loss: 5.8840
Epoch 11/20


100%|██████████| 530/530 [00:10<00:00, 51.97it/s]


Loss: 5.5388
Epoch 12/20


100%|██████████| 530/530 [00:10<00:00, 52.18it/s]


Loss: 5.0882
Epoch 13/20


100%|██████████| 530/530 [00:10<00:00, 51.45it/s]


Loss: 5.2726
Epoch 14/20


100%|██████████| 530/530 [00:10<00:00, 51.96it/s]


Loss: 4.9209
Epoch 15/20


100%|██████████| 530/530 [00:13<00:00, 39.25it/s]


Loss: 4.7938
Epoch 16/20


100%|██████████| 530/530 [00:10<00:00, 50.56it/s]


Loss: 4.0931
Epoch 17/20


100%|██████████| 530/530 [00:10<00:00, 51.94it/s]


Loss: 4.5307
Epoch 18/20


100%|██████████| 530/530 [00:10<00:00, 51.45it/s]


Loss: 4.2639
Epoch 19/20


100%|██████████| 530/530 [00:10<00:00, 50.48it/s]


Loss: 4.2792
Epoch 20/20


100%|██████████| 530/530 [00:10<00:00, 51.57it/s]

Loss: 3.6678





In [6]:
model.save_pretrained("gpt2_morph_manual")
tokenizer.save_pretrained("gpt2_morph_manual")


('gpt2_morph_manual/tokenizer_config.json',
 'gpt2_morph_manual/special_tokens_map.json',
 'gpt2_morph_manual/vocab.json',
 'gpt2_morph_manual/added_tokens.json',
 'gpt2_morph_manual/tokenizer.json')

In [None]:
input_text =  "अब्राहम लिंकन द्वारा अमरीकी गृह युद्ध के बीच में गुलामों:"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Set pad_token_id to eos_token_id (or define your own pad_token_id if you have one)
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,
    do_sample=True,
    top_p=0.9,
    temperature=0.9,
    pad_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=1
)

print(tokenizer.decode(output[0]))


अब्राहम [UNK] द्वारा अमरीकी गृह युद्ध के बीच में [UNK] [UNK] → + अगरबत्ती अ गरबत्ती ण गयी पत्रिकायें उ ठाना प्र शांत व िधानमंडल न ्याय नुमा स त्यदीपक । पहले पुरातात्त्विक ज्ञान शैलाश्रयों ता है युक्त आ म ने सार्वभौमिक जाल भी रहा वह क ट वाद समीक्षाएँ


In [None]:
model = GPT2LMHeadModel.from_pretrained("gpt2_morph_manual")
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2_morph_manual")

In [None]:
inputs = tokenizer("कुछ उदाहरण दें।", return_tensors="pt")
input_ids = inputs["input_ids"]
attention_mask = inputs["attention_mask"]

# Set pad_token_id to eos_token_id (or define your own pad_token_id if you have one)
output = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=50,
    do_sample=True,
    top_p=0.9,
    temperature=0.9,
    pad_token_id=tokenizer.eos_token_id,
    no_repeat_ngram_size=1
)

print(tokenizer.decode(output[0]))


कुछ उदाहरण दें । ला अली बे की मु फ़ ज टु ने के में को दा री जो और से तक अ हुई उनकी पर एक तो े ली आय ट्ट या ट ए क़ ठी जीते केवल प्राप्तक यह निपुण शैली नहीं है कानो अपनी थी उन्होंने ज्यादा
