In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import json
import random
from tqdm import tqdm

In [3]:
with open('/content/drive/MyDrive/tamil_data.txt', "r", encoding="utf-8") as file:
    lines = [line.strip() for line in file if len(line.strip()) > 10]

In [4]:
def generate_question(sentence):
    words = sentence.split()
    main_word = words[0] if words else "இது"
    question_templates = [
        "{} பற்றி என்ன தெரியும்?",
        "{} யார்?",
        "{} எங்கு உள்ளது?",
        "{} எப்போது நடைபெற்றது?",
        "{} முக்கியத்துவம் என்ன?",
        "{} யாரால் நிகழ்ந்தது?",
        "{} என்பதன் விளக்கம் என்ன?",
        "{} எதற்காக பிரபலமானது?",
        "{} பற்றிய தகவல்கள் என்ன?",
        "{} எப்போது ஆரம்பமானது?"
    ]
    template = random.choice(question_templates)
    question = template.format(main_word)
    return question

In [5]:
dataset = []

In [6]:
for _ in tqdm(range(1000)):
    sentence = random.choice(lines)
    question = generate_question(sentence)

    entry = {
        "instruction": "Generate a question in Tamil based on the given text.",
        "input": sentence,
        "output": question
    }
    dataset.append(entry)

100%|██████████| 1000/1000 [00:00<00:00, 34656.51it/s]


In [7]:
output_path = "tamil_question_generation_dataset.json"
with open(output_path, "w", encoding="utf-8") as json_file:
    json.dump(dataset, json_file, ensure_ascii=False, indent=4)

In [8]:
print(f"Generated {len(dataset)} samples — Saved at: {output_path}")

Generated 1000 samples — Saved at: tamil_question_generation_dataset.json


In [9]:
import sentencepiece as spm

In [10]:
input_file = '/content/drive/MyDrive/tamil_data.txt'
model_prefix = "tamil_spm"
vocab_size = 300

In [11]:
spm.SentencePieceTrainer.train(
    input=input_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    character_coverage=0.9995,
    model_type='bpe'
)

In [12]:
sp = spm.SentencePieceProcessor(model_file=f"{model_prefix}.model")

In [13]:
!pip install indic-nlp-library

Collecting indic-nlp-library
  Downloading indic_nlp_library-0.92-py3-none-any.whl.metadata (5.7 kB)
Collecting sphinx-argparse (from indic-nlp-library)
  Downloading sphinx_argparse-0.5.2-py3-none-any.whl.metadata (3.7 kB)
Collecting sphinx-rtd-theme (from indic-nlp-library)
  Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting morfessor (from indic-nlp-library)
  Downloading Morfessor-2.0.6-py3-none-any.whl.metadata (628 bytes)
Collecting sphinxcontrib-jquery<5,>=4 (from sphinx-rtd-theme->indic-nlp-library)
  Downloading sphinxcontrib_jquery-4.1-py2.py3-none-any.whl.metadata (2.6 kB)
Downloading indic_nlp_library-0.92-py3-none-any.whl (40 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.3/40.3 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Downloading sphinx_argparse-0.5.2-py3-none-any.whl (12 kB)
Downloading sphinx_rtd_theme-3.0.2-py2.py3-none-any.whl (7.7 MB)
[2K   [90m━

In [14]:
!python -m indicnlp.resources.manager download_resources

/usr/bin/python3: Error while finding module specification for 'indicnlp.resources.manager' (ModuleNotFoundError: No module named 'indicnlp.resources')


In [15]:
from indicnlp.tokenize import indic_tokenize
from indicnlp import common
from indicnlp.morph import unsupervised_morph

In [16]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 139, done.[K
remote: Counting objects: 100% (13/13), done.[K
remote: Compressing objects: 100% (13/13), done.[K
remote: Total 139 (delta 2), reused 2 (delta 0), pack-reused 126 (from 1)[K
Receiving objects: 100% (139/139), 149.77 MiB | 12.88 MiB/s, done.
Resolving deltas: 100% (53/53), done.
Updating files: 100% (28/28), done.


In [17]:
INDIC_NLP_RESOURCES = '/content/indic_nlp_resources'
common.set_resources_path(INDIC_NLP_RESOURCES)

In [18]:
morph_analyzer = unsupervised_morph.UnsupervisedMorphAnalyzer('ta')

In [19]:
def extract_main_word(sentence):
    tokens = indic_tokenize.trivial_tokenize(sentence)
    morphs = morph_analyzer.morph_analyze_document(tokens)
    for word, morph in zip(tokens, morphs):
        if 'Noun' in morph:
            return word
    return tokens[0] if tokens else "இது"

In [20]:
import json, random
from tqdm import tqdm

In [21]:
lines = [line.strip() for line in open('/content/drive/MyDrive/tamil_data.txt', encoding="utf-8") if len(line.strip()) > 10]

In [22]:
question_templates = ["{} பற்றி விளக்கவும்.", "{} எப்போது?", "{} என்பது யார்?", "{} எங்கு?", "{} என்றால் என்ன?"]

In [23]:
dataset = []

In [24]:
for _ in tqdm(range(1000)):
    sentence = random.choice(lines)
    main_word = extract_main_word(sentence)
    question = random.choice(question_templates).format(main_word)
    dataset.append({
        "instruction": "Generate a question in Tamil based on the given text.",
        "input": sentence,
        "output": question
    })

100%|██████████| 1000/1000 [00:01<00:00, 944.21it/s]


In [25]:
with open("tamil_qa_dataset.json", "w", encoding="utf-8") as f:
    json.dump(dataset, f, ensure_ascii=False, indent=4)

In [26]:
from datasets import Dataset

In [27]:
hf_dataset = Dataset.from_dict({
    "instruction": [d["instruction"] for d in dataset],
    "input": [d["input"] for d in dataset],
    "output": [d["output"] for d in dataset]
})

In [28]:
hf_dataset.save_to_disk("tamil_qa_dataset_hf")

Saving the dataset (0/1 shards):   0%|          | 0/1000 [00:00<?, ? examples/s]

In [29]:
!pip install huggingface_hub -q

In [30]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [32]:
hf_dataset.push_to_hub('saivimenthan/tamil-qa-dataset')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ? shards/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Processing Files (0 / 0)                : |          |  0.00B /  0.00B            

New Data Upload                         : |          |  0.00B /  0.00B            

                                        : 100%|##########|  175kB /  175kB            

README.md:   0%|          | 0.00/515 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/saivimenthan/tamil-qa-dataset/commit/bebe22dbe9bd7c4448d1bc4156f585cfa9f140fd', commit_message='Upload dataset', commit_description='', oid='bebe22dbe9bd7c4448d1bc4156f585cfa9f140fd', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/saivimenthan/tamil-qa-dataset', endpoint='https://huggingface.co', repo_type='dataset', repo_id='saivimenthan/tamil-qa-dataset'), pr_revision=None, pr_num=None)

In [33]:
from google.colab import files
files.download("tamil_qa_dataset.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [34]:
# Decoder-Only Model Setup
from transformers import PreTrainedTokenizerFast, GPT2Config, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset

In [35]:
import json

config = {
    "model_type": "gpt2",
}

with open("config.json", "w") as f:
    json.dump(config, f)

In [36]:
import json
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
from transformers import PreTrainedTokenizerFast

# Create a new BPE tokenizer
bpe_tokenizer = Tokenizer(BPE(unk_token="<unk>"))
bpe_tokenizer.pre_tokenizer = Whitespace()

# Train the tokenizer from the existing SentencePiece model
trainer = BpeTrainer(special_tokens=["<s>", "</s>", "<pad>", "<unk>"], vocab_size=300)
bpe_tokenizer.train_from_iterator(iter(lines), trainer=trainer)

# Save the tokenizer to tokenizer.json
bpe_tokenizer.save("tokenizer.json")

# Load the tokenizer using PreTrainedTokenizerFast
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="tokenizer.json",
    bos_token="<s>",
    eos_token="</s>",
    unk_token="<unk>",
    pad_token="<pad>"
)

In [37]:
# Create model config
config = GPT2Config(
    vocab_size=len(tokenizer),
    n_positions=512,
    n_ctx=512,
    n_embd=256,
    n_layer=4,
    n_head=4,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    pad_token_id=tokenizer.pad_token_id
)

In [38]:
# Instantiate model
model = GPT2LMHeadModel(config)

In [39]:
# Load your Tamil dataset
dataset = load_dataset("json", data_files="tamil_qa_dataset.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [40]:
# Tokenization function
def tokenize(batch):
    # Concatenate instruction, input, and output
    text = [f"{i} {j} {k}" for i, j, k in zip(batch['instruction'], batch['input'], batch['output'])]
    return tokenizer(text, truncation=True, padding="max_length", max_length=512)

tokenized_ds = dataset.map(tokenize, batched=True, remove_columns=["instruction", "input", "output"])

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

In [41]:
# Data collator for causal language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [48]:
# Training configuration
training_args = TrainingArguments(
    output_dir="./decoder_model",
    overwrite_output_dir=True,
    per_device_train_batch_size=2,
    num_train_epochs=100, # Increased epochs for better training
    save_steps=500,
    save_total_limit=2,
    logging_steps=100,
    report_to="none",
    logging_dir="./logs" # Added logging directory
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_ds["train"],
    tokenizer=tokenizer,
    data_collator=data_collator
)

# Start training
trainer.train()

  trainer = Trainer(


Step,Training Loss
100,1.3076
200,1.3863
300,1.3029
400,1.2511
500,1.2481
600,1.2566
700,1.338
800,1.2655
900,1.254
1000,1.2882


TrainOutput(global_step=50000, training_loss=0.7730346180725097, metrics={'train_runtime': 1316.5665, 'train_samples_per_second': 75.955, 'train_steps_per_second': 37.978, 'total_flos': 970614374400000.0, 'train_loss': 0.7730346180725097, 'epoch': 100.0})

In [47]:
from transformers import pipeline
from transformers import GPT2LMHeadModel, PreTrainedTokenizerFast
from transformers import TextGenerationPipeline
model_path = "./decoder_model/checkpoint-25000"
tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer.json")
model = GPT2LMHeadModel.from_pretrained(model_path)
summarizer = TextGenerationPipeline(model=model, tokenizer=tokenizer)
input = '/content/drive/MyDrive/tamil_data.txt'
prompt = f"சுருக்கமாக சொல்: {input} சுருக்கம்:"

# Generate summary
output = summarizer(prompt, max_length=100, num_return_sequences=1, do_sample=False)
print(output[0]['generated_text'])

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Both `max_new_tokens` (=256) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


சுருக்கமாக சொல்: /content/drive/MyDrive/tamil_data.txt சுருக்கம்: ம ு ற ை ய ு ம ு ம ு ம ு ற ை ய ு ர ு ம ் ல ை ய ு ற ை ய ா ன ் ப ு ற ை ய ு ம ் அ ர ு ம ் ல ை ய ா க ு ம ் ல ை ய ு ம ் ப ு ம ு ம ் ல ை ய ா க ் ப ு ற ை ய ு ம ் ப ு ற ை ய ு ம ் ல ் ப ு ற ை ய ு ம ் க ா க ு ம ் ல ் ப ு ற ை ய ு ற ை ய ு ற ி ய ு ம ் ப ு ற ் ப ு ர ் அ ர ு ம ் ப ் க ு ற ் க ு ற ி ய ு ம ் க ு ம ் க ு ற ் ப ு ம ் க ு ர ு ம ் க ள ் க ள ் ப ் ப ு ற ் க ள ் ப ு ற ் ப ு ம ் ப ு ற ் ப ு ற ் ப ு ற ் ப ு ற ் ப ு ம ் ப ு ற ் ப ட ் ப ு ற ் ப ் ப ு ம ் ப ு ற ் ப ு ற ் ப ு ம ் ப ு ற ் ப ு ம ் ப ு ற ் ப ு ற ் ப ு ற ்
