In [None]:
# Colab Setup Cell
!pip install -q transformers[torch] datasets scikit-learn pandas nltk

import nltk
# Download the 'punkt' tokenizer data if not already present
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("Setup complete. Please upload your 'merged.csv' and 'nagamese_manual_enriched.conll' files.")

[0mSetup complete. Please upload your 'merged.csv' and 'nagamese_manual_enriched.conll' files.


In [None]:
# file: transformer_tagger.py

import numpy as np
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import classification_report, accuracy_score
import torch
import re
import os

def read_conll(path: str) -> Dataset:
    """Reads a CoNLL-formatted file and returns a Hugging Face Dataset."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"The CoNLL file was not found at: {path}")

    sentences, tags = [], []
    with open(path, encoding='utf-8') as f:
        sent, sent_tags = [], []
        for line in f:
            line = line.strip()
            if not line:
                if sent:
                    sentences.append(sent)
                    tags.append(sent_tags)
                    sent, sent_tags = [], []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, tag = parts[0], parts[-1]
                    sent.append(token)
                    sent_tags.append(tag)
        if sent:
            sentences.append(sent)
            tags.append(sent_tags)

    return Dataset.from_dict({'tokens': sentences, 'pos_tags': tags})

def train_transformer_tagger(conll_path: str, model_save_path: str):
    """Trains and saves a Transformer-based POS tagger."""
    dataset = read_conll(conll_path)
    unique_tags = sorted({tag for tag_list in dataset['pos_tags'] for tag in tag_list})
    label2id = {label: i for i, label in enumerate(unique_tags)}
    id2label = {i: label for i, label in enumerate(unique_tags)}

    checkpoint = 'bert-base-multilingual-cased'
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    def tokenize_and_align_labels(examples):
        tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True, padding="max_length")
        labels = []
        for i, label in enumerate(examples[f"pos_tags"]):
            word_ids = tokenized_inputs.word_ids(batch_index=i)
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None or word_idx == previous_word_idx:
                    label_ids.append(-100)
                else:
                    label_ids.append(label2id[label[word_idx]])
                previous_word_idx = word_idx
            labels.append(label_ids)
        tokenized_inputs["labels"] = labels
        return tokenized_inputs

    tokenized_ds = dataset.map(tokenize_and_align_labels, batched=True)
    split = tokenized_ds.train_test_split(test_size=0.1, seed=42)

    data_collator = DataCollatorForTokenClassification(tokenizer)
    model = AutoModelForTokenClassification.from_pretrained(
        checkpoint, num_labels=len(unique_tags), id2label=id2label, label2id=label2id
    )

    def compute_metrics(p):
        predictions, labels = p
        predictions = np.argmax(predictions, axis=2)
        true_predictions = [
            [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        true_labels = [
            [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        results = classification_report(sum(true_labels, []), sum(true_predictions, []), output_dict=True, zero_division=0)
        return {"f1": results["weighted avg"]["f1-score"], "accuracy": accuracy_score(sum(true_labels, []), sum(true_predictions, []))}


    # --- CORRECTION IS HERE ---
    # Added 'report_to="none"' to disable wandb logging.
    training_args = TrainingArguments(
        output_dir=os.path.join(model_save_path, 'results'),
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=3,
        weight_decay=0.01,
        save_total_limit=1,
        load_best_model_at_end=True,
        report_to="none",  # This disables wandb integration
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=split["train"],
        eval_dataset=split["test"],
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    print("--- Starting Model Training ---")
    trainer.train()
    print("--- Training Complete ---")

    trainer.save_model(model_save_path)
    print(f"Model and tokenizer saved to '{model_save_path}'")


class NagamesePosTagger:
    """A POS tagger for Nagamese using a fine-tuned Transformer model."""
    def __init__(self, model_path: str):
        if not os.path.isdir(model_path):
            raise OSError(f"Trained model directory not found at: {model_path}")
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
        self.model = AutoModelForTokenClassification.from_pretrained(model_path)
        self.model.eval()

    def _simple_word_tokenize(self, text: str):
        return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)

    def predict(self, text: str):
        """Predicts POS tags for a given text, returning a list of (word, tag) tuples."""
        words = self._simple_word_tokenize(text)
        inputs = self.tokenizer(
            words, is_split_into_words=True, return_tensors='pt', truncation=True
        )
        with torch.no_grad():
            logits = self.model(**inputs).logits

        predictions = torch.argmax(logits, dim=2)
        word_ids = inputs.word_ids()

        results = []
        previous_word_idx = None
        for word_idx, pred_idx in zip(word_ids, predictions[0].tolist()):
            if word_idx is not None and word_idx != previous_word_idx:
                tag = self.model.config.id2label[pred_idx]
                results.append((words[word_idx], tag))
                previous_word_idx = word_idx
        return results

if __name__ == '__main__':
    conll_file = 'nagamese_manual_enriched.conll'
    model_dir = 'nagamese_pos_model'

    train_transformer_tagger(conll_file, model_dir)

    print("\n--- Loading Trained Model for Inference ---")
    try:
        tagger = NagamesePosTagger(model_dir)
        test_sentence = "moi ghor te jai ase aru apuni"
        pos_tags = tagger.predict(test_sentence)

        print(f"\nTest Sentence: '{test_sentence}'")
        print("Predicted POS Tags:")
        print(pos_tags)

    except OSError as e:
        print(f"\nError loading the model: {e}")

Map:   0%|          | 0/6743 [00:00<?, ? examples/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


--- Starting Model Training ---


  return forward_call(*args, **kwargs)


Epoch,Training Loss,Validation Loss


Epoch,Training Loss,Validation Loss,F1,Accuracy
1,No log,0.018466,0.995544,0.99556
2,0.158900,0.011189,0.997125,0.997127
3,0.011300,0.010059,0.997491,0.997493


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


--- Training Complete ---
Model and tokenizer saved to 'nagamese_pos_model'

--- Loading Trained Model for Inference ---


  return forward_call(*args, **kwargs)



Test Sentence: 'moi ghor te jai ase aru apuni'
Predicted POS Tags:
[('moi', 'PRON'), ('ghor', 'NOUN'), ('te', 'ADP'), ('jai', 'VERB'), ('ase', 'VERB'), ('aru', 'CCONJ'), ('apuni', 'PRON')]


In [None]:
# file: nltk_tagger.py

import nltk
import random
import pickle
from nltk.tag import DefaultTagger, UnigramTagger, BigramTagger, TrigramTagger
import os

def read_conll_for_nltk(path: str):
    """Reads a CoNLL file into a list of tagged sentences for NLTK."""
    if not os.path.exists(path):
        raise FileNotFoundError(f"The CoNLL file was not found at: {path}")

    tagged_sents = []
    with open(path, encoding='utf-8') as f:
        sent = []
        for line in f:
            line = line.strip()
            if not line:
                if sent:
                    tagged_sents.append(sent)
                    sent = []
            else:
                parts = line.split()
                if len(parts) >= 2:
                    token, tag = parts[0], parts[-1]
                    sent.append((token, tag))
        if sent:
            tagged_sents.append(sent)
    return tagged_sents

def train_and_save_nltk_tagger(conll_path: str, model_path: str):
    """Trains and saves an NLTK backoff tagger."""
    tagged_sentences = read_conll_for_nltk(conll_path)
    random.seed(42)
    random.shuffle(tagged_sentences)

    # Simple 90/10 split for training and testing
    split_idx = int(len(tagged_sentences) * 0.9)
    train_sents = tagged_sentences[:split_idx]
    test_sents = tagged_sentences[split_idx:]

    # Build the backoff tagger chain
    default_tagger = DefaultTagger('NOUN') # Default to NOUN if unknown
    unigram_tagger = UnigramTagger(train_sents, backoff=default_tagger)
    bigram_tagger = BigramTagger(train_sents, backoff=unigram_tagger)
    trigram_tagger = TrigramTagger(train_sents, backoff=bigram_tagger)

    print("--- Evaluating NLTK Tagger ---")
    accuracy = trigram_tagger.accuracy(test_sents)
    print(f"Trigram Backoff Tagger Accuracy: {accuracy:.2%}")

    # Save the trained tagger using pickle
    with open(model_path, 'wb') as f:
        pickle.dump(trigram_tagger, f)
    print(f"NLTK model saved to '{model_path}'")
    return trigram_tagger


class NltkPosTagger:
    """A POS Tagger for Nagamese using a pickled NLTK tagger object."""
    def __init__(self, model_path: str):
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"Tagger model file not found at: {model_path}")
        with open(model_path, 'rb') as f:
            self.tagger = pickle.load(f)

    def predict(self, tokens: list[str]):
        """
        Tags a list of tokens.

        Args:
            tokens (list[str]): A list of pre-tokenized words.

        Returns:
            list[tuple[str, str]]: A list of (word, tag) tuples.
        """
        return self.tagger.tag(tokens)


if __name__ == '__main__':
    conll_file = 'nagamese_manual_enriched.conll'
    nltk_model_file = 'nagamese_nltk_tagger.pkl'

    # --- Step 1: Train and save the NLTK tagger ---
    train_and_save_nltk_tagger(conll_file, nltk_model_file)

    # --- Step 2: Load the tagger and perform inference ---
    print("\n--- Loading NLTK Tagger for Inference ---")
    try:
        nltk_tagger = NltkPosTagger(nltk_model_file)
        test_tokens = ['moi', 'ghor', 'te', 'jai', 'ase']
        tagged_sentence = nltk_tagger.predict(test_tokens)

        print(f"\nTest Tokens: {test_tokens}")
        print(f"Predicted POS Tags: {tagged_sentence}")

    except FileNotFoundError as e:
        print(f"\nError loading the NLTK model: {e}")

--- Evaluating NLTK Tagger ---
Trigram Backoff Tagger Accuracy: 99.64%
NLTK model saved to 'nagamese_nltk_tagger.pkl'

--- Loading NLTK Tagger for Inference ---

Test Tokens: ['moi', 'ghor', 'te', 'jai', 'ase']
Predicted POS Tags: [('moi', 'PRON'), ('ghor', 'NOUN'), ('te', 'ADP'), ('jai', 'VERB'), ('ase', 'VERB')]


In [None]:
# file: nmt_translator.py

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import random
import os
import pickle
import re
import pandas as pd # <-- MISSING IMPORT ADDED HERE

# --- Data Loading ---
def load_and_prep_data(filepath: str):
    """
    Loads and preprocesses the parallel corpus from a CSV file.
    """
    if not os.path.exists(filepath):
        print(f"Error: The file at {filepath} was not found.")
        return None
    df = pd.read_csv(filepath)
    def clean_text(text):
        if not isinstance(text, str): return ""
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip().lower()

    df['english_cleaned'] = df['english'].apply(clean_text)
    df['nagamese_cleaned'] = df['nagamese'].apply(clean_text)
    # Simple split by space for tokenization
    df['english_tokens'] = df['english_cleaned'].apply(lambda x: x.split())
    df['nagamese_tokens'] = df['nagamese_cleaned'].apply(lambda x: x.split())
    return df

# --- Vocabulary and Dataset ---
class Vocab:
    def __init__(self, tokens, min_freq=2):
        self.pad_token, self.sos_token, self.eos_token, self.unk_token = '<pad>', '<sos>', '<eos>', '<unk>'
        self.pad_idx, self.sos_idx, self.eos_idx, self.unk_idx = 0, 1, 2, 3

        specials = [self.pad_token, self.sos_token, self.eos_token, self.unk_token]
        counter = Counter(tok for seq in tokens for tok in seq)
        vocab = sorted([tok for tok, freq in counter.items() if freq >= min_freq])

        self.idx_to_token = specials + vocab
        self.token_to_idx = {tok: idx for idx, tok in enumerate(self.idx_to_token)}

    def __len__(self):
        return len(self.idx_to_token)

class TranslationDataset(Dataset):
    def __init__(self, df, src_vocab, tgt_vocab):
        self.src_sents = df['nagamese_tokens'].tolist()
        self.tgt_sents = df['english_tokens'].tolist()
        self.src_vocab = src_vocab
        self.tgt_vocab = tgt_vocab

    def __len__(self):
        return len(self.src_sents)

    def __getitem__(self, idx):
        src_tokens = [self.src_vocab.token_to_idx.get(tok, self.src_vocab.unk_idx) for tok in self.src_sents[idx]]
        tgt_tokens = [self.tgt_vocab.token_to_idx.get(tok, self.tgt_vocab.unk_idx) for tok in self.tgt_sents[idx]]
        return torch.tensor(src_tokens), torch.tensor(tgt_tokens)

# --- Model Components ---
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # src = [src_len, batch_size]
        embedded = self.dropout(self.embedding(src))
        # embedded = [src_len, batch_size, emb_dim]
        outputs, hidden = self.rnn(embedded)
        # outputs = [src_len, batch_size, hid_dim * 2]
        # hidden = [n_layers * 2, batch_size, hid_dim]
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        # hidden = [batch_size, hid_dim]
        return outputs, hidden

class Attention(nn.Module):
    def __init__(self, hid_dim):
        super().__init__()
        self.attn = nn.Linear(hid_dim * 3, hid_dim) # hid_dim * 2 (encoder) + hid_dim (decoder)
        self.v = nn.Parameter(torch.rand(hid_dim))

    def forward(self, hidden, encoder_outputs):
        # hidden = [batch_size, hid_dim]
        # encoder_outputs = [src_len, batch_size, hid_dim * 2]
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        # hidden = [batch_size, src_len, hid_dim]
        # encoder_outputs = [batch_size, src_len, hid_dim * 2]
        energy = torch.tanh(self.attn(torch.cat([hidden, encoder_outputs], dim=2)))
        # energy = [batch_size, src_len, hid_dim]
        energy = energy.permute(0, 2, 1)
        # energy = [batch_size, hid_dim, src_len]
        v = self.v.repeat(batch_size, 1).unsqueeze(1)
        # v = [batch_size, 1, hid_dim]
        attention = torch.bmm(v, energy).squeeze(1)
        # attention = [batch_size, src_len]
        return torch.softmax(attention, dim=1)

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention):
        super().__init__()
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU(hid_dim * 2 + emb_dim, hid_dim)
        self.fc_out = nn.Linear(hid_dim * 3 + emb_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, encoder_outputs):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        a = self.attention(hidden, encoder_outputs).unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs).permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        prediction = self.fc_out(torch.cat((output.squeeze(0), weighted.squeeze(0), embedded.squeeze(0)), dim=1))
        return prediction, hidden.squeeze(0)

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        batch_size = src.shape[1]
        trg_len = trg.shape[0]
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        encoder_outputs, hidden = self.encoder(src)
        input = trg[0,:]
        for t in range(1, trg_len):
            output, hidden = self.decoder(input, hidden, encoder_outputs)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1
        return outputs

def collate_fn(batch, src_vocab, tgt_vocab, device):
    """Pads sequences, adds SOS/EOS, and moves tensors to the correct device."""
    src_batch, tgt_batch = [], []
    for src_sample, tgt_sample in batch:
        src_batch.append(torch.cat([torch.tensor([src_vocab.sos_idx]), src_sample, torch.tensor([src_vocab.eos_idx])], dim=0))
        tgt_batch.append(torch.cat([torch.tensor([tgt_vocab.sos_idx]), tgt_sample, torch.tensor([tgt_vocab.eos_idx])], dim=0))

    src_padded = pad_sequence(src_batch, padding_value=src_vocab.pad_idx)
    tgt_padded = pad_sequence(tgt_batch, padding_value=tgt_vocab.pad_idx)
    return src_padded.to(device), tgt_padded.to(device)

def train_model(model, loader, optimizer, criterion):
    """Main training loop for one epoch."""
    model.train()
    epoch_loss = 0
    for src, trg in loader:
        optimizer.zero_grad()
        output = model(src, trg)
        output_dim = output.shape[-1]
        # Flatten the output and target tensors
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)
        loss = criterion(output, trg)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1) # Clip gradients
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(loader)

class Translator:
    """Class to handle translation inference."""
    def __init__(self, model_path: str, vocabs_path: str, device):
        if not os.path.exists(model_path): raise FileNotFoundError(f"NMT model not found: {model_path}")
        if not os.path.exists(vocabs_path): raise FileNotFoundError(f"Vocab file not found: {vocabs_path}")

        with open(vocabs_path, 'rb') as f:
            self.src_vocab, self.tgt_vocab = pickle.load(f)

        self.device = device

        # Re-initialize model architecture to match training
        ENC_EMB_DIM = 256
        DEC_EMB_DIM = 256
        HID_DIM = 512
        DROPOUT = 0.5
        enc = Encoder(len(self.src_vocab), ENC_EMB_DIM, HID_DIM, DROPOUT)
        attn = Attention(HID_DIM)
        dec = Decoder(len(self.tgt_vocab), DEC_EMB_DIM, HID_DIM, DROPOUT, attn)
        self.model = Seq2Seq(enc, dec, device).to(device)
        self.model.load_state_dict(torch.load(model_path, map_location=device))
        self.model.eval()

    def translate(self, sentence: str, max_len=50):
        """Translates a single Nagamese sentence to English."""
        tokens = [tok.lower() for tok in sentence.split()]
        src_indexes = [self.src_vocab.sos_idx] + [self.src_vocab.token_to_idx.get(t, self.src_vocab.unk_idx) for t in tokens] + [self.src_vocab.eos_idx]
        src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(self.device)

        with torch.no_grad():
            encoder_outputs, hidden = self.model.encoder(src_tensor)

        trg_indexes = [self.tgt_vocab.sos_idx]
        for _ in range(max_len):
            trg_tensor = torch.LongTensor([trg_indexes[-1]]).to(self.device)
            with torch.no_grad():
                output, hidden = self.model.decoder(trg_tensor, hidden, encoder_outputs)

            pred_token = output.argmax(1).item()
            trg_indexes.append(pred_token)
            if pred_token == self.tgt_vocab.eos_idx:
                break

        trg_tokens = [self.tgt_vocab.idx_to_token[i] for i in trg_indexes]
        return " ".join(trg_tokens[1:-1])

if __name__ == '__main__':
    # --- Configuration ---
    N_EPOCHS = 10
    MODEL_PATH = 'nmt-nagamese-english.pt'
    VOCABS_PATH = 'nmt-vocabs.pkl'
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f"Using device: {DEVICE}")

    # --- 1. Load and Prepare Data ---
    df = load_and_prep_data('merged.csv')
    if df is not None:
        src_vocab = Vocab(df['nagamese_tokens'].tolist())
        tgt_vocab = Vocab(df['english_tokens'].tolist())

        with open(VOCABS_PATH, 'wb') as f:
            pickle.dump((src_vocab, tgt_vocab), f)
        print(f"Source vocab size: {len(src_vocab)}")
        print(f"Target vocab size: {len(tgt_vocab)}")

        dataset = TranslationDataset(df, src_vocab, tgt_vocab)
        train_size = int(0.9 * len(dataset))
        test_size = len(dataset) - train_size
        train_dataset, test_dataset = random_split(dataset, [train_size, test_size])

        # Correctly create the collate function with arguments
        collate_with_args = lambda batch: collate_fn(batch, src_vocab, tgt_vocab, DEVICE)
        train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_with_args)

        # --- 2. Initialize Model ---
        ENC_EMB_DIM = 256
        DEC_EMB_DIM = 256
        HID_DIM = 512
        DROPOUT = 0.5

        enc = Encoder(len(src_vocab), ENC_EMB_DIM, HID_DIM, DROPOUT)
        attn = Attention(HID_DIM)
        dec = Decoder(len(tgt_vocab), DEC_EMB_DIM, HID_DIM, DROPOUT, attn)
        model = Seq2Seq(enc, dec, DEVICE).to(DEVICE)

        optimizer = optim.Adam(model.parameters())
        criterion = nn.CrossEntropyLoss(ignore_index=src_vocab.pad_idx)

        # --- 3. Train the Model ---
        print("\n--- Starting NMT Model Training ---")
        for epoch in range(N_EPOCHS):
            train_loss = train_model(model, train_loader, optimizer, criterion)
            print(f'Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f}')

        torch.save(model.state_dict(), MODEL_PATH)
        print(f"Model saved to {MODEL_PATH}")

        # --- 4. Load and Test the Translator ---
        print("\n--- Loading Trained Model for Inference ---")
        translator = Translator(MODEL_PATH, VOCABS_PATH, DEVICE)
        test_sentence = "moi ghor te jai ase"
        translation = translator.translate(test_sentence)
        print(f"Nagamese Input: '{test_sentence}'")
        print(f"Predicted English Translation: '{translation}'")

Using device: cuda
Source vocab size: 3049
Target vocab size: 3797

--- Starting NMT Model Training ---
Epoch: 01 | Train Loss: 5.616
Epoch: 02 | Train Loss: 4.918
Epoch: 03 | Train Loss: 4.429
Epoch: 04 | Train Loss: 4.029
Epoch: 05 | Train Loss: 3.715
Epoch: 06 | Train Loss: 3.447
Epoch: 07 | Train Loss: 3.263
Epoch: 08 | Train Loss: 3.102
Epoch: 09 | Train Loss: 2.953
Epoch: 10 | Train Loss: 2.849
Model saved to nmt-nagamese-english.pt

--- Loading Trained Model for Inference ---
Nagamese Input: 'moi ghor te jai ase'
Predicted English Translation: 'i am going to my house'


In [None]:
# file: subword_tokenizer.py

import pandas as pd
import sentencepiece as spm
import os
import re

def load_data_for_spm(filepath: str):
    """Loads and cleans data specifically for SentencePiece training."""
    if not os.path.exists(filepath):
        print(f"Error: The file at {filepath} was not found.")
        return None
    df = pd.read_csv(filepath)
    def clean_text(text):
        if not isinstance(text, str): return ""
        text = re.sub(r'<[^>]+>', '', text)
        text = re.sub(r'\s+', ' ', text).strip()
        return text

    df['english_cleaned'] = df['english'].apply(clean_text)
    df['nagamese_cleaned'] = df['nagamese'].apply(clean_text)
    return df

def train_sentencepiece_model(df, model_prefix='naga_eng_bpe', vocab_size=8000):
    """
    Trains a joint SentencePiece BPE model on the Nagamese and English text.

    Args:
        df (pd.DataFrame): DataFrame containing 'nagamese_cleaned' and 'english_cleaned' columns.
        model_prefix (str): Prefix for the saved model files (.model, .vocab).
        vocab_size (int): The target size of the vocabulary.
    """
    # 1. Prepare a joint corpus file
    joint_corpus_path = 'joint_corpus.txt'
    with open(joint_corpus_path, 'w', encoding='utf-8') as f:
        for text in df['nagamese_cleaned'].tolist():
            f.write(f"{text}\n")
        for text in df['english_cleaned'].tolist():
            f.write(f"{text}\n")

    print(f"Joint corpus file created at '{joint_corpus_path}'")

    # 2. Train the SentencePiece model
    spm.SentencePieceTrainer.Train(
        f'--input={joint_corpus_path} '
        f'--model_prefix={model_prefix} '
        f'--vocab_size={vocab_size} '
        f'--model_type=bpe '
        f'--character_coverage=1.0'
    )
    print(f"SentencePiece model trained. Files '{model_prefix}.model' and '{model_prefix}.vocab' are saved.")

class SubwordTokenizer:
    """A wrapper for a trained SentencePiece model."""
    def __init__(self, model_path: str):
        if not os.path.exists(model_path):
            raise FileNotFoundError(f"SentencePiece model file not found at: {model_path}")
        self.sp = spm.SentencePieceProcessor()
        self.sp.load(model_path)

    def tokenize(self, text: str):
        """Tokenizes text into subword pieces."""
        return self.sp.encode_as_pieces(text)

    def detokenize(self, pieces: list[str]):
        """Converts a list of pieces back into a string."""
        return self.sp.decode_pieces(pieces)

if __name__ == '__main__':
    # --- 1. Load Data ---
    dataframe = load_data_for_spm('merged.csv')

    if dataframe is not None:
        # --- 2. Train the Model ---
        model_prefix = 'nagamese_english_spm'
        train_sentencepiece_model(dataframe, model_prefix=model_prefix, vocab_size=8000)

        # --- 3. Load and Test the Tokenizer ---
        print("\n--- Loading and Testing the Subword Tokenizer ---")
        try:
            tokenizer = SubwordTokenizer(f'{model_prefix}.model')

            test_sentence = "abraham laga chokra david laga chokra"
            tokens = tokenizer.tokenize(test_sentence)
            reconstructed = tokenizer.detokenize(tokens)

            print(f"\nOriginal: {test_sentence}")
            print(f"Tokens: {tokens}")
            print(f"Reconstructed: {reconstructed}")

        except FileNotFoundError as e:
            print(f"Error: {e}")

Joint corpus file created at 'joint_corpus.txt'
SentencePiece model trained. Files 'nagamese_english_spm.model' and 'nagamese_english_spm.vocab' are saved.

--- Loading and Testing the Subword Tokenizer ---

Original: abraham laga chokra david laga chokra
Tokens: ['▁ab', 'raham', '▁laga', '▁chokra', '▁da', 'vid', '▁laga', '▁chokra']
Reconstructed: abraham laga chokra david laga chokra


In [None]:
# Colab Setup Cell
!pip install -q transformers[torch] datasets scikit-learn pandas nltk sentencepiece
# Clone and install awesome-align
!git clone https://github.com/neulab/awesome-align.git
%cd awesome-align
!pip install -e .
%cd ..

import nltk
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

print("\nSetup complete. Ensure 'merged.csv' is uploaded.")

[0mCloning into 'awesome-align'...
remote: Enumerating objects: 343, done.[K
remote: Counting objects: 100% (115/115), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 343 (delta 100), reused 86 (delta 86), pack-reused 228 (from 1)[K
Receiving objects: 100% (343/343), 596.38 KiB | 17.04 MiB/s, done.
Resolving deltas: 100% (207/207), done.
/content/awesome-align
[0mObtaining file:///content/awesome-align
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting boto3 (from awesome_align==0.1.7)
  Downloading boto3-1.39.17-py3-none-any.whl.metadata (6.7 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.2.0->awesome_align==0.1.7)
  Using cached nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting botocore<1.40.0,>=1.39.17 (from boto3->awesome_align==0.1.7)
  Downloading botocore-1.39.17-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->awesome_align==0.1.7)
  Downloading jmespa

In [None]:
# file: word_aligner.py

import pandas as pd
import os
import re
import subprocess

def load_data_for_aligner(filepath: str):
    """Loads and cleans data for the word aligner."""
    if not os.path.exists(filepath):
        print(f"Error: The file at {filepath} was not found.")
        return None
    df = pd.read_csv(filepath).dropna(subset=['english', 'nagamese'])
    def clean_text(text):
        if not isinstance(text, str): return ""
        return re.sub(r'\s+', ' ', text).strip()

    df['english_cleaned'] = df['english'].apply(clean_text)
    df['nagamese_cleaned'] = df['nagamese'].apply(clean_text)
    return df

def align_corpus(df, output_file='alignments.txt'):
    """
    Runs awesome-align on the parallel corpus to generate word alignments.

    Args:
        df (pd.DataFrame): DataFrame with 'nagamese_cleaned' and 'english_cleaned' columns.
        output_file (str): The file to save the alignments to.
    """
    # 1. Prepare the input file for awesome-align
    # --- CORRECTION 1: The format must be tab-separated ('\t') ---
    input_file = 'aligner_input.txt'
    with open(input_file, 'w', encoding='utf-8') as f:
        for _, row in df.iterrows():
            f.write(f"{row['english_cleaned']}\t{row['nagamese_cleaned']}\n")

    print(f"Input file for aligner created at '{input_file}'")

    # 2. Run the awesome-align command
    # --- CORRECTION 2: The path to the script is at the root of the cloned repo ---
    align_script_path = 'awesome-align/run_align.py'
    if not os.path.exists(align_script_path):
        print(f"Error: Alignment script not found at '{align_script_path}'.")
        print("Please ensure you have run the setup cell to clone and install awesome-align correctly.")
        return

    model_name = 'bert-base-multilingual-cased'
    command = [
        'python3', align_script_path,
        '--model_name_or_path', model_name,
        '--data_file', input_file,
        '--output_file', output_file,
        '--extraction', 'softmax',
        '--batch_size', '32'
    ]

    print("\n--- Starting Word Alignment (this may take several minutes) ---")
    try:
        # Using subprocess to run the command
        process = subprocess.run(
            command, check=True, capture_output=True, text=True
        )
        print(process.stdout) # Print the output from the script
        print(f"--- Alignment Complete. Results saved to '{output_file}' ---")
    except subprocess.CalledProcessError as e:
        print("--- An error occurred during alignment. ---")
        print(f"Return Code: {e.returncode}")
        print("----- STDOUT -----")
        print(e.stdout)
        print("----- STDERR -----")
        print(e.stderr)

if __name__ == '__main__':
    # --- 1. Load Data ---
    dataframe = load_data_for_aligner('merged.csv')

    if dataframe is not None:
        # --- 2. Generate Alignments ---
        # We align a smaller subset for a quick demonstration.
        # To run on the full dataset, use: align_corpus(dataframe)
        align_corpus(dataframe.head(100), output_file='alignments_sample.txt')

        # --- 3. Display Sample Alignments ---
        print("\n--- Sample of Generated Alignments ---")
        try:
            with open('alignments_sample.txt', 'r', encoding='utf-8') as f:
                for i, line in enumerate(f):
                    if i >= 5: break
                    print(line.strip())
        except FileNotFoundError:
            print("Alignment file not found. The alignment process may have failed.")

Input file for aligner created at 'aligner_input.txt'

--- Starting Word Alignment (this may take several minutes) ---
Loading the dataset...
Line "The book of the genealogy of Jesus Christ son of David son of Abraham	Abraham laga chokra, David laga chokra, Jisu Khrista laga purbo khandan laga likhikena rakha kitab." (offset in bytes: 170) is not in the correct format. Skipping...
Line "Abraham fathered Isaac and Isaac fathered Jacob and Jacob fathered Judah and his brothers	Abraham, Isaac laga baba hoise, aru Isaac, Jacob laga baba hoise, aru Jacob, Judah aru tai laga bhai-kokai khan laga baba hoise." (offset in bytes: 389) is not in the correct format. Skipping...
Line "and Judah fathered Perez and Zerah by Tamar and Perez fathered Hezrom and Hezrom fathered Aram	Judah pora Perez laga baba, aru Zerah pora Tamar laga baba, Perez pora Hezron laga baba hoise, aru Hezron, Ram laga baba hoise." (offset in bytes: 612) is not in the correct format. Skipping...
Line "and Aram fathered Ammina

In [None]:
# file: main.py

import argparse
from transformer_tagger import train_transformer_tagger, NagamesePosTagger
from nltk_tagger import train_and_save_nltk_tagger, NltkPosTagger
from nmt_translator import Translator as NMT_Translator
# You can add imports for the subword tokenizer and word aligner here if you want to control them from the CLI

def main():
    parser = argparse.ArgumentParser(description="Nagamese NLP Toolkit Command-Line Interface")
    subparsers = parser.add_subparsers(dest='command', required=True, help='Available commands')

    # --- Command to train the Transformer POS Tagger ---
    parser_train_tagger = subparsers.add_parser('train-tagger', help='Train the Transformer POS Tagger.')
    parser_train_tagger.add_argument('--conll_file', type=str, default='nagamese_manual_enriched.conll', help='Path to the CoNLL training file.')
    parser_train_tagger.add_argument('--model_dir', type=str, default='nagamese_pos_model', help='Directory to save the trained model.')

    # --- Command to tag text using the Transformer model ---
    parser_tag = subparsers.add_parser('tag', help='Tag a sentence using the trained Transformer POS model.')
    parser_tag.add_argument('text', type=str, help='The Nagamese sentence to tag.')
    parser_tag.add_argument('--model_dir', type=str, default='nagamese_pos_model', help='Directory of the trained model.')

    # --- Command to train the NLTK Tagger ---
    parser_train_nltk = subparsers.add_parser('train-nltk-tagger', help='Train the NLTK POS Tagger.')
    parser_train_nltk.add_argument('--conll_file', type=str, default='nagamese_manual_enriched.conll', help='Path to the CoNLL training file.')
    parser_train_nltk.add_argument('--model_path', type=str, default='nagamese_nltk_tagger.pkl', help='Path to save the pickled NLTK tagger.')

    # --- Command to translate text using the NMT model ---
    parser_translate = subparsers.add_parser('translate', help='Translate a Nagamese sentence to English.')
    parser_translate.add_argument('text', type=str, help='The Nagamese sentence to translate.')
    parser_translate.add_argument('--model_path', type=str, default='nmt-nagamese-english.pt', help='Path to the trained NMT model state dictionary.')
    parser_translate.add_argument('--vocabs_path', type=str, default='nmt-vocabs.pkl', help='Path to the pickled vocab file.')

    args = parser.parse_args()

    # --- Execute the chosen command ---
    if args.command == 'train-tagger':
        print(f"Starting Transformer POS tagger training...")
        train_transformer_tagger(args.conll_file, args.model_dir)
        print("Training complete.")

    elif args.command == 'tag':
        try:
            tagger = NagamesePosTagger(args.model_dir)
            tags = tagger.predict(args.text)
            print(tags)
        except OSError as e:
            print(f"Error: {e}. Have you trained the model first with the 'train-tagger' command?")

    elif args.command == 'train-nltk-tagger':
        print("Starting NLTK POS tagger training...")
        train_and_save_nltk_tagger(args.conll_file, args.model_path)
        print("Training complete.")

    elif args.command == 'translate':
        import torch
        try:
            device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
            translator = NMT_Translator(args.model_path, args.vocabs_path, device)
            translation = translator.translate(args.text)
            print(f"Translation: {translation}")
        except (OSError, FileNotFoundError) as e:
            print(f"Error: {e}. Ensure the NMT model and vocabs exist by running the `nmt_translator.py` script first.")


if __name__ == '__main__':
    main()