In [47]:
import sentencepiece as spm
import pandas as pd

In [48]:
data = pd.read_excel('tawra_5k.xlsx')

In [49]:
data.head()

Unnamed: 0,ENGLISH,TARAON/TAWRA
0,Run!,chow na
1,Fire!,tamung
2,Help!,di brungna
3,Stop!,na naa
4,Wait!,kalyuna


In [50]:
english_file = data['ENGLISH ']
tawra_file = data['TARAON/TAWRA']

In [51]:
english_file.to_csv('english.txt',index=False)
tawra_file.to_csv('tawra.txt',index=False)

In [64]:
# combining the corpus for combined spm model
data.to_csv('combined.txt',index=False)

In [65]:
def load_parallel_data(combined_file):
    """
    Load parallel data from English and Tawra files.
    Args:
        combined_file (str): The file path to the combined sentences.
    Returns:
        tuple: A combined file containing English sentences and Tawra sentences.
    """
    # Read English sentences from the file
    with open(combined_file, 'r', encoding='utf-8') as f:
        combined_sentences = [line.strip() for line in f.readlines()]

    return combined_sentences

# Example usage:
combined_sentences = load_parallel_data('combined.txt')

In [74]:
# Set the vocabulary size for the SentencePiece model
vocab_size = 1000

def train_sentencepiece_model(data, model_prefix, vocab_size=vocab_size):
    """
    Train a SentencePiece model on the given data.
    Args:
        data (str): The file path to the input data.
        model_prefix (str): The prefix for the output model files.
        vocab_size (int): The size of the vocabulary for the model. Default is 6800.
    """
    spm.SentencePieceTrainer.train(input=data, model_prefix=model_prefix, vocab_size=vocab_size)

# Train SentencePiece model for COmbined texts
train_sentencepiece_model('combined.txt', 'eng_taw', vocab_size=vocab_size)

In [75]:
def load_sentencepiece_model(model_prefix):
    """
    Load a trained SentencePiece model.
    Args:
        model_prefix (str): The prefix for the SentencePiece model files.
    Returns:
        spm.SentencePieceProcessor: The loaded SentencePiece model.
    """
    sp = spm.SentencePieceProcessor()
    sp.load(f"{model_prefix}.model")
    return sp

# Load SentencePiece models
combined_sp = load_sentencepiece_model('eng_taw')
# tawra_sp = load_sentencepiece_model('tawra')


In [76]:
def build_vocab(data, sp):
    """
    Build a vocabulary from tokenized data using a SentencePiece model.
    Args:
        data (list): A list of sentences to tokenize.
        sp (spm.SentencePieceProcessor): A SentencePiece model to tokenize the sentences.
    Returns:
        dict: A dictionary where keys are tokens and values are their respective frequencies.
    """
    vocab = {}
    for sentence in data:
        tokens = sp.encode(sentence, out_type=str)
        for token in tokens:
            if token in vocab:
                vocab[token] += 1
            else:
                vocab[token] = 1
    return vocab

In [77]:
# Build vocabularies for English and Tawra
combined_vocab = build_vocab(combined_sentences, combined_sp)
# tagin_vocab = build_vocab(tawra_sentences, tawra_sp)