<a href="https://colab.research.google.com/github/sebDagnachew/final-project-object-detection/blob/main/LLM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import sentencepiece as spm  #imports the SentencePiece library,
import time  #import time

# ---------- 1. Train SentencePiece tokenizer ----------
corpus_file = '/content/1. What is a Patent.txt'   # your Amharic corpus file
model_prefix = 'amharic_sp'
vocab_size = 400                  # can also try 500

print("Training SentencePiece tokenizer...")

start_time = time.time()

spm.SentencePieceTrainer.train(
    input=corpus_file,
    model_prefix=model_prefix,
    vocab_size=vocab_size,
    model_type='unigram',       # recommended for Amharic
    character_coverage=1.0,  #Tells SentencePiece to include all characters from your text
    pad_id=0,  #padding
    unk_id=1,  #unknown words
    bos_id=2,  #beginning of sentence
    eos_id=3   #end of sentence
)

training_time = time.time() - start_time
print(f"Training completed in {training_time:.2f} seconds.\n")

# ---------- 2. Load the trained model ----------
sp = spm.SentencePieceProcessor(model_file=f'{model_prefix}.model')

# ---------- 3. Read corpus and tokenize ----------
with open(corpus_file, 'r', encoding='utf-8') as f:
    lines = f.readlines()

total_words = 0
total_tokens = 0
subword_fertility_list = []
continued_words_count = 0  # For PCW

tokenization_start = time.time()

for line in lines:
    line = line.strip() #removes extra spaces or newline characters.
    if not line:
        continue
    words = line.split()  # split by space
    total_words += len(words)

    tokens = sp.encode(line, out_type=str)
    total_tokens += len(tokens)

    # Subword fertility per line
    if len(words) > 0:
        subword_fertility_list.append(len(tokens)/len(words))

    # Count continued words
    # Assume a word is continued if it is split into more than 1 token
    # Simple method: check token boundaries for each word
    token_index = 0
    for word in words:
        word_tokens = sp.encode(word, out_type=str) # if someone wants token_id [out_type=int]
        if len(word_tokens) > 1:
            continued_words_count += 1
        token_index += len(word_tokens)

tokenization_time = time.time() - tokenization_start

# ---------- 4. Calculate metrics ----------
normalized_sequence_length = total_tokens / total_words
average_subword_fertility = sum(subword_fertility_list) / len(subword_fertility_list)
proportion_continued_words = continued_words_count / total_words

print("----- Tokenization Metrics -----")
print(f"Total words in corpus: {total_words}")
print(f"Total tokens produced: {total_tokens}")
print(f"Normalized Sequence Length (tokens/word): {normalized_sequence_length:.3f}")
print(f"Average Subword Fertility: {average_subword_fertility:.3f}")
print(f"Proportion of Continued Words (PCW): {proportion_continued_words:.3f}")
print(f"Tokenization execution time: {tokenization_time:.2f} seconds")


Training SentencePiece tokenizer...
Training completed in 0.03 seconds.

----- Tokenization Metrics -----
Total words in corpus: 607
Total tokens produced: 1645
Normalized Sequence Length (tokens/word): 2.710
Average Subword Fertility: 2.802
Proportion of Continued Words (PCW): 0.764
Tokenization execution time: 0.01 seconds
