<a href="https://colab.research.google.com/github/sebinbusra/Translation/blob/main/Tokenizer_Models_and_Translation_Evaluations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**1. Import necessary libraries and modules**

In [8]:
!pip install tokenizers
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, decoders
import unicodedata

Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m17.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3


In [9]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

**2. TOKENIZER TYPES**

2.1. BPE TOKENIZER

In [5]:
# Step 1: Applying normalization to clear invalid characters from the text
def apply_normalization(text):
    # Normalize the text to NFC (Normalization Form C)
    normalized_text = unicodedata.normalize("NFC", text)
    return normalized_text

# Step 2: Training the tokenizer according to the predetermined vocabulary size
def train_tokenizer(corpus, vocab_size):
    tokenizer = Tokenizer(models.BPE())
    trainer = trainers.BpeTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], vocab_size=vocab_size)
    tokenizer.train_from_iterator(corpus, trainer=trainer)
    return tokenizer

# Step 3: Processing the corpus with the trained tokenizer to obtain a tokenized pretraining data
def preprocess_with_tokenizer(tokenizer, text):
    # Lowercase conversion
    text = text.lower()

    # NFC normalization
    normalized_text = unicodedata.normalize("NFC", text)

    # Tokenize the text
    encoded_text = tokenizer.encode(normalized_text)
    return encoded_text.ids

# Sample usage
corpus = [
    "Merhaba! Nasılsın?",
    "Bugün hava çok güzel.",
    "Yarın buluşmak için plan yapıyoruz.",
    "Türk mutfağı dünya çapında ünlüdür.",
    "Köpeğimle parkta gezinti yapıyorum.",
    "Bir fincan kahve içer misin?",
    "Okula gitmek için erken kalkmalıyız.",
    "Sonbahar renkleri harika.",
    "Dizi izlemekten hoşlanıyorum.",
    "Eski dostlarla buluşmak her zaman keyiflidir."
]

# Step 1: Applying normalization to clear invalid characters from the text
corpus_normalized = [apply_normalization(text) for text in corpus]
print("Step 1: Applying normalization to clear invalid characters:")
print(corpus_normalized)

# Step 2: Training the tokenizer according to the predetermined vocabulary size
vocab_size = 1000  # You can adjust the vocabulary size based on your requirements
tokenizer = train_tokenizer(corpus_normalized, vocab_size)
print("\nStep 2: Trained Tokenizer:")
print(tokenizer.get_vocab())

# Step 3: Processing the corpus with the trained tokenizer to obtain a tokenized pretraining data
preprocessed_data = [preprocess_with_tokenizer(tokenizer, text) for text in corpus_normalized]
print("\nStep 3: Tokenized Pretraining Data:")
print(preprocessed_data)


Step 1: Applying normalization to clear invalid characters:
['Merhaba! Nasılsın?', 'Bugün hava çok güzel.', 'Yarın buluşmak için plan yapıyoruz.', 'Türk mutfağı dünya çapında ünlüdür.', 'Köpeğimle parkta gezinti yapıyorum.', 'Bir fincan kahve içer misin?', 'Okula gitmek için erken kalkmalıyız.', 'Sonbahar renkleri harika.', 'Dizi izlemekten hoşlanıyorum.', 'Eski dostlarla buluşmak her zaman keyiflidir.']

Step 2: Trained Tokenizer:
{'Okula gitmek için erken kalkmalıyız.': 263, 'ula g': 183, 'kta g': 194, 'parkta gezinti yapıyorum.': 253, 'st': 158, 'kahv': 181, 'el': 113, 'mek için ': 196, 'ı dü': 167, 'ıyoruz.': 184, 'iç': 57, 'ezinti yapıyorum.': 251, 'l': 29, 'Bir': 96, 'buluşmak için plan yapıyoruz.': 240, 'y': 39, 'şlan': 169, 'k her ': 174, 'uş': 79, 'imle ': 218, 'can ': 109, 'i iz': 175, 'erken ': 173, 'n': 31, 'lemekten ho': 225, 'ız.': 166, 'Sonbahar renkl': 248, 'p': 33, 'ı dünya çapında ünlüdür.': 254, 'iz': 73, 'er ': 84, 'mutfağ': 226, 'yapıyorum.': 197, '! N': 94, 'Yar':

2.2. WORDPIECE TOKENIZER

In [25]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, decoders
import unicodedata

# Step 1: Applying normalization to clear invalid characters from the text
def apply_normalization(text):
    # Normalize the text to NFC (Normalization Form C)
    normalized_text = unicodedata.normalize("NFC", text)
    return normalized_text

# Step 2: Training the tokenizer according to the predetermined vocabulary size
def train_tokenizer(corpus, vocab_size):
    tokenizer = Tokenizer(models.WordPiece())
    trainer = trainers.WordPieceTrainer(special_tokens=["[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]"], vocab_size=vocab_size)
    tokenizer.train_from_iterator(corpus, trainer=trainer)
    return tokenizer

# Step 3: Processing the corpus with the trained tokenizer to obtain a tokenized pretraining data
def preprocess_with_tokenizer(tokenizer, text):
    # Lowercase conversion
    text = text.lower()

    # NFC normalization
    normalized_text = unicodedata.normalize("NFC", text)

    # Tokenize the text
    encoded_text = tokenizer.encode(normalized_text)
    return encoded_text.ids

# Sample usage
corpus = [
    "Merhaba! Nasılsın?",
    "Bugün hava çok güzel.",
    "Yarın buluşmak için plan yapıyoruz.",
    "Türk mutfağı dünya çapında ünlüdür.",
    "Köpeğimle parkta gezinti yapıyorum.",
    "Bir fincan kahve içer misin?",
    "Okula gitmek için erken kalkmalıyız.",
    "Sonbahar renkleri harika.",
    "Dizi izlemekten hoşlanıyorum.",
    "Eski dostlarla buluşmak her zaman keyiflidir."
]

# Step 1: Applying normalization to clear invalid characters from the text
corpus_normalized = [apply_normalization(text) for text in corpus]
print("Step 1: Applying normalization to clear invalid characters:")
print(corpus_normalized)

# Step 2: Training the tokenizer according to the predetermined vocabulary size
vocab_size = 1000  # You can adjust the vocabulary size based on your requirements
tokenizer = train_tokenizer(corpus_normalized, vocab_size)
print("\nStep 2: Trained Tokenizer:")
print(tokenizer.get_vocab())

# Step 3: Processing the corpus with the trained tokenizer to obtain a tokenized pretraining data
preprocessed_data = [preprocess_with_tokenizer(tokenizer, text) for text in corpus_normalized]
print("\nStep 3: Tokenized Pretraining Data:")
print(preprocessed_data)


Step 1: Applying normalization to clear invalid characters:
['Merhaba! Nasılsın?', 'Bugün hava çok güzel.', 'Yarın buluşmak için plan yapıyoruz.', 'Türk mutfağı dünya çapında ünlüdür.', 'Köpeğimle parkta gezinti yapıyorum.', 'Bir fincan kahve içer misin?', 'Okula gitmek için erken kalkmalıyız.', 'Sonbahar renkleri harika.', 'Dizi izlemekten hoşlanıyorum.', 'Eski dostlarla buluşmak her zaman keyiflidir.']

Step 2: Trained Tokenizer:
{'##leme': 215, '##da ünlü': 259, 'Yarın ': 241, '## d': 147, '##ız.': 184, 'Kö': 128, 'r': 34, '##ti ': 170, 'p': 33, '##! N': 190, '##güze': 224, 'Dizi izlemekten ho': 274, 'K': 12, '##haba': 220, 'So': 131, '##lü': 152, '##kalk': 209, 'b': 20, 'Tür': 132, '## r': 143, '## ': 49, 'Es': 127, '##ho': 174, '##lk': 149, 'Mer': 129, '##?': 74, 'c': 21, '##v': 78, '##u': 63, '##iz': 96, '##k ': 83, 'Sonba': 239, 'Bir f': 233, 'Ok': 130, '##a g': 117, '##N': 72, '##l': 50, '##lar': 155, '##ler': 154, '##eğ': 160, '##i': 47, '##s': 73, 'f': 24, '##ha': 106, '##erk

2.3. MORPHOLOGICAL TOKENIZER

**Zemberek for TurkishMorphology**

In [10]:
!pip install zemberek-python

Collecting zemberek-python
  Downloading zemberek_python-0.2.3-py3-none-any.whl (95.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m95.1/95.1 MB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting antlr4-python3-runtime==4.8 (from zemberek-python)
  Downloading antlr4-python3-runtime-4.8.tar.gz (112 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m112.4/112.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.8-py3-none-any.whl size=141210 sha256=5b385faa33ac93118f40e9f502d50d773836bdaef63309c1cc8dd03d652aaacb
  Stored in directory: /root/.cache/pip/wheels/a7/20/bd/e1477d664f22d99989fd28ee1a43d6633dddb5cb9e801350d5
Successfully built antlr4-python3-runtime
Installing colle

In [8]:
from tokenizers import Tokenizer, models, normalizers, pre_tokenizers, trainers, decoders
import unicodedata
import re
from zemberek import TurkishMorphology
from bs4 import BeautifulSoup
from nltk.corpus import stopwords

# ... (Rest of the code from the previous example)

# Step 3: Processing the corpus with the trained tokenizer to obtain a tokenized pretraining data
def preprocess_with_tokenizer(tokenizer, morphology, text):
    # Lowercase conversion
    text = text.lower()

    # NFC normalization
    normalized_text = unicodedata.normalize("NFC", text)

    # Remove HTML tags using BeautifulSoup
    soup = BeautifulSoup(normalized_text, "html.parser")
    cleaned_text = soup.get_text()

    # Remove non-character symbols using regex
    cleaned_text = re.sub(r'[0123456789,/\.!?:‘’()"]', '', cleaned_text)

    # Remove stopwords in Turkish
    stop_words = set(stopwords.words("turkish"))
    words = cleaned_text.split()
    filtered_text = [word for word in words if word not in stop_words]

    # Tokenize the filtered text
    encoded_text = tokenizer.encode(" ".join(filtered_text))

    # Morphological-level tokenization using Zemberek
    tokenized_text = []
    for token in encoded_text.tokens:
        analysis_results = morphology.analyze(token)
        for analysis in analysis_results:
            morphemes = analysis.get_stem()
            tokenized_text.append(morphemes)

    return tokenized_text

# Sample usage
corpus = [
    "Merhaba! Nasılsın?",
    "Bugün hava çok güzel.",
    "Yarın buluşmak için plan yapıyoruz.",
    "Türk mutfağı dünya çapında ünlüdür.",
    "Köpeğimle parkta gezinti yapıyorum.",
    "Bir fincan kahve içer misin?",
    "Okula gitmek için erken kalkmalıyız.",
    "Sonbahar renkleri harika.",
    "Dizi izlemekten hoşlanıyorum.",
    "Eski dostlarla buluşmak her zaman keyiflidir."
]

# Step 1: Applying normalization to clear invalid characters from the text
corpus_normalized = [apply_normalization(text) for text in corpus]
print("Step 1: Applying normalization to clear invalid characters:")
print(corpus_normalized)

# Step 2: Training the tokenizer according to the predetermined vocabulary size
vocab_size = 1000  # You can adjust the vocabulary size based on your requirements
tokenizer = train_tokenizer(corpus_normalized, vocab_size)
print("\nStep 2: Trained Tokenizer:")
print(tokenizer.get_vocab())

# Step 3: Initialize Zemberek TurkishMorphology
morphology = TurkishMorphology.create_with_defaults()

# Step 3: Processing the corpus with the trained tokenizer to obtain a tokenized pretraining data
preprocessed_data = [preprocess_with_tokenizer(tokenizer, morphology, text) for text in corpus_normalized]
print("\nStep 3: Tokenized Pretraining Data:")
print(preprocessed_data)


Step 1: Applying normalization to clear invalid characters:
['Merhaba! Nasılsın?', 'Bugün hava çok güzel.', 'Yarın buluşmak için plan yapıyoruz.', 'Türk mutfağı dünya çapında ünlüdür.', 'Köpeğimle parkta gezinti yapıyorum.', 'Bir fincan kahve içer misin?', 'Okula gitmek için erken kalkmalıyız.', 'Sonbahar renkleri harika.', 'Dizi izlemekten hoşlanıyorum.', 'Eski dostlarla buluşmak her zaman keyiflidir.']

Step 2: Trained Tokenizer:
{'ul': 60, 'e ': 67, 'bul': 65, 'ı dünya çapında ': 234, 'a': 19, 'buluşma': 89, 'a g': 82, 'apında ': 176, 'k için plan ': 200, 'dür.': 186, 'e içer ': 187, 'şlanıyorum.': 235, 'ey': 115, 'k her ': 174, 'mu': 145, 'k için ': 88, 'eri ': 172, 'do': 110, 'T': 17, 'iz': 73, '[PAD]': 0, 'a çapında ': 199, 'enkl': 216, 'B': 9, 'i': 27, 'in': 56, 'N': 14, 'gün ha': 190, 'Dizi izlemekten ho': 243, 'şlan': 169, 'ç': 41, 'st': 158, 'ti ': 159, 'ı dü': 167, 'uş': 79, 'Y': 18, 'K': 12, 'ika': 127, 'r.': 156, 'ez': 116, 'tfağ': 160, 'ar': 49, 'har renkl': 239, 'nlü': 1

INFO:zemberek.morphology.turkish_morphology:TurkishMorphology instance initialized in 19.655850172042847


2023-07-22 13:35:19,072 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 19.655850172042847


Step 3: Tokenized Pretraining Data:
[['m', 'er', 'er', 'er', 'er', 'er', 'ha', 'ha', 'ha', 'b', 'a', 'a', 'n', 'asıl', 'asıl', 'n'], ['b', 'u', 'v', 'a', 'a', 'güzel', 'güzel', 'güzel'], ['y', 'ar', 'ar', 'ın', 'buluş', 'buluş', 'k', 'plan', 'yap', 'z'], ['t', 'rk', 'ün', 'dü', 'r'], ['k', 'ö', 'imle', 'im', 'ezinti', 'yap', 'm'], ['b', 'ir', 'fincan', 'mis', 'in', 'in'], ['o', 'o', 'o', 'o', 'o', 'k', 'it', 'it', 'me', 'k', 'ı', 'z'], ['s', 'o', 'o', 'o', 'o', 'o', 'nba'], ['d', 'iz', 'm'], ['e', 'e', 's', 'ki', 'dost', 'dost', 'la', 'la', 'buluş', 'buluş', 'k']]


**3. Evaluation of Translation**

3.1. BLUE Score

In [None]:
{
  "references": [
    "Reference sentence 1",
    "Reference sentence 2",
    ...
  ],
  "model_outputs": [
    "Model output sentence 1",
    "Model output sentence 2",
    ...
  ]
}


In [None]:
import json

def read_json_document(json_path):
    with open(json_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    return data

# Example usage
json_document_path = "path_to_your_json_document.json"
json_data = read_json_document(json_document_path)

In [None]:
import nltk

def tokenize_sentence(sentence):
    return nltk.word_tokenize(sentence)

# Tokenize the reference and model output sentences
reference_sentences = [tokenize_sentence(sentence) for sentence in json_data["references"]]
model_output_sentences = [tokenize_sentence(sentence) for sentence in json_data["model_outputs"]]

In [None]:
from nltk.translate.bleu_score import SmoothingFunction

def calculate_bleu(candidate, references):
    # candidate: the machine-translated text as a list of tokens (words)
    # references: a list of reference translations, each as a list of tokens (words)

    # Calculate BLEU score
    smoothie = SmoothingFunction().method1  # Smoothing function for short sentences
   # smoothie = SmoothingFunction().method4 # Smoothing function for long sentences
    bleu_score = nltk.translate.bleu_score.sentence_bleu(references, candidate, smoothing_function=smoothie)

    return bleu_score

# Evaluate BLEU score for each pair of reference and model output sentences
for reference, model_output in zip(reference_sentences, model_output_sentences):
    bleu_score = calculate_bleu(model_output, [reference])
    print("BLEU score:", bleu_score)

Notes for method for smoothing function:

method0: Suitable for all sentence lengths, no added n-grams for zero counts.

method1: Effective for short sentences (4 tokens or fewer).

method2: Effective for sentences with 4 tokens or more.

method4: Effective for longer sentences.

In practice, it's a good idea to experiment with different smoothing methods and see which one works best for your specific dataset and translation model.

In [24]:
import nltk.translate.bleu_score as bleu

def calculate_bleu(candidate, references):
    # candidate: the machine-translated text as a string
    # references: a list of reference translations, each as a string

    # Tokenize the candidate and references
    candidate_tokens = candidate.split()
    reference_tokens = [ref.split() for ref in references]

    # Calculate BLEU score
    bleu_score = bleu.sentence_bleu(reference_tokens, candidate_tokens)

    return bleu_score

# Example usage with a JSON data object
json_data = {
    "candidates": [
        "hello how are you",
        "what is your name"
    ],
    "references": [
        ["hi how are you", "hello how are you", "how do you do"],
        ["what's your name", "what is your name", "tell me your name"]
    ]
}

# Calculate BLEU score for each candidate-reference pair
bleu_scores = []
for candidate, references in zip(json_data["candidates"], json_data["references"]):
    # Calculate BLEU score for the candidate-reference pair
    bleu_score = calculate_bleu(candidate, references)
    bleu_scores.append(bleu_score)

print("BLEU score:", bleu_scores)


BLEU score: [1.0, 1.0]


3.2. METEOR Score

METEOR provides a more comprehensive evaluation than BLEU, taking into account word order, synonyms, and paraphrases.

In [17]:
import nltk
from nltk.translate.meteor_score import meteor_score
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [15]:
def preprocess_sentence(sentence):
    # Tokenize the sentence
    return nltk.word_tokenize(sentence.lower())

def calculate_meteor(candidate, references):
    # Calculate METEOR score
    meteor_score_value = meteor_score(references, candidate)

    return meteor_score_value

# Example usage
def process_json_data(json_data):
    candidates = json_data["candidates"]
    reference_sentences = json_data["references"]

    # Tokenize candidate translations
    tokenized_candidates = [preprocess_sentence(candidate) for candidate in candidates]

    # Tokenize reference translations
    tokenized_references = [list(map(preprocess_sentence, references)) for references in reference_sentences]

    # Calculate METEOR score for each candidate
    meteor_scores = []
    for candidate, references in zip(tokenized_candidates, tokenized_references):
        meteor_score_value = calculate_meteor(candidate, references)
        meteor_scores.append(meteor_score_value)

    return meteor_scores

# Example usage with a JSON data object
json_data = {
    "candidates": [
        "hello how are you",
        "what is your name"
    ],
    "references": [
        ["hi how are you", "hello how are you", "how do you do"],
        ["what's your name", "what is your name", "tell me your name"]
    ]
}

In [18]:
meteor_scores = process_json_data(json_data)
print("METEOR scores:", meteor_scores)

METEOR scores: [0.9921875, 0.9921875]


3.3. ROUGE Score

In [2]:
!pip install rouge-score

Collecting rouge-score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge-score
  Building wheel for rouge-score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=d032b4c1b837b0d1c0db08c550c7529baee5375e434cd46fe400d3e15da9cbd1
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge-score
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2


In [23]:
from rouge_score import rouge_scorer

def calculate_rouge(candidate, references, rouge_type='rougeL'):
    # candidate: the machine-translated text as a string
    # references: a list of reference translations, each as a string
    # rouge_type: the type of ROUGE score to compute (default is ROUGE-L)

    # Create a ROUGE scorer
    rouge = rouge_scorer.RougeScorer([rouge_type])

    # Calculate ROUGE score
    scores = rouge.score(candidate, references)

    return scores[rouge_type].fmeasure

# Example usage with a JSON data object
json_data = {
    "candidates": [
        "hello how are you",
        "what is your name"
    ],
    "references": [
        ["hi how are you", "hello how are you", "how do you do"],
        ["what's your name", "what is your name", "tell me your name"]
    ]
}

# Calculate ROUGE score for each candidate-reference pair
rouge_scores = []
for candidate, references in zip(json_data["candidates"], json_data["references"]):
    # Join the list of references into a single string
    reference_text = " ".join(references)

    # Calculate ROUGE score for the candidate-reference pair
    rouge_score = calculate_rouge(candidate, reference_text)
    rouge_scores.append(rouge_score)

# Compute the average ROUGE score across all candidate-reference pairs
print("METEOR scores:", rouge_scores)

METEOR scores: [0.5, 0.5]
