<a href="https://colab.research.google.com/github/sadnec/NER-Yamb/blob/main/Building_a_NER_Dataset_for_the_Yambetta_Language.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Let's Import the libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy as sp

# Loading small version of a model to understand the english language
nlp = sp.load("en_core_web_sm")

Data Exploration and Cleaning

In [None]:
# df = pd.read_csv('data.csv')
df = pd.read_excel('Bible_EN_YAT.xlsx')

df.head()


Unnamed: 0,Verse ID,Chapter,Verse,Book (EN),Book (YAT),Bible text (EN),Bible text (YAT)
0,MAT.1.1,1,1,MATTHEW,MAʼTƐ́AS,This is the genealogy of Jesus the Messiah the...,"Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ..."
1,MAT.1.2,1,2,MATTHEW,MAʼTƐ́AS,"Abraham was the father of Isaac, Isaac the fat...","Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y..."
2,MAT.1.3,1,3,MATTHEW,MAʼTƐ́AS,"Judah the father of Perez and Zerah, whose mot...","Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ..."
3,MAT.1.4,1,4,MATTHEW,MAʼTƐ́AS,"Ram the father of Amminadab, Amminadab the fat...","Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ..."
4,MAT.1.5,1,5,MATTHEW,MAʼTƐ́AS,"Salmon the father of Boaz, whose mother was Ra...",Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...


In [None]:

# Load a generic SpaCy model for tokenization
nlp_en = sp.load("en_core_web_sm")  # Use SpaCy's English model for tokenization
nlp_yt = sp.blank("xx")  # "xx" is a blank model for basic tokenization in Yambetta / Default tokenizer (xx) taken from the Spacy Library

# Tokenize the English and Yambetta Bible texts
df['tokens_en'] = df['Bible text (EN)'].apply(lambda text: [token.text for token in nlp_en(text)])
df['tokens_yat'] = df['Bible text (YAT)'].apply(lambda text: [token.text for token in nlp_yt(text)])


# Display the DataFrame with tokenized texts
# print(df[['Bible text (EN)', 'tokens_en', 'Bible text (YAT)', 'tokens_yat']].head())



In [None]:
# Optionally, save the DataFrame with tokens to a new Excel file
df.to_excel('tokenized_dataset.xlsx', index=False)


In [None]:
def align_tokens(tokens_en, tokens_yat):
    """
      This function will take tokens_en, tokens_yat and try to align them
      side by side, to try to guess the corresponding Yambetta word for each English word
    """
    aligned = []
    min_len = min(len(tokens_en), len(tokens_yat))

    for i in range(min_len):
        aligned.append((tokens_en[i], tokens_yat[i]))

    return aligned

# Apply alignment to each row
df['aligned_tokens'] = df.apply(lambda row: align_tokens(row['tokens_en'], row['tokens_yat']), axis=1)

# Display the aligned tokens
print(df[['tokens_en', 'tokens_yat', 'aligned_tokens']].head())

# df.head()


                                           tokens_en  \
0  [This, is, the, genealogy, of, Jesus, the, Mes...   
1  [Abraham, was, the, father, of, Isaac, ,, Isaa...   
2  [Judah, the, father, of, Perez, and, Zerah, ,,...   
3  [Ram, the, father, of, Amminadab, ,, Amminadab...   
4  [Salmon, the, father, of, Boaz, ,, whose, moth...   

                                          tokens_yat  \
0  [Táá, wɔnɔ́, ná, yoog, ɛ, pɔɔd, yɛ́, Yə́sus, K...   
1  [Ábɛlaam, yiíbíən, Ɛ́sag, ,, Ɛ́sag, əə́bíən, Y...   
2  [Yúda, əə́bíən, na, oʼkán, Tamáal, lɛ́, na, Fá...   
3  [Álam, əə́bíən, Amɛnadáab, ,, Amɛnadáab, əə́bí...   
4  [Sálmɔn, əə́bíən, Póos, ., (, Ŋŋí, o, Póos, ay...   

                                      aligned_tokens  
0  [(This, Táá), (is, wɔnɔ́), (the, ná), (genealo...  
1  [(Abraham, Ábɛlaam), (was, yiíbíən), (the, Ɛ́s...  
2  [(Judah, Yúda), (the, əə́bíən), (father, na), ...  
3  [(Ram, Álam), (the, əə́bíən), (father, Amɛnadá...  
4  [(Salmon, Sálmɔn), (the, əə́bíən), (father, Pó..

**Step 1: Define the Tagging Scheme**  
   

Before we start labeling, we need to decide on a tagging scheme. Here's what we will be using:

0: Non-entity  
1: Person (PER)  
2: Organization (ORG)   
3: Location (LOC)   
4: Date/Time (DATE)   
5: Miscellaneous (MISC)  

In [None]:
# Function to get NER tags using SpaCy's English model
def get_ner_tags(tokens_en):
    """
      This function will take the english tokens, jon them and build a document with
      the nlp_en() model for treating and for each text, it will print near it the corresponding class
    """
    doc = nlp_en(" ".join(tokens_en))
    ner_tags = [ent.label_ if ent.label_ else 'O' for ent in doc.ents]
    return ner_tags

# Apply the NER tagging to English tokens
df['ner_tags_en'] = df['tokens_en'].apply(lambda tokens: get_ner_tags(tokens))

# Display the DataFrame with NER tags for English
# print(df[['tokens_en', 'ner_tags_en']].head())
df.head()

Unnamed: 0,Verse ID,Chapter,Verse,Book (EN),Book (YAT),Bible text (EN),Bible text (YAT),tokens_en,tokens_yat,ner_tags_en
0,MAT.1.1,1,1,MATTHEW,MAʼTƐ́AS,This is the genealogy of Jesus the Messiah the...,"Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ...","[This, is, the, genealogy, of, Jesus, the, Mes...","[Táá, wɔnɔ́, ná, yoog, ɛ, pɔɔd, yɛ́, Yə́sus, K...","[PERSON, PERSON, PERSON]"
1,MAT.1.2,1,2,MATTHEW,MAʼTƐ́AS,"Abraham was the father of Isaac, Isaac the fat...","Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y...","[Abraham, was, the, father, of, Isaac, ,, Isaa...","[Ábɛlaam, yiíbíən, Ɛ́sag, ,, Ɛ́sag, əə́bíən, Y...","[PERSON, PERSON, PERSON, PERSON, PERSON, GPE]"
2,MAT.1.3,1,3,MATTHEW,MAʼTƐ́AS,"Judah the father of Perez and Zerah, whose mot...","Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ...","[Judah, the, father, of, Perez, and, Zerah, ,,...","[Yúda, əə́bíən, na, oʼkán, Tamáal, lɛ́, na, Fá...","[PERSON, PERSON, PERSON, ORG, ORG, GPE]"
3,MAT.1.4,1,4,MATTHEW,MAʼTƐ́AS,"Ram the father of Amminadab, Amminadab the fat...","Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ...","[Ram, the, father, of, Amminadab, ,, Amminadab...","[Álam, əə́bíən, Amɛnadáab, ,, Amɛnadáab, əə́bí...","[ORG, ORG, PERSON, PERSON, ORG]"
4,MAT.1.5,1,5,MATTHEW,MAʼTƐ́AS,"Salmon the father of Boaz, whose mother was Ra...",Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...,"[Salmon, the, father, of, Boaz, ,, whose, moth...","[Sálmɔn, əə́bíən, Póos, ., (, Ŋŋí, o, Póos, ay...","[PERSON, PERSON, PERSON, PERSON, PERSON]"


In [None]:
# Transfer the NER tags to Yambetta based on alignment
def transfer_ner_tags(aligned_tokens, ner_tags_en):
    """
    This function will transfer the ner tags from the english words to the Yambetta ones,
     using the aligned_tokens list and will produce a list of Yambetta NER tags.
    """
    ner_tags_yat = []
    j = 0  # Index to track the position in ner_tags_en
    for en_token, yat_token in aligned_tokens:
        if j < len(ner_tags_en):
            ner_tag = ner_tags_en[j]
            j += 1
        else:
            ner_tag = 0  # 0 for non-entity
        ner_tags_yat.append(ner_tag)
    return ner_tags_yat

df['ner_tags_yat'] = df.apply(lambda row: transfer_ner_tags(row['aligned_tokens'], row['ner_tags_en']), axis=1)

# Display the DataFrame with NER tags for Yambetta
# print(df[['tokens_yat', 'ner_tags_yat']].head())
df.head()


Unnamed: 0,Verse ID,Chapter,Verse,Book (EN),Book (YAT),Bible text (EN),Bible text (YAT),tokens_en,tokens_yat,aligned_tokens,ner_tags_en,ner_tags_yat
0,MAT.1.1,1,1,MATTHEW,MAʼTƐ́AS,This is the genealogy of Jesus the Messiah the...,"Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ...","[This, is, the, genealogy, of, Jesus, the, Mes...","[Táá, wɔnɔ́, ná, yoog, ɛ, pɔɔd, yɛ́, Yə́sus, K...","[(This, Táá), (is, wɔnɔ́), (the, ná), (genealo...","[PERSON, PERSON, PERSON]","[PERSON, PERSON, PERSON, 0, 0, 0, 0, 0, 0, 0, ..."
1,MAT.1.2,1,2,MATTHEW,MAʼTƐ́AS,"Abraham was the father of Isaac, Isaac the fat...","Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y...","[Abraham, was, the, father, of, Isaac, ,, Isaa...","[Ábɛlaam, yiíbíən, Ɛ́sag, ,, Ɛ́sag, əə́bíən, Y...","[(Abraham, Ábɛlaam), (was, yiíbíən), (the, Ɛ́s...","[PERSON, PERSON, PERSON, PERSON, PERSON, GPE]","[PERSON, PERSON, PERSON, PERSON, PERSON, GPE, ..."
2,MAT.1.3,1,3,MATTHEW,MAʼTƐ́AS,"Judah the father of Perez and Zerah, whose mot...","Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ...","[Judah, the, father, of, Perez, and, Zerah, ,,...","[Yúda, əə́bíən, na, oʼkán, Tamáal, lɛ́, na, Fá...","[(Judah, Yúda), (the, əə́bíən), (father, na), ...","[PERSON, PERSON, PERSON, ORG, ORG, GPE]","[PERSON, PERSON, PERSON, ORG, ORG, GPE, 0, 0, ..."
3,MAT.1.4,1,4,MATTHEW,MAʼTƐ́AS,"Ram the father of Amminadab, Amminadab the fat...","Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ...","[Ram, the, father, of, Amminadab, ,, Amminadab...","[Álam, əə́bíən, Amɛnadáab, ,, Amɛnadáab, əə́bí...","[(Ram, Álam), (the, əə́bíən), (father, Amɛnadá...","[ORG, ORG, PERSON, PERSON, ORG]","[ORG, ORG, PERSON, PERSON, ORG, 0, 0, 0, 0, 0,..."
4,MAT.1.5,1,5,MATTHEW,MAʼTƐ́AS,"Salmon the father of Boaz, whose mother was Ra...",Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...,"[Salmon, the, father, of, Boaz, ,, whose, moth...","[Sálmɔn, əə́bíən, Póos, ., (, Ŋŋí, o, Póos, ay...","[(Salmon, Sálmɔn), (the, əə́bíən), (father, Pó...","[PERSON, PERSON, PERSON, PERSON, PERSON]","[PERSON, PERSON, PERSON, PERSON, PERSON, 0, 0,..."


In [None]:
print(df.head())

  Verse ID  Chapter  Verse Book (EN) Book (YAT)  \
0  MAT.1.1        1      1   MATTHEW   MAʼTƐ́AS   
1  MAT.1.2        1      2   MATTHEW   MAʼTƐ́AS   
2  MAT.1.3        1      3   MATTHEW   MAʼTƐ́AS   
3  MAT.1.4        1      4   MATTHEW   MAʼTƐ́AS   
4  MAT.1.5        1      5   MATTHEW   MAʼTƐ́AS   

                                     Bible text (EN)  \
0  This is the genealogy of Jesus the Messiah the...   
1  Abraham was the father of Isaac, Isaac the fat...   
2  Judah the father of Perez and Zerah, whose mot...   
3  Ram the father of Amminadab, Amminadab the fat...   
4  Salmon the father of Boaz, whose mother was Ra...   

                                    Bible text (YAT)  \
0  Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ...   
1  Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y...   
2  Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ...   
3  Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ...   
4  Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...   

                   

In [None]:
# Save the labeled DataFrame to an Excel file
# df[['ner_tags_yat','tokens_yat']].to_excel('ner_tags_tokens_yambetta.xlsx', index=False)
df.to_excel('full_data_set.xlsx', index=False)


In [None]:
df = pd.read_excel('full_data_set.xlsx')

tag2id = {
    "O": 0,        # Non-entity
    "PERSON": 1,   # Person names
    "ORG": 2,      # Organizations
    "GPE": 3,      # Geo-political entities
    "MISC": 4,      # Miscellaneous entities
}

df.head()

Unnamed: 0,Verse ID,Chapter,Verse,Book (EN),Book (YAT),Bible text (EN),Bible text (YAT),tokens_en,tokens_yat,aligned_tokens,ner_tags_en,ner_tags_yat
0,MAT.1.1,1,1,MATTHEW,MAʼTƐ́AS,This is the genealogy of Jesus the Messiah the...,"Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ...","['This', 'is', 'the', 'genealogy', 'of', 'Jesu...","['Táá', 'wɔnɔ́', 'ná', 'yoog', 'ɛ', 'pɔɔd', 'y...","[('This', 'Táá'), ('is', 'wɔnɔ́'), ('the', 'ná...","['PERSON', 'PERSON', 'PERSON']","['PERSON', 'PERSON', 'PERSON', 0, 0, 0, 0, 0, ..."
1,MAT.1.2,1,2,MATTHEW,MAʼTƐ́AS,"Abraham was the father of Isaac, Isaac the fat...","Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y...","['Abraham', 'was', 'the', 'father', 'of', 'Isa...","['Ábɛlaam', 'yiíbíən', 'Ɛ́sag', ',', 'Ɛ́sag', ...","[('Abraham', 'Ábɛlaam'), ('was', 'yiíbíən'), (...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS..."
2,MAT.1.3,1,3,MATTHEW,MAʼTƐ́AS,"Judah the father of Perez and Zerah, whose mot...","Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ...","['Judah', 'the', 'father', 'of', 'Perez', 'and...","['Yúda', 'əə́bíən', 'na', 'oʼkán', 'Tamáal', '...","[('Judah', 'Yúda'), ('the', 'əə́bíən'), ('fath...","['PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', '...","['PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', '..."
3,MAT.1.4,1,4,MATTHEW,MAʼTƐ́AS,"Ram the father of Amminadab, Amminadab the fat...","Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ...","['Ram', 'the', 'father', 'of', 'Amminadab', ',...","['Álam', 'əə́bíən', 'Amɛnadáab', ',', 'Amɛnadá...","[('Ram', 'Álam'), ('the', 'əə́bíən'), ('father...","['ORG', 'ORG', 'PERSON', 'PERSON', 'ORG']","['ORG', 'ORG', 'PERSON', 'PERSON', 'ORG', 0, 0..."
4,MAT.1.5,1,5,MATTHEW,MAʼTƐ́AS,"Salmon the father of Boaz, whose mother was Ra...",Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...,"['Salmon', 'the', 'father', 'of', 'Boaz', ',',...","['Sálmɔn', 'əə́bíən', 'Póos', '.', '(', 'Ŋŋí',...","[('Salmon', 'Sálmɔn'), ('the', 'əə́bíən'), ('f...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS..."


In [None]:
# Function to convert NER tags from string labels to numerical labels
def convert_tags_to_ids(ner_tags, tag2id):
    return [tag2id.get(tag, 0) for tag in ner_tags]  # Default to 0 (O) if tag is not found

# Apply the conversion to the 'ner_tags_yat' column
df['ner_tags_yat_ids'] = df['ner_tags_yat'].apply(lambda tags: convert_tags_to_ids(eval(tags), tag2id))  # Convert strings to lists first

# Display the updated DataFrame
# print(df[['tokens_yat', 'ner_tags_yat', 'ner_tags_yat_ids']].head())
df.head()


Unnamed: 0,Verse ID,Chapter,Verse,Book (EN),Book (YAT),Bible text (EN),Bible text (YAT),tokens_en,tokens_yat,aligned_tokens,ner_tags_en,ner_tags_yat,ner_tags_yat_ids
0,MAT.1.1,1,1,MATTHEW,MAʼTƐ́AS,This is the genealogy of Jesus the Messiah the...,"Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ...","['This', 'is', 'the', 'genealogy', 'of', 'Jesu...","['Táá', 'wɔnɔ́', 'ná', 'yoog', 'ɛ', 'pɔɔd', 'y...","[('This', 'Táá'), ('is', 'wɔnɔ́'), ('the', 'ná...","['PERSON', 'PERSON', 'PERSON']","['PERSON', 'PERSON', 'PERSON', 0, 0, 0, 0, 0, ...","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,MAT.1.2,1,2,MATTHEW,MAʼTƐ́AS,"Abraham was the father of Isaac, Isaac the fat...","Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y...","['Abraham', 'was', 'the', 'father', 'of', 'Isa...","['Ábɛlaam', 'yiíbíən', 'Ɛ́sag', ',', 'Ɛ́sag', ...","[('Abraham', 'Ábɛlaam'), ('was', 'yiíbíən'), (...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1, 1, 1, 1, 1, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,MAT.1.3,1,3,MATTHEW,MAʼTƐ́AS,"Judah the father of Perez and Zerah, whose mot...","Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ...","['Judah', 'the', 'father', 'of', 'Perez', 'and...","['Yúda', 'əə́bíən', 'na', 'oʼkán', 'Tamáal', '...","[('Judah', 'Yúda'), ('the', 'əə́bíən'), ('fath...","['PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', '...","['PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', '...","[1, 1, 1, 2, 2, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,MAT.1.4,1,4,MATTHEW,MAʼTƐ́AS,"Ram the father of Amminadab, Amminadab the fat...","Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ...","['Ram', 'the', 'father', 'of', 'Amminadab', ',...","['Álam', 'əə́bíən', 'Amɛnadáab', ',', 'Amɛnadá...","[('Ram', 'Álam'), ('the', 'əə́bíən'), ('father...","['ORG', 'ORG', 'PERSON', 'PERSON', 'ORG']","['ORG', 'ORG', 'PERSON', 'PERSON', 'ORG', 0, 0...","[2, 2, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0]"
4,MAT.1.5,1,5,MATTHEW,MAʼTƐ́AS,"Salmon the father of Boaz, whose mother was Ra...",Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...,"['Salmon', 'the', 'father', 'of', 'Boaz', ',',...","['Sálmɔn', 'əə́bíən', 'Póos', '.', '(', 'Ŋŋí',...","[('Salmon', 'Sálmɔn'), ('the', 'əə́bíən'), ('f...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","[1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [None]:
data = []
for idx, row in df.iterrows():
    tokens = eval(row['tokens_yat'])  # Convert string to list
    ner_tags = row['ner_tags_yat_ids']

    entry = {
        "id": str(idx),  # Example of using row index as ID
        "tokens": tokens,
        "ner_tags": ner_tags
    }
    data.append(entry)

# Example of how an entry might look
print(data[0])



{'id': '0', 'tokens': ['Táá', 'wɔnɔ́', 'ná', 'yoog', 'ɛ', 'pɔɔd', 'yɛ́', 'Yə́sus', 'Kilíʼtus', ',', 'kɛnannán', 'kɛ́', 'Tə́fid', 'nyɔ́', 'ayɛ́ɛ', 'nyɔ́lɛ́nyɔ́amɔɛ́d', 'tɛn', 'kɛnannán', 'kɛ́', 'Ábɛlaam', 'əyə́biə́níí', 'a', 'yɛ́lɛ́', 'aa', 'yɛ́ɛnɛ', 'pálɛɛ́', 'ɔsɔ́g', 'pɔ́nɔ́', ':'], 'ner_tags': [1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]}


In [None]:

# Save to JSON for use in training
import json
with open('ner_dataset_yambetta.json', 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# df.to_excel('final_dataset.xlsx', index=False)
df.head()

Unnamed: 0,Verse ID,Chapter,Verse,Book (EN),Book (YAT),Bible text (EN),Bible text (YAT),tokens_en,tokens_yat,aligned_tokens,ner_tags_en,ner_tags_yat
0,MAT.1.1,1,1,MATTHEW,MAʼTƐ́AS,This is the genealogy of Jesus the Messiah the...,"Táá wɔnɔ́ ná yoog ɛ pɔɔd yɛ́ Yə́sus Kilíʼtus, ...","['This', 'is', 'the', 'genealogy', 'of', 'Jesu...","[Táá, wɔnɔ́, ná, yoog, ɛ, pɔɔd, yɛ́, Yə́sus, K...","[('This', 'Táá'), ('is', 'wɔnɔ́'), ('the', 'ná...","['PERSON', 'PERSON', 'PERSON']","['PERSON', 'PERSON', 'PERSON', 0, 0, 0, 0, 0, ..."
1,MAT.1.2,1,2,MATTHEW,MAʼTƐ́AS,"Abraham was the father of Isaac, Isaac the fat...","Ábɛlaam yiíbíən Ɛ́sag, Ɛ́sag əə́bíən Yáʼkɔb. Y...","['Abraham', 'was', 'the', 'father', 'of', 'Isa...","[Ábɛlaam, yiíbíən, Ɛ́sag, ,, Ɛ́sag, əə́bíən, Y...","[('Abraham', 'Ábɛlaam'), ('was', 'yiíbíən'), (...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS..."
2,MAT.1.3,1,3,MATTHEW,MAʼTƐ́AS,"Judah the father of Perez and Zerah, whose mot...","Yúda əə́bíən na oʼkán Tamáal lɛ́ na Fálɛs, na ...","['Judah', 'the', 'father', 'of', 'Perez', 'and...","[Yúda, əə́bíən, na, oʼkán, Tamáal, lɛ́, na, Fá...","[('Judah', 'Yúda'), ('the', 'əə́bíən'), ('fath...","['PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', '...","['PERSON', 'PERSON', 'PERSON', 'ORG', 'ORG', '..."
3,MAT.1.4,1,4,MATTHEW,MAʼTƐ́AS,"Ram the father of Amminadab, Amminadab the fat...","Álam əə́bíən Amɛnadáab, Amɛnadáab əə́bíən Násɔ...","['Ram', 'the', 'father', 'of', 'Amminadab', ',...","[Álam, əə́bíən, Amɛnadáab, ,, Amɛnadáab, əə́bí...","[('Ram', 'Álam'), ('the', 'əə́bíən'), ('father...","['ORG', 'ORG', 'PERSON', 'PERSON', 'ORG']","['ORG', 'ORG', 'PERSON', 'PERSON', 'ORG', 0, 0..."
4,MAT.1.5,1,5,MATTHEW,MAʼTƐ́AS,"Salmon the father of Boaz, whose mother was Ra...",Sálmɔn əə́bíən Póos. (Ŋŋí o Póos ayɛ́ɛ niiŋ lɛ...,"['Salmon', 'the', 'father', 'of', 'Boaz', ',',...","[Sálmɔn, əə́bíən, Póos, ., (, Ŋŋí, o, Póos, ay...","[('Salmon', 'Sálmɔn'), ('the', 'əə́bíən'), ('f...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS...","['PERSON', 'PERSON', 'PERSON', 'PERSON', 'PERS..."


In [None]:
import pandas as pd
import json



tag2id = {
    "0": 0,        # Non-entity
    "PERSON": 1,   # Person names
    "ORG": 2,      # Organizations
    "GPE": 3,      # Geo-political entities
    "MISC": 4,      # Miscellaneous entities
}
# Reverse mapping for readability
id2tag = {v: k for k, v in tag2id.items()}

# Load the dataset (choose either JSON or Excel based on your current format)
df = pd.read_excel("final_dataset.xlsx")
# Ensure the tokens and ner_tags are properly converted from string
df['tokens_yat'] = df['tokens_yat'].apply(eval)
df['ner_tags_yat_ids'] = df['ner_tags_yat_ids'].apply(eval)
dataset =  df.to_dict(orient='records')

print(dataset)

# Validate the dataset
def validate_ner_dataset(dataset, tag2id):
    issues = []

    for entry in dataset:
        tokens = entry['tokens_yat']
        ner_tags = entry['ner_tags_yat']

        # Check if the length of tokens and ner_tags match
        if len(tokens) != len(ner_tags):
            issues.append({
                "id": entry['Verse ID'],
                "issue": f"Token count {len(tokens)} doesn't match NER tag count {len(ner_tags)}"
            })

        # Check for invalid NER tag values
        for tag in ner_tags:
            if tag not in tag2id.values():
                issues.append({
                    "id": entry['Verse ID'],
                    "issue": f"Invalid NER tag {tag}"
                })

    return issues

# Load your dataset

# Validate the dataset
validation_issues = validate_ner_dataset(dataset, tag2id)

# Report validation results
if validation_issues:
    print(f"Found {len(validation_issues)} issues:")
    for issue in validation_issues:
        print(f"Entry {issue['id']}: {issue['issue']}")
else:
    print("No issues found, the dataset looks good!")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid NER tag ,
Entry REV.20.13: Invalid NER tag  
Entry REV.20.13: Invalid NER tag 0
Entry REV.20.13: Invalid 