# Convert to standard Farsi

In [1]:
from tqdm.notebook import tqdm
import pandas as pd
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed

# API base URL
API_BASE_URL = "https://bible.peplamb.com/api/parse"
#API_BASE_URL = "http://localhost:3080/api/parse"
#API_BASE_URL = "http://localhost:4200/api/parse"


import unicodedata

def normalize_text(text):
    return unicodedata.normalize("NFC", text)

def convert_to_standard_farsi(text):
    """
    Converts text to standardized Farsi characters used in the working parser.

    Args:
        text (str): Input text containing Farsi/Arabic characters

    Returns:
        str: Converted text with standardized characters
    """

    text = normalize_text(text)
    # Character mapping dictionary
    char_map = {
        # Yeh variants
        # 'ي': 'ی',  # Arabic Yeh to Farsi Yeh
        'ى': 'ی',  # Alef Maksura to Farsi Yeh

        "ؤ": "و",
        "ـ": "",
        "آ": "ا",

        # Remove diacritics and special markers
        'ِ': '',   # Kasra
        'ُ': '',   # Damma
        'َ': '',   # Fatha
        'ّ': '',   # Shadda
        'ٌ': '',   # Tanvin Damm
        'ً': '',   # Tanvin Fath
        'ٍ': '',   # Tanvin Kasr

        # Zero-width & spacing characters
        '\u200c': ' ',  # Convert ZWNJ to space
        '\u200b': '',   # Remove zero-width space
        '\u200d': '',   # Remove zero-width joiner
        '\u200e': '',   # Remove LTR mark
        '\u200f': '',   # Remove RTL mark
        '‌': ' ',       # Convert ZWNJ to space

        # Additional standardizations
        'ك': 'ک',      # Arabic Kaf to Farsi Keh
        'ظ': 'ض',      # Standardize similar sounds
    }

    # Convert text character by character
    result = ''
    i = 0
    while i < len(text):
        # Regular character conversion
        char = text[i]
        result += char_map.get(char, char)
        i += 1

    # Clean up multiple spaces
    result = ' '.join(result.split())

    return result

# def convert_to_standard_farsi(text):
#     """
#     Converts text to standardized Farsi characters used in the working parser.

#     Args:
#         text (str): Input text containing Farsi/Arabic characters

#     Returns:
#         str: Converted text with standardized characters
#     """
#     # Character mapping dictionary
#     char_map = {
#         # Yeh variants
#         # 'ي': 'ی',  # Arabic Yeh to Farsi Yeh
#         'ى': 'ی',  # Alef Maksura to Farsi Yeh

#         # Remove diacritics and special markers
#         'ِ': '',   # Kasra
#         'ُ': '',   # Damma
#         'َ': '',   # Fatha
#         'ّ': '',   # Shadda
#         'ٌ': '',   # Tanvin Damm
#         'ً': '',   # Tanvin Fath
#         'ٍ': '',   # Tanvin Kasr

#         # # Standardize hamza variants
#         # 'أ': 'ا',
#         # 'إ': 'ا',
#         # 'ؤ': 'و',
#         # 'ئ': 'ی',

#         # Zero-width & spacing characters
#         '\u200c': ' ',  # Convert ZWNJ to space
#         '\u200b': '',   # Remove zero-width space
#         '\u200d': '',   # Remove zero-width joiner
#         '\u200e': '',   # Remove LTR mark
#         '\u200f': '',   # Remove RTL mark
#         '‌': ' ',       # Convert ZWNJ to space

#         # # Special case conversions
#         # 'ط': 'ت',      # Convert Ta variants
#         # 'ة': 'ه',      # Convert Teh Marbuta to Heh

#         # Additional standardizations
#         'ك': 'ک',      # Arabic Kaf to Farsi Keh
#         'ظ': 'ض',      # Standardize similar sounds


#         # Arabic diacritical marks (common to all)
#         '\u0610': '',  # Arabic sign Sallallahou Alayhe Wasallam
#         '\u0611': '',  # Arabic sign Alayhe Assallam
#         '\u0612': '',  # Arabic sign Rahmatullah Alayhe
#         '\u0613': '',  # Arabic sign Radi Allahou Anhu
#         '\u0614': '',  # Arabic sign Takhallus
#         '\u0615': '',  # Arabic small high Tah
#         '\u0616': '',  # Arabic small high Ligature Alef with Lam with Yeh
#         '\u0617': '',  # Arabic small high Zain
#         '\u0618': '',  # Arabic small Fatha
#         '\u0619': '',  # Arabic small Damma
#         '\u061A': '',  # Arabic small Kasra
#         '\u064B': '',  # Arabic Fathatan
#         '\u064C': '',  # Arabic Dammatan
#         '\u064D': '',  # Arabic Kasratan
#         '\u064E': '',  # Arabic Fatha
#         '\u064F': '',  # Arabic Damma
#         '\u0650': '',  # Arabic Kasra
#         '\u0651': '',  # Arabic Shadda
#         '\u0652': '',  # Arabic Sukun
#         '\u0653': '',  # Arabic Maddah Above
#         '\u0654': '',  # Arabic Hamza Above
#         '\u0655': '',  # Arabic Hamza Below
#         '\u0656': '',  # Arabic Subscript Alef
#         '\u0657': '',  # Arabic Inverted Damma
#         '\u0658': '',  # Arabic Mark Noon Ghunnah
#         '\u0659': '',  # Arabic Zavith
#         '\u065A': '',  # Arabic Vowel Sign Small V Above
#         '\u065B': '',  # Arabic Vowel Sign Inverted Small V Above
#         '\u065C': '',  # Arabic Vowel Sign Dot Below
#         '\u065D': '',  # Arabic Reversed Damma
#         '\u065E': '',  # Arabic Fatha with Two Dots
#         '\u065F': '',  # Arabic Wavy Hamza Below

#         # Quranic annotations
#         '\u06D6': '',  # Arabic Small High Ligature Sad with Lam with Alef Maksura
#         '\u06D7': '',  # Arabic Small High Ligature Qaf with Lam with Alef Maksura
#         '\u06D8': '',  # Arabic Small High Meem Initial Form
#         '\u06D9': '',  # Arabic Small High Lam Alef
#         '\u06DA': '',  # Arabic Small High Jeem
#         '\u06DB': '',  # Arabic Small High Three Dots
#         '\u06DC': '',  # Arabic Small High Seen
#         '\u06DF': '',  # Arabic Small High Rounded Zero
#         '\u06E0': '',  # Arabic Small High U
#         '\u06E1': '',  # Arabic Small High Maddah
#         '\u06E2': '',  # Arabic Small High Noon
#         '\u06E3': '',  # Arabic Small Low Seen
#         '\u06E4': '',  # Arabic Small High Yeh
#         '\u06E5': '',  # Arabic Small Waw
#         '\u06E6': '',  # Arabic Small Yeh
#         '\u06E7': '',  # Arabic Small High Yeh Barree
#         '\u06E8': '',  # Arabic Small High Noon with Kasra
#         '\u06EA': '',  # Arabic Empty Centre Low Stop
#         '\u06EB': '',  # Arabic Empty Centre High Stop
#         '\u06EC': '',  # Arabic Rounded High Stop with Filled Centre
#         '\u06ED': '',  # Arabic Small Low Meem
#     }

#     # Convert text character by character
#     result = ''
#     i = 0
#     while i < len(text):
#         # # Check for double letters (like یی) and convert to single
#         # if i + 1 < len(text) and text[i] == text[i + 1]:
#         #     if text[i] in 'یيىکكظطضصشسژزرذدحچجثتپباء':
#         #     #if text[i] in 'یيىکكظطضصشسژزرذدحچجثتپباءلم':
#         #         result += text[i]
#         #         i += 2
#         #         continue

#         # Regular character conversion
#         char = text[i]
#         result += char_map.get(char, char)
#         i += 1

#     # Clean up multiple spaces
#     result = ' '.join(result.split())

#     return result

# def process_farsi_books(farsi_books):
#     books_with_good_parse = []
#     books_with_bad_parse = []
#     bad_count = 0

#     # Use tqdm to track progress
#     for farsi, english in tqdm(farsi_books, desc="Processing Farsi Books"):
#         farsi = convert_to_standard_farsi(farsi)
#         params = {
#             "references": farsi,
#             "splitRange": "true",
#         }
#         try:
#             response = requests.get(API_BASE_URL, params=params, timeout=10)
#             if response.status_code == 200:
#                 data = response.json()
#                 if len(data) == 0:
#                     bad_count += 1
#                     #print(f"Book with no parsed references: {bad_count} - {farsi} ({english})")
#                     books_with_bad_parse.append((farsi, english))
#                 else:
#                     books_with_good_parse.append((farsi, english))
#             else:
#                 print(f"Error {response.status_code} for {farsi} ({english}): {response.text}")
#         except requests.exceptions.RequestException as e:
#             print(f"Request failed for {farsi} ({english}): {e}")

#     # Output results as a table
#     #print("Books with no parse results:")
#     df_farsi_bad = pd.DataFrame(books_with_bad_parse, columns=["Farsi Name", "English Name"])
#     #print("Books with good parse results:")
#     df_farsi_good = pd.DataFrame(books_with_good_parse, columns=["Farsi Name", "English Name"])

#     return df_farsi_bad

def process_farsi_books(farsi_books):
    books_with_good_parse = []
    books_with_bad_parse = []
    bad_count = 0

    def process_book(book):
        farsi, english = book
        #farsi = convert_to_standard_farsi(farsi)
        params = {
            "references": farsi,
            "splitRange": "false",
        }
        try:
            response = requests.get(API_BASE_URL, params=params, timeout=10)
            if response.status_code == 200:
                data = response.json()
                if len(data) == 0:
                    return {"status": "bad", "book": (farsi, english)}
                else:
                    return {"status": "good", "book": (farsi, english)}
            else:
                print(f"Error {response.status_code} for {farsi} ({english}): {response.text}")
                return {"status": "bad", "book": (farsi, english)}
        except requests.exceptions.RequestException as e:
            print(f"Request failed for {farsi} ({english}): {e}")
            return {"status": "bad", "book": (farsi, english)}

    with ThreadPoolExecutor() as executor:
        # Submit all books to the executor
        future_to_book = {executor.submit(process_book, book): book for book in farsi_books}

        # Use tqdm to display progress
        for future in tqdm(as_completed(future_to_book), desc="Processing Farsi Books", total=len(farsi_books)):
            result = future.result()
            if result["status"] == "good":
                books_with_good_parse.append(result["book"])
            else:
                books_with_bad_parse.append(result["book"])

    # Convert results to DataFrames
    df_farsi_bad = pd.DataFrame(books_with_bad_parse, columns=["Farsi Name", "English Name"])
    df_farsi_good = pd.DataFrame(books_with_good_parse, columns=["Farsi Name", "English Name"])

    return df_farsi_bad #, df_farsi_good

In [2]:
def transform_data_to_farsi_books(data, farsi_type):
    """
    Transforms the `data` dictionary into a list of tuples (Farsi, English) for testing.

    :param data: Dictionary containing Farsi and English book names.
    :return: List of tuples [(Farsi name, English name), ...].
    """
    farsi_books = []

    english_books = data["English"]
    farsi_books_long = data[farsi_type]

    # Pair each English book name with the corresponding Farsi book name (Long format)
    for english, farsi_long in zip(english_books, farsi_books_long):
        farsi_books.append((farsi_long, english))

    return farsi_books
data = {
    "English": [
        "Genesis", "Exodus", "Leviticus", "Numbers", "Deuteronomy", "Joshua", "Judges", "Ruth",  
        "1 Samuel", "2 Samuel", "1 Kings", "2 Kings", "1 Chronicles", "2 Chronicles", "Ezra",  
        "Nehemiah", "Esther", "Job", "Psalms", "Proverbs", "Ecclesiastes", "Song of Solomon",  
        "Isaiah", "Jeremiah", "Lamentations", "Ezekiel", "Daniel", "Hosea", "Joel", "Amos",  
        "Obadiah", "Jonah", "Micah", "Nahum", "Habakkuk", "Zephaniah", "Haggai", "Zechariah",  
        "Malachi", "Matthew", "Mark", "Luke", "John", "Acts", "Romans", "1 Corinthians",  
        "2 Corinthians", "Galatians", "Ephesians", "Philippians", "Colossians",  
        "1 Thessalonians", "2 Thessalonians", "1 Timothy", "2 Timothy", "Titus", "Philemon",  
        "Hebrews", "James", "1 Peter", "2 Peter", "1 John", "2 John", "3 John", "Jude",  
        "Revelation",
    ],
    "Long": [
        "پیدایش", "خروج", "لاویان", "اعداد", "تثنیه", "یوشع", "داوران", "روت", "۱ سموئیل", "۲ سموئیل", "۱ پادشاهان", "۲ پادشاهان", "۱ تواریخ", "۲ تواریخ", "عِزرا", "نِحِمیا", "اِستر", "ایوب", "مزمور", "امثال", "جامعه", "غزل غزلها", "اِشعیا", "اِرمیا", "مراثی اِرمیا", "حِزقیال", "دانیال", "هوشع", "یوئیل", "عاموس", "عوبَدیا", "یونس", "میکاه", "ناحوم", "حَبَقوق", "صَفَنیا", "حَجَّی", "زکریا", "مَلاکی", "مَتّی", "مَرقُس", "لوقا", "یوحنا", "اعمال", "رومیان", "۱ قرنتیان", "۲ قرنتیان", "غلاطیان", "اَفِسسیان", "فیلیپیان", "کولُسیان", "۱ تسالونیکیان", "۲ تسالونیکیان", "۱ تیموتائوس", "۲ تیموتائوس", "تیتوس", "فلیمون", "عبرانیان", "یعقوب", "۱ پطرس", "۲ پطرس", "۱ یوحنا", "۲ یوحنا", "۳ یوحنا", "یهودا", "مکاشفه"
    ],
    "Short": [
        "پیدا", "خروج", "لاوی", "اعدا", "تثنی", "یوشع", "داور", "روتا", "۱سموی", "۲سموی", "۱پادش", "۲پادش", "۱توار", "۲توار", "عزرا", "نحمی", "استر", "ایوب", "مزمو", "امثا", "جامع", "غزله", "اشعی", "ارمی", "مراث", "حزقی", "دانیا", "هوشا", "یوئی", "عامو", "عوبد", "یونس", "میکا", "ناحو", "حبقا", "صفنی", "حجـی", "زکری", "ملاک", "متی", "مرقس", "لوقا", "یوحن", "اعما", "رومیا", "۱قرن", "۲قرن", "غلاط", "افسـ", "فیلی", "کولس", "۱تسال", "۲تسال", "۱تیمو", "۲تیمو", "تیتو", "فلیمو", "عبران", "یعقو", "۱پطرس", "۲پطرس", "۱یوحن", "۲یوحن", "۳یوحن", "یهود", "مکاش"
    ],
    "Shorter": [
        "پید", "خرو", "لاو", "اعد", "تثن", "یوش", "داو", "روت", "۱سمو", "۲سمو", "۱پاد", "۲پاد", "۱توا", "۲توا", "عزر", "نحم", "است", "ایو", "مزم", "امت", "جام", "غزل", "اشع", "ارم", "مرا", "حزق", "دان", "هوش", "یوئ", "عام", "عوب", "یون", "میک", "ناح", "حبق", "صفن", "حجـ", "زکر", "ملا", "مت", "مرق", "لوق", "یوح", "اعم", "رومی", "۱قر", "۲قر", "غلا", "افس", "فیل", "کول", "۱تسا", "۲تسا", "۱تیم", "۲تیم", "تیت", "فلیم", "عبرا", "یعق", "۱پطر", "۲پطر", "۱یوح", "۲یوح", "۳یوح", "یهو", "مکا"
    ],
    "Single": [
        "پی", "خر", "لا", "اع", "تث", "یش", "داو", "رو", "۱سم", "۲سم", "۱پا", "۲پا", "۱تو", "۲تو", "عز", "نح", "اس", "ای", "مز", "ام", "جا", "غز", "اش", "ار", "مر", "حز", "دا", "هو", "یو", "عا", "عو", "ین", "می", "نا", "حب", "صف", "حج", "زک", "مل", "مت", "مر", "لو", "یو", "اع", "روم", "۱قر", "۲قر", "غل", "اف", "فی", "کو", "۱تس", "۲تس", "۱تی", "۲تی", "تی", "فل", "عب", "یع", "۱پط", "۲پط", "۱یو", "۲یو", "۳یو", "یه", "مک"
    ]
}

In [3]:
# Transform the data to farsi_books format
farsi_books = transform_data_to_farsi_books(data, "Long")

# Run the test
df_farsi_bad = process_farsi_books(farsi_books)

# Display the DataFrame of books with bad parse
df_farsi_bad

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [4]:
# Transform the data to farsi_books format
farsi_books = transform_data_to_farsi_books(data, "Short")

# Run the test
df_farsi_bad = process_farsi_books(farsi_books)

# Display the DataFrame of books with bad parse
df_farsi_bad

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [5]:
# Transform the data to farsi_books format
farsi_books = transform_data_to_farsi_books(data, "Shorter")

# Run the test
df_farsi_bad = process_farsi_books(farsi_books)

# Display the DataFrame of books with bad parse
df_farsi_bad

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [6]:
# Transform the data to farsi_books format
farsi_books = transform_data_to_farsi_books(data, "Single")

# Run the test
df_farsi_bad = process_farsi_books(farsi_books)

# Display the DataFrame of books with bad parse
df_farsi_bad

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name
0,ای,Job
1,یه,Jude


# Test with PDF data

In [7]:
# List of Farsi book names and their English counterparts from the PDF
farsi_books = [
    ("پيدايش", "Genesis"),
    ("سفر خروج", "Exodus"),
    ("لويان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنييه", "Deuteronomy"),
    ("يشوع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اول سموئيل", "1 Samuel"),
    ("دوم سموئيل", "2 Samuel"),
    ("اول پادشاهان", "1 Kings"),
    ("دوم پادشاهان", "2 Kings"),
    ("اول تواريخ", "1 Chronicles"),
    ("دوم تواريخ", "2 Chronicles"),
    ("عزرا", "Ezra"),
    ("نحيميا", "Nehemiah"),
    ("استر", "Esther"),
    ("ايوب", "Job"),
    ("مزامير", "Psalms"),
    ("امثالا", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزلی از غزلها", "Song of Solomon"),
    ("اشعيا", "Isaiah"),
    ("ارمييا", "Jeremiah"),
    ("نوحه", "Lamentations"),
    ("حزقيال", "Ezekiel"),
    ("دانيال", "Daniel"),
    ("هوزيا", "Hosea"),
    ("يوئيل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبيديا", "Obadiah"),
    ("يونس", "Jonah"),
    ("ميکاه", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"),
    ("ضفينيا", "Zephaniah"),
    ("حجی", "Haggai"),
    ("زکريا", "Zechariah"),
    ("ملکی", "Malachi"),
    ("متی", "Matthew"),
    ("مرقس", "Mark"),
    ("لوقا", "Luke"),
    ("يوحنا", "John"),
    ("اعمال رسولن", "Acts"),
    ("روميان", "Romans"),
    ("اول قرنتيان", "1 Corinthians"),
    ("دوم قرنتيان", "2 Corinthians"),
    ("غلطيان", "Galatians"),
    ("افسسيان", "Ephesians"),
    ("فيليپيان", "Philippians"),
    ("کولوسيان", "Colossians"),
    ("اول تسالونيکيان", "1 Thessalonians"),
    ("دوم تسالونيکيان", "2 Thessalonians"),
    ("اول تيموتيوس", "1 Timothy"),
    ("دوم تيموتيوس", "2 Timothy"),
    ("تايتوس", "Titus"),
    ("فيلمان", "Philemon"),
    ("عبرانيان", "Hebrews"),
    ("يعقوب", "James"),
    ("اول پترس", "1 Peter"),
    ("دوم پترس", "2 Peter"),
    ("اول يوحنا", "1 John"),
    ("دوم يوحنا", "2 John"),
    ("سوم يوحنا", "3 John"),
    ("يهودا", "Jude"),
    ("مکاشفه", "Revelation"),
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [8]:
# List of Dari book names and their English counterparts
dari_books = [
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اول سموئیل", "1 Samuel"),
    ("دوم سموئیل", "2 Samuel"),
    ("اول پادشاهان", "1 Kings"),
    ("دوم پادشاهان", "2 Kings"),
    ("اول تواریخ", "1 Chronicles"),
    ("دوم تواریخ", "2 Chronicles"),
    ("عزرا", "Ezra"),
    ("نحمیا", "Nehemiah"),
    ("استر", "Esther"),
    ("ایوب", "Job"),
    ("مزامیر", "Psalms"),
    ("امثال", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزل غزل‌ها", "Song of Solomon"),
    ("اشعیا", "Isaiah"),
    ("ارمیا", "Jeremiah"),
    ("سوگنامه", "Lamentations"),
    ("حزقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکاه", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"),
    ("صفنیا", "Zephaniah"),
    ("حجی", "Haggai"),
    ("زکریا", "Zechariah"),
    ("ملاکی", "Malachi"),
    ("متی", "Matthew"),
    ("مرقس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("اعمال", "Acts"),
    ("رومیان", "Romans"),
    ("اول قرنتیان", "1 Corinthians"),
    ("دوم قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("افسسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولسیان", "Colossians"),
    ("اول تسالونیکیان", "1 Thessalonians"),
    ("دوم تسالونیکیان", "2 Thessalonians"),
    ("اول تیموتائوس", "1 Timothy"),
    ("دوم تیموتائوس", "2 Timothy"),
    ("تیطوس", "Titus"),
    ("فیلیمون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("اول پطرس", "1 Peter"),
    ("دوم پطرس", "2 Peter"),
    ("اول یوحنا", "1 John"),
    ("دوم یوحنا", "2 John"),
    ("سوم یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفه", "Revelation"),
]

process_farsi_books(dari_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


# Test with current data from in fa folder in Bible-Passage-Reference-Parser

In [9]:
# Input data
current_data = [
  ('پیدایش', 'Genesis'),
  ('خروج', 'Exodus'),
  ('لاویان', 'Leviticus'),
  ('اعداد', 'Numbers'),
  ('تثنیه', 'Deuteronomy'),
  ('یوشع', 'Joshua'),
  ('داوران', 'Judges'),
  ('روت', 'Ruth'),
  ('۱ سموئیل', '1 Samuel'),
  ('۲ سموئیل', '2 Samuel'),
  ('۱ پادشاهان', '1 Kings'),
  ('۲ پادشاهان', '2 Kings'),
  ('۱ تواریخ', '1 Chronicles'),
  ('۲ تواریخ', '2 Chronicles'),
  ('عِزرا', 'Ezra'),
  ('نِحِمیا', 'Nehemiah'),
  ('اِستر', 'Esther'),
  ('ایوب', 'Job'),
  ('مزمور', 'Psalms'),
  ('امثال', 'Proverbs'),
  ('جامعه', 'Ecclesiastes'),
  ('غزل غزل\u200cها', 'Song of Songs'),
  ('اِشعیا', 'Isaiah'),
  ('اِرمیا', 'Jeremiah'),
  ('مراثی اِرمیا', 'Lamentations'),
  ('حِزقیال', 'Ezekiel'),
  ('دانیال', 'Daniel'),
  ('هوشع', 'Hosea'),
  ('یوئیل', 'Joel'),
  ('عاموس', 'Amos'),
  ('عوبَدیا', 'Obadiah'),
  ('یونس', 'Jonah'),
  ('میکاه', 'Micah'),
  ('ناحوم', 'Nahum'),
  ('حَبَقوق', 'Habakkuk'),
  ('صَفَنیا', 'Zephaniah'),
  ('حَجَّی', 'Haggai'),
  ('زکریا', 'Zechariah'),
  ('مَلاکی', 'Malachi'),
  ('مَتّی', 'Matthew'),
  ('مَرقُس', 'Mark'),
  ('لوقا', 'Luke'),
  ('یوحنا', 'John'),
  ('اعمال', 'Acts'),
  ('رومیان', 'Romans'),
  ('۱ قرنتیان', '1 Corinthians'),
  ('۲ قرنتیان', '2 Corinthians'),
  ('غلاطیان', 'Galatians'),
  ('اَفِسسیان', 'Ephesians'),
  ('فیلیپیان', 'Philippians'),
  ('کولُسیان', 'Colossians'),
  ('۱ تسالونیکیان', '1 Thessalonians'),
  ('۲ تسالونیکیان', '2 Thessalonians'),
  ('۱ تیموتائوس', '1 Timothy'),
  ('۲ تیموتائوس', '2 Timothy'),
  ('تیتوس', 'Titus'),
  ('فلیمون', 'Philemon'),
  ('عبرانیان', 'Hebrews'),
  ('یعقوب', 'James'),
  ('۱ پطرس', '1 Peter'),
  ('۲ پطرس', '2 Peter'),
  ('۱ یوحنا', '1 John'),
  ('۲ یوحنا', '2 John'),
  ('۳ یوحنا', '3 John'),
  ('یهودا', 'Jude'),
  ('مکاشفه', 'Revelation')
]

process_farsi_books(current_data)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


# Test with Bible.com data

In [10]:
farsi_books = [ # POV-FAS
    ("پیدایش", "Genesis"), ("خروج", "Exodus"), ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"), ("تثنیه", "Deuteronomy"), ("یوشع", "Joshua"),
    ("داوران", "Judges"), ("روت", "Ruth"), ("۱سموئیل", "1 Samuel"),
    ("۲سموئیل", "2 Samuel"), ("۱پادشاهان", "1 Kings"),
    ("۲پادشاهان", "2 Kings"), ("۱تواریخ", "1 Chronicles"),
    ("۲تواریخ", "2 Chronicles"), ("عزرا", "Ezra"),
    ("نحمیا", "Nehemiah"), ("استِر", "Esther"), ("ایوب", "Job"),
    ("مزامیر", "Psalms"), ("امثال", "Proverbs"), ("جامعه", "Ecclesiastes"),
    ("غزل‌غزل‌ها", "Song of Solomon"), ("اشعیا", "Isaiah"),
    ("ارمیا", "Jeremiah"), ("مراثی", "Lamentations"),
    ("حزقیال", "Ezekiel"), ("دانیال", "Daniel"), ("هوشع", "Hosea"),
    ("یوئیل", "Joel"), ("عاموس", "Amos"), ("عوبدیا", "Obadiah"),
    ("یونس", "Jonah"), ("میکا", "Micah"), ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"), ("صفنیا", "Zephaniah"), ("حجی", "Haggai"),
    ("زکریا", "Zechariah"), ("ملاکی", "Malachi"), ("متی", "Matthew"),
    ("مرقس", "Mark"), ("لوقا", "Luke"), ("یوحنا", "John"),
    ("اعمال", "Acts"), ("رومیان", "Romans"), ("۱قرنتیان", "1 Corinthians"),
    ("۲قرنتیان", "2 Corinthians"), ("غلاطیان", "Galatians"),
    ("افسسیان", "Ephesians"), ("فلیپیان", "Philippians"),
    ("کولسیان", "Colossians"), ("۱تسالونیکان", "1 Thessalonians"),
    ("۲تسالونیکیان", "2 Thessalonians"), ("۱تیموتائوس", "1 Timothy"),
    ("۲تیموتائوس", "2 Timothy"), ("تیطس", "Titus"), ("فیلیمون", "Philemon"),
    ("عبرانیان", "Hebrews"), ("یعقوب", "James"), ("۱پطرس", "1 Peter"),
    ("۲پطرس", "2 Peter"), ("۱یوحنا", "1 John"), ("۲یوحنا", "2 John"),
    ("۳یوحنا", "3 John"), ("یهودا", "Jude"), ("مکاشفه", "Revelation")
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [11]:
farsi_books = [
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اول سموئیل", "1 Samuel"),
    ("دوم سموئیل", "2 Samuel"),
    ("اول پادشاهان", "1 Kings"),
    ("دوم پادشاهان", "2 Kings"),
    ("اول تواریخ", "1 Chronicles"),
    ("دوم تواریخ", "2 Chronicles"),
    ("عزرا", "Ezra"),
    ("نحمیا", "Nehemiah"),
    ("استر", "Esther"),
    ("ایّوب", "Job"),
    ("مزامیر", "Psalms"),
    ("امثال", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزل غزلها", "Song of Solomon"),
    ("اشعیا", "Isaiah"),
    ("ارمیا", "Jeremiah"),
    ("سوگنامه", "Lamentations"),
    ("حزقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکا", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"),
    ("صَفَنیا", "Zephaniah"),
    ("حَجّای", "Haggai"),
    ("زکریا", "Zechariah"),
    ("ملاکی", "Malachi"),
    ("متّی", "Matthew"),
    ("مرقس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("کارهای رسولان", "Acts"),
    ("رومیان", "Romans"),
    ("اول قرنتیان", "1 Corinthians"),
    ("دوم قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("افسسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولسیان", "Colossians"),
    ("اول تسالونیکیان", "1 Thessalonians"),
    ("دوم تسالونیکیان", "2 Thessalonians"),
    ("اول تیموتاؤس", "1 Timothy"),
    ("دوم تیموتاؤس", "2 Timothy"),
    ("تیطُس", "Titus"),
    ("فِلیمُون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("اول پطرس", "1 Peter"),
    ("دوم پطرس", "2 Peter"),
    ("اول یوحنا", "1 John"),
    ("دوم یوحنا", "2 John"),
    ("سوم یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفهٔ یوحنا", "Revelation")
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [12]:
farsi_books = [ # tpv
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اول سموئیل", "1 Samuel"),
    ("دوم سموئیل", "2 Samuel"),
    ("اول پادشاهان", "1 Kings"),
    ("دوم پادشاهان", "2 Kings"),
    ("اول تواریخ", "1 Chronicles"),
    ("دوم تواریخ", "2 Chronicles"),
    ("عزرا", "Ezra"),
    ("نحمیا", "Nehemiah"),
    ("استر", "Esther"),
    ("ایّوب", "Job"),
    ("مزامیر", "Psalms"),
    ("امثال", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزل غزلها", "Song of Solomon"),
    ("اشعیا", "Isaiah"),
    ("ارمیا", "Jeremiah"),
    ("سوگنامه", "Lamentations"),
    ("حزقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکا", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"),
    ("صَفَنیا", "Zephaniah"),
    ("حَجّای", "Haggai"),
    ("زکریا", "Zechariah"),
    ("ملاکی", "Malachi"),
    ("متّی", "Matthew"),
    ("مرقس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("کارهای رسولان", "Acts"),
    ("رومیان", "Romans"),
    ("اول قرنتیان", "1 Corinthians"),
    ("دوم قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("افسسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولسیان", "Colossians"),
    ("اول تسالونیکیان", "1 Thessalonians"),
    ("دوم تسالونیکیان", "2 Thessalonians"),
    ("اول تیموتاؤس", "1 Timothy"),
    ("دوم تیموتاؤس", "2 Timothy"),
    ("تیطُس", "Titus"),
    ("فِلیمُون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("اول پطرس", "1 Peter"),
    ("دوم پطرس", "2 Peter"),
    ("اول یوحنا", "1 John"),
    ("دوم یوحنا", "2 John"),
    ("سوم یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفهٔ یوحنا", "Revelation")
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [13]:
farsi_books = [# pcb
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اول سموئیل", "1 Samuel"),
    ("دوم سموئیل", "2 Samuel"),
    ("اول پادشاهان", "1 Kings"),
    ("دوم پادشاهان", "2 Kings"),
    ("اول تواریخ", "1 Chronicles"),
    ("دوم تواریخ", "2 Chronicles"),
    ("عِزرا", "Ezra"),
    ("نحمیا", "Nehemiah"),
    ("استر", "Esther"),
    ("ایوب", "Job"),
    ("مزامیر", "Psalms"),
    ("امثال", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزل غزلها", "Song of Solomon"),
    ("اشعیا", "Isaiah"),
    ("ارمیا", "Jeremiah"),
    ("مراثی", "Lamentations"),
    ("حِزِقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکاه", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"),
    ("صفنیا", "Zephaniah"),
    ("حَجَّی", "Haggai"),
    ("زکریا", "Zechariah"),
    ("ملاکی", "Malachi"),
    ("متی", "Matthew"),
    ("مَرقُس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("اعمال رسولان", "Acts"),
    ("رومیان", "Romans"),
    ("اول قرنتیان", "1 Corinthians"),
    ("دوم قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("افسسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولسیان", "Colossians"),
    ("اول تسالونیکیان", "1 Thessalonians"),
    ("دوم تسالونیکیان", "2 Thessalonians"),
    ("اول تیموتائوس", "1 Timothy"),
    ("دوم تیموتائوس", "2 Timothy"),
    ("تیتوس", "Titus"),
    ("فلیمون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("اول پطرس", "1 Peter"),
    ("دوم پطرس", "2 Peter"),
    ("اول یوحنا", "1 John"),
    ("دوم یوحنا", "2 John"),
    ("سوم یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفه", "Revelation")
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [14]:
farsi_books = [ # nmv
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("۱سموئیل", "1 Samuel"),
    ("۲سموئیل", "2 Samuel"),
    ("۱پادشاهان", "1 Kings"),
    ("۲پادشاهان", "2 Kings"),
    ("۱تواریخ", "1 Chronicles"),
    ("۲تواریخ", "2 Chronicles"),
    ("عِزرا", "Ezra"),
    ("نِحِمیا", "Nehemiah"),
    ("اِستر", "Esther"),
    ("ایوب", "Job"),
    ("مزمور", "Psalms"),
    ("امثال", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزل غزل‌ها", "Song of Solomon"),
    ("اِشعیا", "Isaiah"),
    ("اِرمیا", "Jeremiah"),
    ("مراثی اِرمیا", "Lamentations"),
    ("حِزقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبَدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکاه", "Micah"),
    ("ناحوم", "Nahum"),
    ("حَبَقوق", "Habakkuk"),
    ("صَفَنیا", "Zephaniah"),
    ("حَجَّی", "Haggai"),
    ("زکریا", "Zechariah"),
    ("مَلاکی", "Malachi"),
    ("مَتّی", "Matthew"),
    ("مَرقُس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("اعمال", "Acts"),
    ("رومیان", "Romans"),
    ("۱قرنتیان", "1 Corinthians"),
    ("۲قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("اَفِسسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولُسیان", "Colossians"),
    ("۱تسالونیکیان", "1 Thessalonians"),
    ("۲تسالونیکیان", "2 Thessalonians"),
    ("۱تیموتائوس", "1 Timothy"),
    ("۲تیموتائوس", "2 Timothy"),
    ("تیتوس", "Titus"),
    ("فیلیمون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("۱پطرس", "1 Peter"),
    ("۲پطرس", "2 Peter"),
    ("۱یوحنا", "1 John"),
    ("۲یوحنا", "2 John"),
    ("۳یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفه", "Revelation")
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name
0,حَجَّی,Haggai


In [15]:
farsi_books = [
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اوّل سموئیل", "1 Samuel"),
    ("دوّم سموئیل", "2 Samuel"),
    ("اوّل پادشاهان", "1 Kings"),
    ("دوّم پادشاهان", "2 Kings"),
    ("اوّل تواریخ", "1 Chronicles"),
    ("دوّم تواریخ", "2 Chronicles"),
    ("عِزرا", "Ezra"),
    ("نِحِمیا", "Nehemiah"),
    ("استر", "Esther"),
    ("ایّوب", "Job"),
    ("مزمور", "Psalms"),
    ("امثال", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزل غزل‌ها", "Song of Solomon"),
    ("اِشعیا", "Isaiah"),
    ("اِرمیا", "Jeremiah"),
    ("سوگ‌نامه", "Lamentations"),
    ("حزقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس نبی", "Amos"),
    ("عوبَدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکاه", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبقوق", "Habakkuk"),
    ("صَفَنیا", "Zephaniah"),
    ("حَجّای", "Haggai"),
    ("زَکریا", "Zechariah"),
    ("ملاکی", "Malachi"),
    ("متّی", "Matthew"),
    ("مرقس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("کارهای رسولان", "Acts"),
    ("رومیان", "Romans"),
    ("اوّل قرنتیان", "1 Corinthians"),
    ("دوّم قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("اَفِسُسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولسیان", "Colossians"),
    ("اوّل تسالونیکیان", "1 Thessalonians"),
    ("دوّم تسالونیکیان", "2 Thessalonians"),
    ("اوّل تیموتاؤس", "1 Timothy"),
    ("دوّم تیموتاؤس", "2 Timothy"),
    ("تیطُس", "Titus"),
    ("فِلیمُون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("اوّل پطرس", "1 Peter"),
    ("دوّم پطرس", "2 Peter"),
    ("اوّل یوحنا", "1 John"),
    ("دوّم یوحنا", "2 John"),
    ("سوّم یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفۀ یوحنا", "Revelation")
]

process_farsi_books(farsi_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name


In [16]:
dari_books = [
    ("پیدایش", "Genesis"),
    ("خروج", "Exodus"),
    ("لاویان", "Leviticus"),
    ("اعداد", "Numbers"),
    ("تثنیه", "Deuteronomy"),
    ("یوشع", "Joshua"),
    ("داوران", "Judges"),
    ("روت", "Ruth"),
    ("اول سموئیل", "1 Samuel"),
    ("دوم سموئیل", "2 Samuel"),
    ("اول پادشاهان", "1 Kings"),
    ("دوم پادشاهان", "2 Kings"),
    ("اول تواریخ", "1 Chronicles"),
    ("دوم تواریخ", "2 Chronicles"),
    ("عِزرا", "Ezra"),
    ("نِحِمیا", "Nehemiah"),
    ("اِستَر", "Esther"),
    ("ایوب", "Job"),
    ("مزامیر", "Psalms"),
    ("امثال سلیمان", "Proverbs"),
    ("جامعه", "Ecclesiastes"),
    ("غزلِ غزلها", "Song of Solomon"),
    ("اشعیا", "Isaiah"),
    ("ارمیا", "Jeremiah"),
    ("سوگنامه", "Lamentations"),
    ("حِزقیال", "Ezekiel"),
    ("دانیال", "Daniel"),
    ("هوشع", "Hosea"),
    ("یوئیل", "Joel"),
    ("عاموس", "Amos"),
    ("عوبَدیا", "Obadiah"),
    ("یونس", "Jonah"),
    ("میکاه", "Micah"),
    ("ناحوم", "Nahum"),
    ("حبَقوق", "Habakkuk"),
    ("سِفَنیا", "Zephaniah"),
    ("حجی", "Haggai"),
    ("زِکَریا", "Zechariah"),
    ("ملاکی", "Malachi"),
    ("متی", "Matthew"),
    ("مرقُس", "Mark"),
    ("لوقا", "Luke"),
    ("یوحنا", "John"),
    ("اعمال رسولان", "Acts"),
    ("رومیان", "Romans"),
    ("اول قرنتیان", "1 Corinthians"),
    ("دوم قرنتیان", "2 Corinthians"),
    ("غلاطیان", "Galatians"),
    ("اِفِسُسیان", "Ephesians"),
    ("فیلیپیان", "Philippians"),
    ("کولسیان", "Colossians"),
    ("اول تسالونیکیان", "1 Thessalonians"),
    ("دوم تسالونیکیان", "2 Thessalonians"),
    ("اول تیموتاووس", "1 Timothy"),
    ("دوم تیموتاووس", "2 Timothy"),
    ("تیطوس", "Titus"),
    ("فِلیمون", "Philemon"),
    ("عبرانیان", "Hebrews"),
    ("یعقوب", "James"),
    ("اول پِطرُس", "1 Peter"),
    ("دوم پِطرُس", "2 Peter"),
    ("اول یوحنا", "1 John"),
    ("دوم یوحنا", "2 John"),
    ("سوم یوحنا", "3 John"),
    ("یهودا", "Jude"),
    ("مکاشفه", "Revelation")
]
process_farsi_books(dari_books)

Processing Farsi Books:   0%|          | 0/66 [00:00<?, ?it/s]

Unnamed: 0,Farsi Name,English Name
