In [3]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Spanish": 'spa_Latn'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="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")

def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract medical and non-medical keywords from the given sentence. return it as json format without extra things."},
                  {"role": "user", "content": f"{sentence}"}],
        temperature=0.5
    )
    # print(response.choices[0].message.content)
    keywords = json.loads(response.choices[0].message.content)
    return keywords  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}

def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        
        translations = {row['LAT']: row['STR'] for row in rows}
        return translations
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

def translate_non_medical(keyword):
    translations = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

# def convert_to_chained_format(dictionary, src_lang, target_lang):
#     data={}
#     for x, y in dictionary.items():
#         for word,_ in dictionary[x].items():
#             data[word]=[]
#             data[word].append(word)
#             for _, tran2 in dictionary[x][word].items():
#                 data[word].append(tran2)
#     chain=[]
#     for word, translations in data.items():
#         ch=""
#         for i,x in enumerate(translations):
#             if i!=len(translations)-1:
#                 ch+=f"'{x}' means "
#             else:
#                 ch+=f"'{x}'"
#         chain.append(ch)
#     chain=". ".join(chain)
#     chain+=". "
#     # chain=f"{chain}\nTranslate the following text from {src_lang} into {target_lang}:"
#     return chain

def convert_to_chained_format(dictionary, src_lang, target_lang):
    chain = []
    
    for category, words in dictionary.items():
        for word, translations in words.items():
            formatted_translations = []
            
            # Ensure the source language is first, target language second, then others
            ordered_languages = ["English", "Spanish", "French", "German", "Portuguese"]
            
            for lang in ordered_languages:
                if lang in translations:
                    formatted_translations.append(f"{word} in {lang} is '{translations[lang]}'")

            chain.append(". ".join(formatted_translations))
    
    chained_text = ". ".join(chain) + "."
    return chained_text


def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    non_medical_translations = {}

    # Process medical keywords
    for keyword in keywords["medical_keywords"]:
        translation = search_umls(keyword)
        if not translation:  # If keyword not found in UMLS, translate using NLLB
            translation = {lang: translator(keyword, src_lang="eng_Latn", tgt_lang=code, max_length=400)[0]['translation_text'] 
                           for lang, code in languages.items()}
        elif "SPA" not in translation:  # If Spanish missing in UMLS, use NLLB for Spanish
            translation["SPA"] = translator(keyword, src_lang="eng_Latn", tgt_lang="spa_Latn", max_length=400)[0]['translation_text']
        
        medical_translations[keyword] = translation

    # Process non-medical keywords (always translated via NLLB)
    for keyword in keywords["non_medical_keywords"]:
        non_medical_translations[keyword] = translate_non_medical(keyword)

    result_json_temp = {
        "medical": medical_translations,
        "non_medical": non_medical_translations
    }

    src_language = "English"
    target_language = "Spanish"
    chained_output = convert_to_chained_format(result_json_temp, src_language, target_language)

    return chained_output, result_json_temp

# Example usage
sentence = "Bariatric surgery is done when diet and exercise haven't worked or when you have serious health problems because of your weight."
COD_prompt,result_json_temp = process_sentence(sentence)
print(COD_prompt)


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.47it/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Bariatric surgery in Spanish is 'Cirugía bariátrica'. Bariatric surgery in French is 'Chirurgie bariatrique'. Bariatric surgery in German is 'Bariatrische Chirurgie'. Bariatric surgery in Portuguese is 'Cirurgia bariátrica'. . . . .


In [None]:
def translate_cod_prompt(COD_prompt,sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"{COD_prompt}"},
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    ans=(response.choices[0].message.content)
    return ans

## Direct translate in Spanish
def direct_translate(sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)


## Back translate in English
def back_translate(spa_tran):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                # {"role": "system", "content": f""},
                    {"role": "user", "content": f"Translate the following text from Spanish into English: {spa_tran}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)

## Evaluation
import sacrebleu
import evaluate

metric = evaluate.load("sacrebleu")
def compute_bleu_chrf(reference, hypothesis):
    """
    Computes the BLEU and chrF++ scores for a given reference and hypothesis.
    
    :param reference: List of reference translations (list of strings)
    :param hypothesis: The hypothesis translation (a single string)
    :return: A dictionary containing BLEU and chrF++ scores
    """
    # Ensure reference is wrapped in a list as sacrebleu expects a list of references
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference]).score
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference], tokenize="13a", lowercase=True).score
    bleu_score=metric.compute(predictions=[hypothesis], references=[reference])
    chrf_score = sacrebleu.corpus_chrf(hypothesis, [reference]).score

    return {"bleu_score": bleu_score['score'],"chrF++": chrf_score}


## Evaluation
output_data=[]
import tqdm
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in tqdm.tqdm(f[:100]):
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    COD_prompt,result_json_temp = process_sentence(sentence_eng)
    spa_tran=translate_cod_prompt(COD_prompt,sentence_eng)
    spa_tran_direct=direct_translate(sentence_eng)
    back_tran=back_translate(spa_tran)
    back_tran_direct=back_translate(spa_tran_direct)
    reference_text = [sentence_eng]
    hypothesis_text = back_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)
    hypothesis_text = back_tran_direct
    scores_direct = compute_bleu_chrf(reference_text, hypothesis_text)
    output_data.append({
        "Original_English_sentence": sentence_eng,
        "Original_Spanish_sentence": sentence_spa,
        "COD_prompt": COD_prompt,
        "spanish_translation": spa_tran,
        "spanish_translation_direct": spa_tran_direct,
        "back_translation": back_tran,
        "back_translation_direct": back_tran_direct,
        "scores_cod_prompt(bleu and chrf)": scores_cod_prompt,
        "scores_direct(bleu and chrf)": scores_direct
    })
    # print(scores_cod_prompt,scores_direct)
import pandas as pd
df = pd.DataFrame(output_data)
df.to_csv("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_gpt4_mini.csv")

df.to_excel("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_gpt4_mini.xlsx")
## Performance check dataset
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in f[:100]:
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    COD_prompt,result_json_temp = process_sentence(sentence_eng)
    spa_tran=translate_cod_prompt(COD_prompt, sentence_eng)
    reference_text = [sentence_spa]
    hypothesis_text = spa_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)


## Using NLLB model only

In [None]:
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages (ensuring Spanish comes second)
languages = {
    "Spanish": 'spa_Latn',  # Target language
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn"
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="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")

def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract medical and non-medical keywords from the given sentence. return it as json format without extra things."},
            {"role": "user", "content": f"{sentence}"}
        ],
        temperature=0.5
    )
    keywords = json.loads(response.choices[0].message.content)
    return keywords  

def translate_keywords(keyword):
    translations = {"English": keyword}  # Ensure English is first
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

def convert_to_chained_format(dictionary):
    chain = []
    
    for category, keywords in dictionary.items():
        for word, translations in keywords.items():
            ordered_translations = [translations["English"], translations["Spanish"]]  # Keep English first, Spanish second
            other_langs = {k: v for k, v in translations.items() if k not in ["English", "Spanish"]}
            ordered_translations.extend(other_langs.values())  # Append remaining languages
            
            phrase = ""
            for i, term in enumerate(ordered_translations):
                if i != len(ordered_translations) - 1:
                    phrase += f"'{term}' means "
                else:
                    phrase += f"'{term}'"
            chain.append(phrase)
    
    chained_text = ". ".join(chain) + "."
    return chained_text

def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    non_medical_translations = {}

    # Translate medical keywords
    for keyword in keywords["medical_keywords"]:
        medical_translations[keyword] = translate_keywords(keyword)

    # Translate non-medical keywords
    for keyword in keywords["non_medical_keywords"]:
        non_medical_translations[keyword] = translate_keywords(keyword)

    result_json_temp = {
        "medical": medical_translations,
        "non_medical": non_medical_translations
    }

    chained_output = convert_to_chained_format(result_json_temp)

    return chained_output, result_json_temp


sentence = "Bariatric surgery is done when diet and exercise haven't worked or when you have serious health problems because of your weight."
COD_prompt, result_json_temp = process_sentence(sentence)
print(COD_prompt)


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.35it/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


'Bariatric surgery' means 'Cirugía bariátrica' means 'Chirurgie bariatrique' means 'Bariatrische Chirurgie' means 'Cirurgia bariátrica'. 'diet' means 'dieta y' means 'régime alimentaire' means 'Ernährung' means 'dieta'. 'exercise' means 'ejercicio' means 'exercice physique' means 'Übung' means 'exercício'. 'serious health problems' means 'problemas de salud graves' means 'problèmes de santé graves' means 'schwere gesundheitliche Probleme' means 'problemas de saúde graves'. 'weight' means 'peso' means 'poids' means 'Gewicht' means 'peso'.


## Translate all the data using NLLB except cod prompt translation

In [3]:
# List of target languages
languages = {
    "Spanish": 'spa_Latn',  # Target language for translation
    "English": "eng_Latn"   # Source language for back-translation
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

def translate_cod_prompt(COD_prompt,sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"{COD_prompt}"},
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    ans=(response.choices[0].message.content)
    return ans

def direct_translate(sentence):
    """
    Directly translates an English sentence to Spanish using NLLB.
    """
    output = translator(sentence, src_lang="eng_Latn", tgt_lang="spa_Latn", max_length=400)
    return output[0]['translation_text']

def back_translate(spa_tran):
    """
    Translates a Spanish sentence back into English using NLLB.
    """
    output = translator(spa_tran, src_lang="spa_Latn", tgt_lang="eng_Latn", max_length=400)
    return output[0]['translation_text']

## Evaluation
import sacrebleu
import evaluate

metric = evaluate.load("sacrebleu")
def compute_bleu_chrf(reference, hypothesis):
    """
    Computes the BLEU and chrF++ scores for a given reference and hypothesis.
    
    :param reference: List of reference translations (list of strings)
    :param hypothesis: The hypothesis translation (a single string)
    :return: A dictionary containing BLEU and chrF++ scores
    """
    # Ensure reference is wrapped in a list as sacrebleu expects a list of references
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference]).score
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference], tokenize="13a", lowercase=True).score
    bleu_score=metric.compute(predictions=[hypothesis], references=[reference])
    chrf_score = sacrebleu.corpus_chrf(hypothesis, [reference]).score

    return {"bleu_score": bleu_score['score'],"chrF++": chrf_score}


## Evaluation
output_data=[]
import tqdm
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in tqdm.tqdm(f[:100]):
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    COD_prompt,result_json_temp = process_sentence(sentence_eng)
    spa_tran=translate_cod_prompt(COD_prompt,sentence_eng)
    spa_tran_direct=direct_translate(sentence_eng)
    back_tran=back_translate(spa_tran)
    back_tran_direct=back_translate(spa_tran_direct)
    reference_text = [sentence_eng]
    hypothesis_text = back_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)
    hypothesis_text = back_tran_direct
    scores_direct = compute_bleu_chrf(reference_text, hypothesis_text)
    output_data.append({
        "Original_English_sentence": sentence_eng,
        "Original_Spanish_sentence": sentence_spa,
        "COD_prompt": COD_prompt,
        "spanish_translation": spa_tran,
        "spanish_translation_direct": spa_tran_direct,
        "back_translation": back_tran,
        "back_translation_direct": back_tran_direct,
        "scores_cod_prompt(bleu and chrf)": scores_cod_prompt,
        "scores_direct(bleu and chrf)": scores_direct
    })
    # print(scores_cod_prompt,scores_direct)
import pandas as pd
df = pd.DataFrame(output_data)

df.to_excel("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_NLLB.xlsx")




Loading checkpoint shards: 100%|██████████| 3/3 [00:01<00:00,  1.53it/s]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
100%|██████████| 100/100 [07:48<00:00,  4.69s/it]


## additional dictionary added

In [1]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Spanish": 'spa_Latn'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="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")


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 3/3 [00:03<00:00,  1.07s/it]
Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


In [2]:
def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract medical and non-medical keywords from the given sentence. return it as json format without extra things."},
                  {"role": "user", "content": f"{sentence}"}],
        temperature=0.5
    )
    # print(response.choices[0].message.content)
    keywords = json.loads(response.choices[0].message.content)
    return keywords  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}


In [4]:
def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        
        translations = {row['LAT']: row['STR'] for row in rows}
        return translations
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

In [5]:

def translate_non_medical(keyword):
    translations = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

In [23]:
def convert_to_chained_format(dictionary, src_lang, target_lang):
    chain_umls = []
    chain_nllb = []

    # Process UMLS translations
    for word, translations in dictionary["medical"]["UMLS"].items():
        formatted_translations = []
        for lang, translation in translations.items():
            formatted_translations.append(f"{word} in {lang} is '{translation}'")
        chain_umls.append(". ".join(formatted_translations))
    
    # Process NLLB translations
    for word, translations in dictionary["medical"]["NLLB"].items():
        formatted_translations = []
        for lang, translation in translations.items():
            formatted_translations.append(f"{word} in {lang} is '{translation}'")
        chain_nllb.append(". ".join(formatted_translations))
    
    chained_text = "UMLS Translations: " + ". ".join(chain_umls) + ". " + "NLLB Translations: " + ". ".join(chain_nllb) + "."
    return chained_text

def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations_umls = {}
    medical_translations_nllb = {}
    non_medical_translations = {}

    # Process medical keywords
    for keyword in keywords["medical_keywords"]:
        translation = search_umls(keyword)
        if translation:  # If found in UMLS
            medical_translations_umls[keyword] = translation
        
        # Translate using NLLB regardless
        medical_translations_nllb[keyword] = {
            lang: translator(keyword, src_lang="eng_Latn", tgt_lang=code, max_length=400)[0]['translation_text'] 
            for lang, code in languages.items()
        }
    
    # Process non-medical keywords (always translated via NLLB)
    for keyword in keywords["non_medical_keywords"]:
        non_medical_translations[keyword] = translate_non_medical(keyword)

    result_json_temp = {
        "medical": {
            "UMLS": medical_translations_umls,
            "NLLB": medical_translations_nllb
        },
        "non_medical": non_medical_translations
    }

    src_language = "English"
    target_language = "Spanish"
    chained_output = convert_to_chained_format(result_json_temp, src_language, target_language)

    return {
        "chained_output": chained_output,
        "translations": result_json_temp
    }


In [24]:
ans=process_sentence("Bariatric surgery is done when diet and exercise haven't worked or when you have serious health problems because of your weight.")

In [26]:
ans['chained_output']

"UMLS Translations: diet in POR is 'Paraoxônio'. diet in FRE is 'Paraoxon'. diet in GER is 'Paraoxon'. exercise in FRE is 'Entraînement respiratoire'. exercise in GER is 'Atemübungen'. exercise in POR is 'Exercícios Respiratórios'. health problems in POR is 'Problemas Internacionais de Saúde'. health problems in GER is 'Internationale Gesundheitsprobleme'. health problems in FRE is 'Problème international de santé'. weight in FRE is 'Nourrisson à petit poids de naissance'. weight in POR is 'Recém-Nascido de Baixo Peso'. weight in GER is 'Neugeborenes, geringes Geburtsgewicht'. NLLB Translations: Bariatric surgery in French is 'Chirurgie bariatrique'. Bariatric surgery in German is 'Bariatrische Chirurgie'. Bariatric surgery in Portuguese is 'Cirurgia bariátrica'. Bariatric surgery in Spanish is 'Cirugía bariátrica'. diet in French is 'régime alimentaire'. diet in German is 'Ernährung'. diet in Portuguese is 'dieta'. diet in Spanish is 'dieta y'. exercise in French is 'exercice physique

In [None]:
def translate_cod_prompt(COD_prompt,sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"{COD_prompt}"},
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    ans=(response.choices[0].message.content)
    return ans

## Direct translate in Spanish
def direct_translate(sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)


## Back translate in English
def back_translate(spa_tran):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                # {"role": "system", "content": f""},
                    {"role": "user", "content": f"Translate the following text from Spanish into English: {spa_tran}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)

## Evaluation
import sacrebleu
import evaluate

metric = evaluate.load("sacrebleu")
def compute_bleu_chrf(reference, hypothesis):
    """
    Computes the BLEU and chrF++ scores for a given reference and hypothesis.
    
    :param reference: List of reference translations (list of strings)
    :param hypothesis: The hypothesis translation (a single string)
    :return: A dictionary containing BLEU and chrF++ scores
    """
    # Ensure reference is wrapped in a list as sacrebleu expects a list of references
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference]).score
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference], tokenize="13a", lowercase=True).score
    bleu_score=metric.compute(predictions=[hypothesis], references=[reference])
    chrf_score = sacrebleu.corpus_chrf(hypothesis, [reference]).score

    return {"bleu_score": bleu_score['score'],"chrF++": chrf_score}


## Evaluation
output_data=[]
import tqdm
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in tqdm.tqdm(f[:100]):
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    COD_prompt = process_sentence(sentence_eng)
    spa_tran=translate_cod_prompt(COD_prompt['chained_output'],sentence_eng)
    spa_tran_direct=direct_translate(sentence_eng)
    back_tran=back_translate(spa_tran)
    back_tran_direct=back_translate(spa_tran_direct)
    reference_text = [sentence_eng]
    hypothesis_text = back_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)
    hypothesis_text = back_tran_direct
    scores_direct = compute_bleu_chrf(reference_text, hypothesis_text)
    output_data.append({
        "Original_English_sentence": sentence_eng,
        "Original_Spanish_sentence": sentence_spa,
        "COD_prompt": COD_prompt,
        "spanish_translation": spa_tran,
        "spanish_translation_direct": spa_tran_direct,
        "back_translation": back_tran,
        "back_translation_direct": back_tran_direct,
        "scores_cod_prompt(bleu and chrf)": scores_cod_prompt,
        "scores_direct(bleu and chrf)": scores_direct
    })
    # print(scores_cod_prompt,scores_direct)
import pandas as pd
df = pd.DataFrame(output_data)

# df.to_excel("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_gpt4_mini.xlsx")



100%|██████████| 100/100 [17:43<00:00, 10.64s/it]


In [30]:
df.to_excel("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_gpt4_mini_NLLB.xlsx")

In [29]:
# Compute average scores for COD-prompt-based and direct translations
total_bleu_cod = 0
total_chrf_cod = 0
total_bleu_direct = 0
total_chrf_direct = 0
count = len(output_data)

for entry in output_data:
    total_bleu_cod += entry["scores_cod_prompt(bleu and chrf)"]["bleu_score"]
    total_chrf_cod += entry["scores_cod_prompt(bleu and chrf)"]["chrF++"]
    total_bleu_direct += entry["scores_direct(bleu and chrf)"]["bleu_score"]
    total_chrf_direct += entry["scores_direct(bleu and chrf)"]["chrF++"]

# Calculate averages
avg_bleu_cod = total_bleu_cod / count
avg_chrf_cod = total_chrf_cod / count
avg_bleu_direct = total_bleu_direct / count
avg_chrf_direct = total_chrf_direct / count

# Print the results
print(f"Average BLEU Score (COD-Prompt-Based Translation): {avg_bleu_cod:.2f}")
print(f"Average chrF++ Score (COD-Prompt-Based Translation): {avg_chrf_cod:.2f}")
print(f"Average BLEU Score (Direct Translation): {avg_bleu_direct:.2f}")
print(f"Average chrF++ Score (Direct Translation): {avg_chrf_direct:.2f}")

# Save the scores to a file
evaluation_results = {
    "Average BLEU (COD-Prompt-Based Translation)": avg_bleu_cod,
    "Average chrF++ (COD-Prompt-Based Translation)": avg_chrf_cod,
    "Average BLEU (Direct Translation)": avg_bleu_direct,
    "Average chrF++ (Direct Translation)": avg_chrf_direct
}

df_scores = pd.DataFrame([evaluation_results])
df_scores.to_excel("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_translation_scores.xlsx", index=False)


Average BLEU Score (COD-Prompt-Based Translation): 52.46
Average chrF++ Score (COD-Prompt-Based Translation): 2.03
Average BLEU Score (Direct Translation): 51.93
Average chrF++ Score (Direct Translation): 1.84
