In [None]:
# prompt3='''
# The chained multilingual dictionaries:
# <word X in source-language> means <word X in target-language> means 
# <word X in auxiliary-language 1> means <word X in auxiliary-language 2>.
# Translate the following text from <source-language> into <target-language>: <source-sentence>
# '''

In [None]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Spanish": 'spa_Latn'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="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")

def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract medical and non-medical keywords from the given sentence. return it as json format without extra things."},
                  {"role": "user", "content": f"{sentence}"}],
        temperature=0.5
    )
    # print(response.choices[0].message.content)
    keywords = json.loads(response.choices[0].message.content)
    return keywords  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}

def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        
        translations = {row['LAT']: row['STR'] for row in rows}
        return translations
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

def translate_non_medical(keyword):
    translations = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

def convert_to_chained_format(dictionary, src_lang, target_lang):
    data={}
    for x, y in dictionary.items():
        for word,_ in dictionary[x].items():
            data[word]=[]
            data[word].append(word)
            for _, tran2 in dictionary[x][word].items():
                data[word].append(tran2)
    chain=[]
    for word, translations in data.items():
        ch=""
        for i,x in enumerate(translations):
            if i!=len(translations)-1:
                ch+=f"'{x}' means "
            else:
                ch+=f"'{x}'"
        chain.append(ch)
    chain=". ".join(chain)
    chain+=". "
    # chain=f"{chain}\nTranslate the following text from {src_lang} into {target_lang}:"
    return chain

def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    non_medical_translations = {}

    # Process medical keywords
    for keyword in keywords["medical_keywords"]:
        translation = search_umls(keyword)
        if not translation:  # If keyword not found in UMLS, translate using NLLB
            translation = {lang: translator(keyword, src_lang="eng_Latn", tgt_lang=code, max_length=400)[0]['translation_text'] 
                           for lang, code in languages.items()}
        elif "SPA" not in translation:  # If Spanish missing in UMLS, use NLLB for Spanish
            translation["SPA"] = translator(keyword, src_lang="eng_Latn", tgt_lang="spa_Latn", max_length=400)[0]['translation_text']
        
        medical_translations[keyword] = translation

    # Process non-medical keywords (always translated via NLLB)
    for keyword in keywords["non_medical_keywords"]:
        non_medical_translations[keyword] = translate_non_medical(keyword)

    result_json_temp = {
        "medical": medical_translations,
        "non_medical": non_medical_translations
    }

    src_language = "English"
    target_language = "Spanish"
    chained_output = convert_to_chained_format(result_json_temp, src_language, target_language)

    return chained_output, result_json_temp

# Example usage
sentence = "Bariatric surgery is done when diet and exercise haven't worked or when you have serious health problems because of your weight."
COD_prompt,result_json_temp = process_sentence(sentence)
# print(COD_prompt)


## Translate using COD prompt in Spanish

In [None]:
def translate_cod_prompt(COD_prompt,sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"{COD_prompt}"},
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    ans=(response.choices[0].message.content)
    return ans
spa_tran=translate_cod_prompt(COD_prompt,sentence)
spa_tran

## Direct translate in Spanish

In [None]:
def direct_translate(sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)
spa_tran_direct=direct_translate(sentence)
spa_tran_direct

## Back translate in English

In [None]:
def back_translate(spa_tran):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                # {"role": "system", "content": f""},
                    {"role": "user", "content": f"Translate the following text from Spanish into English: {spa_tran}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)
back_tran=back_translate(spa_tran)
back_tran

## Back translation in English without COD prompt

In [None]:
back_tran_direct=back_translate(spa_tran_direct)
back_tran_direct

## Evaluation

In [None]:
import sacrebleu
import evaluate

metric = evaluate.load("sacrebleu")
def compute_bleu_chrf(reference, hypothesis):
    """
    Computes the BLEU and chrF++ scores for a given reference and hypothesis.
    
    :param reference: List of reference translations (list of strings)
    :param hypothesis: The hypothesis translation (a single string)
    :return: A dictionary containing BLEU and chrF++ scores
    """
    # Ensure reference is wrapped in a list as sacrebleu expects a list of references
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference]).score
    # bleu_score = sacrebleu.corpus_bleu(hypothesis, [reference], tokenize="13a", lowercase=True).score
    bleu_score=metric.compute(predictions=[hypothesis], references=[reference])
    chrf_score = sacrebleu.corpus_chrf(hypothesis, [reference]).score

    return {"bleu_score": bleu_score['score'],"chrF++": chrf_score}

# Example usage
# reference_text = [sentence]
# hypothesis_text = back_tran
# scores = compute_bleu_chrf(reference_text, hypothesis_text)
# print("COD prompt: ",scores)
# reference_text = [sentence]
# hypothesis_text = back_tran_direct
# scores = compute_bleu_chrf(reference_text, hypothesis_text)
# print("Direct translation: ",scores)


In [None]:
## Evaluation
output_data=[]
import tqdm
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in tqdm.tqdm(f[:100]):
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    COD_prompt,result_json_temp = process_sentence(sentence_eng)
    spa_tran=translate_cod_prompt(COD_prompt,sentence_eng)
    spa_tran_direct=direct_translate(sentence_eng)
    back_tran=back_translate(spa_tran)
    back_tran_direct=back_translate(spa_tran_direct)
    reference_text = [sentence_eng]
    hypothesis_text = back_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)
    hypothesis_text = back_tran_direct
    scores_direct = compute_bleu_chrf(reference_text, hypothesis_text)
    output_data.append({
        "Original_English_sentence": sentence_eng,
        "Original_Spanish_sentence": sentence_spa,
        "COD_prompt": COD_prompt,
        "spanish_translation": spa_tran,
        "spanish_translation_direct": spa_tran_direct,
        "back_translation": back_tran,
        "back_translation_direct": back_tran_direct,
        "scores_cod_prompt(bleu and chrf)": scores_cod_prompt,
        "scores_direct(bleu and chrf)": scores_direct
    })
    # print(scores_cod_prompt,scores_direct)
import pandas as pd
df = pd.DataFrame(output_data)
df.to_csv("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_gpt4_mini.csv")


In [None]:
df.to_excel("/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs_gpt4_mini.xlsx")

## Performance check dataset

In [None]:
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in f[:100]:
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    COD_prompt,result_json_temp = process_sentence(sentence_eng)
    spa_tran=translate_cod_prompt(COD_prompt, sentence_eng)
    reference_text = [sentence_spa]
    hypothesis_text = spa_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)

# COD + syn (test1)

In [None]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up
from utils import get_synonyms, back_translate, compute_bleu_chrf
from openai import OpenAI 
import os
client = OpenAI(api_key="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")
import json
def translate_using_prompt(prompt,sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"{prompt}"},
                    {"role": "user", "content": f"Translate the following text from English into Spanish: {sentence}"}],
            temperature=0.5
        )
    ans=(response.choices[0].message.content)
    return ans

db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Spanish": 'spa_Latn'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)


def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract medical and non-medical keywords from the given sentence. return it as json format without extra things."},
                  {"role": "user", "content": f"{sentence}"}],
        temperature=0.5
    )
    # print(response.choices[0].message.content)
    keywords = json.loads(response.choices[0].message.content)
    return keywords  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}

def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        
        translations = {row['LAT']: row['STR'] for row in rows}
        return translations
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

# def translate_non_medical(keyword):
#     translations = {}
#     for language, lang_code in languages.items():
#         output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
#         translations[language] = output[0]['translation_text']
#     return translations

def convert_to_chained_format(dictionary, src_lang, target_lang):
    chain = []
    
    for category, words in dictionary.items():
        for word, translations in words.items():
            formatted_translations = []
            
            # Ensure the source language is first, target language second, then others
            ordered_languages = ["ENG", "SPA", "FRE", "GER", "POR"]
            
            for lang in ordered_languages:
                if lang in translations:
                    formatted_translations.append(f"'{translations[lang]}'")

            chain.append(" means ".join(formatted_translations))
    
    chained_text = ". ".join(chain) + "."
    return chained_text

def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    output=[]
    # Process medical keywords
    for keyword in keywords["medical_keywords"]:
        translation={}
        translation = search_umls(keyword)
        synonyms=get_synonyms1(keyword)
        # print(keyword)
        if synonyms:
            output.append(f"'{keyword}' synonyms are [{', '.join(synonyms)}].")
        if translation and "SPA" not in translation:  # If Spanish missing in UMLS, use NLLB for Spanish
            translation["SPA"] = translator(keyword, src_lang="eng_Latn", tgt_lang="spa_Latn", max_length=400)[0]['translation_text']
        
        if translation:
            translation['ENG'] = keyword
            medical_translations[keyword] = translation

    result_json_temp = {
        "medical": medical_translations
    }

    src_language = "English"
    target_language = "Spanish"
    output2=" ".join(output)
    if medical_translations:
        chained_output = convert_to_chained_format(result_json_temp, src_language, target_language)
        full_prompt="Chain of dictionary: "+chained_output+"\n\nSynonyms: "+output2
    else:
        full_prompt="Synonyms: "+output2

    return full_prompt, medical_translations, keywords,output

# Example usage
sentence = "A stress fracture is a hairline crack in the bone that develops because of repeated or prolonged forces against the bone."
full_prompt, medical_translations, keywords,output = process_sentence(sentence)
print(full_prompt)


Loading checkpoint shards: 100%|██████████| 3/3 [00:02<00:00,  1.49it/s]
Device set to use cuda:0


Chain of dictionary: 'stress fracture' means 'fractura por esfuerzo' means 'Fractures de fatigue' means 'Frakturen, Streß-' means 'Fraturas de Estresse'. 'bone' means 'hueso' means 'Maladies osseuses' means 'Knochenkrankheiten' means 'Doenças Ósseas'.

Synonyms: 'stress fracture' synonyms are [stress, fracture]. 'hairline crack' synonyms are [hairline, crack]. 'bone' synonyms are [bone]. 'repeated forces' synonyms are [repeated, forces]. 'prolonged forces' synonyms are [prolonged, forces].


In [5]:
from utils import get_synonyms1
synonyms=get_synonyms1("head injury")
synonyms

['HEAD INJ CLOSED',
 'Nonpenetrating Head Injury',
 'close head injuries',
 'INJ CLOSED HEAD',
 'HEAD INJURY CLOSED',
 'Closed injury of head (disorder)',
 'Injury;closed head',
 'Nonpenetrating Head Injuries',
 'Injuries, Closed Head',
 'CLOSED HEAD INJ',
 'Closed Head Traumas',
 'Head Injury, Closed',
 'Head Injuries, Nonpenetrating',
 'Closed Head Trauma',
 'Head Injuries, Closed',
 'Head Traumas, Closed',
 'Head Injury, Nonpenetrating',
 'Closed Head Injury',
 'Closed head injuries',
 'Head Trauma, Closed',
 'Closed injury of head',
 'HEAD INJ NONPENETRATING',
 'close head injury']

## use orginal spanish text as reference text

In [None]:
# import tqdm
# total_score=[]
# file_path = "/home/mshahidul/project1/all_tran_data/Sampled_100_MedlinePlus_eng_spanish_pair.json"
# from utils import compute_bleu_chrf
# with open(file_path, 'r', encoding='utf-8') as json_file:
#     original_file = json.load(json_file)

# for line in tqdm.tqdm(original_file):
#     full_prompt, medical_translations, keywords,output = process_sentence(line['english'])
#     hypothesis_text=translate_using_prompt(full_prompt,sentence_eng)
#     reference_text = line['spanish']
#     score=compute_bleu_chrf(reference_text, hypothesis_text)  
#     total_score.append({
#         "original_english": line['english'],
#         "original_spanish": line['spanish'],
#         "translated_spanish": hypothesis_text,
#         "bleu_score": score
#     })

In [None]:
## Evaluation
from utils import translate_using_prompt
output_data = []
file_path = "/home/mshahidul/project1/all_tran_data/dataset/Sampled_100_MedlinePlus_eng_spanish_pair.json"
import tqdm
import json

with open(file_path, 'r', encoding='utf-8') as json_file:
    sampled_medlineplus_data = json.load(json_file)

for x in tqdm.tqdm(sampled_medlineplus_data):
    sentence_eng = x['english']
    sentence_spa = x['spanish']
    
    try:
        full_prompt, medical_translations, keywords, output = process_sentence(sentence_eng)

        output_data.append({
            "Original_English_sentence": sentence_eng,
            "Original_Spanish_sentence": sentence_spa,
            "COD_prompt": full_prompt,
            "medical_translations": medical_translations,
            "keywords": keywords,
            "synonyms": output
        })
    except Exception as e:
        print(f"Error: {e}!!!!")
        continue

json_path = "/home/mshahidul/project1/all_tran_data/dataset/medlineplus_info.json"
with open(json_path, 'w', encoding='utf-8') as json_file:
    json.dump(output_data, json_file, ensure_ascii=False, indent=4)

print(f"Filtered data saved to {json_path}")


  6%|▌         | 6/100 [00:54<14:08,  9.03s/it]

Error: Unread result found!!!!


  7%|▋         | 7/100 [00:59<12:03,  7.78s/it]

## Use back translation to evaluation the model

In [2]:
## Evaluation
from utils import translate_using_prompt
output_data=[]
file_path = "/home/mshahidul/project1/all_tran_data/dataset/Sampled_100_MedlinePlus_eng_spanish_pair.json"
import tqdm
import json

with open(file_path, 'r', encoding='utf-8') as json_file:
    sampled_medlineplus_data = json.load(json_file)

for x in tqdm.tqdm(sampled_medlineplus_data):
    sentence_eng = x['english']
    sentence_spa = x['spanish']
    
    try:
        full_prompt, medical_translations, keywords,output = process_sentence(sentence_eng)
        spa_tran_prompt = translate_using_prompt(full_prompt, sentence_eng)
        back_tran_prompt = back_translate(spa_tran_prompt)
        
        reference_text = [sentence_eng]
        hypothesis_text = back_tran_prompt
        scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)

        output_data.append({
            "Original_English_sentence": sentence_eng,
            "Original_Spanish_sentence": sentence_spa,
            "keywords":keywords,
            "COD_prompt": full_prompt,
            "spanish_translation_prompt": spa_tran_prompt,
            "back_translation_prompt": back_tran_prompt,
            "scores_cod_prompt(bleu and chrf)": scores_cod_prompt
        })
    except Exception as e:
        print(f"Error: {e}!!!!")
        continue

json_path = "/home/mshahidul/project1/results_new/medlineplus_gpt4_mini_COD_(syn)_back_translation.json"
with open(json_path, 'w', encoding='utf-8') as json_file:
    json.dump(output_data, json_file, ensure_ascii=False, indent=4)

print(f"Data saved to {json_path}")


  1%|          | 1/100 [00:27<45:23, 27.51s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 100/100 [23:42<00:00, 14.22s/it]

Data saved to /home/mshahidul/project1/results_new/medlineplus_gpt4_mini_COD_(syn)_back_translation.json





In [4]:
avg_bleu_score = sum([x['scores_cod_prompt(bleu and chrf)']['bleu_score'] for x in output_data]) / len(output_data)

print(f"Average BLEU Score: {avg_bleu_score:.4f}")

Average BLEU Score: 60.5105


## Ehr data

In [5]:
## Evaluation
from utils import translate_using_prompt
output_data = []
import tqdm
import pandas as pd
import json

ehr_data = pd.read_excel('/home/mshahidul/project1/all_tran_data/dataset/EHR_data.xlsx')

for eng, spa in tqdm.tqdm(zip(ehr_data["english"], ehr_data["spain"])):
    sentence_eng = eng
    sentence_spa = spa
    try:
        full_prompt, medical_translations, keywords, output = process_sentence(sentence_eng)
        spa_tran_prompt = translate_using_prompt(full_prompt, sentence_eng)
        back_tran_prompt = back_translate(spa_tran_prompt)

        reference_text = [sentence_eng]
        hypothesis_text = back_tran_prompt
        scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)

        output_data.append({
            "Original_English_sentence": sentence_eng,
            "Original_Spanish_sentence": sentence_spa,
            "COD_prompt": full_prompt,
            "spanish_translation_prompt": spa_tran_prompt,
            "back_translation_prompt": back_tran_prompt,
            "scores_cod_prompt(bleu and chrf)": scores_cod_prompt
        })
    except Exception as e:
        print(f"Error: {e}!!!!")
        continue

json_path = "/home/mshahidul/project1/results_new/ehr_gpt4_mini_COD_(syn)_back_translation.json"
with open(json_path, 'w', encoding='utf-8') as json_file:
    json.dump(output_data, json_file, ensure_ascii=False, indent=4)

print(f"Data saved to {json_path}")


108it [35:59, 19.99s/it]

Data saved to /home/mshahidul/project1/results_new/ehr_gpt4_mini_COD_(syn)_back_translation.json





# COD + syn (test2)

In [None]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import Open
AI  # Assuming OpenAI API is set up
from utils import get_synonyms, back_translate, compute_bleu_chrf
from openai import OpenAI 
client = OpenAI(api_key="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")
import json
def translate_using_prompt(prompt,sentence):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": f"{prompt}"},
                    {"role": "user", "content": f"Translate the following text from English into Spanish based on above context: {sentence}"}],
            temperature=0.5
        )
    ans=(response.choices[0].message.content)
    return ans
def back_translate(spa_tran):
    response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                # {"role": "system", "content": f""},
                    {"role": "user", "content": f"Translate the following text from Spanish into English: {spa_tran}"}],
            temperature=0.5
        )
    return (response.choices[0].message.content)

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "FRE": "fra_Latn",
    "GER": "deu_Latn",
    "POR": "por_Latn",
    "SPA": 'spa_Latn'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

In [None]:


import ast
def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "system", "content": "Extract medical keywords from the given sentence. return it as python list format without extra things."},
                  {"role": "user", "content": f"{sentence}"}],
        temperature=0.5
    )
    keywords = (response.choices[0].message.content)
    words_list = ast.literal_eval(keywords)
    return words_list  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}

def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        
        def find_cui(term):
            cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{term}%",))
            return cursor.fetchone()
        
        def find_translations(cui):
            cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                           (cui, 'FRE', 'POR', 'GER'))
            return {row['LAT']: row['STR'] for row in cursor.fetchall()}
        
        results = {}
        words = keyword.split()
        for word in words:
            result = find_cui(word)
            if result:
                results[word] = find_translations(result["CUI"])
        
        return results if results else None
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()
    

# def translate_non_medical(keyword):
#     translations = {}
#     for language, lang_code in languages.items():
#         output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
#         translations[language] = output[0]['translation_text']
#     return translations

def convert_to_chained_format(dictionary, src_lang, target_lang):
    chain = []
    
    for category, words in dictionary.items():
        for word, translations in words.items():
            formatted_translations = []
            
            # Ensure the source language is first, target language second, then others
            ordered_languages = ["ENG", "SPA", "FRE", "GER", "POR"]
            
            for lang in ordered_languages:
                if lang in translations:
                    formatted_translations.append(f"'{word}' in {lang} is '{translations[lang]}'")

            chain.append(". ".join(formatted_translations))
    
    chained_text = ". ".join(chain) + "."
    return chained_text
def translate_keywords_NLLB(keyword):
    translations = {"ENG": keyword}  # Ensure English is first
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

def get_keywords(sentence):
    df = pd.read_excel("/home/mshahidul/project1/testing_dataset_modified.xlsx")
    result = df.loc[df['sentence'] == sentence, 'keywords']
    return result.iloc[0] if not result.empty else None

def process_sentence(sentence):
    # keywords = extract_keywords(sentence)
    keywords=get_keywords(sentence)
    import ast
    keywords=ast.literal_eval(keywords)
    # print(keywords)
    medical_translations = {}
    synonyms_list=[]
    # Process medical keywords
    for keyword in keywords:
        translation = search_umls(keyword)
        translation={} if translation is None else translation
        synonyms=get_synonyms(keyword)
        # print(keyword)
        if synonyms:
            synonyms_list.append(f"'{keyword}' synonyms are [{', '.join(synonyms)}].")
        if translation:  # If Spanish missing in UMLS, use NLLB for Spanish
            translation=translate_keywords_NLLB(keyword)
        else:
            translation["SPA"] = translator(keyword, src_lang="eng_Latn", tgt_lang="spa_Latn", max_length=400)[0]['translation_text']
        
        if translation:
            translation['ENG'] = keyword
            medical_translations[keyword] = translation

    result_json_temp = {
        "medical": medical_translations
    }

    src_language = "English"
    target_language = "Spanish"
    output2=" ".join(synonyms_list)
    if medical_translations:
        chained_output = convert_to_chained_format(result_json_temp, src_language, target_language)
        full_prompt="Chain of dictionary: "+chained_output+"\n\nSynonyms: "+output2
    else:
        full_prompt="Synonyms: "+output2

    return full_prompt, medical_translations, output2

# Example usage
# sentence = "If the broken bone punctures the skin, it is called an open fracture (compound fracture)."
# full_prompt,result_json_temp,_1 = process_sentence(sentence)
# print(full_prompt)


In [None]:
output_data=[]
import tqdm
path="/home/mshahidul/project1/data2/extracted_files/eng_spa_pairs/medlineplus_pairs.txt"
f=open(path, "r").read().split("\n")
for x in tqdm.tqdm(f[:100]):
    xx=x.split("\t")
    sentence_eng=xx[0]
    sentence_spa=xx[1]
    full_prompt,result_json_temp,_ = process_sentence(sentence_eng)
    spa_tran=translate_using_prompt(full_prompt,sentence_eng)
    # spa_tran_direct=direct_translate(sentence_eng)
    back_tran=back_translate(spa_tran)
    # back_tran_direct=back_translate(spa_tran_direct)
    reference_text = [sentence_eng]
    hypothesis_text = back_tran
    scores_cod_prompt = compute_bleu_chrf(reference_text, hypothesis_text)
    # hypothesis_text = back_tran_direct
    # scores_direct = compute_bleu_chrf(reference_text, hypothesis_text)
    output_data.append({
        "Original_English_sentence": sentence_eng,
        "Original_Spanish_sentence": sentence_spa,
        "COD_prompt": full_prompt,
        "spanish_translation": spa_tran,
        # "spanish_translation_direct": spa_tran_direct,
        "back_translation": back_tran,
        # "back_translation_direct": back_tran_direct,
        "scores_cod_prompt(bleu and chrf)": scores_cod_prompt,
        # "scores_direct(bleu and chrf)": scores_direct
    })
    # print(scores_cod_prompt,scores_direct)

In [None]:
# Compute average scores for COD-prompt-based and direct translations
total_bleu_cod = 0
total_chrf_cod = 0
total_bleu_direct = 0
total_chrf_direct = 0
count = len(output_data)

for entry in output_data:
    total_bleu_cod += entry["scores_cod_prompt(bleu and chrf)"]["bleu_score"]
    total_chrf_cod += entry["scores_cod_prompt(bleu and chrf)"]["chrF++"]


# Calculate averages
avg_bleu_cod = total_bleu_cod / count
avg_chrf_cod = total_chrf_cod / count


# Print the results
print(f"Average BLEU Score (COD-Prompt-Based Translation): {avg_bleu_cod:.2f}")
print(f"Average chrF++ Score (COD-Prompt-Based Translation): {avg_chrf_cod:.2f}")

## Extra code

In [None]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up
from utils import *
# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Spanish": 'spa_Latn'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="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")



def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        
        translations = {row['LAT']: row['STR'] for row in rows}
        return translations
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

def translate_non_medical(keyword):
    translations = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations


def convert_to_chained_format(dictionary, src_lang, target_lang):
    chain = []
    
    for category, words in dictionary.items():
        for word, translations in words.items():
            formatted_translations = []
            
            # Ensure the source language is first, target language second, then others
            ordered_languages = ["English", "Spanish", "French", "German", "Portuguese"]
            
            for lang in ordered_languages:
                if lang in translations:
                    formatted_translations.append(f"{word} in {lang} is '{translations[lang]}'")

            chain.append(". ".join(formatted_translations))
    
    chained_text = ". ".join(chain) + "."
    return chained_text


def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    non_medical_translations = {}

    # Process medical keywords
    for keyword in keywords["medical_keywords"]:
        translation = search_umls(keyword)
        if "SPA" not in translation and translation:  # If Spanish missing in UMLS, use NLLB for Spanish
            translation["SPA"] = translator(keyword, src_lang="eng_Latn", tgt_lang="spa_Latn", max_length=400)[0]['translation_text']
        if translation:
            medical_translations[keyword] = translation

    # Process non-medical keywords (always translated via NLLB)
    for keyword in keywords["non_medical_keywords"]:
        non_medical_translations[keyword] = translate_non_medical(keyword)

    result_json_temp = {
        "medical": medical_translations,
        "non_medical": non_medical_translations
    }

    src_language = "English"
    target_language = "Spanish"
    chained_output = convert_to_chained_format(result_json_temp, src_language, target_language)

    return chained_output, result_json_temp

# Example usage
sentence = "Bariatric surgery is done when diet and exercise haven't worked or when you have serious health problems because of your weight."
COD_prompt,result_json_temp = process_sentence(sentence)
print(COD_prompt)
