In [None]:
try: 
    import torch
except: 
    raise ImportError('Install torch via `pip install torch`')
from packaging.version import Version as V
v = V(torch.__version__)
cuda = str(torch.version.cuda)
is_ampere = torch.cuda.get_device_capability()[0] >= 8
if cuda != "12.1" and cuda != "11.8" and cuda != "12.4": 
    raise RuntimeError(f"CUDA = {cuda} not supported!")
if   v <= V('2.1.0'): 
    raise RuntimeError(f"Torch = {v} too old!")
elif v <= V('2.1.1'): 
    x = 'cu{}{}-torch211'
elif v <= V('2.1.2'): 
    x = 'cu{}{}-torch212'
elif v  < V('2.3.0'): 
    x = 'cu{}{}-torch220'
elif v  < V('2.4.0'): 
    x = 'cu{}{}-torch230'
elif v  < V('2.5.0'): 
    x = 'cu{}{}-torch240'
elif v  < V('2.6.0'): 
    x = 'cu{}{}-torch250'
else: 
    raise RuntimeError(f"Torch = {v} too new!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
print(f'pip install --upgrade pip && pip install "unsloth[{x}] @ git+https://github.com/unslothai/unsloth.git"')

In [None]:
import mysql.connector
import json

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

def search_umls_by_language(keyword):
    try:
        # Connect to the database
        connection = mysql.connector.connect(**db_config)
        if connection.is_connected():
            print("Connected to the UMLS database")

            cursor = connection.cursor(dictionary=True)

            # Step 1: Find the CUI for the keyword
            cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
            result = cursor.fetchone()
            if not result:
                print(f"No results found for keyword: {keyword}")
                return "not found"
            cui = result["CUI"]
            print(f"Found CUI: {cui}")

            # Step 2: Query for STR values for specific languages
            languages = ['FRE', 'POR', 'GER']
            cursor.execute(
                "SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                (cui, *languages)
            )
            rows = cursor.fetchall()

            # Step 3: Format the output in JSON
            language_data = {row['LAT']: row['STR'] for row in rows}
            output_json = json.dumps(language_data, indent=4, ensure_ascii=False)
            print("\nResults in JSON format:")
            print(output_json)

            return output_json

    except mysql.connector.Error as err:
        print(f"Error: {err}")
    finally:
        if 'connection' in locals() and connection.is_connected():
            connection.close()
            print("Database connection closed")

# Use the function to search for a keyword
keyword = input("Enter a keyword to search: ")
search_umls_by_language(keyword)


In [None]:
import mysql.connector

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

output_file = "umls_tables_info.txt"

try:
    # Connect to the MySQL database
    connection = mysql.connector.connect(**db_config)
    
    if connection.is_connected():
        print("Connected to the UMLS database")

        # Create a cursor object to execute queries
        cursor = connection.cursor()

        # Open the file to write
        with open(output_file, "w") as file:
            # List all tables in the database
            cursor.execute("SHOW TABLES;")
            tables = cursor.fetchall()
            file.write("Tables in the database:\n")
            for table in tables:
                file.write(f"{table[0]}\n")
            file.write("\nTable structures:\n")

            # Get the structure of each table
            for table in tables:
                table_name = table[0]
                file.write(f"\nStructure of table '{table_name}':\n")
                cursor.execute(f"DESCRIBE {table_name};")
                structure = cursor.fetchall()
                for column in structure:
                    file.write(f"  Column: {column[0]}, Type: {column[1]}, Null: {column[2]}, Key: {column[3]}, Default: {column[4]}\n")
        
        print(f"Table information saved to '{output_file}'")

except mysql.connector.Error as err:
    print(f"Error: {err}")

finally:
    # Close the connection
    if 'connection' in locals() and connection.is_connected():
        connection.close()
        print("Database connection closed")


In [None]:
import mysql.connector
import json
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
import torch

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model and its configuration
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of languages and their codes
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Bangla": 'ben_Beng'
}

# Load model and tokenizer for translation
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)


def search_umls_by_language(keyword):
    try:
        # Connect to the database
        connection = mysql.connector.connect(**db_config)
        if connection.is_connected():
            print("Connected to the UMLS database")

            cursor = connection.cursor(dictionary=True)

            # Step 1: Find the CUI for the keyword
            cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
            result = cursor.fetchone()
            if not result:
                print(f"No results found for keyword: {keyword}")
                return translate_keyword(keyword)  # Call translation fallback
            cui = result["CUI"]
            print(f"Found CUI: {cui}")

            # Step 2: Query for STR values for specific languages
            languages_query = ['FRE', 'POR', 'GER']
            cursor.execute(
                "SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                (cui, *languages_query)
            )
            rows = cursor.fetchall()

            # Step 3: Format the output in JSON
            language_data = {row['LAT']: row['STR'] for row in rows}
            output_json = json.dumps(language_data, indent=4, ensure_ascii=False)
            print("\nResults in JSON format:")
            print(output_json)

            return output_json

    except mysql.connector.Error as err:
        print(f"Error: {err}")
    finally:
        if 'connection' in locals() and connection.is_connected():
            connection.close()
            print("Database connection closed")


def translate_keyword(keyword):
    print(f"Translating keyword: {keyword}")
    multi_tran = {}
    multi_tran[keyword] = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        multi_tran[keyword][language] = output[0]['translation_text']

    output_json = json.dumps(multi_tran, indent=4, ensure_ascii=False)
    print("\nTranslated results in JSON format:")
    print(output_json)
    return output_json


# Use the function to search for a keyword
keyword = input("Enter a keyword to search: ")
search_umls_by_language(keyword)



In [None]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Bangla": 'ben_Beng'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="YOUR_OPENAI_API_KEY")

def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4",
        messages=[{"role": "system", "content": "Extract medical and non-medical keywords from the given sentence."},
                  {"role": "user", "content": sentence}],
        temperature=0.5
    )
    keywords = json.loads(response.choices[0].message.content)
    return keywords  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}

def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        return {row['LAT']: row['STR'] for row in rows}
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

def translate_non_medical(keyword):
    translations = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    non_medical_translations = {}
    
    for keyword in keywords["medical"]:
        translation = search_umls(keyword)
        if translation:
            medical_translations[keyword] = translation
    
    for keyword in keywords["non_medical"]:
        non_medical_translations[keyword] = translate_non_medical(keyword)
    
    return json.dumps({
        "medical": medical_translations,
        "non_medical": non_medical_translations
    }, indent=4, ensure_ascii=False)

# Example usage
sentence = input("Enter a sentence: ")
result_json = process_sentence(sentence)
print(result_json)


In [None]:
import tarfile
import os

# Define the path to the tar.gz file
tar_gz_path = "/data/data_user/annotations/machine_translation/eng_spa_pairs.tar.gz"

# Extract the contents of the tar.gz file
with tarfile.open(tar_gz_path, "r:gz") as tar:
    tar.extractall(path="/home/mshahidul/project1/data2/extracted_files")

# List the extracted files
extracted_files_path = "/home/mshahidul/project1/data2/extracted_files"
extracted_files = os.listdir(extracted_files_path)
print(extracted_files)

In [None]:
import mysql.connector
import json
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForSeq2SeqLM
from openai import OpenAI  # Assuming OpenAI API is set up

# Database connection details
db_config = {
    'host': '172.16.34.1',
    'port': 3307,
    'user': 'umls',
    'password': 'umls',
    'database': 'umls2024'
}

# Define the NLLB-200 model
model_name = "facebook/nllb-200-3.3B"
cache_directory = "/data/data_user_alpha/public_models"

# List of target languages
languages = {
    "French": "fra_Latn",
    "German": "deu_Latn",
    "Portuguese": "por_Latn",
    "Bangla": 'ben_Beng'
}

# Load translation model
device = torch.device("cuda:2" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name, cache_dir=cache_directory, torch_dtype=torch.float16)
model.to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_directory)
translator = pipeline("translation", model=model, tokenizer=tokenizer)

# OpenAI API Client
client = OpenAI(api_key="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")

def extract_keywords(sentence):
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "system", "content": "Extract medical and non-medical keywords from the given sentence. return it as json format without extra things."},
                  {"role": "user", "content": sentence}],
        temperature=0.5
    )
    print(response.choices[0].message.content)
    keywords = json.loads(response.choices[0].message.content)
    return keywords  # Expected format: {"medical": ["keyword1", "keyword2"], "non_medical": ["keyword3", "keyword4"]}

def search_umls(keyword):
    try:
        connection = mysql.connector.connect(**db_config)
        cursor = connection.cursor(dictionary=True)
        cursor.execute("SELECT CUI FROM MRCONSO WHERE STR LIKE %s LIMIT 1", (f"%{keyword}%",))
        result = cursor.fetchone()
        if not result:
            return None
        cui = result["CUI"]

        cursor.execute("SELECT LAT, STR FROM MRCONSO WHERE CUI = %s AND LAT IN (%s, %s, %s)",
                       (cui, 'FRE', 'POR', 'GER'))
        rows = cursor.fetchall()
        return {row['LAT']: row['STR'] for row in rows}
    except mysql.connector.Error as err:
        print(f"Database error: {err}")
    finally:
        if connection.is_connected():
            connection.close()

def translate_non_medical(keyword):
    translations = {}
    for language, lang_code in languages.items():
        output = translator(keyword, src_lang="eng_Latn", tgt_lang=lang_code, max_length=400)
        translations[language] = output[0]['translation_text']
    return translations

def process_sentence(sentence):
    keywords = extract_keywords(sentence)
    medical_translations = {}
    non_medical_translations = {}
    
    for keyword in keywords["medical_keywords"]:
        translation = search_umls(keyword)
        if translation:
            medical_translations[keyword] = translation
    
    for keyword in keywords["non_medical_keywords"]:
        non_medical_translations[keyword] = translate_non_medical(keyword)
    
    return json.dumps({
        "medical": medical_translations,
        "non_medical": non_medical_translations
    }, indent=4, ensure_ascii=False)

# Example usage
sentence = "Bariatric surgery is done when diet and exercise haven't worked or when you have serious health problems because of your weight."
result_json = process_sentence(sentence)
print(result_json)


In [5]:
from collections import Counter
from nltk.util import ngrams

def ngram_hits(reference: str, hypothesis: str):
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    
    # Compute unigram hits
    ref_unigrams = Counter(ref_tokens)
    hyp_unigrams = Counter(hyp_tokens)
    unigram_hits = [word for word in hyp_unigrams if word in ref_unigrams]
    unigram_misses = [word for word in hyp_unigrams if word not in ref_unigrams]
    
    # Compute bigram hits
    ref_bigrams = Counter(ngrams(ref_tokens, 2))
    hyp_bigrams = Counter(ngrams(hyp_tokens, 2))
    bigram_hits = [bigram for bigram in hyp_bigrams if bigram in ref_bigrams]
    bigram_misses = [bigram for bigram in hyp_bigrams if bigram not in ref_bigrams]
    
    return {
        "unigram_hits": unigram_hits,
        "unigram_misses": unigram_misses,
        "bigram_hits": bigram_hits,
        "bigram_misses": bigram_misses
    }

# Example usage
reference_sentence = "the cat is sitting on the mat"
hypothesis_sentence = "the cat is on the mat"
print(ngram_hits(reference_sentence, hypothesis_sentence))


{'unigram_hits': ['the', 'cat', 'is', 'on', 'mat'], 'unigram_misses': [], 'bigram_hits': [('the', 'cat'), ('cat', 'is'), ('on', 'the'), ('the', 'mat')], 'bigram_misses': [('is', 'on')]}


In [14]:
from collections import Counter
from nltk.util import ngrams
import gradio as gr
reference="the cat is on the mat" 
hypothesis="the cat is sitting on the mat"

def ngram_hits():
    ref_tokens = reference.split()
    hyp_tokens = hypothesis.split()
    
    # Compute unigram hits
    ref_unigrams = Counter(ref_tokens)
    hyp_unigrams = Counter(hyp_tokens)
    unigram_hits = [word for word in hyp_unigrams if word in ref_unigrams]
    unigram_misses = [word for word in hyp_unigrams if word not in ref_unigrams]
    
    # Compute bigram hits
    ref_bigrams = Counter(ngrams(ref_tokens, 2))
    hyp_bigrams = Counter(ngrams(hyp_tokens, 2))
    bigram_hits = [bigram for bigram in hyp_bigrams if bigram in ref_bigrams]
    bigram_misses = [bigram for bigram in hyp_bigrams if bigram not in ref_bigrams]
    
    # Format output with HTML
    formatted_text = ""
    for word in hyp_tokens:
        if word in unigram_hits:
            formatted_text += f'<span style="color: green;">{word}</span> '
        else:
            formatted_text += f'<span style="color: red;">{word}</span> '
    
    return formatted_text.strip()

def processing():
    txt=ngram_hits()
    print(txt)
    return txt
# Gradio UI
demo = gr.Interface(
    fn=processing,
    inputs=[],
    outputs=gr.HTML(label="Highlighted Output")
)

demo.launch(share=True)


* Running on local URL:  http://127.0.0.1:7864
* Running on public URL: https://43f3e64d1f9c07f9ba.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


