In [1]:
print("Hello world!")

Hello world!


In [3]:
import re

def parse_obo_file(file_path):
    data = {}
    with open(file_path, 'r') as f:
        current_entry = None
        for line in f:
            line = line.strip()
            if line.startswith("name:"):
                if current_entry:
                    data[current_entry['name']] = current_entry['synonyms']
                current_entry = {'synonyms': []}
                current_entry['name'] = line.split("name:")[1].strip()
            elif line.startswith("synonym:"):
                synonym_match = re.match(r'synonym: "(.+)"', line)
                if synonym_match:
                    current_entry['synonyms'].append(synonym_match.group(1))
        if current_entry:
            data[current_entry['name']] = current_entry['synonyms']
    return data

def write_data_to_file(data, output_file):
    with open(output_file, 'w') as f:
        for entry_name, synonyms in data.items():
            f.write(entry_name + ": " + ", ".join(synonyms) + "\n")

# For diseases
diseases_file_path = "doid.obo"  # Replace with the path to your diseases .obo file
diseases_output_file = "disease_synonyms.txt"  # Replace with the path for the output file

disease_data = parse_obo_file(diseases_file_path)
write_data_to_file(disease_data, diseases_output_file)

print("Disease data extracted and saved to:", diseases_output_file)

# For symptoms
symptoms_file_path = "symp.obo"  # Replace with the path to your symptoms .obo file
symptoms_output_file = "symptom_synonyms.txt"  # Replace with the path for the output file

symptom_data = parse_obo_file(symptoms_file_path)
write_data_to_file(symptom_data, symptoms_output_file)

print("Symptoms data extracted and saved to:", symptoms_output_file)


Disease data extracted and saved to: disease_synonyms.txt


UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 1411: character maps to <undefined>

Correct code for tackling the symtomps file encoding

In [None]:
import re

def parse_obo_file(file_path):
    data = {}
    current_entry = None
    with open(file_path, 'r', encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if line.startswith("[Term]"):
                if current_entry:
                    if 'name' in current_entry:
                        data[current_entry['name']] = current_entry.get('synonyms', [])
                current_entry = {}
            elif line.startswith("id:"):
                current_entry['id'] = line.split("id:")[1].strip()
            elif line.startswith("name:"):
                current_entry['name'] = line.split("name:")[1].strip()
            elif line.startswith("synonym:"):
                synonym_match = re.match(r'synonym: "(.+)"', line)
                if synonym_match:
                    current_entry.setdefault('synonyms', []).append(synonym_match.group(1))
    # Add the last entry after reaching the end of the file
    if current_entry and 'name' in current_entry:
        data[current_entry['name']] = current_entry.get('synonyms', [])
    return data

def write_data_to_file(data, output_file):
    with open(output_file, 'w', encoding="utf-8") as f:
        for entry_name, synonyms in data.items():
            f.write(entry_name + ": " + ", ".join(synonyms) + "\n")

file_path = "symp.obo"  # Replace with the path to your symptoms .obo file
output_file = "symptom_synonyms.txt"  # Replace with the path for the output file

symptom_data = parse_obo_file(file_path)
write_data_to_file(symptom_data, output_file)

print("Symptoms data extracted and saved to:", output_file)


Symptoms data extracted and saved to: symptom_synonyms.txt


Disease symptoms idetifying from a sentence through the extracted file

In [None]:
import re

# Function to read synonyms from file and return a dictionary
def read_synonyms(file_path):
    synonym_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            if ':' in line:
                disease_or_symptom, synonyms = line.strip().split(':', 1)  # Only split on the first colon
                disease_or_symptom = disease_or_symptom.strip()
                synonym_dict[disease_or_symptom.lower()] = disease_or_symptom
                if synonyms.strip():
                    synonyms_list = [syn.strip() for syn in synonyms.split(',')]
                    for synonym in synonyms_list:
                        synonym_dict[synonym.lower()] = disease_or_symptom
            else:
                disease_or_symptom = line.strip()
                synonym_dict[disease_or_symptom.lower()] = disease_or_symptom
    return synonym_dict

# Function to find synonyms in text and return the matched diseases/symptoms
def find_in_text(text, synonym_dict):
    matches = set()
    text_lower = text.lower()
    for synonym in synonym_dict:
        if re.search(r'\b' + re.escape(synonym) + r'\b', text_lower):
            matches.add(synonym_dict[synonym])
    return matches

# Paths to your synonym files
disease_synonym_file = 'disease_synonyms.txt'
symptom_synonym_file = 'symptom_synonyms.txt'

# Read the synonym files
disease_synonyms = read_synonyms(disease_synonym_file)
symptom_synonyms = read_synonyms(symptom_synonym_file)

# Sample paragraph text
paragraph_text = "The patient reported experiencing severe headaches and nausea, which might indicate a possible metabolic disease or hemangiosarcoma."

# Find diseases and symptoms in the text
diseases_found = find_in_text(paragraph_text, disease_synonyms)
symptoms_found = find_in_text(paragraph_text, symptom_synonyms)

print("Diseases found:", diseases_found)
print("Symptoms found:", symptoms_found)


Diseases found: {'angiosarcoma', 'disease of metabolism', 'disease', 'hypotrichosis 7'}
Symptoms found: {'nausea'}


This one is not able to tackle numbers and punctuations

In [16]:
import re

# Function to read synonyms from file and return a dictionary
def read_synonyms(file_path):
    synonym_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if ':' in line:
                main_term, synonyms = line.split(':', 1)
                main_term = main_term.strip().lower()
                synonym_dict[main_term] = main_term
                if synonyms.strip():
                    synonyms_list = [syn.strip().lower() for syn in synonyms.split(',')]
                    for synonym in synonyms_list:
                        synonym_dict[synonym] = main_term
            else:
                main_term = line.strip().lower()
                synonym_dict[main_term] = main_term
    return synonym_dict

# Function to find synonyms in text and return the matched diseases/symptoms
def find_in_text(text, synonym_dict):
    matches = set()
    text_lower = text.lower()
    for synonym in synonym_dict:
        if re.search(r'\b' + re.escape(synonym) + r'\b', text_lower):
            matches.add(synonym_dict[synonym])
    return matches

# Paths to your synonym files
disease_synonym_file = 'disease_synonyms.txt'
symptom_synonym_file = 'symptom_synonyms.txt'

# Read the synonym files
disease_synonyms = read_synonyms(disease_synonym_file)
symptom_synonyms = read_synonyms(symptom_synonym_file)

# Sample paragraph text
paragraph_text = "During a 16-year period (1972-1988), 40 out of 477 thyroid cancer patients underwent thyroidectomy for undifferentiated thyroid carcinoma. To analyse the significance of "radical" versus "palliative" surgical procedures with regard to early postoperative course, operative complications and survival, all patients records were reviewed and actually followed up. A significant better survival was correlated with radical (n = 17) versus palliative tumor resection (n = 23) (p less than 0.001), and total thyroidectomy (n = 22) versus subtotal thyroidectomy (n = 18) (p less than 0.006). Radical surgery with early postoperative external irradiation revealed no postoperative mortality and only one symptomatic cervical tumor recurrence. In contrast, palliative surgery, particularly in the case of synchronous tracheotomy, was attended with a relatively high mortality (30%) and symptomatic local recurrences. The results of this study suggest that in undifferentiated thyroid carcinoma without infiltration of the esophageal or tracheal mucosa an attempt of radical tumor resection should be undertaken, since palliative surgical procedures revealed a significantly lower survival due to complications of persistent or recurrent cervical tumor infiltration and frequently were accompanied by local complications during the postoperative course."

# Find diseases and symptoms in the text
diseases_found = find_in_text(paragraph_text, disease_synonyms)
symptoms_found = find_in_text(paragraph_text, symptom_synonyms)

print("Diseases found:", diseases_found)
print("Symptoms found:", symptoms_found)


SyntaxError: invalid syntax (2548437286.py, line 40)

In [18]:
import re

# Function to read synonyms from file and return a dictionary
def read_synonyms(file_path):
    synonym_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if ':' in line:
                main_term, synonyms = line.split(':', 1)
                main_term = main_term.strip().lower()
                synonym_dict[main_term] = main_term
                if synonyms.strip():
                    synonyms_list = [syn.strip().lower() for syn in synonyms.split(',')]
                    for synonym in synonyms_list:
                        synonym_dict[synonym] = main_term
            else:
                main_term = line.strip().lower()
                synonym_dict[main_term] = main_term
    return synonym_dict

# Function to find synonyms in text and return the matched diseases/symptoms
def find_in_text(text, synonym_dict):
    matches = set()
    text_lower = text.lower()
    for synonym in synonym_dict:
        # Ensure the synonym is matched as a whole word
        pattern = r'\b' + re.escape(synonym) + r'\b'
        if re.search(pattern, text_lower):
            matches.add(synonym_dict[synonym])
    return matches

# Paths to your synonym files
disease_synonym_file = 'disease_synonyms.txt'
symptom_synonym_file = 'symptom_synonyms.txt'

# Read the synonym files
disease_synonyms = read_synonyms(disease_synonym_file)
symptom_synonyms = read_synonyms(symptom_synonym_file)

# Sample paragraph text
paragraph_text = "The patient reported experiencing severe headaches and nausea, which might indicate a possible metabolic disease or hemangiosarcoma."

# Find diseases and symptoms in the text
diseases_found = find_in_text(paragraph_text, disease_synonyms)
symptoms_found = find_in_text(paragraph_text, symptom_synonyms)

print("Diseases found:", diseases_found)
print("Symptoms found:", symptoms_found)


Diseases found: {'angiosarcoma', 'disease of metabolism', 'disease', 'hypotrichosis 7'}
Symptoms found: {'nausea'}


Identifying more disease that are not even in the text para

In [19]:
import re

# Function to read synonyms from file and return a dictionary
def read_synonyms(file_path):
    synonym_dict = {}
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if ':' in line:
                main_term, synonyms = line.split(':', 1)
                main_term = main_term.strip().lower()
                synonym_dict[main_term] = main_term
                if synonyms.strip():
                    synonyms_list = [syn.strip().lower() for syn in synonyms.split(',')]
                    for synonym in synonyms_list:
                        synonym_dict[synonym] = main_term
            else:
                main_term = line.strip().lower()
                synonym_dict[main_term] = main_term
    return synonym_dict

# Function to find synonyms in text and return the matched diseases/symptoms
def find_in_text(text, synonym_dict):
    matches = set()
    text_lower = text.lower()
    for synonym in synonym_dict:
        # Ensure the synonym is matched as a whole word
        pattern = r'\b' + re.escape(synonym) + r'\b'
        if re.search(pattern, text_lower):
            matches.add(synonym_dict[synonym])
    return matches

# Paths to your synonym files
disease_synonym_file = 'disease_synonyms.txt'
symptom_synonym_file = 'symptom_synonyms.txt'

# Read the synonym files
disease_synonyms = read_synonyms(disease_synonym_file)
symptom_synonyms = read_synonyms(symptom_synonym_file)

# Sample paragraph text
paragraph_text = """
During a 16-year period (1972-1988), 40 out of 477 thyroid cancer patients underwent thyroidectomy for undifferentiated thyroid carcinoma.
To analyse the significance of "radical" versus "palliative" surgical procedures with regard to early postoperative course, operative complications and survival,
all patients records were reviewed and actually followed up. A significant better survival was correlated with radical (n = 17) versus palliative tumor resection (n = 23) (p less than 0.001),
and total thyroidectomy (n = 22) versus subtotal thyroidectomy (n = 18) (p less than 0.006).
Radical surgery with early postoperative external irradiation revealed no postoperative mortality and only one symptomatic cervical tumor recurrence.
In contrast, palliative surgery, particularly in the case of synchronous tracheotomy, was attended with a relatively high mortality (30%) and symptomatic local recurrences.
The results of this study suggest that in undifferentiated thyroid carcinoma without infiltration of the esophageal or tracheal mucosa an attempt of radical tumor resection should be undertaken,
since palliative surgical procedures revealed a significantly lower survival due to complications of persistent or recurrent cervical tumor infiltration
and frequently were accompanied by local complications during the postoperative course.
"""

# Find diseases and symptoms in the text
diseases_found = find_in_text(paragraph_text, disease_synonyms)
symptoms_found = find_in_text(paragraph_text, symptom_synonyms)

print("Diseases found:", diseases_found)
print("Symptoms found:", symptoms_found)


Diseases found: {'thyroid gland carcinoma', 'familial medullary thyroid carcinoma', 'congenital disorder of glycosylation type iim', 'fallopian tube cancer', 'acute lymphoblastic leukemia', 'cancer', 'obsolete anaplastic carcinoma', 'total circumpapillary dystrophy of choroid', 'paraplegia', 'hypotrichosis 7', 'primary syphilis', 'thyroid cancer', 'lip cancer', 'obsolete recurrent adenocarcinoma of lung'}
Symptoms found: set()


In [10]:
import re

# Function to read synonyms from file and return a dictionary
def read_synonyms(file_path):
  synonym_dict = {}
  with open(file_path, 'r') as file:
    for line in file:
      line = line.strip()
      if ':' in line:
        main_term, synonyms = line.split(':', 1)
        main_term = main_term.strip().lower()
        synonym_dict[main_term] = main_term
        if synonyms.strip():
          synonyms_list = [syn.strip().lower() for syn in synonyms.split(',')]
          for synonym in synonyms_list:
            synonym_dict[synonym] = main_term
      else:
        main_term = line.strip().lower()
        synonym_dict[main_term] = main_term
  return synonym_dict

# Function to find the first exact match for a disease/symptom in text
def find_first_in_text(text, synonym_dict):
  text_lower = text.lower()
  # Escape special characters including inverted commas
  escaped_synonyms = [re.escape(syn.strip()) for synonym in synonym_dict]
  # Create a pattern for whole words only (using word boundaries)
  pattern = r'\b(' + '|'.join(escaped_synonyms) + r')\b'
  match = re.search(pattern, text_lower)
  if match:
    found_term = match.group()
    print(f"First match found: '{found_term}' as '{synonym_dict[found_term]}'")
    return synonym_dict[found_term]
  return None

# ... (rest of your code remains the same)
# Paths to your synonym files
disease_synonym_file = 'disease_synonyms.txt'
symptom_synonym_file = 'symptom_synonyms.txt'

# Read the synonym files
disease_synonyms = read_synonyms(disease_synonym_file)
symptom_synonyms = read_synonyms(symptom_synonym_file)

# Sample paragraph text
paragraph_text = """
During a 16-year period (1972-1988), 40 out of 477 thyroid cancer patients underwent thyroidectomy for undifferentiated thyroid carcinoma.
To analyse the significance of "radical" versus "palliative" surgical procedures with regard to early postoperative course, operative complications and survival,
all patients records were reviewed and actually followed up. A significant better survival was correlated with radical (n = 17) versus palliative tumor resection (n = 23) (p less than 0.001),
and total thyroidectomy (n = 22) versus subtotal thyroidectomy (n = 18) (p less than 0.006).
Radical surgery with early postoperative external irradiation revealed no postoperative mortality and only one symptomatic cervical tumor recurrence.
In contrast, palliative surgery, particularly in the case of synchronous tracheotomy, was attended with a relatively high mortality (30%) and symptomatic local recurrences.
The results of this study suggest that in undifferentiated thyroid carcinoma without infiltration of the esophageal or tracheal mucosa an attempt of radical tumor resection should be undertaken,
since palliative surgical procedures revealed a significantly lower survival due to complications of persistent or recurrent cervical tumor infiltration
and frequently were accompanied by local complications during the postoperative course.
"""

# Find diseases and symptoms in the text
diseases_found = find_in_text(paragraph_text, disease_synonyms)
symptoms_found = find_in_text(paragraph_text, symptom_synonyms)

print("Diseases found:", diseases_found)
print("Symptoms found:", symptoms_found)

Match found: 'thyroid carcinoma' as 'familial medullary thyroid carcinoma'
Match found: '22' as 'congenital disorder of glycosylation type iim'
Match found: 'recurrent' as 'obsolete recurrent adenocarcinoma of lung'
Match found: '' as 'hypotrichosis 7'
Match found: 'total' as 'total circumpapillary dystrophy of choroid'
Match found: 'cancer' as 'cancer'
Match found: 'thyroid cancer' as 'thyroid cancer'
Match found: 'tumor' as 'fallopian tube cancer'
Match found: 'carcinoma' as 'obsolete anaplastic carcinoma'
Match found: 'thyroid' as 'thyroid gland carcinoma'
Match found: 'symptomatic' as 'primary syphilis'
Match found: 'lower' as 'paraplegia'
Match found: 'external' as 'lip cancer'
Match found: 'mucosa' as 'lip cancer'
Match found: 'all' as 'acute lymphoblastic leukemia'
Diseases found: {'cancer', 'fallopian tube cancer', 'paraplegia', 'acute lymphoblastic leukemia', 'thyroid cancer', 'primary syphilis', 'obsolete recurrent adenocarcinoma of lung', 'familial medullary thyroid carcinom