Creation of drug-drug-toxicity interacion matrix

In [1]:
import xml.etree.ElementTree as ET
from tqdm import tqdm

# Replace with your actual file path
xml_path = r"...\drugbank_all_full_database.xml\full database.xml"

tree = ET.parse(xml_path)
root = tree.getroot()

# DrugBank XML uses namespaces; must match the xmlns
ns = {'db': 'http://www.drugbank.ca'}

ddi_pairs = set()

for drug in tqdm(root.findall('db:drug', ns)):
    drug1_name = drug.findtext('db:name', default='', namespaces=ns).lower().strip()

    # Extract interactions
    for interaction in drug.findall('db:drug-interactions/db:drug-interaction', ns):
        drug2_name = interaction.findtext('db:name', default='', namespaces=ns).lower().strip()
        if drug1_name and drug2_name:
            # Add both orderings
            ddi_pairs.add((drug1_name, drug2_name))
            ddi_pairs.add((drug2_name, drug1_name))

print(f"✅ Total unique DDI pairs: {len(ddi_pairs)}")


100%|███████████████████████████████████████████████████████████████████████████| 17430/17430 [01:29<00:00, 195.62it/s]

✅ Total unique DDI pairs: 2856386





In [2]:
import pandas as pd

# Load your mapping CSV
drug_map = pd.read_csv(r'...\drugbank_mimic_rxcui_map.csv')

# Normalize names: lowercase and strip whitespace
drug_map['Name_norm'] = drug_map['Name'].str.lower().str.strip()
drug_map['Generic_Name_norm'] = drug_map['Generic_Name'].str.lower().str.strip()

# Build name → generic name lookup
name_to_generic = dict(zip(drug_map['Name_norm'], drug_map['Generic_Name_norm']))


In [3]:
# Create a new set with mapped generic names
mapped_ddi_pairs = set()

unmatched = set()

for d1, d2 in ddi_pairs:
    d1_gen = name_to_generic.get(d1)
    d2_gen = name_to_generic.get(d2)

    if d1_gen and d2_gen:
        # Sort to avoid duplicate A-B and B-A
        mapped_ddi_pairs.add(tuple(sorted((d1_gen, d2_gen))))
    else:
        mapped_ddi_pairs.add((d1, d2))  # For review

print(f"✅ Mapped DDI pairs (by generic name): {len(mapped_ddi_pairs)}")


✅ Mapped DDI pairs (by generic name): 2806304


In [4]:
ddi_description_dict = {}

for drug in tqdm(root.findall('db:drug', ns)):
    drug1_name = drug.findtext('db:name', default='', namespaces=ns).lower().strip()
    
    for interaction in drug.findall('db:drug-interactions/db:drug-interaction', ns):
        drug2_name = interaction.findtext('db:name', default='', namespaces=ns).lower().strip()
        description = interaction.findtext('db:description', default='', namespaces=ns).strip()
        
        if drug1_name and drug2_name:
            # Store description for ordered pair (drug1, drug2)
            ddi_description_dict[(drug1_name, drug2_name)] = description
            ddi_description_dict[(drug2_name, drug1_name)] = description  # also reverse


100%|███████████████████████████████████████████████████████████████████████████| 17430/17430 [00:26<00:00, 670.05it/s]


In [5]:
import re

def label_toxicity(description):
    desc = description.lower()

    # Major toxicity keywords
    major_keywords = [
        'severe', 'life-threatening', 'major', 'hospitalization', 'death',
        'fatal', 'contraindicated', 'black box warning', 'emergency',
        'intensive care', 'discontinue', 'significant risk', 'warning',
        'avoid', 'do not use', 'high risk'
    ]

    # Moderate toxicity keywords
    moderate_keywords = [
        'moderate', 'dose adjustment', 'monitor', 'avoid use',
        'careful', 'caution', 'increased risk', 'may increase',
        'reduce dose', 'adjust dose', 'watch for', 'observe',
        'possible interaction', 'clinical monitoring', 'potentially serious',
        'adverse effects'
    ]

    # Minor toxicity keywords
    minor_keywords = [
        'minor', 'mild', 'no significant', 'no interaction',
        'unlikely', 'low risk', 'insignificant', 'tolerated',
        'no dose adjustment', 'no effect', 'no clinically relevant',
        'generally safe', 'no known'
    ]

    # Check for major toxicity first
    if any(re.search(r'\b' + re.escape(kw) + r'\b', desc) for kw in major_keywords):
        return 'major'

    # Then moderate
    if any(re.search(r'\b' + re.escape(kw) + r'\b', desc) for kw in moderate_keywords):
        return 'moderate'

    # Then minor
    if any(re.search(r'\b' + re.escape(kw) + r'\b', desc) for kw in minor_keywords):
        return 'minor'

    # Additional heuristics: e.g., if description contains interaction but no strong keywords, mark as moderate
    if 'interaction' in desc or 'interact' in desc:
        return 'moderate'

    # If none matched, fallback to minor (safer to underestimate toxicity)
    return 'minor'

In [6]:
import pandas as pd

mapped_ddi_pairs_df = pd.DataFrame(list(mapped_ddi_pairs), columns=['drug1_name', 'drug2_name'])

def get_description(row):
    key = (row['drug1_name'].lower().strip(), row['drug2_name'].lower().strip())
    return ddi_description_dict.get(key, '')

mapped_ddi_pairs_df['description'] = mapped_ddi_pairs_df.apply(get_description, axis=1)

In [7]:
mapped_ddi_pairs_df.head

<bound method NDFrame.head of                       drug1_name                      drug2_name  \
0                   clomipramine                    itraconazole   
1        betamethasone phosphate                    sparfloxacin   
2                       naloxone                     trimebutine   
3                risedronic acid          hydrolyzed cephalothin   
4                       lactitol                     hydrocodone   
...                          ...                             ...   
2806299             dalantercept                      crovalimab   
2806300               vecuronium                   levamlodipine   
2806301              saxagliptin                       pidotimod   
2806302              vinorelbine  adenovirus type 7 vaccine live   
2806303       trichlormethiazide                        sulindac   

                                               description  
0        The metabolism of Clomipramine can be decrease...  
1        The risk or severity o

In [8]:
mapped_ddi_pairs_df['toxicity'] = mapped_ddi_pairs_df['description'].apply(label_toxicity)

In [9]:
mapped_ddi_pairs_df.head

<bound method NDFrame.head of                       drug1_name                      drug2_name  \
0                   clomipramine                    itraconazole   
1        betamethasone phosphate                    sparfloxacin   
2                       naloxone                     trimebutine   
3                risedronic acid          hydrolyzed cephalothin   
4                       lactitol                     hydrocodone   
...                          ...                             ...   
2806299             dalantercept                      crovalimab   
2806300               vecuronium                   levamlodipine   
2806301              saxagliptin                       pidotimod   
2806302              vinorelbine  adenovirus type 7 vaccine live   
2806303       trichlormethiazide                        sulindac   

                                               description  severity  
0        The metabolism of Clomipramine can be decrease...     minor  
1        Th

In [10]:
# Drop the description column
mapped_ddi_pairs_df = mapped_ddi_pairs_df.drop(columns=['description'], errors='ignore')

# Convert back to a list of (drug1_name, drug2_name) tuples
mapped_ddi_pairs = list(mapped_ddi_pairs_df.itertuples(index=False, name=None))
print(mapped_ddi_pairs[0])

('clomipramine', 'itraconazole', 'minor')


In [13]:
import pickle

with open(r'...\mapped_ddi_pairs.pkl', 'wb') as f:
    pickle.dump(mapped_ddi_pairs, f)
