## Chargement des librairies à utiliser

In [1]:
import pandas as pd
import numpy as np
import itertools
import lxml.etree as ET
from pathlib import Path  
import glob

## Analyse exploratoire de la base de données

In [3]:
df = pd.read_csv(r'C:/Users/Utilisateur/Desktop/Projet_ordonance/PRESCRIPTIONS.csv')
df

Unnamed: 0,row_id,subject_id,hadm_id,icustay_id,startdate,enddate,drug_type,drug,drug_name_poe,drug_name_generic,formulary_drug_cd,gsn,ndc,prod_strength,dose_val_rx,dose_unit_rx,form_val_disp,form_unit_disp,route
0,32600,42458,159647,,2146-07-21 00:00:00,2146-07-22 00:00:00,MAIN,Pneumococcal Vac Polyvalent,Pneumococcal Vac Polyvalent,PNEUMOcoccal Vac Polyvalent,PNEU25I,48548.0,6494300.0,25mcg/0.5mL Vial,0.5,mL,1,VIAL,IM
1,32601,42458,159647,,2146-07-21 00:00:00,2146-07-22 00:00:00,MAIN,Bisacodyl,Bisacodyl,Bisacodyl,BISA5,2947.0,536338101.0,5 mg Tab,10,mg,2,TAB,PO
2,32602,42458,159647,,2146-07-21 00:00:00,2146-07-22 00:00:00,MAIN,Bisacodyl,Bisacodyl,Bisacodyl (Rectal),BISA10R,2944.0,574705050.0,10mg Suppository,10,mg,1,SUPP,PR
3,32603,42458,159647,,2146-07-21 00:00:00,2146-07-22 00:00:00,MAIN,Senna,Senna,Senna,SENN187,19964.0,904516561.0,1 Tablet,1,TAB,1,TAB,PO
4,32604,42458,159647,,2146-07-21 00:00:00,2146-07-21 00:00:00,MAIN,Docusate Sodium (Liquid),Docusate Sodium (Liquid),Docusate Sodium (Liquid),DOCU100L,3017.0,121054410.0,100mg UD Cup,100,mg,1,UDCUP,PO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10393,3609913,42430,100969,210474.0,2142-11-29 00:00:00,2142-11-30 00:00:00,MAIN,Sodium Chloride 0.9% Flush,Sodium Chloride 0.9% Flush,Sodium Chloride 0.9% Flush,NACLFLUSH,,0.0,Syringe,3,mL,0.6,SYR,IV
10394,3609914,42430,100969,210474.0,2142-11-30 00:00:00,2142-11-30 00:00:00,MAIN,Acetaminophen,Acetaminophen,Acetaminophen (Rectal),ACET650R,4478.0,713016550.0,650mg Supp,650,mg,1,SUPP,PR
10395,3609915,42430,100969,,2142-11-26 00:00:00,2142-11-27 00:00:00,BASE,0.9% Sodium Chloride,,,NS1000,1210.0,338004904.0,1000mL Bag,1000,mL,1,BAG,IV
10396,3609916,42430,100969,,2142-11-26 00:00:00,2142-11-27 00:00:00,BASE,D5W,,,HEPBASE,,0.0,HEPARIN BASE,250,mL,250,mL,IV


In [4]:
df.dtypes

row_id                 int64
subject_id             int64
hadm_id                int64
icustay_id           float64
startdate             object
enddate               object
drug_type             object
drug                  object
drug_name_poe         object
drug_name_generic     object
formulary_drug_cd     object
gsn                  float64
ndc                  float64
prod_strength         object
dose_val_rx           object
dose_unit_rx          object
form_val_disp         object
form_unit_disp        object
route                 object
dtype: object

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10398 entries, 0 to 10397
Data columns (total 19 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   row_id             10398 non-null  int64  
 1   subject_id         10398 non-null  int64  
 2   hadm_id            10398 non-null  int64  
 3   icustay_id         7046 non-null   float64
 4   startdate          10398 non-null  object 
 5   enddate            10397 non-null  object 
 6   drug_type          10398 non-null  object 
 7   drug               10398 non-null  object 
 8   drug_name_poe      5766 non-null   object 
 9   drug_name_generic  5772 non-null   object 
 10  formulary_drug_cd  10397 non-null  object 
 11  gsn                9122 non-null   float64
 12  ndc                10397 non-null  float64
 13  prod_strength      10398 non-null  object 
 14  dose_val_rx        10398 non-null  object 
 15  dose_unit_rx       10398 non-null  object 
 16  form_val_disp      103

In [6]:
print(f' il y a {df['subject_id'].nunique()} patients dans cette base de données')

 il y a 94 patients dans cette base de données


In [7]:
df.columns

Index(['row_id', 'subject_id', 'hadm_id', 'icustay_id', 'startdate', 'enddate',
       'drug_type', 'drug', 'drug_name_poe', 'drug_name_generic',
       'formulary_drug_cd', 'gsn', 'ndc', 'prod_strength', 'dose_val_rx',
       'dose_unit_rx', 'form_val_disp', 'form_unit_disp', 'route'],
      dtype='object')

## Construction des ordonances

In [9]:
def construire_ordonnances(df):
    """
    Reconstitue les ordonnances et génère les paires de médicaments
    pour chaque hadm_id et chaque jour de prescription (startdate).
    
    Paramètres :
        df (DataFrame): Table PRESCRIPTIONS.csv chargée en DataFrame
        
    Retourne :
        DataFrame avec les colonnes : ordonnance_id, hadm_id, ordonnance_date, drug1_name, drug2_name
    """

    # Supprimer les lignes incomplètes
    df = df.dropna(subset=["hadm_id", "startdate", "drug_name_poe"])

    # Assurer que startdate est en datetime
    df["startdate"] = pd.to_datetime(df["startdate"])

    # Créer une colonne de date tronquée (sans l'heure)
    df["ordonnance_date"] = df["startdate"].dt.date

    # Groupement par séjour et jour
    ordonnances = df.groupby(["hadm_id", "ordonnance_date"])["drug_name_poe"].unique().reset_index()

    # Générer un identifiant unique d’ordonnance
    ordonnances["ordonnance_id"] = ["ORD" + str(i).zfill(5) for i in range(len(ordonnances))]

    # Générer les paires de médicaments
    rows = []
    for _, row in ordonnances.iterrows():
        med_list = sorted(row["drug_name_poe"])
        pairs = list(itertools.combinations(med_list, 2))  # paires de 2

        for med1, med2 in pairs:
            rows.append({
                "ordonnance_id": row["ordonnance_id"],
                "hadm_id": row["hadm_id"],
                "ordonnance_date": row["ordonnance_date"],
                "drug1_name": med1,
                "drug2_name": med2
            })

    return pd.DataFrame(rows)

In [10]:

dataset_interactions = construire_ordonnances(df)

# Aperçu
dataset_interactions.info()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["startdate"] = pd.to_datetime(df["startdate"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["ordonnance_date"] = df["startdate"].dt.date


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17967 entries, 0 to 17966
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   ordonnance_id    17967 non-null  object
 1   hadm_id          17967 non-null  int64 
 2   ordonnance_date  17967 non-null  object
 3   drug1_name       17967 non-null  object
 4   drug2_name       17967 non-null  object
dtypes: int64(1), object(4)
memory usage: 702.0+ KB


In [11]:
dataset_interactions


Unnamed: 0,ordonnance_id,hadm_id,ordonnance_date,drug1_name,drug2_name
0,ORD00000,100375,2129-05-02,Acetaminophen,Atorvastatin
1,ORD00000,100375,2129-05-02,Acetaminophen,Dextrose 50%
2,ORD00000,100375,2129-05-02,Acetaminophen,Docusate Sodium
3,ORD00000,100375,2129-05-02,Acetaminophen,Insulin
4,ORD00000,100375,2129-05-02,Acetaminophen,Pantoprazole
...,...,...,...,...,...
17962,ORD01032,199395,2190-07-20,Ranitidine,Warfarin
17963,ORD01032,199395,2190-07-20,Sodium Chloride 0.9% Flush,Warfarin
17964,ORD01033,199395,2190-07-21,Amiodarone HCl,Metoprolol
17965,ORD01033,199395,2190-07-21,Amiodarone HCl,Warfarin


## Ajouter les type d'interaction 


In [13]:
ddi = pd.read_csv(r'C:\Users\Utilisateur\Desktop\Projet_ordonance\DDI_data.csv')

In [14]:
ddi

Unnamed: 0,drug1_id,drug2_id,drug1_name,drug2_name,interaction_type
0,DB00006,DB00346,Bivalirudin,Alfuzosin,serum concentration
1,DB00006,DB13783,Bivalirudin,Acemetacin,risk or severity of bleeding
2,DB00006,DB06605,Bivalirudin,Apixaban,anticoagulant activities
3,DB00006,DB06695,Bivalirudin,Dabigatran etexilate,anticoagulant activities
4,DB00006,DB09075,Bivalirudin,Edoxaban,anticoagulant activities
...,...,...,...,...,...
222691,DB13955,DB14033,Estradiol dienanthate,Acetyl sulfisoxazole,metabolism
222692,DB13956,DB14033,Estradiol valerate,Acetyl sulfisoxazole,metabolism
222693,DB14011,DB14033,Nabiximols,Acetyl sulfisoxazole,metabolism
222694,DB14019,DB14033,Fosnetupitant,Acetyl sulfisoxazole,metabolism


In [15]:
ddi[ddi['drug1_name'] == 'Atorvastatin']

Unnamed: 0,drug1_id,drug2_id,drug1_name,drug2_name,interaction_type
172669,DB01076,DB08860,Atorvastatin,Pitavastatin,serum concentration
172670,DB01076,DB09296,Atorvastatin,Ombitasvir,serum concentration
172671,DB01076,DB09297,Atorvastatin,Paritaprevir,serum concentration
172672,DB01076,DB09183,Atorvastatin,Dasabuvir,serum concentration
172673,DB01076,DB13345,Atorvastatin,Dihydroergocristine,risk or severity of adverse effects
...,...,...,...,...,...
172857,DB01076,DB13650,Atorvastatin,Aloglutamol,serum concentration
172858,DB01076,DB01390,Atorvastatin,Sodium bicarbonate,serum concentration
172859,DB01076,DB03754,Atorvastatin,Tromethamine,serum concentration
172860,DB01076,DB09481,Atorvastatin,Magnesium carbonate,serum concentration


In [16]:
for col in ["drug1_name", "drug2_name"]:
    dataset_interactions[col] = dataset_interactions[col].str.lower().str.strip()
    ddi[col] = ddi[col].str.lower().str.strip()


In [17]:
# 1er sens
merged_1 = dataset_interactions.merge(ddi, on=["drug1_name", "drug2_name"], how="left")

# Inverser les colonnes pour tester l'autre sens
ordonnances_swapped = dataset_interactions.rename(columns={
    "drug1_name": "drug2_name", "drug2_name": "drug1_name"
})
merged_2 = ordonnances_swapped.merge(ddi, on=["drug1_name", "drug2_name"], how="left")

# Reprendre les colonnes d'origine dans le bon ordre
merged_2 = merged_2.rename(columns={
    "drug1_name": "drug2_name", "drug2_name": "drug1_name"
})


In [18]:
result = pd.concat([merged_1, merged_2], ignore_index=True)

# Garde uniquement les lignes où une interaction a été trouvée
result = result.dropna(subset=["interaction_type"])  # ou "description", selon ton fichier


In [19]:
print(result[["ordonnance_id", "drug1_name", "drug2_name", "interaction_type"]])


      ordonnance_id     drug1_name    drug2_name  \
0          ORD00000  acetaminophen  atorvastatin   
68         ORD00006      lorazepam     midazolam   
163        ORD00018   pantoprazole    tacrolimus   
179        ORD00019   clotrimazole      diazepam   
186        ORD00019   clotrimazole   ondansetron   
...             ...            ...           ...   
35889      ORD01032     furosemide    metoprolol   
35897      ORD01032      glipizide    metoprolol   
35901      ORD01032      glipizide    ranitidine   
35903      ORD01032      glipizide      warfarin   
35930      ORD01032     ranitidine      warfarin   

                          interaction_type  
0      risk or severity of adverse effects  
68     risk or severity of adverse effects  
163                    serum concentration  
179                             metabolism  
186                             metabolism  
...                                    ...  
35889  risk or severity of adverse effects  
35897          

In [20]:
result.info()

<class 'pandas.core.frame.DataFrame'>
Index: 978 entries, 0 to 35930
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   ordonnance_id     978 non-null    object
 1   hadm_id           978 non-null    int64 
 2   ordonnance_date   978 non-null    object
 3   drug1_name        978 non-null    object
 4   drug2_name        978 non-null    object
 5   drug1_id          978 non-null    object
 6   drug2_id          978 non-null    object
 7   interaction_type  978 non-null    object
dtypes: int64(1), object(7)
memory usage: 68.8+ KB


In [21]:
result

Unnamed: 0,ordonnance_id,hadm_id,ordonnance_date,drug1_name,drug2_name,drug1_id,drug2_id,interaction_type
0,ORD00000,100375,2129-05-02,acetaminophen,atorvastatin,DB00316,DB01076,risk or severity of adverse effects
68,ORD00006,100969,2142-11-28,lorazepam,midazolam,DB00186,DB00683,risk or severity of adverse effects
163,ORD00018,101361,2145-12-15,pantoprazole,tacrolimus,DB00213,DB00864,serum concentration
179,ORD00019,102203,2127-07-23,clotrimazole,diazepam,DB00257,DB00829,metabolism
186,ORD00019,102203,2127-07-23,clotrimazole,ondansetron,DB00257,DB00904,metabolism
...,...,...,...,...,...,...,...,...
35889,ORD01032,199395,2190-07-20,furosemide,metoprolol,DB00264,DB00695,risk or severity of adverse effects
35897,ORD01032,199395,2190-07-20,glipizide,metoprolol,DB00264,DB01067,hypoglycemic activities
35901,ORD01032,199395,2190-07-20,glipizide,ranitidine,DB00863,DB01067,serum concentration
35903,ORD01032,199395,2190-07-20,glipizide,warfarin,DB00682,DB01067,anticoagulant activities


# On remarque q'on a 978 interactions détecté 

## Ajouter une description pour chaque interaction

In [24]:
file_path = Path(r"C:\Users\Utilisateur\Desktop\Projet_ordonance\drugbank.xml\drugbank.xml")
tree = ET.parse(file_path)             # peut lever ET.XMLSyntaxError si le fichier est corrompu
root = tree.getroot()

In [25]:
NS = {"db": "http://www.drugbank.ca"}

In [26]:
records = []

for drug in root.findall(".//db:drug", NS):
    this_id   = drug.findtext(".//db:drugbank-id[@primary='true']", namespaces=NS)
    this_name = drug.findtext(".//db:name", namespaces=NS)

    # 3a.  Parcours des <drug-interaction> de ce médicament
    for di in drug.findall(".//db:drug-interactions/db:drug-interaction", NS):
        partner_id   = di.findtext("db:drugbank-id", namespaces=NS)
        partner_name = di.findtext("db:name", namespaces=NS)
        description  = di.findtext("db:description", namespaces=NS)

        records.append(
            {
                "drug_id":        this_id,
                "drug_name":      this_name,
                "partner_id":     partner_id,
                "partner_name":   partner_name,
                "description":    description,
            }
        )

# ------------------------------
# 4.  Option : transformer en DataFrame pandas
# ------------------------------
df = pd.DataFrame(records)
print(df.head())

   drug_id  drug_name partner_id     partner_name  \
0  DB00001  Lepirudin    DB01323  St. John's Wort   
1  DB00001  Lepirudin    DB00346        Alfuzosin   
2  DB00001  Lepirudin    DB13783       Acemetacin   
3  DB00001  Lepirudin    DB06605         Apixaban   
4  DB00001  Lepirudin    DB09075         Edoxaban   

                                         description  
0  The metabolism of Lepirudin can be increased w...  
1  The serum concentration of Alfuzosin can be in...  
2  The risk or severity of bleeding can be increa...  
3  Apixaban may increase the anticoagulant activi...  
4  Edoxaban may increase the anticoagulant activi...  


In [27]:
df.shape

(658079, 5)

In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658079 entries, 0 to 658078
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   drug_id       658079 non-null  object
 1   drug_name     658079 non-null  object
 2   partner_id    658079 non-null  object
 3   partner_name  658079 non-null  object
 4   description   658079 non-null  object
dtypes: object(5)
memory usage: 25.1+ MB


In [29]:
df

Unnamed: 0,drug_id,drug_name,partner_id,partner_name,description
0,DB00001,Lepirudin,DB01323,St. John's Wort,The metabolism of Lepirudin can be increased w...
1,DB00001,Lepirudin,DB00346,Alfuzosin,The serum concentration of Alfuzosin can be in...
2,DB00001,Lepirudin,DB13783,Acemetacin,The risk or severity of bleeding can be increa...
3,DB00001,Lepirudin,DB06605,Apixaban,Apixaban may increase the anticoagulant activi...
4,DB00001,Lepirudin,DB09075,Edoxaban,Edoxaban may increase the anticoagulant activi...
...,...,...,...,...,...
658074,DB13929,Relcovaptan,DB12749,Butylphthalide,Butylphthalide may increase the antiplatelet a...
658075,DB13929,Relcovaptan,DB12771,Hydroxytyrosol,Hydroxytyrosol may increase the antiplatelet a...
658076,DB13929,Relcovaptan,DB13036,Ramatroban,Ramatroban may increase the antiplatelet activ...
658077,DB13929,Relcovaptan,DB13400,Linsidomine,Linsidomine may increase the antiplatelet acti...


### On nettoie les données des deux tbale pour les joindre

In [31]:
description = df[['drug_name' , 'partner_name','description']]
description

Unnamed: 0,drug_name,partner_name,description
0,Lepirudin,St. John's Wort,The metabolism of Lepirudin can be increased w...
1,Lepirudin,Alfuzosin,The serum concentration of Alfuzosin can be in...
2,Lepirudin,Acemetacin,The risk or severity of bleeding can be increa...
3,Lepirudin,Apixaban,Apixaban may increase the anticoagulant activi...
4,Lepirudin,Edoxaban,Edoxaban may increase the anticoagulant activi...
...,...,...,...
658074,Relcovaptan,Butylphthalide,Butylphthalide may increase the antiplatelet a...
658075,Relcovaptan,Hydroxytyrosol,Hydroxytyrosol may increase the antiplatelet a...
658076,Relcovaptan,Ramatroban,Ramatroban may increase the antiplatelet activ...
658077,Relcovaptan,Linsidomine,Linsidomine may increase the antiplatelet acti...


In [32]:
description = description.rename(columns={'drug_name' : 'drug1_name','partner_name': 'drug2_name'})
print(description.columns.tolist())


['drug1_name', 'drug2_name', 'description']


In [33]:
cols = ["drug1_name", "drug2_name"]

# 4) Nettoyage strip + lower sur ces colonnes
description[cols] = description[cols].apply(
    lambda s: s.str.strip().str.lower()
)
description

Unnamed: 0,drug1_name,drug2_name,description
0,lepirudin,st. john's wort,The metabolism of Lepirudin can be increased w...
1,lepirudin,alfuzosin,The serum concentration of Alfuzosin can be in...
2,lepirudin,acemetacin,The risk or severity of bleeding can be increa...
3,lepirudin,apixaban,Apixaban may increase the anticoagulant activi...
4,lepirudin,edoxaban,Edoxaban may increase the anticoagulant activi...
...,...,...,...
658074,relcovaptan,butylphthalide,Butylphthalide may increase the antiplatelet a...
658075,relcovaptan,hydroxytyrosol,Hydroxytyrosol may increase the antiplatelet a...
658076,relcovaptan,ramatroban,Ramatroban may increase the antiplatelet activ...
658077,relcovaptan,linsidomine,Linsidomine may increase the antiplatelet acti...


In [34]:
for d in (result, description):
    d["key"] = d[["drug1_name", "drug2_name"]].apply(
        lambda r: "|".join(sorted(map(str.lower, r))),        # ex. "atorvastatin|paracetamol"
        axis=1
    )

# 2) Jointure et récupération de la description ---------------
result = result.merge(
    description[["key", "description"]].drop_duplicates("key"),
    on="key",
    how="left"            # garde toutes les observations
).drop(columns="key")     #

In [35]:
result.drop(columns=['drug1_id','drug2_id'],axis=1,inplace=True)

In [36]:
result

Unnamed: 0,ordonnance_id,hadm_id,ordonnance_date,drug1_name,drug2_name,interaction_type,description
0,ORD00000,100375,2129-05-02,acetaminophen,atorvastatin,risk or severity of adverse effects,The risk or severity of adverse effects can be...
1,ORD00006,100969,2142-11-28,lorazepam,midazolam,risk or severity of adverse effects,The risk or severity of adverse effects can be...
2,ORD00018,101361,2145-12-15,pantoprazole,tacrolimus,serum concentration,The serum concentration of Tacrolimus can be i...
3,ORD00019,102203,2127-07-23,clotrimazole,diazepam,metabolism,The metabolism of Diazepam can be decreased wh...
4,ORD00019,102203,2127-07-23,clotrimazole,ondansetron,metabolism,The metabolism of Ondansetron can be decreased...
...,...,...,...,...,...,...,...
973,ORD01032,199395,2190-07-20,furosemide,metoprolol,risk or severity of adverse effects,The risk or severity of adverse effects can be...
974,ORD01032,199395,2190-07-20,glipizide,metoprolol,hypoglycemic activities,Metoprolol may increase the hypoglycemic activ...
975,ORD01032,199395,2190-07-20,glipizide,ranitidine,serum concentration,The serum concentration of Glipizide can be in...
976,ORD01032,199395,2190-07-20,glipizide,warfarin,anticoagulant activities,Glipizide may increase the anticoagulant activ...


## Ajout du niveau de gravité pour chaque interaction médicamenteuse


In [38]:
all_filles = glob.glob(r'C:\Users\Utilisateur\Desktop\Projet_ordonance\Level\*.csv')

In [39]:
ddi_all = pd.concat([pd.read_csv(f) for f in all_filles], ignore_index=True)

In [40]:
ddi_all

Unnamed: 0,DDInterID_A,Drug_A,DDInterID_B,Drug_B,Level
0,DDInter1263,Naltrexone,DDInter1,Abacavir,Moderate
1,DDInter1,Abacavir,DDInter1348,Orlistat,Moderate
2,DDInter58,Aluminum hydroxide,DDInter582,Dolutegravir,Major
3,DDInter112,Aprepitant,DDInter582,Dolutegravir,Minor
4,DDInter138,Attapulgite,DDInter582,Dolutegravir,Major
...,...,...,...,...,...
222378,DDInter513,Dexamethasone,DDInter801,Galactose,Unknown
222379,DDInter1683,Sodium bicarbonate,DDInter801,Galactose,Unknown
222380,DDInter317,Cefepime,DDInter801,Galactose,Unknown
222381,DDInter1906,Ursodeoxycholic acid,DDInter801,Galactose,Unknown


In [41]:
Level = ddi_all[['Drug_A','Drug_B','Level']]
Level

Unnamed: 0,Drug_A,Drug_B,Level
0,Naltrexone,Abacavir,Moderate
1,Abacavir,Orlistat,Moderate
2,Aluminum hydroxide,Dolutegravir,Major
3,Aprepitant,Dolutegravir,Minor
4,Attapulgite,Dolutegravir,Major
...,...,...,...
222378,Dexamethasone,Galactose,Unknown
222379,Sodium bicarbonate,Galactose,Unknown
222380,Cefepime,Galactose,Unknown
222381,Ursodeoxycholic acid,Galactose,Unknown


In [42]:
Level.rename(columns={'Drug_A':'drug1_name','Drug_B':'drug2_name'},inplace=True)
Level

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Level.rename(columns={'Drug_A':'drug1_name','Drug_B':'drug2_name'},inplace=True)


Unnamed: 0,drug1_name,drug2_name,Level
0,Naltrexone,Abacavir,Moderate
1,Abacavir,Orlistat,Moderate
2,Aluminum hydroxide,Dolutegravir,Major
3,Aprepitant,Dolutegravir,Minor
4,Attapulgite,Dolutegravir,Major
...,...,...,...
222378,Dexamethasone,Galactose,Unknown
222379,Sodium bicarbonate,Galactose,Unknown
222380,Cefepime,Galactose,Unknown
222381,Ursodeoxycholic acid,Galactose,Unknown


In [43]:
for d in (result, Level):
    d["key"] = d[["drug1_name", "drug2_name"]].apply(
        lambda r: "|".join(sorted(map(str.lower, r))),        # ex. "atorvastatin|paracetamol"
        axis=1
    )

# 2) Jointure et récupération de la description ---------------
result = result.merge(
    Level[["key", "Level"]].drop_duplicates("key"),
    on="key",
    how="left"            # garde toutes les observations
).drop(columns="key")     #

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  d["key"] = d[["drug1_name", "drug2_name"]].apply(


In [44]:
Level

Unnamed: 0,drug1_name,drug2_name,Level,key
0,Naltrexone,Abacavir,Moderate,abacavir|naltrexone
1,Abacavir,Orlistat,Moderate,abacavir|orlistat
2,Aluminum hydroxide,Dolutegravir,Major,aluminum hydroxide|dolutegravir
3,Aprepitant,Dolutegravir,Minor,aprepitant|dolutegravir
4,Attapulgite,Dolutegravir,Major,attapulgite|dolutegravir
...,...,...,...,...
222378,Dexamethasone,Galactose,Unknown,dexamethasone|galactose
222379,Sodium bicarbonate,Galactose,Unknown,galactose|sodium bicarbonate
222380,Cefepime,Galactose,Unknown,cefepime|galactose
222381,Ursodeoxycholic acid,Galactose,Unknown,galactose|ursodeoxycholic acid


In [45]:
result

Unnamed: 0,ordonnance_id,hadm_id,ordonnance_date,drug1_name,drug2_name,interaction_type,description,Level
0,ORD00000,100375,2129-05-02,acetaminophen,atorvastatin,risk or severity of adverse effects,The risk or severity of adverse effects can be...,
1,ORD00006,100969,2142-11-28,lorazepam,midazolam,risk or severity of adverse effects,The risk or severity of adverse effects can be...,
2,ORD00018,101361,2145-12-15,pantoprazole,tacrolimus,serum concentration,The serum concentration of Tacrolimus can be i...,Moderate
3,ORD00019,102203,2127-07-23,clotrimazole,diazepam,metabolism,The metabolism of Diazepam can be decreased wh...,Moderate
4,ORD00019,102203,2127-07-23,clotrimazole,ondansetron,metabolism,The metabolism of Ondansetron can be decreased...,Unknown
...,...,...,...,...,...,...,...,...
973,ORD01032,199395,2190-07-20,furosemide,metoprolol,risk or severity of adverse effects,The risk or severity of adverse effects can be...,
974,ORD01032,199395,2190-07-20,glipizide,metoprolol,hypoglycemic activities,Metoprolol may increase the hypoglycemic activ...,Moderate
975,ORD01032,199395,2190-07-20,glipizide,ranitidine,serum concentration,The serum concentration of Glipizide can be in...,Moderate
976,ORD01032,199395,2190-07-20,glipizide,warfarin,anticoagulant activities,Glipizide may increase the anticoagulant activ...,Moderate


In [94]:
result['Level'].isnull().sum()

617

In [47]:
Level['Level'].value_counts()

Level
Moderate    130367
Unknown      47182
Major        33896
Minor        10938
Name: count, dtype: int64

In [48]:
Level['Level'].isnull().sum()

0

In [49]:
nb_total = len(result)
nb_connus = result["Level"].notna().sum()
nb_inconnus = result["Level"].isna().sum()

print(f"Total paires analysées : {nb_total}")
print(f"Interactions trouvées  : {nb_connus}")
print(f"Interactions manquantes : {nb_inconnus}")


Total paires analysées : 978
Interactions trouvées  : 361
Interactions manquantes : 617
