Load the embeddings

In [3]:
import numpy as np
import pandas as pd

notes = pd.read_csv(
    r'...\NOTEEVENTS.csv.gz',
    dtype={4: str, 5: str}  # or int, float, etc. depending on data
)

notes = notes[notes["CATEGORY"].isin(["Discharge summary"])]
notes = notes.dropna(subset=["TEXT", "HADM_ID"])

patient_texts = notes.groupby("HADM_ID")["TEXT"].apply(lambda x: "\n".join(x)).reset_index()

In [5]:
# Ensure consistent ID type
patient_texts['HADM_ID'] = patient_texts['HADM_ID'].astype(str)

# Load clinical note embeddings
loaded = np.load(r'...\useremb.npz')
clinical_embs = loaded['array1']  # shape: (n_users, emb_dim)

projected_note_embs = []

for i, subj_id in enumerate(patient_texts['HADM_ID']):
    clinical_emb = clinical_embs[i]
    
    projected_note_embs.append(clinical_emb)

projected_note_embs = np.array(projected_note_embs)
print(f"Final note embedding shape: {projected_note_embs.shape}")

Final note embedding shape: (52726, 768)


In [6]:
import torch
import torch.nn as nn
import numpy as np

class EmbeddingProjector(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.proj = nn.Sequential(
            nn.Linear(input_dim, output_dim),
            nn.ReLU(),  # Optional: or GELU, Tanh
            nn.Dropout(0.1)
        )

    def forward(self, x):
        return self.proj(x)


In [7]:
# Convert to tensor
combined_tensor = torch.tensor(projected_note_embs, dtype=torch.float32)

# Project to original dimension (or any size)
input_dim = combined_tensor.shape[1]
output_dim = 128 
projector = EmbeddingProjector(input_dim, output_dim)

# Forward pass (no training yet)
with torch.no_grad():
    projected_tensor = projector(combined_tensor)

# Convert back to NumPy if needed for Cornac
projected_note_embs = projected_tensor.numpy()


In [8]:
projected_note_embs.shape

(52726, 128)

In [8]:
drug_embeddings = np.load(r'...\drug_embeddings.npy', allow_pickle=True)

In [9]:
import torch
# Convert to tensor
combined_tensor = torch.tensor(drug_embeddings, dtype=torch.float32)

# Project to original dimension (or any size)
input_dim = combined_tensor.shape[1]
output_dim = 128
projector = EmbeddingProjector(input_dim, output_dim)

# Forward pass (no training yet)
with torch.no_grad():
    projected_tensor = projector(combined_tensor)

# Convert back to NumPy if needed for Cornac
projected_drug_embeddings = projected_tensor.numpy()

In [10]:
projected_drug_embeddings.shape

(499, 128)

Safety Metrics

In [11]:
from cornac.metrics import RankingMetric

class DDIRate(RankingMetric):
    def __init__(self, ddi_matrix, k=10, name="DDI@10"):
        """
        Parameters:
        - ddi_pairs: set of (drug_id_1, drug_id_2) tuples indicating known DDIs.
        - k: number of top predicted items to consider per user.
        """
        super().__init__(name=name, k=k)
        self.ddi_matrix = ddi_matrix

    def compute(self, gt_pos, gt_neg, pd_rank, pd_scores, item_indices=None):
        top_k_items = pd_rank[:self.k]
        ddi_count = 0
        total_pairs = 0

        for i in range(len(top_k_items)):
            for j in range(i + 1, len(top_k_items)):
                d1, d2 = top_k_items[i], top_k_items[j]    
                if frozenset({d1, d2}) in self.ddi_matrix or frozenset({d2, d1}) in self.ddi_matrix:
                    ddi_count += 1
                total_pairs += 1

        ddi_rate = ddi_count / total_pairs if total_pairs > 0 else 0.0
        return ddi_rate


In [12]:
import numpy as np
from itertools import combinations

class ToxicityDDIRate(RankingMetric):
    def __init__(self, toxicity_matrix, k=10, name="ToxicityDDI@10"):
        """
        Parameters:
        - toxicity_matrix: 2D NumPy array or sparse matrix where toxicity_matrix[i, j] 
                           gives the toxicity score of the DDI between drugs i and j.
                           (0 if no interaction, >0 if interaction exists)
        - k: number of top predicted items to consider per user.
        """
        super().__init__(name=name, k=k)
        self.toxicity_matrix = toxicity_matrix

    def compute(self, gt_pos, gt_neg, pd_rank, pd_scores, item_indices=None):
        top_k_items = pd_rank[:self.k]
        if len(top_k_items) < 2:
            return 0.0

        # All unordered pairs among top-k
        pairs = np.array(list(combinations(top_k_items, 2)))

        # Sum toxicity of interactions among top-k items
        toxicity_sum = self.toxicity_matrix[pairs[:, 0], pairs[:, 1]].sum()
        total_pairs = len(pairs)

        return toxicity_sum / total_pairs if total_pairs > 0 else 0.0


In [15]:
import pickle
with open(r'...\mapped_ddi_pairs.pkl', 'rb') as f:
    mapped_ddi_pairs = pickle.load(f)

In [16]:
print(mapped_ddi_pairs[0])

('clomipramine', 'itraconazole', 'minor')


TBBPR 

In [17]:
import numpy as np
import copy
from tqdm.auto import trange
from cornac.models import Recommender
from cornac.utils.init_utils import uniform, zeros


def normalize_rows(mat):
    """Row-wise L2 normalization."""
    norms = np.linalg.norm(mat, axis=1, keepdims=True)
    norms[norms == 0] = 1
    return mat / norms


class TBBPR(Recommender):
    """
    MultiTask BPR with:
    - Pretrained patient/drug embeddings
    - Optional DDI toxicity multitask loss (vectorized)
    """

    def __init__(
        self,
        # pretrained
        patient_embeddings=None,
        drug_embeddings=None,
        fold_uid_map=None,
        fold_iid_map=None,
        residual_scale=0.01,
        # multitask BPR
        k=50,
        max_iter=100,
        learning_rate=0.01,
        lambda_reg=0.001,
        alpha=0.8,
        ddi_pairs=None,
        # misc
        verbose=False,
        seed=None,
    ):
        super().__init__(name="TBBPR", trainable=True, verbose=verbose)

        # embedding dim (override if pretrained provided)
        self.k = (
            patient_embeddings.shape[1]
            if patient_embeddings is not None
            else (drug_embeddings.shape[1] if drug_embeddings is not None else k)
        )

        # store pretrained + maps
        self.patient_embeddings = (
            normalize_rows(patient_embeddings) if patient_embeddings is not None else None
        )
        self.drug_embeddings = (
            normalize_rows(drug_embeddings) if drug_embeddings is not None else None
        )
        self.fold_uid_map = fold_uid_map or {}
        self.fold_iid_map = fold_iid_map or {}

        self.residual_scale = residual_scale

        self.max_iter = max_iter
        self.learning_rate = learning_rate
        self.lambda_reg = lambda_reg
        self.alpha = alpha
        self.ddi_pairs = ddi_pairs if ddi_pairs is not None else []
        self.seed = seed
        self.rng = np.random.RandomState(seed)


        # save params for clone
        self._init_params = copy.deepcopy(locals())
        self._init_params.pop("self")

    def _init_factors(self, train_set):
        # ---- users ----
        self.u_factors = (uniform((train_set.num_users, self.k),
                                  random_state=self.rng,
                                  dtype=np.float32) - 0.5) / self.k
        covered_users = 0
        if self.patient_embeddings is not None and self.fold_uid_map:
            for uid in range(train_set.num_users):
                raw_uid = train_set.user_ids[uid]
                if raw_uid in self.fold_uid_map:
                    emb_idx = self.fold_uid_map[raw_uid]
                    base = self.patient_embeddings[emb_idx]
                    if not np.allclose(base, 0):
                        covered_users += 1
                    self.u_factors[uid] = base + self.residual_scale * (
                        uniform((1, self.k), random_state=self.rng, dtype=np.float32) - 0.5
                    ) / self.k
        if self.verbose:
            print(f"Users with pretrained emb: {covered_users}/{train_set.num_users}")

        # ---- items ----
        self.i_factors = (uniform((train_set.num_items, self.k),
                                  random_state=self.rng,
                                  dtype=np.float32) - 0.5) / self.k
        covered_items = 0
        if self.drug_embeddings is not None and self.fold_iid_map:
            for iid in range(train_set.num_items):
                raw_iid = train_set.item_ids[iid].lower().strip()
                if raw_iid in self.fold_iid_map:
                    emb_idx = self.fold_iid_map[raw_iid]
                    base = self.drug_embeddings[emb_idx]
                    if not np.allclose(base, 0):
                        covered_items += 1
                    self.i_factors[iid] = base + self.residual_scale * (
                        uniform((1, self.k), random_state=self.rng, dtype=np.float32) - 0.5
                    ) / self.k
        if self.verbose:
            print(f"Items with pretrained emb: {covered_items}/{train_set.num_items}")

        # biases
        self.i_biases = zeros(train_set.num_items, dtype=np.float32)

    def _prepare_data(self, train_set):
        X = train_set.matrix
        user_counts = np.ediff1d(X.indptr)
        user_ids = np.repeat(np.arange(train_set.num_users), user_counts)
        return X, user_counts, user_ids

    def fit(self, train_set, val_set=None):
        super().fit(train_set, val_set)
        self._init_factors(train_set)
        X, _, user_ids = self._prepare_data(train_set)
        neg_item_ids = np.arange(train_set.num_items, dtype=np.int32)

        with trange(self.max_iter, disable=not self.verbose) as progress:
            for epoch in progress:
                correct, skipped = self._fit_sgd(user_ids, X.indices, X.indptr, neg_item_ids)
                if self.verbose:
                    progress.set_postfix({
                        "correct": "%.2f%%" % (100.0 * correct / (len(user_ids) - skipped)),
                        "skipped": "%.2f%%" % (100.0 * skipped / len(user_ids))
                    })
        return self



    def _fit_sgd(self, user_ids, item_ids, indptr, neg_item_ids):
        item_id_set = set(item_ids)

        # filter DDI pairs
        if self.ddi_pairs:
            filtered = [(d1, d2, sev) for d1, d2, sev in self.ddi_pairs
                        if d1 in item_id_set and d2 in item_id_set]
            if filtered:
                d1_idx, d2_idx, sev_vals = zip(*filtered)
                d1_idx, d2_idx, sev_vals = map(np.array, (d1_idx, d2_idx, sev_vals))
            else:
                d1_idx = d2_idx = sev_vals = np.array([])
        else:
            d1_idx = d2_idx = sev_vals = np.array([])

        # neg items per user
        user_neg_sets = {}
        for u in np.unique(user_ids):
            start, end = indptr[u], indptr[u + 1]
            user_neg_sets[u] = np.array(list(set(neg_item_ids) - set(item_ids[start:end])))

        # pos/neg triplets
        u_array, i_array, j_array = [], [], []
        for u in np.unique(user_ids):
            pos_items = item_ids[indptr[u]:indptr[u + 1]]
            neg_items = user_neg_sets[u]
            n_samples = len(pos_items)
            if n_samples == 0:
                continue
            u_array.extend([u] * n_samples)
            i_array.extend(pos_items)
            j_array.extend(self.rng.choice(neg_items, size=n_samples, replace=True))
        u_array, i_array, j_array = map(np.array, (u_array, i_array, j_array))


        # --- BPR forward ---
        x_ui = self.i_biases[i_array] + np.sum(self.u_factors[u_array] * self.i_factors[i_array], axis=1)
        x_uj = self.i_biases[j_array] + np.sum(self.u_factors[u_array] * self.i_factors[j_array], axis=1)
        x_uij = x_ui - x_uj
        z = 1.0 / (1.0 + np.exp(x_uij))
        correct = np.sum(z < 0.5)
        skipped = 0        
        
        # --- BPR updates ---
        grad_u = z[:, None] * (self.i_factors[i_array] - self.i_factors[j_array]) - self.lambda_reg * self.u_factors[u_array]
        grad_i = z[:, None] * self.u_factors[u_array] - self.lambda_reg * self.i_factors[i_array]
        grad_j = -z[:, None] * self.u_factors[u_array] - self.lambda_reg * self.i_factors[j_array]

        np.add.at(self.u_factors, u_array, self.learning_rate * self.alpha * grad_u)
        np.add.at(self.i_factors, i_array, self.learning_rate * self.alpha * grad_i)
        np.add.at(self.i_factors, j_array, self.learning_rate * self.alpha * grad_j)
        np.add.at(self.i_biases, i_array, self.learning_rate * self.alpha * (z - self.lambda_reg * self.i_biases[i_array]))
        np.add.at(self.i_biases, j_array, self.learning_rate * self.alpha * (-z - self.lambda_reg * self.i_biases[j_array]))

        # --- DDI toxicity updates ---
        if d1_idx.size > 0:
            prod = np.sum(self.i_factors[d1_idx] * self.i_factors[d2_idx], axis=1) - sev_vals
            grad_d1 = prod[:, None] * self.i_factors[d2_idx]
            grad_d2 = prod[:, None] * self.i_factors[d1_idx]
            self.i_factors[d1_idx] -= self.learning_rate * (1 - self.alpha) * grad_d1
            self.i_factors[d2_idx] -= self.learning_rate * (1 - self.alpha) * grad_d2

        return correct, skipped

    def score(self, user_idx, item_idx=None):
        if item_idx is None:
            scores = np.copy(self.i_biases)
            scores += self.u_factors[user_idx] @ self.i_factors.T
            return scores
        else:
            return self.i_biases[item_idx] + self.u_factors[user_idx] @ self.i_factors[item_idx]

    def _get_init_params(self):
        return copy.deepcopy(self._init_params)

    def clone(self, new_params=None):
        import inspect
        params = copy.deepcopy(self._init_params)
        if new_params:
            params.update(new_params)
        init_params = inspect.signature(self.__init__).parameters
        filtered_params = {k: v for k, v in params.items() if k in init_params}
        return self.__class__(**filtered_params)


In [25]:
# ------------------- Step 0: Imports -------------------
import numpy as np
import pandas as pd
import re
from tqdm import tqdm
from rapidfuzz import process
import cornac
from cornac.data import Dataset
from cornac.eval_methods import CrossValidation
from cornac.metrics import Recall, NDCG, MRR

# ------------------- Step 1: Load Data -------------------
ratings_df = pd.read_csv(r"...\user_drug_rating_visit_anemia.csv")
matched_df = pd.read_csv(r"...\drugbank_mimic_rxcui_map.csv")

# ensure IDs are strings
patient_texts['HADM_ID'] = patient_texts['HADM_ID'].astype(float).astype(int).astype(str)

# ------------------- Step 2: Clean Drug Names -------------------
def clean_drug_name(name):
    if pd.isnull(name):
        return ""
    name = name.lower().strip()
    name = re.sub(
        r"\b\d+(\.\d+)?\s*(mg|ml|mcg|units|tablet|tab|capsule|cap|drop|syrup|patch|ointment|cream|injection|solution|suspension|oral|inj|dose|suppository)\b",
        "",
        name,
    )
    name = re.sub(r"[^\w\s]", "", name)
    name = re.sub(r"\s+", " ", name)
    return name.strip()

ratings_df.dropna(subset=["user", "item", "rating"], inplace=True)
ratings_df["user"] = ratings_df["user"].astype(str)
ratings_df["item"] = ratings_df["item"].astype(str)
ratings_df["rating"] = ratings_df["rating"].astype(float)
ratings_df["clean_item"] = ratings_df["item"].apply(clean_drug_name)

matched_df["Generic_Name"] = matched_df["Generic_Name"].astype(str)
matched_df["clean_generic"] = matched_df["Generic_Name"].apply(clean_drug_name)

# ------------------- Step 3: Fuzzy Match -------------------
ratings_items = ratings_df["clean_item"].unique()
mimic_generics = matched_df["clean_generic"].unique().tolist()

lookup = {}
for item in ratings_items:
    match = process.extractOne(item, mimic_generics, score_cutoff=80)
    if match:
        lookup[item] = match[0]

ratings_df["matched_generic"] = ratings_df["clean_item"].map(lookup)
ratings_df["matched_generic"] = ratings_df["matched_generic"].fillna(ratings_df["clean_item"])

# ------------------- Step 4: Filter Users With Embeddings -------------------
subj_id_to_emb_idx = {sid: idx for idx, sid in enumerate(patient_texts["HADM_ID"])}


# ------------------- Step 5: Prepare UIR -------------------
uir_data = list(zip(ratings_df["user"], ratings_df["matched_generic"], ratings_df["rating"]))
cornac_data = Dataset.from_uir(uir_data, seed=123)
uid_map = cornac_data.uid_map
iid_map = cornac_data.iid_map

# ------------------- Step 6: User/Item Embeddings -------------------
def align_user_embeddings(cornac_uid_map, raw_to_emb_idx, embeddings):
    mat = np.zeros((len(cornac_uid_map), embeddings.shape[1]))
    for raw_uid, internal_uid in cornac_uid_map.items():
        if raw_uid in raw_to_emb_idx:
            mat[internal_uid] = embeddings[raw_to_emb_idx[raw_uid]]
    return mat

def align_item_embeddings(cornac_iid_map, raw_to_emb_idx, embeddings):
    mat = np.zeros((len(cornac_iid_map), embeddings.shape[1]))
    for raw_iid, internal_iid in cornac_iid_map.items():
        key = raw_iid.lower().strip()
        if key in raw_to_emb_idx:
            mat[internal_iid] = embeddings[raw_to_emb_idx[key]]
    return mat

# align embeddings
user_emb_matrix = align_user_embeddings(uid_map, subj_id_to_emb_idx, projected_note_embs)
drug_id_to_index = {row["clean_generic"]: idx for idx, row in matched_df.iterrows()}
item_emb_matrix = align_item_embeddings(iid_map, drug_id_to_index, projected_drug_embeddings)

# ------------------- Step 7: Filter DDI Pairs -------------------
current_drugs = {drug.lower().strip() for drug in ratings_df["matched_generic"]}
filtered_ddi_pairs = [
    (d1.lower().strip(), d2.lower().strip(), sev)
    for (d1, d2, sev) in mapped_ddi_pairs
    if d1.lower().strip() in current_drugs and d2.lower().strip() in current_drugs
]

# Map raw drug names to internal item indices
toxicity_map = {"minor": 1.0, "moderate": 2.0, "major": 3.0}
ddi_index_pairs = []
for d1, d2, sev in filtered_ddi_pairs:
    if d1 in iid_map and d2 in iid_map:
        ddi_index_pairs.append((iid_map[d1], iid_map[d2], toxicity_map[sev]))

print("Example DDI pair (internal ids):", ddi_index_pairs[0])

# ------------------- Step 8: Initialize Model -------------------
model = TBBPR(
    k=128,
    alpha=0.005,
    max_iter=1000,
    learning_rate=0.001,
    ddi_pairs=ddi_index_pairs,
    lambda_reg=0.001,
    verbose=True,
    seed=42,
    patient_embeddings=user_emb_matrix,
    drug_embeddings=item_emb_matrix,
    fold_uid_map=uid_map,
    fold_iid_map=iid_map
)

# ------------------- Step 9: Setup Evaluation -------------------
ratio_split = CrossValidation(
    data=uir_data,
    n_folds=10,
    exclude_unknowns=True,
    rating_threshold=1.0,
    verbose=True,
    seed=123,
)

# Create toxicity matrix
num_items = len(iid_map)
toxicity_matrix = np.zeros((num_items, num_items), dtype=float)
for d1, d2, sev in filtered_ddi_pairs:
    if d1 in iid_map and d2 in iid_map:
        i, j = iid_map[d1], iid_map[d2]
        toxicity_matrix[i, j] = toxicity_map[sev]
        toxicity_matrix[j, i] = toxicity_map[sev]

toxicity_ddi_metric = ToxicityDDIRate(toxicity_matrix=toxicity_matrix, k=10)

ddi_index_pairs_set = set()
for d1, d2, _ in filtered_ddi_pairs:
    if d1 in iid_map and d2 in iid_map:
        ddi_index_pairs_set.add(frozenset((iid_map[d1], iid_map[d2])))

ddi_metric = DDIRate(ddi_matrix=ddi_index_pairs_set, k=10)

eval_metrics = [Recall(k=10), NDCG(k=[10]), toxicity_ddi_metric, ddi_metric]
cornac.Experiment(
    eval_method=ratio_split,
    models=[model],
    metrics=eval_metrics,
    user_based=True,
).run()




Example DDI pair (internal ids): (814, 438, 1.0)
rating_threshold = 1.0
exclude_unknowns = True
Fold: 1




---
Training data:
Number of users = 15162
Number of items = 1122
Number of ratings = 367245
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15162
Number of items = 1122
Number of ratings = 44305
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15162
Number of items = 1122
Number of ratings = 44305
---
Total users = 15162
Total items = 1122

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14930/15162
Items with pretrained emb: 454/1122


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13756 [00:00<?, ?it/s]

Fold: 2




---
Training data:
Number of users = 15162
Number of items = 1116
Number of ratings = 367247
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15162
Number of items = 1116
Number of ratings = 44305
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15162
Number of items = 1116
Number of ratings = 44305
---
Total users = 15162
Total items = 1116

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14930/15162
Items with pretrained emb: 450/1116


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13807 [00:00<?, ?it/s]

Fold: 3




---
Training data:
Number of users = 15161
Number of items = 1129
Number of ratings = 367184
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15161
Number of items = 1129
Number of ratings = 44296
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15161
Number of items = 1129
Number of ratings = 44296
---
Total users = 15161
Total items = 1129

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14929/15161
Items with pretrained emb: 451/1129


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13753 [00:00<?, ?it/s]

Fold: 4




---
Training data:
Number of users = 15160
Number of items = 1104
Number of ratings = 367068
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15160
Number of items = 1104
Number of ratings = 44287
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15160
Number of items = 1104
Number of ratings = 44287
---
Total users = 15160
Total items = 1104

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14929/15160
Items with pretrained emb: 456/1104


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13777 [00:00<?, ?it/s]

Fold: 5




---
Training data:
Number of users = 15161
Number of items = 1127
Number of ratings = 367155
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15161
Number of items = 1127
Number of ratings = 44303
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15161
Number of items = 1127
Number of ratings = 44303
---
Total users = 15161
Total items = 1127

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14929/15161
Items with pretrained emb: 455/1127


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13708 [00:00<?, ?it/s]

Fold: 6




---
Training data:
Number of users = 15160
Number of items = 1118
Number of ratings = 367163
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15160
Number of items = 1118
Number of ratings = 44311
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15160
Number of items = 1118
Number of ratings = 44311
---
Total users = 15160
Total items = 1118

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14928/15160
Items with pretrained emb: 454/1118


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13730 [00:00<?, ?it/s]

Fold: 7




---
Training data:
Number of users = 15161
Number of items = 1117
Number of ratings = 367178
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15161
Number of items = 1117
Number of ratings = 44295
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15161
Number of items = 1117
Number of ratings = 44295
---
Total users = 15161
Total items = 1117

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14930/15161
Items with pretrained emb: 456/1117


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13804 [00:00<?, ?it/s]

Fold: 8




---
Training data:
Number of users = 15162
Number of items = 1125
Number of ratings = 367193
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15162
Number of items = 1125
Number of ratings = 44289
Number of unknown users = 0
Number of unknown items = 0
---
Validation data:
Number of users = 15162
Number of items = 1125
Number of ratings = 44289
---
Total users = 15162
Total items = 1125





[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14930/15162
Items with pretrained emb: 455/1125


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13745 [00:00<?, ?it/s]

Fold: 9




---
Training data:
Number of users = 15161
Number of items = 1111
Number of ratings = 367235
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15161
Number of items = 1111
Number of ratings = 44252
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15161
Number of items = 1111
Number of ratings = 44252
---
Total users = 15161
Total items = 1111

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14930/15161
Items with pretrained emb: 454/1111


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13736 [00:00<?, ?it/s]

Fold: 10




---
Training data:
Number of users = 15161
Number of items = 1114
Number of ratings = 367026
Max rating = 1.0
Min rating = 1.0
Global mean = 1.0
---
Test data:
Number of users = 15161
Number of items = 1114
Number of ratings = 44314
Number of unknown users = 0
Number of unknown items = 0




---
Validation data:
Number of users = 15161
Number of items = 1114
Number of ratings = 44314
---
Total users = 15161
Total items = 1114

[MultiTaskEnhancedBPR] Training started!
Users with pretrained emb: 14929/15161
Items with pretrained emb: 450/1114


  0%|          | 0/1000 [00:00<?, ?it/s]


[MultiTaskEnhancedBPR] Evaluation started!


Ranking:   0%|          | 0/13817 [00:00<?, ?it/s]


TEST:
...
[MultiTaskEnhancedBPR]
       | DDI@10 | NDCG@10 | Recall@10 | SeverityDDI@10 | Train (s) | Test (s)
------ + ------ + ------- + --------- + -------------- + --------- + --------
Fold 0 | 0.2772 |  0.3506 |    0.4171 |         0.3472 | 5648.4009 |  12.1873
Fold 1 | 0.3370 |  0.3511 |    0.4269 |         0.3544 | 5734.8517 |  11.6404
Fold 2 | 0.2448 |  0.3455 |    0.4274 |         0.2678 | 5563.9981 |  11.6404
Fold 3 | 0.4002 |  0.3493 |    0.4213 |         0.4443 | 5564.0584 |  11.8591
Fold 4 | 0.2726 |  0.3535 |    0.4237 |         0.3042 | 5577.8404 |  11.6248
Fold 5 | 0.3076 |  0.3514 |    0.4183 |         0.3860 | 5589.0584 |  11.5935
Fold 6 | 0.2121 |  0.3572 |    0.4304 |         0.2202 | 5572.1840 |  11.7654
Fold 7 | 0.2975 |  0.3505 |    0.4286 |         0.3081 | 5603.2322 |  11.6716
Fold 8 | 0.1902 |  0.3564 |    0.4256 |         0.2000 | 5572.8243 |  11.6873
Fold 9 | 0.2867 |  0.3574 |    0.4284 |         0.2958 | 5572.7619 |  12.8122
------ + ------ + ------- + --

Case Study

In [28]:
# ------------------- Step 0: Pick 5 Patients -------------------
case_patients = ratings_df["user"].unique()[:5]  # first 5 unique patients
print("\n===== CASE STUDY: Top-10 Recommendations =====\n")

# ------------------- Step 1: Prepare Train Set -------------------
# Using all data as "train" for this example (no CV)
train_set = cornac_data

# Fit the model
model.fit(train_set)

# ------------------- Step 2: Generate Top-10 Recommendations -------------------
for pid in case_patients:
    if pid not in uid_map:
        continue  # skip if patient not mapped

    internal_uid = uid_map[pid]

    # Ground-truth drugs for this patient
    gt_drugs = ratings_df.loc[ratings_df["user"] == pid, "matched_generic"].unique().tolist()

    # Candidate items = all items
    all_items = list(iid_map.keys())
    item_indices = [iid_map[drug] for drug in all_items]

    # Get scores for all items (fixed matrix multiplication)
    item_factor_subset = model.i_factors[item_indices]  # shape: (num_items, k)
    scores = model.u_factors[internal_uid] @ item_factor_subset.T + model.i_biases[item_indices]

    # Rank by score and take Top-10
    top10_idx = np.argsort(scores)[::-1][:10]
    top10_drugs = [(all_items[i], scores[i]) for i in top10_idx]

    # Print results
    print(f"Patient HADM_ID: {pid}")
    print(f"  Ground Truth Drugs: {gt_drugs}")
    print("  Top-10 Recommendations:")
    for rank, (drug, score) in enumerate(top10_drugs, 1):
        marker = "✓" if drug in gt_drugs else ""
        print(f"    {rank:2d}. {drug} ({score:.4f}) {marker}")
    print("-" * 60)



===== CASE STUDY: Top-10 Recommendations =====

Users with pretrained emb: 14930/15162
Items with pretrained emb: 457/1149


  0%|          | 0/1000 [00:00<?, ?it/s]

Patient HADM_ID: 100003
  Ground Truth Drugs: ['acetaminophen', 'chloraseptic throat spray', 'folic acid', 'furosemide', 'lactulose', 'lidocaine', 'magnesium sulfate', 'nadolol', 'sarna lotion', 'sodium chloride', 'spironolactone', 'terbinafine 1 cream', 'thiamine']
  Top-10 Recommendations:
     1. sodium chloride (4.6272) ✓
     2. acetaminophen (4.4913) ✓
     3. heparin (4.4583) 
     4. docusate sodium (4.1799) 
     5. insulin sliding scale (4.1250) 
     6. metoprolol (4.0610) 
     7. potassium chloride (3.9449) 
     8. pantoprazole (3.9134) 
     9. morphine (3.8939) 
    10. senna (3.8594) 
------------------------------------------------------------
Patient HADM_ID: 100009
  Ground Truth Drugs: ['fenofibrate', 'acetaminophen', 'aspirin', 'atenolol', 'bisacodyl', 'calcium carbonate', 'cephalexin', 'chlorhexidine gluconate', 'dextrose 50', 'docusate sodium', 'sodium citrate', 'ezetimibe', 'furosemide', 'glucagon', 'glycopyrrolate', 'insulin sliding scale', 'insulin glargine',