# Multi CARNIVAL on PANACEA

This notebook shows how the results can be generated. To generate the results for the manuscript, we used the `script.py` in a HPC for convenience. Multi-condition methods require a high performance MILP solver. We used GUROBI for all experiments.


In [None]:
import corneto as cn
import pandas as pd
import numpy as np

selected_drug = "PONATINIB"
selected_cells = ["H1793", "LNCAP", "KRJ1", "HCC1143", "EFO21", "PANC1", "HF2597"]

df_panacea = pd.read_csv("GSE186341-PANACEA.tsv.xz", sep='\t')
df_panacea['drug'] = df_panacea['obs_id'].str.extract(r'_(.*?)_v')
df_panacea['cell'] = df_panacea['obs_id'].str.extract( r'^([^_]*)')
df_panacea['sign'] = np.sign(df_panacea['act'])

def filter_df(
    cells,
    drugs,
    df = df_panacea, 
    resource = "dorothea", 
    pipeline = "NA+deseq2", 
    statparam = "stat", 
    status = "unfiltered", 
    padj = 0.05
):
    c = [c.upper() for c in cells]
    d = [d.upper() for d in drugs]
    dff = df[
        (df.cell.str.upper().isin(c)) & 
        (df.drug.str.upper().isin(d)) & 
        (df.resource == resource) & 
        (df.pipeline == pipeline) &
        (df.statparam == statparam) &
        (df.status == status) &
        (df.padj <= padj)
    ]
    return dff

df_conditions = filter_df(selected_cells, [selected_drug])

def get_measurements(cell, drug, df = df_panacea, resource = "dorothea", pipeline = "NA+deseq2", statparam = "stat", padj = 0.05, as_dict=True):
    df_r = df[
        (df.drug.str.upper() == drug.upper()) & 
        (df.cell.str.upper() == cell.upper()) & 
        (df.resource == resource) & 
        (df.pipeline == pipeline) &
        (df.statparam == statparam) &
        (df.padj <= padj)
    ]
    if as_dict:
        return df_r[["items", "sign"]].set_index("items").to_dict()["sign"]
    return df_r

G_pkn = cn.Graph.from_sif("network_collectri.sif", has_header=True, column_order=[0, 2, 1])
ponatinib_targets = ["BCR", "ABL", "VEGFR", "PDGFRA_PDGFRB", "FGFR1", "FGFR2", "FGFR3", "FGFR4", "EPH", "SRC", "KIT", "RET", "TIE2", "FLT3"]
targets = {v: -1 for v in G_pkn.V if v in ponatinib_targets}

# Create the input dict
input_data = dict()
for cell in selected_cells:
    d = dict()
    input_data[cell] = d
    d["input"] = targets
    d["output"] = get_measurements(cell, selected_drug)


from corneto.methods.future.carnival import Carnival
from corneto.methods.future.method import Dataset
from corneto.methods.signalling.carnival import multi_carnival
import pandas as pd

n_reps = 10
timelimit=120
norel=120
gt_lambda = 0.1

sols = []
for i in range(n_reps):
    P, Gexp, stats = multi_carnival(G_pkn, input_data, lambd=gt_lambda)
    P.solve(solver="GUROBI", TimeLimit=timelimit, NoRelHeurTime=norel, Seed=i, verbosity=0);
    sols.append(P.expr.edge_value.value)
    print(i, "completed")


# Ground truth data. A script should be used to generate and save a dataset (ground truth)
# passing the arguments n_reps, timelimit, norel and gt_lambda
df_gt = pd.DataFrame(np.mean(sols, axis=0), index=Gexp.E)


### Measuring performance

import random

def split_inputs_outputs_folds(data, folds, seed=None):
    """
    Splits the data into a list of (train, test) pairs of dictionaries, 
    one pair per fold. For each key in `data`, its 'output' entries are randomly 
    partitioned into `folds` groups. In fold i:
      - The test set (test_dict) contains the outputs from group i.
      - The train set (train_dict) contains all the remaining outputs.
    Over all folds, every output is used exactly once as test data.

    Args:
        data (dict): The original dictionary with 'input' and 'output' keys.
        folds (int): Number of folds (train/test splits) to generate.
        seed (int, optional): Random seed for reproducibility.

    Returns:
        list: A list of tuples (train_dict, test_dict), one tuple per fold.
              Each train_dict contains for every key:
                  {'input': <original input>, 'output': <training outputs>}
              Each test_dict contains for every key:
                  {'output': <test outputs>}
    """
    if seed is not None:
        random.seed(seed)

    # Initialize a list of (train, test) dictionary pairs, one for each fold.
    folds_list = []
    for _ in range(folds):
        folds_list.append(({}, {}))  # Each element is a tuple: (train_dict, test_dict)

    # Process each key in the data.
    for key, value in data.items():
        input_features = value['input']
        outputs = value['output']
        output_keys = list(outputs.keys())
        total_outputs = len(output_keys)
        
        # Shuffle the list of output keys.
        random.shuffle(output_keys)
        
        # Compute the size for each fold so that the outputs are as evenly distributed as possible.
        fold_sizes = [total_outputs // folds] * folds
        remainder = total_outputs % folds
        for i in range(remainder):
            fold_sizes[i] += 1
        
        # Partition the output_keys into `folds` parts.
        assigned = []
        start = 0
        for size in fold_sizes:
            assigned.append(output_keys[start:start + size])
            start += size

        # For each fold, build the train and test dictionaries for this key.
        for i, (train_dict, test_dict) in enumerate(folds_list):
            # In fold i, the test set gets the outputs in assigned[i],
            # and the training set gets all other outputs.
            test_keys = assigned[i]
            # The union of the remaining parts are the training outputs.
            # (Since output_keys is the entire set, we simply remove the test_keys.)
            train_keys = [k for k in output_keys if k not in test_keys]
            
            # Add to the training dictionary: include the input and the training outputs.
            train_dict[key] = {
                'input': input_features,
                'output': {k: outputs[k] for k in train_keys}
            }
            # In the test dictionary, only include the outputs.
            test_dict[key] = {
                'output': {k: outputs[k] for k in test_keys}
            }

    return folds_list

num_folds = 5
seed = 42  # For reproducibility

folds_data = split_inputs_outputs_folds(input_data, num_folds, seed)
folds_data[0][0]

lambdas = [0, 0.01, 0.05, 0.10, 0.15, 0.20, 0.25, 0.30]

errors = dict()
for l in lambdas:
    fold_errors = []
    for fold in folds_data:
        data = fold[0]
        sols_rec = []
        for i in range(n_reps):
            P, Gexp, stats = multi_carnival(G_pkn, data, lambd=l)
            P.solve(solver="GUROBI", TimeLimit=timelimit, NoRelHeurTime=norel, Seed=i, verbosity=0);
            sols_rec.append(P.expr.edge_value.value)
            print(i, "completed")
        err = (df_gt - pd.DataFrame(np.mean(sols_rec, axis=0), index=Gexp.E)).fillna(1).pow(2).mean(axis=0).sum()
        fold_errors.append(err)
    errors[l] = fold_errors
    
