## Prepare Input Data for Curve Curator

This script will create the data and corresponding .toml files needed to run curve curator. It will create one pair of files for each drug which will contain the cell viability data for every cell line tested with this drug.<br>
All created files will be stored in a directory (out_dir).<br>
Curve curator can then be called (for a single drug) like this:
```sh
(curve_curator)$ python ./CurveCurator <path_to_toml_file>
```

In [1]:
import numpy as np
import pandas as pd
import os

The input file (dataset_file) should contain a whole drug response data set (all drugs and cell lines) in the following format:<br>
| cell_line | drug         | dose | response |
|-----------|--------------|------|----------|
| PC-14     | Camptothecin | 0.1  | 0.519    |
| PC-14     | Camptothecin | 0.03 | 0.739    |
| 2004      | Lapatinib    | 0.0  | 1.0      |
| ...       | ...          | ...  | ...      |

The dose can be in any unit, e.g. uM, but this has to be specified in the toml parameters (dose_unit and dose_scale).<br>
For each drug-cell line pair, there has to be a row with dose 0 which is going to be used by curve curator to compute ratios.<br>
The response should be percent viability values, so the response for dose 0 should always be 1.0.

In [9]:
dataset_name = "ctrp2_mapped_nCC"
dataset_file = "../processed_data/ctrp2_normalized_mapped.tsv"

# path to directory where data and .toml files are saved
out_dir = "../ctrp2_mapped_nCC/"

# if True, the last data point is removed
wo_last = False

# parameters for toml file
treatment_time = "72 h"
dose_unit = "uM"
dose_scale = "1e-06"
available_cores = 5
optimized_dofs = "true"
imputation = "false"
normalization = "false"
fit_weights_delta = "1.0"
alpha = "0.05"
fc_lim = "0.45"

In [10]:
# check if directories for curves, mad and dashboards already exist (else, create them)
if not os.path.isdir(f"{out_dir}curves/"):
    os.makedirs(f"{out_dir}curves/")
if not os.path.isdir(f"{out_dir}mad/"):
    os.makedirs(f"{out_dir}mad/")
if not os.path.isdir(f"{out_dir}dashboards/"):
    os.makedirs(f"{out_dir}dashboards/")

In [11]:
df = pd.read_table(dataset_file)
df.head()

Unnamed: 0,cell_line,drug,dose,response
0,CVCL_U759,16betabromoandrosterone,0.0011,0.926588
1,CVCL_U759,16betabromoandrosterone,0.0011,0.933033
2,CVCL_U759,16betabromoandrosterone,0.0023,0.659754
3,CVCL_U759,16betabromoandrosterone,0.0023,0.926588
4,CVCL_U759,16betabromoandrosterone,0.0045,0.939523


In [12]:
# get all drug names and create a version which can be used in a filename
drugs = np.unique(df["drug"])
drugs_file = np.char.replace(np.array(drugs, dtype="str"), "/", "_")
drugs_file = np.char.replace(drugs_file, ",", "_")
drugs_file = np.char.replace(drugs_file, " ", "_")
drugs_file = np.char.replace(drugs_file, ":", "_")

In [13]:
def create_toml_file(drug, drug_file, concentrations):
    experiments = [i for i in range(1, len(concentrations)+1)]
    max_missing = len(concentrations)-2
    doses = "["+", ".join(concentrations.astype(str))+"]"
    content = f"['Meta']\nid = '{dataset_name}_{drug}'\ndescription = '{dataset_name}, {drug}'\ncondition = '{drug}'\ntreatment_time = '{treatment_time}'\n\n"
    content += f"['Experiment']\nexperiments = {experiments}\ndoses = {doses}\ndose_scale = '{dose_scale}'\ndose_unit = '{dose_unit}'\ncontrol_experiment = 1\n"
    content += f"measurement_type = 'OTHER'\ndata_type = 'OTHER'\nsearch_engine = 'OTHER'\nsearch_engine_version = '0'\n\n"
    content += f"['Paths']\ninput_file = './{drug_file}.txt'\ncurves_file = './curves/{drug_file}_curves.txt'\nnormalization_file = './norm.txt'\n"
    content += f"mad_file = './mad/{drug_file}_mad.txt'\ndashboard = './dashboards/{drug_file}_dashboard.html'\n\n"
    content += f"['Processing']\navailable_cores = {available_cores}\nmax_missing = {max_missing}\nimputation = {imputation}\nnormalization = {normalization}\n\n"
    content += f"['F Statistic']\noptimized_dofs = {optimized_dofs}\nalpha = {alpha}\nfc_lim = {fc_lim}\n"

    return content

In [14]:
def create_data(drug, outfile=None):
    data = df[df["drug"]==drug]
    if wo_last:
        # remove last measurement point for each cell line
        idx = data.groupby('cell_line')['dose'].transform(max) == data['dose']
        #data = data.drop(indices, axis=0)
        data = data[~idx]
    doses = np.sort(np.unique(data["dose"]))
    if outfile != None:
        # create and save drug raw data file
        data = data.drop("drug", axis=1)
        # aggregate duplicates with mean
        data = data.groupby(['cell_line', 'dose']).mean()
        data = data.reset_index()
        data = pd.pivot(data, index="cell_line", columns="dose", values="response")
        data = data.reindex(sorted(data.columns), axis=1)
        data = data.reset_index()
        data.columns = ["Name"] + [f"Raw {i}" for i in range(1,len(doses)+1)]
        data.to_csv(outfile, index=False, sep="\t")
    return(doses)

In [15]:
# create data + toml file for each drug
for i in range(0, len(drugs)):
    drug_data_file = f"{out_dir}{drugs_file[i]}.txt"
    toml_file = f"{out_dir}{drugs_file[i]}.toml"
    concentrations = create_data(drugs[i], drug_data_file)
    toml = create_toml_file(drugs[i], drugs_file[i], concentrations)
    with open(toml_file, "w") as file:
        file.write(toml)