## Forming a large dataset

Transforming the datasets so that it has only 1 concentration and one response, i.e. increase the dataset in 10 times


In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.svm import SVR
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
import warnings
warnings.filterwarnings("ignore")
_FOLDER ="results/"

In [2]:
def PrepareDataSets(dict_data_type, training_data_type):
    with open(dict_data_type[training_data_type]["drug_ids_list"], 'r') as f:
        drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]
    
    train_df = pd.read_csv(dict_data_type[training_data_type]["train_df"]).drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)
    test_df = pd.read_csv(dict_data_type[training_data_type]["test_df"]).drop(["Unnamed: 0","Unnamed: 0.1"], axis=1)               

    train = train_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()
    test = test_df.set_index("DRUG_ID").loc[drug_ids_50, :].copy()

    return train, test

In [3]:
with open(_FOLDER+"X_features_cancer_cell_lines.txt", 'r') as f:
    X_cancer_cell_lines = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_PubChem_properties.txt", 'r') as f:
    X_PubChem_properties = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Targets.txt", 'r') as f:
    X_targets = [line.rstrip('\n') for line in f]
# *****************************************

with open(_FOLDER+"X_features_Target_Pathway.txt", 'r') as f:
    X_target_pathway = [line.rstrip('\n') for line in f]
# *****************************************

all_columns = X_cancer_cell_lines + X_PubChem_properties + X_targets + X_target_pathway +["MAX_CONC"]

datasets = ["Dataset_1", "Dataset_2", "Dataset_3", "Dataset_4"]

X_feat_dict = {"Dataset_1": X_cancer_cell_lines ,
               "Dataset_2": ["MAX_CONC"] + X_targets + X_target_pathway + X_cancer_cell_lines ,
               "Dataset_3": ["MAX_CONC"] + X_PubChem_properties +  X_cancer_cell_lines,
               "Dataset_4": ["MAX_CONC"] + X_PubChem_properties +  X_targets + X_target_pathway + X_cancer_cell_lines}
dict_data_type = {
    "original_data": {
        "drug_ids_list": _FOLDER + "drug_ids_50.txt", 
        "train_df" : _FOLDER + "train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv",
        "test_df" : _FOLDER + "test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv"},
    
    "reduced_by_R2_data": {
        "drug_ids_list": _FOLDER +"drug_ids_50_restr.txt", 
        "train_df" : _FOLDER + "train08_min10_restr.csv",
        "test_df" : _FOLDER + "test02_min10_restr.csv"}
}

In [4]:
training_data_type= "original_data"

train_df_50, test_df_50 = PrepareDataSets(dict_data_type, training_data_type = "original_data")
train_df_50.shape, test_df_50.shape

((781, 1379), (202, 1379))

## Making a long dataset

### Original data

In [5]:
train_df_50, test_df_50 = PrepareDataSets(dict_data_type, training_data_type = "original_data")
train_df_50.shape, test_df_50.shape

columns_to_use = ["Drug_Name", "COSMIC_ID"] + X_feat_dict["Dataset_4"]

df_train = pd.DataFrame()
for i in range(10):
    df_train = pd.concat([df_train, train_df_50[columns_to_use + ["fd_num_" + str(i), "norm_cells_" + str(i)]].rename(
                                    columns={"fd_num_" + str(i): "scaled_x", 
                                             "norm_cells_" + str(i): "norm_y"})],
                          axis=0, ignore_index = False)
    
df_train.shape, train_df_50.shape

set(train_df_50.columns) - set(df_train.columns) - set(["fd_num_"+str(i) for i in range(10)]) - set(["norm_cells_"+str(i) for i in range(10)])

df_test = pd.DataFrame()
for i in range(10):
    df_test = pd.concat([df_test, test_df_50[columns_to_use + ["fd_num_" + str(i), "norm_cells_" + str(i)]].rename(
                                    columns={"fd_num_" + str(i): "scaled_x", 
                                             "norm_cells_" + str(i): "norm_y"})],
                          axis=0, ignore_index = False)
    
df_train.shape, df_test.shape

((7810, 1356), (2020, 1356))

In [6]:
df_train.to_csv(_FOLDER +"large_train_sigmoid4.csv")
df_test.to_csv(_FOLDER +"large_test_sigmoid4.csv")

### Reduced by R2 data

In [7]:
train_df_50, test_df_50 = PrepareDataSets(dict_data_type, training_data_type = "reduced_by_R2_data")
train_df_50.shape, test_df_50.shape

columns_to_use = ["Drug_Name", "COSMIC_ID"] + X_feat_dict["Dataset_4"]

df_train = pd.DataFrame()
for i in range(10):
    df_train = pd.concat([df_train, train_df_50[columns_to_use + ["fd_num_" + str(i), "norm_cells_" + str(i)]].rename(
                                    columns={"fd_num_" + str(i): "scaled_x", 
                                             "norm_cells_" + str(i): "norm_y"})],
                          axis=0, ignore_index = False)
    
df_train.shape, train_df_50.shape

set(train_df_50.columns) - set(df_train.columns) - set(["fd_num_"+str(i) for i in range(10)]) - set(["norm_cells_"+str(i) for i in range(10)])

df_test = pd.DataFrame()
for i in range(10):
    df_test = pd.concat([df_test, test_df_50[columns_to_use + ["fd_num_" + str(i), "norm_cells_" + str(i)]].rename(
                                    columns={"fd_num_" + str(i): "scaled_x", 
                                             "norm_cells_" + str(i): "norm_y"})],
                          axis=0, ignore_index = False)
    
df_train.shape, df_test.shape

((7230, 1356), (1850, 1356))

In [8]:
df_train.to_csv(_FOLDER +"large_train_sigmoid4_restr.csv")
df_test.to_csv(_FOLDER +"large_test_sigmoid4_restr.csv")