## Prepare train and test sets for further modelling

1. take the drugs which have more than 10 drug profiles
2. split them into two data sets with equal portion of each of the drugs
3. reproduce the principle of data splitting for the case of restrictions

In [1]:
import pandas as pd
import numpy as np 
import os

import warnings
warnings.filterwarnings("ignore")

_FOLDER_2 = "results/"

In [2]:
os.listdir(_FOLDER_2)

['.DS_Store',
 'merged_drug_profiles_cells_sigmoid4_123.csv',
 'drugs_with_pubchem_id.txt',
 'drugs_with_pubchem_id_NEW.txt',
 'drug_features_with_pubchem_properties.csv',
 'drugs_with_no_pubchem_id.txt',
 'target_target_pathway_df.csv',
 'fit_filtered_drug_profiles_123.csv',
 'X_features_Targets.txt',
 '.ipynb_checkpoints',
 'filtered_drug_profiles_123.csv',
 'X_PubChem_properties.txt',
 'X_features_Target_Pathway.txt',
 'merged_fitted_sigmoid4_123_with_drugs_properties.csv',
 'X_features_cancer_cell_lines.txt']

### Drugs with properties

In [3]:
df = pd.read_csv(_FOLDER_2 + "merged_fitted_sigmoid4_123_with_drugs_properties.csv").drop(['H', 'Target', 'Target_Pathway', 'elements'], axis = 1)
df.shape

(2585, 1381)

### Split into train and test data with more than 10 record per drug

In [4]:
gr = df.groupby("DRUG_ID").size()
drugs = gr[gr>10].index
print("Number of drugs with more than 10 profiles:", len(drugs))        

train_ratio = 0.8
train = pd.DataFrame()
test = pd.DataFrame()
np.random.seed(123)
indexes = np.random.permutation(df.index)

for drug_id in drugs:
    df_i = df[df["DRUG_ID"]==drug_id]
    indexes = np.random.permutation(df_i.index)
    train_size = int(df_i.shape[0]*train_ratio)
    indexes_train = indexes[:train_size]
    indexes_test = indexes[train_size:]
    train = pd.concat([train, df_i.loc[indexes_train, :]])
    test = pd.concat([test, df_i.loc[indexes_test, :]])
    
gr = df.groupby("DRUG_ID")["COSMIC_ID"].count()
drug_ids = list(gr[gr>50].index)

with open(_FOLDER_2 +"drug_ids_50.txt", 'w') as f:
    for s in drug_ids:
        f.write(str(s) + '\n')
print("Number of drugs with more than 50 profiles:", len(drug_ids))  

train.shape, test.shape

Number of drugs with more than 10 profiles: 69
Number of drugs with more than 50 profiles: 11


((1843, 1381), (496, 1381))

In [5]:
train.to_csv(_FOLDER_2+ "train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv")
test.to_csv(_FOLDER_2+ "test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv")

## R2 restriction

In [6]:
from sklearn.metrics import r2_score

def sigmoid_4_param(x, x0, L, k, d):
    """ Comparing with Dennis Wang's sigmoid:
    x0 -  p - position, correlation with IC50 or EC50
        bounds [0, 1]
    L = 1 in Dennis Wang's sigmoid, protect from devision by zero if x is too small 
        L<1 inverted sigmoid, l=100 - lower upper and lower boundso sigmpoid on y axis (y= [0.1, 0.11])
        bounds [0.8, 10]
    k = -1/s (s -shape parameter)  default = -10 k=0 straight line, k<0 sigmoid around k=-10
        bounds [1, -100]
    d - determines the vertical position of the sigmoid - shift on y axis - better fitting then Dennis Wang's sigmoid
         bounds [0, 0.9]
    parameters_bound ((0, 0.8, -100, 0), (1, 10, 1, 0.9))
    """
    return ( 1/ (L + np.exp(-k*(x-x0))) + d)

def r2_score_sigmoid_4_param(df, x_columns, y_columns, param_columns = []):
    r2_scores=np.zeros(len(df.index))
    for i in range(len(df.index)):
        x = df.loc[df.index[i], x_columns].values.astype(np.float32)
        y = df.loc[df.index[i], y_columns].values.astype(np.float32)
        fit_param = df.loc[df.index[i], param_columns].values.astype(np.float32)
#         print(fit_param)
        y_fit = sigmoid_4_param(x, *fit_param)
        r2_scores[i] = r2_score(y, y_fit)
    return r2_scores



In [7]:
df["r2_scores"] = r2_score_sigmoid_4_param(df.reset_index(), 
                                                   x_columns = ["fd_num_"+str(i) for i in range(10)],
                                                  y_columns = ["norm_cells_" + str(i) for i in range(10)],
                                                  param_columns = ["param_" + str(i) for i in range(1,5)])

df2= df[df["r2_scores"]>0.9].copy()
print("Data after R2 restriction:", df2.shape)

gr = df2.groupby("DRUG_ID").size()
drugs = gr[gr>10].index

print("R2 restr: Number of drugs with more than 10 profiles:", len(drugs))  

gr = df2.groupby("DRUG_ID").size()
drug_ids = gr[gr>50].index

with open(_FOLDER_2+"drug_ids_50_restr.txt", 'w') as f:
    for s in drug_ids:
        f.write(str(s) + '\n')        
print("R2 restr: Number of drugs with more than 50 profiles:", len(drug_ids))

train_2_ratio = 0.8
train_2 = pd.DataFrame()
test_2 = pd.DataFrame()
np.random.seed(123)
indexes = np.random.permutation(df2.index)

for drug_id in drug_ids:
    df2_i = df2[df2["DRUG_ID"]==drug_id]
    indexes = np.random.permutation(df2_i.index)
    train_2_size = int(df2_i.shape[0]*train_2_ratio)
    indexes_train_2 = indexes[:train_2_size]
    indexes_test_2 = indexes[train_2_size:]
    train_2 = pd.concat([train_2, df2_i.loc[indexes_train_2, :]])
    test_2 = pd.concat([test_2, df2_i.loc[indexes_test_2, :]])
      
train_2.shape, test_2.shape

Data after R2 restriction: (2522, 1382)
R2 restr: Number of drugs with more than 10 profiles: 69
R2 restr: Number of drugs with more than 50 profiles: 10


((723, 1382), (185, 1382))

In [8]:
# R2 restriction
train_2.to_csv(_FOLDER_2 + "train08_min10_restr.csv")
test_2.to_csv(_FOLDER_2+ "test02_min10_restr.csv")  

## Restrictions for coefficients

In [9]:
df3 =df[(df["param_1"] <1) & (df["param_2"] >-5) & (df["param_3"] >-120) & (df["param_4"] >0)].copy()


gr = df3.groupby("DRUG_ID").size()
drugs = gr[gr>10].index

print("Number of drugs with more than 10 profiles:", len(drugs))     
gr = df3.groupby("DRUG_ID").size()
drug_ids = gr[gr>50].index

with open(_FOLDER_2+"drug_ids_50_restr_coef.txt", 'w') as f:
    for s in drug_ids:
        f.write(str(s) + '\n')        
        
len(drugs)

train_3_ratio = 0.8
train_3 = pd.DataFrame()
test_3 = pd.DataFrame()
np.random.seed(123)
indexes = np.random.permutation(df3.index)

for drug_id in drug_ids:
    df3_i = df3[df3["DRUG_ID"]==drug_id]
    indexes = np.random.permutation(df3_i.index)
    train_3_size = int(df3_i.shape[0]*train_3_ratio)
    indexes_train_3 = indexes[:train_3_size]
    indexes_test_3 = indexes[train_3_size:]
    train_3 = pd.concat([train_3, df3_i.loc[indexes_train_3, :]])
    test_3 = pd.concat([test_3, df3_i.loc[indexes_test_3, :]])
      
train_3.shape, test_3.shape

Number of drugs with more than 10 profiles: 57


((588, 1382), (152, 1382))

In [10]:
# restriction for coefficients
train_3.to_csv(_FOLDER_2+ "train08_min10_restr_coef.csv")
test_3.to_csv(_FOLDER_2+ "test02_min10_restr_coef.csv")  

## Comparison of datasets

In [11]:
print("Original >50:", train.shape, test.shape)
print("R2 restr >50:", train_2.shape, test_2.shape)
print("Coef restr >50:", train_3.shape, test_3.shape)

Original >50: (1843, 1381) (496, 1381)
R2 restr >50: (723, 1382) (185, 1382)
Coef restr >50: (588, 1382) (152, 1382)


## Sanity check

In [12]:
with open(_FOLDER_2 + "drug_ids_50.txt", 'r') as f:
    drug_ids_50 = [np.int32(line.rstrip('\n')) for line in f]

Unnamed: 0_level_0,Unnamed: 1_level_0,MAX_CONC
DRUG_ID,Cl,Unnamed: 2_level_1
170,0.0,81
173,1.0,40
180,0.0,57
200,0.0,69
219,1.0,68
272,0.0,85
273,0.0,84
274,0.0,80
276,0.0,80
328,0.0,92
