In [1]:
import pandas as pd
import numpy as np 
import os

In [2]:
os.listdir("../results")

['drug_ids_50_restr.txt',
 'drug_cells_kernels_best_parameters.csv',
 'X_features_1122_easy_read.txt',
 'drug_cells_PubChem_scaled_kernels_best_parameters.csv',
 '.DS_Store',
 '2_train08_min10.csv',
 'drugs_with_pubchem_id.txt',
 'drugs67_more_10.txt',
 'test02_min10_restr.csv',
 'drug_features_with_pubchem_properties.csv',
 'drug_ids_10.txt',
 'large_train_sigmoid4.csv',
 'drugs10_more_50.txt',
 'drugs_with_no_pubchem_id.txt',
 'statistics_of_sigmoid_coefficients.csv',
 'test02_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv',
 'large_test_sigmoid4.csv',
 'target_target_pathway_df.csv',
 'drug_ids_10_restr.txt',
 'X_features_Targets.txt',
 'X_PubChem_features.txt',
 '.ipynb_checkpoints',
 '2_test02_min10.csv',
 'drug_ids_50.txt',
 'ridge_coef4.csv',
 'train08_min10_restr.csv',
 'train08_merged_fitted_sigmoid4_123_with_drugs_properties_min10.csv',
 'X_columns_1094.txt',
 'drug_cells_description_kernels_best_parameters.csv',
 'filtered_drug_profiles_123.csv',
 'drug_ids_10_2.

### Drugs with properties

In [3]:
df = pd.read_csv("../results/merged_fitted_sigmoid4_123_with_drugs_properties.csv")
df.shape

(2585, 1381)

In [4]:
df = df[df["param_1"]<1]
df.shape

(2523, 1381)

In [5]:
df.columns

Index(['Unnamed: 0', 'DRUG_ID', 'COSMIC_ID', 'fd_num_0', 'fd_num_1',
       'fd_num_2', 'fd_num_3', 'fd_num_4', 'fd_num_5', 'fd_num_6',
       ...
       'JNK and p38 signaling', 'Metabolism', 'Mitosis', 'Other',
       'Other, kinases', 'PI3K/MTOR signaling',
       'Protein stability and degradation', 'RTK signaling', 'WNT signaling',
       'p53 pathway'],
      dtype='object', length=1381)

### Split into train and test data with more than 10 record per drug

In [6]:
gr = df.groupby("DRUG_ID").size()
drugs = gr[gr>10].index
print("Number of drugs:", len(drugs))        

# with open("../results/drugs10_more_50.txt", 'w') as f:
#     for s in drugs10_more_50:
#         f.write(str(s) + '\n')
        
# with open("../results/drugs67_more_10.txt", 'w') as f:
#     for s in drugs67_more_10:
#         f.write(str(s) + '\n')

train_ratio = 0.8
train = pd.DataFrame()
test = pd.DataFrame()
np.random.seed(123)
indexes = np.random.permutation(df.index)

for drug_id in drugs:
    df_i = df[df["DRUG_ID"]==drug_id]
    indexes = np.random.permutation(df_i.index)
    train_size = int(df_i.shape[0]*train_ratio)
    indexes_train = indexes[:train_size]
    indexes_test = indexes[train_size:]
    train = pd.concat([train, df_i.loc[indexes_train, :]])
    test = pd.concat([test, df_i.loc[indexes_test, :]])
    
# #### Split into train and test data with more than 10 record per drug 
train.shape, test.shape

Number of drugs: 67


((1792, 1381), (479, 1381))

## R2 restriction

In [7]:
from sklearn.metrics import r2_score

def sigmoid_4_param(x, x0, L, k, d):
    """ Comparing with Dennis Wang's sigmoid:
    x0 -  p - position, correlation with IC50 or EC50
        bounds [0, 1]
    L = 1 in Dennis Wang's sigmoid, protect from devision by zero if x is too small 
        L<1 inverted sigmoid, l=100 - lower upper and lower boundso sigmpoid on y axis (y= [0.1, 0.11])
        bounds [0.8, 10]
    k = -1/s (s -shape parameter)  default = -10 k=0 straight line, k<0 sigmoid around k=-10
        bounds [1, -100]
    d - determines the vertical position of the sigmoid - shift on y axis - better fitting then Dennis Wang's sigmoid
         bounds [0, 0.9]
    parameters_bound ((0, 0.8, -100, 0), (1, 10, 1, 0.9))
    """
    return ( 1/ (L + np.exp(-k*(x-x0))) + d)

def r2_score_sigmoid_4_param(df, x_columns, y_columns, param_columns = []):
    r2_scores=np.zeros(len(df.index))
    for i in range(len(df.index)):
        x = df.loc[df.index[i], x_columns].values.astype(np.float32)
        y = df.loc[df.index[i], y_columns].values.astype(np.float32)
        fit_param = df.loc[df.index[i], param_columns].values.astype(np.float32)
#         print(fit_param)
        y_fit = sigmoid_4_param(x, *fit_param)
        r2_scores[i] = r2_score(y, y_fit)
    return r2_scores

df["r2_scores"] = r2_score_sigmoid_4_param(df.reset_index(), 
                                                   x_columns = ["fd_num_"+str(i) for i in range(10)],
                                                  y_columns = ["norm_cells_" + str(i) for i in range(10)],
                                                  param_columns = ["param_" + str(i) for i in range(1,5)])

In [8]:
df2= df[df["r2_scores"]>0.9].copy()
print(df2.shape)
gr = df2.groupby("DRUG_ID").size()
drugs = gr[gr>10].index

with open("../results/drug_ids_10_restr.txt", 'w') as f:
    for s in drugs:
        f.write(str(s) + '\n')

gr = df2.groupby("DRUG_ID").size()
drugs = gr[gr>50].index

with open("../results/drug_ids_50_restr.txt", 'w') as f:
    for s in drugs:
        f.write(str(s) + '\n')        
        
print(len(drugs))

train_ratio = 0.8
train = pd.DataFrame()
test = pd.DataFrame()
np.random.seed(123)
indexes = np.random.permutation(df2.index)

for drug_id in drugs:
    df2_i = df2[df2["DRUG_ID"]==drug_id]
    indexes = np.random.permutation(df2_i.index)
    train_size = int(df2_i.shape[0]*train_ratio)
    indexes_train = indexes[:train_size]
    indexes_test = indexes[train_size:]
    train = pd.concat([train, df2_i.loc[indexes_train, :]])
    test = pd.concat([test, df2_i.loc[indexes_test, :]])
    
# train.to_csv("../results/train08_min10_restr.csv")
# test.to_csv("../results/test02_min10_restr.csv")    
train.shape, test.shape

  app.launch_new_instance()
  app.launch_new_instance()
  app.launch_new_instance()


(2496, 1382)
10


((723, 1382), (185, 1382))

## Restrictions for coefficients

In [15]:
df3 =df[(df["param_1"] <1) & (df["param_2"] >-5) & (df["param_3"] >-120) & (df["param_4"] >0)].copy()


gr = df3.groupby("DRUG_ID").size()
drugs = gr[gr>10].index

with open("../results/drug_ids_10_restr.txt", 'w') as f:
    for s in drugs:
        f.write(str(s) + '\n')

gr = df3.groupby("DRUG_ID").size()
drugs = gr[gr>50].index

with open("../results/drug_ids_50_restr.txt", 'w') as f:
    for s in drugs:
        f.write(str(s) + '\n')        
        
len(drugs)

train_ratio = 0.8
train = pd.DataFrame()
test = pd.DataFrame()
np.random.seed(123)
indexes = np.random.permutation(df3.index)

for drug_id in drugs:
    df3_i = df3[df3["DRUG_ID"]==drug_id]
    indexes = np.random.permutation(df3_i.index)
    train_size = int(df3_i.shape[0]*train_ratio)
    indexes_train = indexes[:train_size]
    indexes_test = indexes[train_size:]
    train = pd.concat([train, df3_i.loc[indexes_train, :]])
    test = pd.concat([test, df3_i.loc[indexes_test, :]])
    
# train.to_csv("../results/train08_min10_restr.csv")
# test.to_csv("../results/test02_min10_restr.csv")    
train.shape, test.shape

((588, 1382), (152, 1382))

In [None]:
train.shape[0], test.shape[0], train.shape[0]+test.shape[0], df.set_index("DRUG_ID").loc[drugs,:].shape[0]

In [None]:
"MAX_CONC" in train.columns

In [None]:
gr = df.groupby("DRUG_ID")["COSMIC_ID"].count()
drugs10_more_50 = gr[gr>50].index
len(drugs10_more_50), drugs10_more_50

In [None]:
test.shape