# Prepare train and test sets for further modelling 

1. take the drugs which have more than 10 drug profiles
2. split them into two data sets with equal portion of each of the drugs
3. reproduce the principle of data splitting for the case of restrictions

In [8]:
import pandas as pd
import numpy as np 
import os

from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings("ignore")

_FOLDER = "fitted_datasets_drug_properties/"
_FOLDER_2 = "test_train/"

## Split into train and test data with more than 10 record per drug

In [12]:
# Set a random seed for reproducibility
np.random.seed(123)

# Define the ratio of the train set to the total data
train_ratio = 0.8

columns_to_drop = ['H', 'Target', 'Target_Pathway', 'elements']

for file in os.listdir(_FOLDER):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(_FOLDER, file))
        df = df.drop([col for col in columns_to_drop if col in df.columns], axis=1)

        print(f"Processing {file}, shape: {df.shape}")
        

        # Group by 'DRUG_ID' and filter out those with less than 10 profiles
        gr = df.groupby("DRUG_ID").size()
        drugs = gr[gr > 10].index
        print("Number of drugs with more than 10 profiles:", len(drugs))

        train = pd.DataFrame()
        test = pd.DataFrame()

        for drug_id in drugs:
            df_i = df[df["DRUG_ID"] == drug_id]
            indexes = np.random.permutation(df_i.index)
            train_size = int(df_i.shape[0] * train_ratio)
            indexes_train = indexes[:train_size]
            indexes_test = indexes[train_size:]
            train = pd.concat([train, df_i.loc[indexes_train, :]])
            test = pd.concat([test, df_i.loc[indexes_test, :]])

        # Save the train and test sets
        scenario_num = file.split("_")[4]  # Adjust the split index if needed
        train.to_csv(os.path.join(_FOLDER_2, f"train_{scenario_num}"), index=False)
        test.to_csv(os.path.join(_FOLDER_2, f"test_{scenario_num}"), index=False)
        

Processing merged_fitted_drug_properties_1.1.csv.csv, shape: (3464, 1382)
Number of drugs with more than 10 profiles: 78
Processing merged_fitted_drug_properties_1.2.csv.csv, shape: (4292, 1382)
Number of drugs with more than 10 profiles: 86
Processing merged_fitted_drug_properties_1.3.csv.csv, shape: (5141, 1382)
Number of drugs with more than 10 profiles: 91
Processing merged_fitted_drug_properties_1.4.csv.csv, shape: (5979, 1382)
Number of drugs with more than 10 profiles: 98
Processing merged_fitted_drug_properties_1.5.csv.csv, shape: (6766, 1382)
Number of drugs with more than 10 profiles: 104
Processing merged_fitted_drug_properties_2.1.csv.csv, shape: (2781, 1382)
Number of drugs with more than 10 profiles: 72
Processing merged_fitted_drug_properties_2.2.csv.csv, shape: (2944, 1382)
Number of drugs with more than 10 profiles: 74
Processing merged_fitted_drug_properties_2.3.csv.csv, shape: (3082, 1382)
Number of drugs with more than 10 profiles: 79
Processing merged_fitted_drug_p

## R2 Restriction

In [9]:
def sigmoid_4_param(x, x0, L, k, d):
    """ Comparing with Dennis Wang's sigmoid:
    x0 -  p - position, correlation with IC50 or EC50
        bounds [0, 1]
    L = 1 in Dennis Wang's sigmoid, protect from devision by zero if x is too small 
        L<1 inverted sigmoid, l=100 - lower upper and lower boundso sigmpoid on y axis (y= [0.1, 0.11])
        bounds [0.8, 10]
    k = -1/s (s -shape parameter)  default = -10 k=0 straight line, k<0 sigmoid around k=-10
        bounds [1, -100]
    d - determines the vertical position of the sigmoid - shift on y axis - better fitting then Dennis Wang's sigmoid
         bounds [0, 0.9]
    parameters_bound ((0, 0.8, -100, 0), (1, 10, 1, 0.9))
    """
    return ( 1/ (L + np.exp(-k*(x-x0))) + d)

def r2_score_sigmoid_4_param(df, x_columns, y_columns, param_columns = []):
    r2_scores=np.zeros(len(df.index))
    for i in range(len(df.index)):
        x = df.loc[df.index[i], x_columns].values.astype(np.float32)
        y = df.loc[df.index[i], y_columns].values.astype(np.float32)
        fit_param = df.loc[df.index[i], param_columns].values.astype(np.float32)
#         print(fit_param)
        y_fit = sigmoid_4_param(x, *fit_param)
        r2_scores[i] = r2_score(y, y_fit)
    return r2_scores

In [13]:
for file in os.listdir(_FOLDER):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(_FOLDER, file))
        
        # Calculate R^2 scores for each row in the dataset
        df["r2_scores"] = r2_score_sigmoid_4_param(df, 
                                                   x_columns=["fd_num_"+str(i) for i in range(10)],
                                                   y_columns=["norm_cells_" + str(i) for i in range(10)],
                                                   param_columns=["param_" + str(i) for i in range(1,5)])
        
        # Apply R^2 restriction
        df2 = df[df["r2_scores"] > 0.9].copy()
        
        # Group by 'DRUG_ID' and filter out those with less than a certain number of profiles
        gr = df2.groupby("DRUG_ID").size()
        drugs = gr[gr > 10].index
        
        train = pd.DataFrame()
        test = pd.DataFrame()

        # Split the data into training and testing sets
        for drug_id in drugs:
            df_i = df2[df2["DRUG_ID"] == drug_id]
            indexes = np.random.permutation(df_i.index)
            train_size = int(len(indexes) * train_ratio)
            indexes_train = indexes[:train_size]
            indexes_test = indexes[train_size:]
            train = pd.concat([train, df_i.loc[indexes_train, :]])
            test = pd.concat([test, df_i.loc[indexes_test, :]])


        # Save the train and test sets
        scenario_num = file.split("_")[4]  # Adjust the split index if needed
        train.to_csv(os.path.join(_FOLDER_2, f"train_restr_{scenario_num}"), index=False)
        test.to_csv(os.path.join(_FOLDER_2, f"test_restr_{scenario_num}"), index=False)

## Restrictions for coefficients

In [14]:
for file in os.listdir(_FOLDER):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(_FOLDER, file))
        
        # Apply restrictions for coefficients
        df3 = df[(df["param_1"] < 1) & (df["param_2"] > -5) & (df["param_3"] > -120) & (df["param_4"] > 0)].copy()

        # Group by 'DRUG_ID' and filter out those with less than a certain number of profiles
        gr = df3.groupby("DRUG_ID").size()
        drugs = gr[gr > 10].index

        train = pd.DataFrame()
        test = pd.DataFrame()

        # Split the data into training and testing sets
        for drug_id in drugs:
            df_i = df3[df3["DRUG_ID"] == drug_id]
            indexes = np.random.permutation(df_i.index)
            train_size = int(len(indexes) * train_ratio)
            indexes_train = indexes[:train_size]
            indexes_test = indexes[train_size:]
            train = pd.concat([train, df_i.loc[indexes_train, :]])
            test = pd.concat([test, df_i.loc[indexes_test, :]])

        
        # Save the train and test sets
        scenario_num = file.split("_")[4]  # Adjust the split index if needed
        train.to_csv(os.path.join(_FOLDER_2, f"train_restr_coef_{scenario_num}"), index=False)
        test.to_csv(os.path.join(_FOLDER_2, f"test_restr_coef_{scenario_num}"), index=False)

    