## Dataset creation

**Default values for Filtering_curves**
(df, response_columns, filtering_scenario = [1,2,3], first_columns_to_compare = [1, 2], last_columns_to_compare = [-1, -2], tolerance=0.05, first_points_lower_limit = 0.8, last_points_upper_limit = 0.4)

filtering_scenario = [1,2,3]
1. Increase the tolerence levels

2. Modify the locations of platues

3. A combination of scenario 1 and 2

In [2]:
import pandas as pd
import numpy as np
import os
_FOLDER = "data/"
_FOLDER_2 = "results/"
_FOLDER_3 = "datasets/"

In [3]:
def FilteringCurves(df, response_columns, filtering_scenario=[1, 2, 3],
                    first_columns_to_compare=[1, 2], last_columns_to_compare=[-1, -2],
                    tolerance=0.05, first_points_lower_limit=0.8, last_points_upper_limit=0.4):
    """
    Filters the dataset based on specified criteria.

    Parameters:
    df (DataFrame): The dataset to filter.
    response_columns (list): List of columns to apply the filtering.
    filtering_scenario (list): Scenarios to apply [1, 2, 3].
    first_columns_to_compare (list): Columns to compare for initial plateau.
    last_columns_to_compare (list): Columns to compare for final plateau.
    tolerance (float): Tolerance level for plateau comparison.
    first_points_lower_limit (float): Lower limit for initial points plateau.
    last_points_upper_limit (float): Upper limit for final points plateau.
    """
    df = df.copy()
    print("Original dataset:", df.shape)

    for i in filtering_scenario:
        if i == 1:
            # Ensure that all responses are less than 1
            index_row_more_than_1 = df[df[response_columns].max(axis=1) > 1].index
            df = df.drop(index_row_more_than_1)
            # print("1st filtration (Responses less than 1): Filtered dataset:", df.shape)

        elif i == 2:
            # Ensure that first and last points form plateaus
            df["dif_first"] = abs(df[response_columns[first_columns_to_compare[0] - 1]] -
                                  df[response_columns[first_columns_to_compare[1] - 1]])
            df["dif_last"] = abs(df[response_columns[last_columns_to_compare[0]]] -
                                 df[response_columns[last_columns_to_compare[1]]])
            df = df[(df["dif_first"] <= tolerance) & (df["dif_last"] <= tolerance)]
            # print("2nd filtration (Plateau formation): Filtered dataset:", df.shape)

        elif i == 3:
            # Specify location of the plateaus
            df = df[(df[response_columns[0]] > first_points_lower_limit) &
                    (df[response_columns[-1]] < last_points_upper_limit)]
            # print("3rd filtration (Plateau location): Filtered dataset:", df.shape)

        else:
            print("Unknown filtration scenario")

    return df

### Reading the data

In [4]:
drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data.csv")
conc_columns= ["fd_num_"+str(i) for i in range(10)]
response_norm = ['norm_cells_'+str(i) for i in range(10)]

drug_curves.shape

  drug_curves = pd.read_csv(_FOLDER+"normalised_dose_response_data.csv")


(225384, 44)

### Filtration Scenario 1

In [7]:
tolerance_values = [0.06, 0.07, 0.08, 0.09, 0.10]
for i, tolerance in enumerate(tolerance_values, start=1):
    filtered_df = FilteringCurves(drug_curves, response_norm, filtering_scenario=[1, 2, 3],
                                  first_columns_to_compare=[1, 2], last_columns_to_compare=[-1, -2],
                                  tolerance=tolerance, first_points_lower_limit=0.8, last_points_upper_limit=0.4)
    
    # Construct the filename
    filename = _FOLDER_3 + 'filtering_scenario_1.' + str(i) + '.csv'

    # Save the filtered dataframe
    filtered_df.to_csv(filename, index=False)
    
    # Print the size of the dataset after filtering
    print(f"Tolerance used: {tolerance}, Dataset size after filtering: {filtered_df.shape[0]} rows")

Original dataset: (225384, 44)
Tolerance used: 0.06, Dataset size after filtering: 3667 rows
Original dataset: (225384, 44)
Tolerance used: 0.07, Dataset size after filtering: 4543 rows
Original dataset: (225384, 44)
Tolerance used: 0.08, Dataset size after filtering: 5436 rows
Original dataset: (225384, 44)
Tolerance used: 0.09, Dataset size after filtering: 6308 rows
Original dataset: (225384, 44)
Tolerance used: 0.1, Dataset size after filtering: 7145 rows


In [10]:
# Define ranges for first_points_lower_limit and last_points_upper_limit adjustments
first_points_lower_limits = [0.75, 0.7, 0.65]
last_points_upper_limits = [0.45, 0.5, 0.55]

for i, (first_limit, last_limit) in enumerate(zip(first_points_lower_limits, last_points_upper_limits), start=1):
    filtered_df = FilteringCurves(drug_curves, response_norm, filtering_scenario=[1, 2, 3],
                                  first_columns_to_compare=[1, 2], last_columns_to_compare=[-1, -2],
                                  tolerance=0.05, first_points_lower_limit=first_limit, last_points_upper_limit=last_limit)
    
    # Construct the filename for saving
    filename = _FOLDER_3 + 'filtering_scenario_2.' + str(i) + '.csv'
    
    # Save the filtered dataframe
    filtered_df.to_csv(filename, index=False)
    
    # Print the size of the dataset after filtering
    print(f"First point lower limit: {first_limit}, Last point upper limit: {last_limit}, Dataset size: {filtered_df.shape[0]} rows")

Original dataset: (225384, 44)
First point lower limit: 0.75, Last point upper limit: 0.45, Dataset size: 2956 rows
Original dataset: (225384, 44)
First point lower limit: 0.7, Last point upper limit: 0.5, Dataset size: 3125 rows
Original dataset: (225384, 44)
First point lower limit: 0.65, Last point upper limit: 0.55, Dataset size: 3276 rows


In [11]:
# Define the parameter ranges for the third scenario
tolerance_values = [0.06, 0.07, 0.08, 0.09, 0.10]
first_points_lower_limits = [0.75, 0.7, 0.65]
last_points_upper_limits = [0.45, 0.5, 0.55]

# Loop through each combination of tolerance, first point lower limit, and last point upper limit
scenario_counter = 1
for tolerance in tolerance_values:
    for first_limit, last_limit in zip(first_points_lower_limits, last_points_upper_limits):
        filtered_df = FilteringCurves(drug_curves, response_norm, filtering_scenario=[1, 2, 3],
                                      first_columns_to_compare=[1, 2], last_columns_to_compare=[-1, -2],
                                      tolerance=tolerance, first_points_lower_limit=first_limit, last_points_upper_limit=last_limit)
        
        # Construct the filename for saving
        filename = _FOLDER_3 + f'filtering_scenario_3.{scenario_counter}.csv'
        
        # Save the filtered dataframe
        filtered_df.to_csv(filename, index=False)
        
        # Print the size of the dataset after filtering and the criteria used
        print(f"Scenario {scenario_counter}: Tolerance: {tolerance}, First point lower limit: {first_limit}, Last point upper limit: {last_limit}, Dataset size: {filtered_df.shape[0]} rows")
        
        scenario_counter += 1

Original dataset: (225384, 44)
Scenario 1: Tolerance: 0.06, First point lower limit: 0.75, Last point upper limit: 0.45, Dataset size: 3902 rows
Original dataset: (225384, 44)
Scenario 2: Tolerance: 0.06, First point lower limit: 0.7, Last point upper limit: 0.5, Dataset size: 4144 rows
Original dataset: (225384, 44)
Scenario 3: Tolerance: 0.06, First point lower limit: 0.65, Last point upper limit: 0.55, Dataset size: 4356 rows
Original dataset: (225384, 44)
Scenario 4: Tolerance: 0.07, First point lower limit: 0.75, Last point upper limit: 0.45, Dataset size: 4848 rows
Original dataset: (225384, 44)
Scenario 5: Tolerance: 0.07, First point lower limit: 0.7, Last point upper limit: 0.5, Dataset size: 5153 rows
Original dataset: (225384, 44)
Scenario 6: Tolerance: 0.07, First point lower limit: 0.65, Last point upper limit: 0.55, Dataset size: 5436 rows
Original dataset: (225384, 44)
Scenario 7: Tolerance: 0.08, First point lower limit: 0.75, Last point upper limit: 0.45, Dataset size:

: 