## Import libraries

In [1]:
import pandas as pd
from fastprogress.fastprogress import progress_bar
import os


## Data processing to replicates

In [2]:
# load gdsc file or import own file here

def load_data(filepath):
    gdsc = pd.read_csv()
    
    return gdsc

In [3]:
gdsc = load_data('data/gdsc.csv')

  gdsc = pd.read_csv('data/gdsc.csv')


In [4]:
# figure out which conditions have 3 or more replicates

def find_replicates(gdsc):
    # unique plate, cell line, tag, drug, assay, seeding density, duration, and concentrations
    replicates = pd.DataFrame(gdsc[['DRUGSET_ID','CELL_ID','TAG','DRUG_ID','ASSAY','SEEDING_DENSITY','DURATION','CONC']].value_counts(dropna=False))
    replicates = replicates.reset_index()
    replicates.rename(columns={0:'replicates'}, inplace=True)

    # remove conc column and drop duplicates so it's essentially only counting replicates for one conc
    replicates.drop(['CONC'], axis=1, inplace=True)
    replicates.drop_duplicates(inplace=True)

    # only keep 3 or more replicates
    replicates = replicates[replicates['replicates']>2]
    
    # remove samples we don't need
    replicates = replicates[replicates['TAG']!='DMSO']
    replicates = replicates[replicates['TAG']!='B']
    replicates = replicates[replicates['TAG']!='UN-USED']
    replicates = replicates[replicates['TAG']!='NC-1']
    
    # make a list for indexing the file_combo and dropping rows we don't need
    rep_ind = []

    for i in progress_bar(replicates.index):
        index = replicates.loc[i]
        rep_ind.append((index['DRUGSET_ID'], index['CELL_ID'], index['DRUG_ID'], index['TAG'], index['SEEDING_DENSITY'], index['ASSAY'], index['DURATION']))
    
    # make a new file that just has the data in replicate form 
    rep_file = gdsc.set_index(['DRUGSET_ID','CELL_ID','DRUG_ID','TAG','SEEDING_DENSITY','ASSAY','DURATION']).copy()
    rep_file = rep_file.loc[rep_ind]
    rep_file.reset_index(inplace=True)
    
    # save locally
    if os.path.exists('data') == False:
        os.makedirs('data')
    rep_file.to_csv('data/replicates.csv', index=False)

    return rep_file


In [5]:
replicates = find_replicates(gdsc)

## Normalization

In [13]:
# normalize the intensity values for every plate and cell line using the NC-0 control

def data_normalization(replicates):
    data_norm = pd.DataFrame()
    
    # divide intensity values for each unique plate and cell line by the average of the control NC-0 intensity
    for plate in progress_bar(replicates['DRUGSET_ID'].unique()):
        df = replicates[replicates['DRUGSET_ID']==plate].copy()
        for cell in df['CELL_ID'].unique():
            df2 = df[df['CELL_ID']==cell].copy()
            df2['norm_intensity'] = df2['INTENSITY'] / df2[df2['TAG']=='NC-0']['INTENSITY'].mean()
            data_norm = pd.concat([data_norm, df2])
    
    # remove the controls since they are no longer needed
    data_norm = data_norm[data_norm['TAG']!='NC-0']
    
    # save locally
    if os.path.exists('data') == False:
        os.makedirs('data')  
    data_norm.to_csv('data/replicates_normalized.csv', index=False)
    
    return data_norm

In [14]:
data_norm = data_normalization(replicates)

## Data processing to triplicates

In [15]:
# limit data to only those conditions with triplicates
# for the replicates greater than 3, batch into triplicates

def make_triplicates(data_norm):
    triplicates_df = pd.DataFrame()

    # narrow dataset down to each plate, cell line, assay, duration, seeding density, and drug
    for plate in progress_bar(data_norm['DRUGSET_ID'].unique()):
        df = data_norm[data_norm['DRUGSET_ID']==plate].copy()
        for cell in df['CELL_ID'].unique():
            df2 = df[df['CELL_ID']==cell].copy()
            for assay in df2['ASSAY'].unique():
                df3 = df2[df2['ASSAY']==assay].copy()
                for duration in df3['DURATION'].unique():
                    df4 = df3[df3['DURATION']==duration].copy()
                    for seeding in df4['SEEDING_DENSITY'].unique():
                        df5 = df4[df4['SEEDING_DENSITY']==seeding].copy()
                        for drug in df5['DRUG_ID'].unique():
                            df6 = df5[df5['DRUG_ID']==drug].copy()
                        
                            # find the number of replicates per concentration
                            df7 = pd.DataFrame(df6[['DRUGSET_ID','CELL_ID','ASSAY','DURATION','SEEDING_DENSITY','DRUG_ID','CONC']].value_counts())
                            df7.reset_index(inplace=True)
                            df7.rename(columns={0:'counts'}, inplace=True)
                        
                            # make sure there are at least 5 concentrations
                            if len(df7) > 4:
                            
                                # if all of the concentrations have three replicates, add to triplicates_df
                                if False not in list(df7['counts']==3):
                                
                                    #default batch number is 0
                                    df6['batch'] = 0
                                    triplicates_df = pd.concat([triplicates_df, df6])
                                
                                # if all the concentrations are not in triplicates already, batch them into threes
                                else:
                                
                                    # find the number of batches of 3 from the minimum number of replicates
                                    multiple = df7['counts'].min()//3
                                
                                    # make the list of batches
                                    batches = list(range(3, (multiple*3)+1, 3))
                                
                                    # for every concentration, make batches of three and add to triplicates_df
                                    for conc in df6['CONC'].unique():
                                        df8 = df6[df6['CONC']==conc].copy()
                                        for batch in batches:
                                            df9 = df8[batch-3:batch].copy()
                                            df9['batch'] = batch
                                            triplicates_df = pd.concat([triplicates_df, df9])
    # save locally
    if os.path.exists('data') == False:
        os.makedirs('data')
    triplicates_df.to_csv('data/triplicates_normalized.csv', index=False)
    
    return triplicates_df

In [16]:
triplicates = make_triplicates(data_norm)