## Import libraries

In [1]:
import pandas as pd
import numpy as np
from fastprogress.fastprogress import progress_bar
import requests
from zipfile import ZipFile
from io import BytesIO
import os


## Data collection

In [2]:
# read in gdsc1 and gdsc2 files directly from the website
# save locally

def import_files():
    r = requests.get('https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC1_public_raw_data_24Jul22.csv.zip')
    files = ZipFile(BytesIO(r.content))
    gdsc1 = pd.read_csv(files.open('GDSC1_public_raw_data_24Jul22.csv'))

    r = requests.get('https://cog.sanger.ac.uk/cancerrxgene/GDSC_release8.4/GDSC2_public_raw_data_24Jul22.csv.zip')
    files = ZipFile(BytesIO(r.content))
    gdsc2 = pd.read_csv(files.open('GDSC2_public_raw_data_24Jul22.csv'))
    
    if os.path.exists('data') == False:
        os.makedirs('data')
    gdsc1.to_csv('data/gdsc1.csv', index=False)
    gdsc2.to_csv('data/gdsc2.csv', index=False)
    
    return gdsc1, gdsc2

In [3]:
gdsc1, gdsc2 = import_files()

  gdsc1 = pd.read_csv(files.open('GDSC1_public_raw_data_24Jul22.csv'))


## Data processing

In [4]:
# load gdsc1 and gdsc2 files

def load_data(gdsc1, gdsc2):
    
    gdsc = pd.concat([gdsc1, gdsc2])
    gdsc.drop_duplicates(inplace=True)
    
    return gdsc

In [5]:
gdsc = load_data(gdsc1, gdsc2)

In [6]:
# figure out which conditions have 3 or more replicates

def find_replicates(gdsc):
    # unique plate, cell line, tag, drug, assay, seeding density, duration, and concentrations
    replicates = pd.DataFrame(gdsc[['DRUGSET_ID','CELL_ID','TAG','DRUG_ID','ASSAY','SEEDING_DENSITY','DURATION','CONC']].value_counts(dropna=False))
    replicates = replicates.reset_index()
    replicates.rename(columns={0:'replicates'}, inplace=True)

    # remove conc column and drop duplicates so it's essentially only counting replicates for one conc
    replicates.drop(['CONC'], axis=1, inplace=True)
    replicates.drop_duplicates(inplace=True)

    # only keep 3 or more replicates
    replicates = replicates[replicates['replicates']>2]
    
    # remove samples we don't need
    replicates = replicates[replicates['TAG']!='DMSO']
    replicates = replicates[replicates['TAG']!='B']
    replicates = replicates[replicates['TAG']!='UN-USED']
    replicates = replicates[replicates['TAG']!='NC-1']
    
    # make a list for indexing the file_combo and dropping rows we don't need
    rep_ind = []

    for i in progress_bar(replicates.index):
        index = replicates.loc[i]
        rep_ind.append((index['DRUGSET_ID'], index['CELL_ID'], index['DRUG_ID'], index['TAG'], index['SEEDING_DENSITY'], index['ASSAY'], index['DURATION']))
    
    # make a new file that just has the data in replicate form 
    rep_file = gdsc.set_index(['DRUGSET_ID','CELL_ID','DRUG_ID','TAG','SEEDING_DENSITY','ASSAY','DURATION']).copy()
    rep_file = rep_file.loc[rep_ind]
    rep_file.reset_index(inplace=True)
    
    # save locally
    if os.path.exists('data') == False:
        os.makedirs('data')
    rep_file.to_csv('data/replicates.csv', index=False)

    return rep_file


In [7]:
replicates = find_replicates(gdsc)

In [8]:
# limit data to only those conditions with triplicates
# for the replicates greater than 3, batch into triplicates

def make_triplicates(replicates):
    triplicates_df = pd.DataFrame()

    # narrow dataset down to each plate, cell line, assay, duration, seeding density, and drug
    for plate in progress_bar(replicates['DRUGSET_ID'].unique()):
        df = replicates[replicates['DRUGSET_ID']==plate].copy()
        for cell in df['CELL_ID'].unique():
            df2 = df[df['CELL_ID']==cell].copy()
            for assay in df2['ASSAY'].unique():
                df3 = df2[df2['ASSAY']==assay].copy()
                for duration in df3['DURATION'].unique():
                    df4 = df3[df3['DURATION']==duration].copy()
                    for seeding in df4['SEEDING_DENSITY'].unique():
                        df5 = df4[df4['SEEDING_DENSITY']==seeding].copy()
                        for drug in df5['DRUG_ID'].unique():
                            df6 = df5[df5['DRUG_ID']==drug].copy()
                        
                            # find the number of replicates per concentration
                            df7 = pd.DataFrame(df6[['DRUGSET_ID','CELL_ID','ASSAY','DURATION','SEEDING_DENSITY','DRUG_ID','CONC']].value_counts())
                            df7.reset_index(inplace=True)
                            df7.rename(columns={0:'counts'}, inplace=True)
                        
                            # make sure there are at least 5 concentrations
                            if len(df7) > 4:
                            
                                # if all of the concentrations have three replicates, add to triplicates_df
                                if False not in list(df7['counts']==3):
                                
                                    #default batch number is 0
                                    df6['batch'] = 0
                                    triplicates_df = pd.concat([triplicates_df, df6])
                                
                                # if all the concentrations are not in triplicates already, batch them into threes
                                else:
                                
                                    # find the number of batches of 3 from the minimum number of replicates
                                    multiple = df7['counts'].min()//3
                                
                                    # make the list of batches
                                    batches = list(range(3, (multiple*3)+1, 3))
                                
                                    # for every concentration, make batches of three and add to triplicates_df
                                    for conc in df6['CONC'].unique():
                                        df8 = df6[df6['CONC']==conc].copy()
                                        for batch in batches:
                                            df9 = df8[batch-3:batch].copy()
                                            df9['batch'] = batch
                                            triplicates_df = pd.concat([triplicates_df, df9])
    # save locally
    if os.path.exists('data') == False:
        os.makedirs('data')
    triplicates_df.to_csv('data/triplicates.csv', index=False)
    
    return triplicates_df

In [9]:
triplicates = make_triplicates(replicates)

## Normalization

In [10]:
# normalize the intensity values for every plate and cell line using the NC-0 control

def data_normalization(triplicates):
    data_norm = pd.DataFrame()

    for plate in progress_bar(triplicates['DRUGSET_ID'].unique()):
        df = triplicates[triplicates['DRUGSET_ID']==plate].copy()
        for cell in df['CELL_ID'].unique():
            df2 = df[df['CELL_ID']==cell].copy()
            df2['norm_intensity'] = df2['INTENSITY'] / df2[df2['TAG']=='NC-0']['INTENSITY'].mean()
            data_norm = pd.concat([data_norm, df2])
    
    data_norm = data_norm[data_norm['TAG']!='NC-0']
    
    if os.path.exists('data') == False:
        os.makedirs('data')  
    data_norm.to_csv('data/triplicates_normalized.csv', index=False)
    
    return data_norm

In [11]:
data_norm = data_normalization(triplicates)