In [366]:
import pandas as pd
import numpy as np
import glob
import json

### First preprocess the screen data and create relevant files

A088=CRISPRi, and A090=CRISPRa

In [4]:
crispri = pd.read_csv('/dfs/project/perturb-gnn/AI_RA/Steinhart/A088_2022-04-19_SetA.count.txt', delimiter='\t')

In [149]:
def mean_repeats(data_df):
    cols = data_df.columns
    cols = [c for c in cols if c != 'sgRNA']
    mean_df = data_df.loc[:, cols].groupby('Gene').mean()
    return mean_df

def total_norm(X):
    return X/np.sum(X)

def z_norm(X):
    return (X - np.mean(X))/np.std(X)

def control_norm(X):
    return (X - X.loc['NO-TARGET', :])/X.loc['NO-TARGET', :]

In [150]:
processed_data = {}
screen_map = {'crispri':'A088_2022-04-19', 'crispra':'A090'}

for screen_type in ['crispri', 'crispra']:
    
    processed_data[screen_type] = {}
    for car_type in ['CD19', 'GD2']:
        
        processed_data[screen_type][car_type] = {}
        for day_num in ['D5', 'D13', 'D18', 'D22']:
            
            processed_data[screen_type][car_type][day_num] = {}
            for donor_id in ['Donor21', 'Donor22', 'Donor23', 'Donor24']:
            
                processed_data[screen_type][car_type][day_num][donor_id] = {}
                for set_ in ['A', 'B']:
                
                    processed_data[screen_type][car_type][day_num][donor_id][set_] = []

In [186]:
for screen_type in ['crispri', 'crispra']:
    screen_id = screen_map[screen_type]
    
    for set_ in ['A', 'B']:
        data = f'/dfs/project/perturb-gnn/AI_RA/Steinhart/{screen_id}_Set{set_}.count.txt'
        
        read_in = pd.read_csv(data, delimiter='\t')
        read_in = read_in.set_index('Gene')
        
        mean_df = mean_repeats(read_in)
        mean_df = total_norm(mean_df)
        mean_df = control_norm(mean_df)
        
        for col in mean_df.columns:
            if '_' not in col:
                continue
            
            donor_id = col.split('_')[1]
            car_type = col.split('_')[2]
            day_num = col.split('_')[3]
                
            df = mean_df.loc[:, [col]]
            df.to_csv(f'/dfs/user/yhr/AI_RA/research_assistant/datasets/Steinhart_temp/ground_truth_Steinhart_{screen_id}_{donor_id}_{car_type}_{day_num}_{set_}.csv',
                     header=False)
            
            processed_data[screen_type][car_type][day_num][donor_id][set_] = col_data

In [331]:
all_files = glob.glob('/dfs/user/yhr/AI_RA/research_assistant/datasets/Steinhart_temp/ground_truth_Steinhart_*')
case_names = [f.split('.csv')[0][:-2] for f in all_files]

In [332]:
for c in np.unique(case_names):
    split_c = c.split('_')
    split_c[-3] = 'Donor??'
    relevant_filenames = '_'.join(split_c) + '_?.csv'
    
    relevant_files = glob.glob(relevant_filenames)
    for itr, f in enumerate(relevant_files):       
        if itr==0:
            sum_df = pd.read_csv(f, index_col=0, header=None)
        else:
            pass
            temp_df = pd.read_csv(f, index_col=0, header=None)
            sum_df += temp_df
        
    sum_df = sum_df[~sum_df[1].isna()]
    sum_df = sum_df/len(relevant_files)
    
    new_name = relevant_filenames.replace('Steinhart_temp/', '')
    new_name = new_name.replace('Donor??_', '')
    new_name = new_name.replace('_?', '')
    new_name = new_name.replace('A090', 'crispra')
    new_name = new_name.replace('A088_2022-04-19', 'crispri')
    
    sum_df.to_csv(new_name)

### Compute topmovers

In [337]:
processed_files = glob.glob('/dfs/user/yhr/AI_RA/research_assistant/datasets/ground_truth_Steinhart_*')

In [338]:
def get_hits(df):
    mean = np.mean(df['1'].values)
    std = np.std(df['1'].values)
    
    up_thresh = mean+std*2
    low_thresh = mean-std*2

    hit_idx_hi = np.where(df['1']>up_thresh)[0]
    hit_idx_lo = np.where(df['1']<low_thresh)[0]
    hit_idx = np.concatenate([hit_idx_hi, hit_idx_lo])

    processed_hits = df.iloc[hit_idx, :].index.values.tolist()
    
    return processed_hits

In [341]:
'topmovers_'+name+'.csv'

'topmovers_Steinhart_crispri_GD2_D13.csv'

In [342]:
for f in processed_files:
    df = pd.read_csv(f, index_col=0)
    processed_hits = get_hits(df)
    
    name = f.split('/ground_truth_')[-1].split('.csv')[0]
    np.save('./datasets/topmovers_'+name+'.csv', processed_hits)

### Write task prompts

In [370]:
task_prompt = {"Task": '', "Measurement": ''}
exp_type_map = {'crispri':'inactivation', 'crispra': 'activation'}
car_type_map = {'GD2':'HA GD2', 'CD19': 'CD19'}

for f in processed_files:
    save_name = f.split('/ground_truth_')[-1]
    save_name = save_name.split('.csv')[0]
    
    exp_name = f.split('/ground_truth_Steinhart_')[-1]
    exp_type, car_type, day = exp_name.split('_')
    day = day.split('.')[0][1:]
    exp_type = exp_type_map[exp_type]
    car_type = car_type_map[car_type]
    
    task_prompt["Task"] = f"identify genes that upon {exp_type} allow cells to resist T-cell exhaustion, under the {car_type} CAR (chimeric-antigenic receptor) condition"
    task_prompt["Measurement"] = f"the log fold change in normalized sgRNA read counts compared to the non-targeting control, {day} days after perturbation"    
    
    file_path = f'./datasets/task_prompts/{save_name}'+'.json'
    with open(file_path, 'w') as file:
        json.dump(task_prompt, file)

In [369]:
task_prompt

{'Task': 'identify genes that upon inactivation allow cells to resist T-cell exhaustion, under the HA GD2 CAR (chimeric-antigenic receptor) condition',
 'Measurement': 'the log fold change in normalized sgRNA read counts compared to the non-targeting control, 13 days after perturbation'}

In [357]:
task_prompt

{'Task': 'identify genes that upon inactivation allow cells to resist T-cell exhaustion, under the HA GD2 CAR (chimeric-antigenic receptor) condition',
 'Measurement': 'the log fold change in normalized sgRNA read counts compared to the non-targeting control, 13 days after perturbation'}