- For our positive controls, ideally we’d like to establish a reference ORF paired with two mutants, one showing strong shifts and one subtle in the protein channel as well as detectable changes in morphology. In this case, profiling would especially be helpful. For the NegCons, we must slim down our selection to only 4 ORFs – I’m not sure if you guys have preference for selection there.


- Regarding the PosCons, we’d like to select either IMPDH1 or ALK as our reference allele, 
  plus two of their respective variants (one which shows strong morphological shifts/localization patterns,
  and one that’s subtle). For NegCons, we can only select 4 to include in our screen – 
  I’ll leave it up to you guys which 4 best suit your needs.


- You can disregard all wells that are not labelled either PosCon or NegCon for this screen. 
  And please keep in mind each quadrant received a varying dose of viral supernatant. 
  The amount I settled on for our final pipeline is 6 uL, so perhaps you want to pay attention to the wells which     received a vTitre = 6.


In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib notebook
import pandas as pd
import numpy as np
import seaborn as sns
sns.set_style("whitegrid")
import sklearn.preprocessing as sp
import pickle
# from imblearn.over_sampling import SMOTE  
from sklearn.metrics import accuracy_score,confusion_matrix
import matplotlib.pyplot as plt
import os

from scipy.stats import pearsonr
import scipy
from matplotlib import rcParams
from sklearn.cluster import KMeans
import sys
sys.path.insert(0, '/home/ubuntu/workspace_SingleCell/SingleCell_Morphological_Analysis/') 
from singlecell.preprocess import handle_nans, extract_cpfeature_names

import sys
sys.path.insert(0, '/home/ubuntu/workspace_rare/2017_09_27_RareDiseases_Taipale/') 
from utils import preprocessing,meanProfileAnalysis,visualization,impactscore
from datetime import date
today = date.today()

#### Read cleaned metadata from "metadata/reprocessed" folder

In [None]:
rootDir='/home/ubuntu/bucket/projects/2017_10_19_Profiling_rare_ORFs/workspace/'
batch='2022_08_22_Batch_1'
annot_df=pd.read_csv(rootDir+'/metadata/reprocessed/'+batch+'.csv')
annot_df.head()

annot_df.head()

In [None]:
annot_df['control_type'].unique()

#### Save raw int features for thrsh calculation and transfection detection

In [None]:
rootPath='/home/ubuntu/bucket/projects/2017_10_19_Profiling_rare_ORFs/workspace'
########################## 
channels_used=['GFP']

listOfBatchPlates=annot_df.Metadata_batch_Plate.unique().tolist();
for bp in listOfBatchPlates:
    _=preprocessing.saveRawIntensityFeatures(bp,annot_df,rootPath,channels_used);

In [None]:
listOfPlates=annot_df.Metadata_Plate.unique().tolist();

scaler0 = preprocessing.MinMaxScaler(feature_range=(0,1))

df_inten = pd.DataFrame();
df_inten_scaled_perPlate = pd.DataFrame();
# listOfPlates=['20X_CP_CP127_1']
for p in listOfPlates: #[0:1]:
    fileNameToSave=rootDir+'/backend/plate_raw_intensity_features/'+batch+'/df_intensityFeatures_'+p;
    intFeaturesDf=pd.read_pickle(fileNameToSave, compression='infer');    
    df_inten=df_inten.append(intFeaturesDf, ignore_index=True)  
    df_inten_scaled0 = intFeaturesDf.copy()
    intFeatures=intFeaturesDf.columns[intFeaturesDf.columns.str.contains("GFP")].tolist()
    for ifi in intFeatures:
        qpi=intFeaturesDf[ifi].quantile(0.999)
        intFeaturesDf[ifi]=intFeaturesDf[ifi].clip(0, qpi)
        
        
    dataScaled=scaler0.fit_transform(intFeaturesDf.loc[:,intFeatures])
    df_inten_scaled0[intFeatures]=dataScaled
    df_inten_scaled_perPlate =df_inten_scaled_perPlate.append(df_inten_scaled0, ignore_index=True)  
print(df_inten.shape)   

df_inten=pd.merge(df_inten, annot_df, how='inner', on=['Metadata_Plate','Metadata_Well']);
df_inten_scaled_perPlate=pd.merge(df_inten_scaled_perPlate, annot_df, how='inner', on=['Metadata_Plate','Metadata_Well']);

intFeatures=['Cells_Intensity_UpperQuartileIntensity_GFP',
 'Cells_Intensity_MeanIntensity_GFP']
# Cells_Intensity_UpperQuartileIntensity_DsRed
# Cells_Intensity_UpperQuartileIntensity_Protein
log_scale_enabled=True
binss=1000

rcParams['patch.force_edgecolor'] = False
# df_inten=df_inten_scaled_perPlate.copy()
perc95all_m1a_x=[]
perc95all_m1b_x=[]
fig, axes = plt.subplots(2,len(intFeatures), figsize=(9,6),sharex=True)



for i in range(len(intFeatures)):
    #### Method 1 -a
    allDataTandU=df_inten[intFeatures[i]].values
#     sns.distplot(allDataTandU,kde=False,bins=2000,label="T+U",ax=axes[0,i],color="navy")
    sns.histplot(data=df_inten,x=intFeatures[i], bins=binss,stat="density",\
             element="step",common_norm=False,legend=True,log_scale=log_scale_enabled,ax=axes[0,i])

    perc95=np.percentile(allDataTandU, 99);axes[0,i].axvline(x=perc95,linestyle=':',color="r")
    perc95all_m1a_x.append(perc95);
    axes[0,i].set_title('_'.join(intFeatures[i].split('_')[2:]));
#     axes[0,0].set_ylabel('Method 1- a');
    axes[0,0].set_ylabel('All single cells');

    #### Method 1 -b
#     data2plotTrans=df_inten[~df_inten['Variant'].isin(untrans_vars)][intFeatures[i]].values
    data2plotUtrans=df_inten[df_inten['control']][intFeatures[i]].values

    sns.histplot(data=df_inten,x=intFeatures[i], bins=binss,stat="density",\
             hue="control",element="step",common_norm=False,legend=True,log_scale=log_scale_enabled,ax=axes[1,i])

    print(np.percentile(data2plotUtrans, 40))
    perc95=np.percentile(data2plotUtrans, 99);axes[1,i].axvline(x=perc95,linestyle=':',color="r");

    perc95all_m1b_x.append(perc95)
plt.tight_layout()
# axes[1,0].legend();

#### 4. Generate replicate level profiles based on fixed cell mean intensity thrsh
- For this batch all the cells are assumed to be transfected and therefore transfection_params_dict={} 

In [None]:
transfection_params_dict={} 

feature_scaling_params_dict={'feature_scaler': 'Robust'}
all_params={}
all_params['enrichement_profiles_params']={}
all_params['transfection_params_dict']=transfection_params_dict
all_params['feature_scaling_params_dict']=feature_scaling_params_dict
all_params['save_single_cells']=True


listOfBatchPlates=annot_df.Metadata_batch_Plate.unique().tolist();

for bp in listOfBatchPlates:
    preprocessing.generate_population_profiles(bp,annot_df,rootDir,all_params);

#### 5. Load and preprocess replicate level profiles

In [None]:
# sc_per_plate_scaling # 'sc_scaled_per_plate','raw'
# zscored_profiles # 'untransfected','untransfected_stringent'
feature_scaling_params_dict={'sc_per_plate_scaling':'sc_scaled_per_plate',\
                             'zscored_profiles':[False,'untransfected'],\
                             'post_scale_all_profiles':[False,'Standard']} 

dirs_params_dict={'rootDir':rootDir,\
                  'profiles_folder_in_workspace': 'population_profiles'}
read_pop_params={}
read_pop_params['dirs_params_dict']=dirs_params_dict
read_pop_params['feature_scaling_params_dict']=feature_scaling_params_dict
read_pop_params['protein_channel_suffix']='GFP'

df_scaled_annot,cpFeats_A,cpFeats_P,cpFeats_NP=\
meanProfileAnalysis.read_merge_preprocess_meanProfiles(annot_df,read_pop_params);

# dfTransSummary = df_scaled_annot[['Metadata_batch_Plate','Metadata_Sample_Unique','n_transf','n_untransf','transf_Ratio']]
dfTransSummary=df_scaled_annot[annot_df.columns.tolist()+['n_transf','n_untransf','transf_Ratio']];


#### 6. Calculate replicate correlation of profiles
Save curve plots and values to results/replicate_corr_curves

In [None]:
from singlecell.process.replicate_correlation import replicate_null_corr_coefs
from singlecell.process import normalize_funcs

df_rep_level=df_scaled_annot[df_scaled_annot['transfection_status']==1].reset_index(drop=True)

df_rep_level_scaled=normalize_funcs.standardize_per_catX(df_rep_level,'Metadata_batch_Plate',cpFeats_P+cpFeats_NP).copy();
# df_rep_level_scaled = normalize_funcs.standardize_df_columns(df_rep_level,cpFeats_P+cpFeats_NP,'Standard')

nOfReps=df_rep_level_scaled.groupby(['Metadata_Sample_Unique']).size().reset_index()
pairWithReplicates=nOfReps.loc[nOfReps[0]!=1,:].reset_index()['Metadata_Sample_Unique']#.groupby([0]).size()

scal_status=df_rep_level_scaled['normalization'].unique()[0]
# zscor_status=df_rep_level_scaled['zscored'].unique().astype(str)[0]
zscor_status='nan'
# if not np.isnan(df_rep_level_scaled['zscored'].unique()[0])

saveDir=rootDir+'/results/replicate_corr_curves/'+batch
# 
os.makedirs(saveDir, exist_ok=True)

pertColName='Metadata_Sample_Unique'
repCor4impactList=[]
for f,ch,t in zip([cpFeats_P,cpFeats_NP],['p','np'],['Protein_Channel','NonProtein_Channels']):
    print(ch,t)
    t2=t+', '+scal_status+', zscored: '+zscor_status
    fh_2save,repCorrDf=replicate_null_corr_coefs(df_rep_level_scaled,pertColName,f,1,title=t2,hist_bins=10)
    fh_2save.savefig(saveDir+'/'+ch+'_'+scal_status+'_'+zscor_status+'.png')
    repCorrDf=repCorrDf.add_suffix('_'+ch)
    repCor4impactList.append(repCorrDf);
    
repCorr_df_avg=pd.concat(repCor4impactList,axis=1).reset_index().rename(columns={'index':pertColName})
repCorr_df_avg.to_csv(saveDir+'/'+scal_status+'_'+zscor_status+'.csv',index=False)
df_rep_level_scaled=pd.merge(df_rep_level_scaled,repCorr_df_avg,how='left',on=pertColName)

#### 7. Calculate WT-MT impact scores and save
- Approach 1: average replicate level profiles and score treatment level profiles
- Approach 2: calculate impact scores per plate

In [None]:
# Approach 1

df_rep_level_scaled['Gene-dose']=df_rep_level_scaled['Gene']+'-'+df_rep_level_scaled['vTitre'].astype(str)
wt_mt_cols=['Gene-dose','Metadata_Sample_Unique']
impact_scores_trt_profs = impactscore.impact_score_wt_mt(df_rep_level_scaled,repCorr_df_avg,[cpFeats_P,cpFeats_NP],\
                                                        wt_mt_cols);
   
saveDir=rootDir+'/results/Impact-Scores/Method-MeanProfiles/'+batch
os.makedirs(saveDir, exist_ok=True)
impact_scores_trt_profs.to_csv(saveDir+'/impact_scores_trt_'+today.strftime("%Y%m%d")+'.csv',index=False)

In [None]:
# print(impact_scores_trt_profs.to_markdown())

In [None]:
control_type='negcon'
neg_con_unq=annot_df.loc[(annot_df['control_type']==control_type) &(annot_df['vTitre']==6),\
                         'Metadata_Sample_Unique'].tolist()
control_type='poscon'
pos_con_unq=annot_df.loc[(annot_df['control_type']==control_type) &(annot_df['vTitre']==6),\
                         'Metadata_Sample_Unique'].tolist()

In [None]:
print(impact_scores_trt_profs.to_markdown())

In [None]:
annot_df[(annot_df['control_type']=='negcon') &(annot_df['vTitre']==6)].groupby('Metadata_Sample_Unique').size()

In [None]:
print(impact_scores_trt_profs[impact_scores_trt_profs['Metadata_Sample_Unique'].isin(pos_con_unq)].to_markdown(index=False))

In [None]:
print(repCorr_df_avg[repCorr_df_avg['Metadata_Sample_Unique'].isin(neg_con_unq)].to_markdown(index=False))

In [None]:
# impact_scores_trt_profs
# repCorr_df_avg

In [None]:
featColNames_ls=[cpFeats_P,cpFeats_NP]
impact_scores_df_chs_ls=[]
for f,ch in zip(featColNames_ls,['p','np']):
    impact_corr_mat=df_rep_level_scaled.groupby([wt_mt_cols[0],wt_mt_cols[1]]).mean()[f].T.corr()

    genes_variant_size=impact_corr_mat.groupby(wt_mt_cols[0]).size().reset_index()
    genes_with_variant=list(set(genes_variant_size.loc[genes_variant_size[0]>1,wt_mt_cols[0]].tolist()) &\
                            set(df_rep_level_scaled[wt_mt_cols[1]].unique().tolist()))
    impact_scores_df_ls=[]
    for g in genes_with_variant:
        per_gene_df=impact_corr_mat.loc[g][g][g].reset_index()
    #     per_gene_df['Gene']=per_gene_df.columns[1]
        per_gene_df[wt_mt_cols[0]]=g
        per_gene_df['wt_RepCor_'+ch]=repCorr_df_avg.loc[repCorr_df_avg[wt_mt_cols[1]]==g,'RepCor_'+ch].values[0]

        impact_scores_df_ls.append(per_gene_df.rename(columns={g:'cc_'+ch}))

    impact_scores_df=pd.concat(impact_scores_df_ls,ignore_index=True)
    impact_scores_df_chs_ls.append(impact_scores_df)

impact_scores_df_chs=pd.concat(impact_scores_df_chs_ls,axis=1).T.drop_duplicates().T

In [None]:
# impact_corr_mat.loc[g][g]
df_rep_level_scaled[['Gene-dose','Metadata_Sample_Unique','vTitre']]

In [None]:
# pd.concat(repCor4impactList,axis=1).reset_index()

In [None]:
results_df.Variant.unique()

In [None]:
palette = sns.color_palette("Paired",20)

In [None]:
palette[1]