In [87]:
import pandas as pd
import numpy as np
from copy import deepcopy
from rs3.seq import predict_seq

In [2]:
# read in processed in-house datasets, subset to targeting guides (i.e. all guides except intergenic and non-targeting controls)
file_path = '../../Data/primaryLibrary/'
annot_guide_lfcs_A549_Kox1 = pd.read_csv(file_path + 'A549_Kox1_zscored.csv')
annot_guide_lfcs_A549_Kox1 = annot_guide_lfcs_A549_Kox1[annot_guide_lfcs_A549_Kox1.Category.isin(['Essential', 'Non-essential'])]

annot_guide_lfcs_A549_Zim3 = pd.read_csv(file_path +  'A549_Zim3_zscored.csv')
annot_guide_lfcs_A549_Zim3 = annot_guide_lfcs_A549_Zim3[annot_guide_lfcs_A549_Zim3.Category.isin(['Essential', 'Non-essential'])]

annot_guide_lfcs_HCT116_Kox1 = pd.read_csv(file_path +  'HCT116_Kox1_zscored.csv')
annot_guide_lfcs_HCT116_Kox1 = annot_guide_lfcs_HCT116_Kox1[annot_guide_lfcs_HCT116_Kox1.Category.isin(['Essential', 'Non-essential'])]

annot_guide_lfcs_HCT116_Zim3 = pd.read_csv(file_path +  'HCT116_Zim3_zscored.csv')
annot_guide_lfcs_HCT116_Zim3 = annot_guide_lfcs_HCT116_Zim3[annot_guide_lfcs_HCT116_Zim3.Category.isin(['Essential', 'Non-essential'])]

#add column to specify domain before concatenating
annot_guide_lfcs_A549_Kox1['Domain'] = 'Kox1'
annot_guide_lfcs_A549_Zim3['Domain'] = 'Zim3'
annot_guide_lfcs_HCT116_Kox1['Domain'] = 'Kox1'
annot_guide_lfcs_HCT116_Zim3['Domain'] = 'Zim3'

InHouseCleanData = pd.concat([annot_guide_lfcs_A549_Kox1, annot_guide_lfcs_A549_Zim3, annot_guide_lfcs_HCT116_Kox1, annot_guide_lfcs_HCT116_Zim3])

# read in design file to append additional features
ess_noness = pd.read_csv(file_path + 'essentials_nonessentials_CRISPRi_tiling_designs_v2.csv')
ess_noness['chromosome'] = ess_noness['Reference Sequence'].apply(lambda x: int(x.split("NC_")[1].split(".")[0]) if pd.notnull(x) else x)
ess_noness = ess_noness[['Target Gene Symbol','Target Gene ID','Category','chromosome','sgRNA Sequence', 
           'sgRNA Context Sequence', 'TSS Position', 
       'sgRNA \'Cut\' Site TSS Offset', 'sgRNA \'Cut\' Position', 
            'Strand of Target','Strand of sgRNA', 'Orientation',
            'On-Target Ruleset','On-Target Efficacy Score']]
InHouseCleanData = pd.merge(InHouseCleanData, ess_noness)

InHouseCleanData.head()

Unnamed: 0,sgRNA Sequence,condition,avg_lfc,n_obs,Target Gene Symbol,Target Gene ID,Category,TSS Position,sgRNA 'Cut' Site TSS Offset,sgRNA Context Sequence,On-Target Ruleset,On-Target Efficacy Score,z_scored_avg_lfc,Domain,chromosome,sgRNA 'Cut' Position,Strand of Target,Strand of sgRNA,Orientation
0,AAAAAAAAAAAGGGCAGAAG,A549,-0.376159,2,APOBEC1,339.0,Non-essential,7665908.0,-999.0,AAAAAAAAAAAAAAAGGGCAGAAGTGGACT,RS3seq-Chen2013,-0.7248,-1.389004,Kox1,12.0,7666908.0,-,-,sense
1,AAAAAAAAAAATTAGACCTC,A549,-0.021644,2,MRGPRD,116512.0,Non-essential,68980986.0,-735.0,AAAAAAAAAAAAAAATTAGACCTCAGGGCA,RS3seq-Chen2013,-0.6309,-0.52814,Kox1,11.0,68981722.0,-,+,antisense
2,AAAAAAAAAAATTAGACCTC,HCT116,-0.257163,2,MRGPRD,116512.0,Non-essential,68980986.0,-735.0,AAAAAAAAAAAAAAATTAGACCTCAGGGCA,RS3seq-Chen2013,-0.6309,-1.381448,Kox1,11.0,68981722.0,-,+,antisense
3,AAAAAAAAAAATTAGACCTC,HCT116,-0.277625,2,MRGPRD,116512.0,Non-essential,68980986.0,-735.0,AAAAAAAAAAAAAAATTAGACCTCAGGGCA,RS3seq-Chen2013,-0.6309,-1.315569,Zim3,11.0,68981722.0,-,+,antisense
4,AAAAAAAAAACAGGACACAG,A549,0.03181,2,LARS2,23395.0,Essential,45388577.0,667.0,CAGAAAAAAAAAAACAGGACACAGGGGAGA,RS3seq-Chen2013,0.014,-0.398339,Kox1,3.0,45389244.0,+,-,antisense


In [41]:
# read in cleaned Gilbert ricin data
file_path = '../../Data/externalData/'
GilbertRicinCleanData = pd.read_csv(file_path + 'Gilbert2014Cleaned.csv')
GilbertRicinCleanData['condition'] = 'K562'
GilbertRicinCleanData['Domain'] = 'Kox1'
GilbertRicinCleanData.head()

Unnamed: 0,Target Gene Symbol,chromosome,strand targeted,PAM genomic coordinate [hg38],context seq,guide id,oligo,sgRNA sequence,zscoreRelativeNC,Avg_LFC,sgRNA sequence in Context,Target Gene Function,zscoreRelativeNC_signed,Avg_LFC_signed,condition,Domain
0,ARCN1,chr11,+,118567638.0,GCACTTTCCTTGTTTACCTCTGGTAGGTTT,ARCN1_w_118438354.27,cccttggagaaCCAcctTGTTGGCACTTTCCTTGTTTACCTCTGGT...,GCACTTTCCTTGTTTACCTCTGGT,0.250358,-0.134928,True,Resistance,-0.250358,0.134928,K562,Kox1
1,ARCN1,chr11,+,118568026.0,GCAACCCTGTAAGCATGCTTCTTGAGGAGT,ARCN1_w_118438742.27,cccttggagaaCCAcctTGTTGGCAACCCTGTAAGCATGCTTCTTG...,GCAACCCTGTAAGCATGCTTCTTG,0.157641,-0.166789,True,Resistance,-0.157641,0.166789,K562,Kox1
2,ARCN1,chr11,+,118568080.0,GATGCTGCTTCTTCGGAAATATAACGGTAG,ARCN1_w_118438796.24,cccttggagaaCCAcctTGTTGGCTGCTTCTTCGGAAATATAAGTT...,GCTGCTTCTTCGGAAATATAA,0.21634,-0.146618,True,Resistance,-0.21634,0.146618,K562,Kox1
3,ARCN1,chr11,+,118568134.0,TAGCACCTTCCAGGGCTTCCAACTTGGATC,ARCN1_w_118438850.25,cccttggagaaCCAcctTGTTGGCACCTTCCAGGGCTTCCAACTGT...,GCACCTTCCAGGGCTTCCAACT,0.320467,-0.110835,True,Resistance,-0.320467,0.110835,K562,Kox1
4,ARCN1,chr11,+,118568146.0,ATTGCCTTATTGTAGCACCTTCCAGGGCTT,ARCN1_w_118438862.24,cccttggagaaCCAcctTGTTGGCCTTATTGTAGCACCTTCCAGTT...,GCCTTATTGTAGCACCTTCCA,-0.708019,-0.464269,True,Resistance,0.708019,0.464269,K562,Kox1


In [42]:
# read in cleaned Nunez data
NunezCleanData = pd.read_csv(file_path + 'Nunez2021Cleaned.csv')
NunezCleanData['condition'] = 'K562'
NunezCleanData['Domain'] = 'Kox1'
NunezCleanData.head()

Unnamed: 0,ID,Target Gene Symbol,chromosome,strand,sequence,context_seq,hg38_coord,start_coord_hg38,end_coord_hg38,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain
0,ACTR6_+_100592059,ACTR6,chr12,+,GCGAGCACATAGAGGGATCT,TGCTTCGAGCACATAGAGGGATCTGGGCCC,100198281,100198284,100198303,-0.018881,0.035104,-0.889828,K562,Kox1
1,ACTR6_+_100592060,ACTR6,chr12,+,GTCGAGCACATAGAGGGATC,CTGCTTCGAGCACATAGAGGGATCTGGGCC,100198282,100198285,100198304,0.009292,0.295731,0.580559,K562,Kox1
2,ACTR6_+_100592066,ACTR6,chr12,+,GCCTGCTTCGAGCACATAGA,GTCCACCTGCTTCGAGCACATAGAGGGATC,100198288,100198291,100198310,0.000734,0.214023,0.119582,K562,Kox1
3,ACTR6_+_100592067,ACTR6,chr12,+,GACCTGCTTCGAGCACATAG,GGTCCACCTGCTTCGAGCACATAGAGGGAT,100198289,100198292,100198311,-0.008256,0.131427,-0.346402,K562,Kox1
4,ACTR6_+_100592092,ACTR6,chr12,+,GAGACTCTGTCACCTCCGAG,GGGCTAGACTCTGTCACCTCCGAGGGGTCC,100198314,100198317,100198336,0.001314,0.165469,-0.154343,K562,Kox1


In [43]:
# re-assigns sgRNA "cut" site in Gilbert and Nunez Datasets to be consistent with CRISPick
def CRISPickCutSite(df, strand_col, point_col, context_seq_col):
    CutSiteList = []
    for index, row in df.iterrows():
        if row[strand_col] == '+':
            CutSiteList.append(row[point_col] + 7)
        elif row[strand_col] == '-':
            CutSiteList.append(row[point_col] - 4)
        else:
            print(row[context_seq_col] + f'strand is neither + nor -')
    return CutSiteList

In [44]:
geneListGilbertAndNunez = GilbertRicinCleanData['Target Gene Symbol'].unique().tolist() + NunezCleanData['Target Gene Symbol'].unique().tolist()
pd.DataFrame(geneListGilbertAndNunez).to_csv(file_path + 'geneListGilbertAndNunez.csv', index = False)

Run CRISPick providing this list of genes as input to obtain MANE Select TSS positions. Link to output directory: https://portals.broadinstitute.org/gppx/crispick/platform/results/1da3859a-902d-471a-b06b-7aa43da31abd.

In [45]:
CRISPick_geneListGilbertAndNunez = pd.read_table(file_path + 'geneListGilbertAndNunez-sgrna-designs.txt')
CRISPick_geneListGilbertAndNunez = CRISPick_geneListGilbertAndNunez[[ 
       'Target Gene Symbol',  'Reference Sequence', 'Strand of Target',
       'TSS Position']]

In [46]:
#reassign cut site for Gilbert dataset guides
GilbertRicinCleanDataCutSite = CRISPickCutSite(GilbertRicinCleanData,'strand targeted' ,'PAM genomic coordinate [hg38]', 'context seq')
GilbertRicinCleanData['CRISPick CutSite'] = GilbertRicinCleanDataCutSite
GilbertRicinCleanData = pd.merge(GilbertRicinCleanData,CRISPick_geneListGilbertAndNunez, 
                                                 on = 'Target Gene Symbol')
GilbertRicinCleanData.head()

Unnamed: 0,Target Gene Symbol,chromosome,strand targeted,PAM genomic coordinate [hg38],context seq,guide id,oligo,sgRNA sequence,zscoreRelativeNC,Avg_LFC,sgRNA sequence in Context,Target Gene Function,zscoreRelativeNC_signed,Avg_LFC_signed,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position
0,ARCN1,chr11,+,118567638.0,GCACTTTCCTTGTTTACCTCTGGTAGGTTT,ARCN1_w_118438354.27,cccttggagaaCCAcctTGTTGGCACTTTCCTTGTTTACCTCTGGT...,GCACTTTCCTTGTTTACCTCTGGT,0.250358,-0.134928,True,Resistance,-0.250358,0.134928,K562,Kox1,118567645.0,NC_000011.10,+,118572410
1,ARCN1,chr11,+,118568026.0,GCAACCCTGTAAGCATGCTTCTTGAGGAGT,ARCN1_w_118438742.27,cccttggagaaCCAcctTGTTGGCAACCCTGTAAGCATGCTTCTTG...,GCAACCCTGTAAGCATGCTTCTTG,0.157641,-0.166789,True,Resistance,-0.157641,0.166789,K562,Kox1,118568033.0,NC_000011.10,+,118572410
2,ARCN1,chr11,+,118568080.0,GATGCTGCTTCTTCGGAAATATAACGGTAG,ARCN1_w_118438796.24,cccttggagaaCCAcctTGTTGGCTGCTTCTTCGGAAATATAAGTT...,GCTGCTTCTTCGGAAATATAA,0.21634,-0.146618,True,Resistance,-0.21634,0.146618,K562,Kox1,118568087.0,NC_000011.10,+,118572410
3,ARCN1,chr11,+,118568134.0,TAGCACCTTCCAGGGCTTCCAACTTGGATC,ARCN1_w_118438850.25,cccttggagaaCCAcctTGTTGGCACCTTCCAGGGCTTCCAACTGT...,GCACCTTCCAGGGCTTCCAACT,0.320467,-0.110835,True,Resistance,-0.320467,0.110835,K562,Kox1,118568141.0,NC_000011.10,+,118572410
4,ARCN1,chr11,+,118568146.0,ATTGCCTTATTGTAGCACCTTCCAGGGCTT,ARCN1_w_118438862.24,cccttggagaaCCAcctTGTTGGCCTTATTGTAGCACCTTCCAGTT...,GCCTTATTGTAGCACCTTCCA,-0.708019,-0.464269,True,Resistance,0.708019,0.464269,K562,Kox1,118568153.0,NC_000011.10,+,118572410


In [48]:
#calculate guide distance from MANE Select annotated TSS 
GilbertRicinCleanData["Distance to TSS"] = GilbertRicinCleanData.apply(
    lambda r:(r["CRISPick CutSite"] - r["TSS Position"]) if r["Strand of Target"] == "+" \
              else (r["TSS Position"] - r["CRISPick CutSite"]), axis=1)
GilbertRicinCleanData['Strand of sgRNA'] = GilbertRicinCleanData['strand targeted'].replace({'-':'+','+':'-'})
GilbertRicinCleanData = GilbertRicinCleanData[abs(GilbertRicinCleanData['Distance to TSS']) <= 5000]
GilbertRicinCleanData.head()

Unnamed: 0,Target Gene Symbol,chromosome,strand targeted,PAM genomic coordinate [hg38],context seq,guide id,oligo,sgRNA sequence,zscoreRelativeNC,Avg_LFC,...,zscoreRelativeNC_signed,Avg_LFC_signed,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position,Distance to TSS,Strand of sgRNA
0,ARCN1,chr11,+,118567638.0,GCACTTTCCTTGTTTACCTCTGGTAGGTTT,ARCN1_w_118438354.27,cccttggagaaCCAcctTGTTGGCACTTTCCTTGTTTACCTCTGGT...,GCACTTTCCTTGTTTACCTCTGGT,0.250358,-0.134928,...,-0.250358,0.134928,K562,Kox1,118567645.0,NC_000011.10,+,118572410,-4765.0,-
1,ARCN1,chr11,+,118568026.0,GCAACCCTGTAAGCATGCTTCTTGAGGAGT,ARCN1_w_118438742.27,cccttggagaaCCAcctTGTTGGCAACCCTGTAAGCATGCTTCTTG...,GCAACCCTGTAAGCATGCTTCTTG,0.157641,-0.166789,...,-0.157641,0.166789,K562,Kox1,118568033.0,NC_000011.10,+,118572410,-4377.0,-
2,ARCN1,chr11,+,118568080.0,GATGCTGCTTCTTCGGAAATATAACGGTAG,ARCN1_w_118438796.24,cccttggagaaCCAcctTGTTGGCTGCTTCTTCGGAAATATAAGTT...,GCTGCTTCTTCGGAAATATAA,0.21634,-0.146618,...,-0.21634,0.146618,K562,Kox1,118568087.0,NC_000011.10,+,118572410,-4323.0,-
3,ARCN1,chr11,+,118568134.0,TAGCACCTTCCAGGGCTTCCAACTTGGATC,ARCN1_w_118438850.25,cccttggagaaCCAcctTGTTGGCACCTTCCAGGGCTTCCAACTGT...,GCACCTTCCAGGGCTTCCAACT,0.320467,-0.110835,...,-0.320467,0.110835,K562,Kox1,118568141.0,NC_000011.10,+,118572410,-4269.0,-
4,ARCN1,chr11,+,118568146.0,ATTGCCTTATTGTAGCACCTTCCAGGGCTT,ARCN1_w_118438862.24,cccttggagaaCCAcctTGTTGGCCTTATTGTAGCACCTTCCAGTT...,GCCTTATTGTAGCACCTTCCA,-0.708019,-0.464269,...,0.708019,0.464269,K562,Kox1,118568153.0,NC_000011.10,+,118572410,-4257.0,-


In [50]:
#reassign cut site for Nunez dataset guides
NunezCleanDataCutSite = CRISPickCutSite(NunezCleanData,'strand' ,'hg38_coord', 'context_seq')
NunezCleanData['CRISPick CutSite'] = NunezCleanDataCutSite
NunezCleanData = pd.merge(NunezCleanData, CRISPick_geneListGilbertAndNunez,
                          on= ['Target Gene Symbol'],how = 'inner')
NunezCleanData.head()

Unnamed: 0,ID,Target Gene Symbol,chromosome,strand,sequence,context_seq,hg38_coord,start_coord_hg38,end_coord_hg38,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position
0,ACTR6_+_100592059,ACTR6,chr12,+,GCGAGCACATAGAGGGATCT,TGCTTCGAGCACATAGAGGGATCTGGGCCC,100198281,100198284,100198303,-0.018881,0.035104,-0.889828,K562,Kox1,100198288,NC_000012.12,+,100200816
1,ACTR6_+_100592060,ACTR6,chr12,+,GTCGAGCACATAGAGGGATC,CTGCTTCGAGCACATAGAGGGATCTGGGCC,100198282,100198285,100198304,0.009292,0.295731,0.580559,K562,Kox1,100198289,NC_000012.12,+,100200816
2,ACTR6_+_100592066,ACTR6,chr12,+,GCCTGCTTCGAGCACATAGA,GTCCACCTGCTTCGAGCACATAGAGGGATC,100198288,100198291,100198310,0.000734,0.214023,0.119582,K562,Kox1,100198295,NC_000012.12,+,100200816
3,ACTR6_+_100592067,ACTR6,chr12,+,GACCTGCTTCGAGCACATAG,GGTCCACCTGCTTCGAGCACATAGAGGGAT,100198289,100198292,100198311,-0.008256,0.131427,-0.346402,K562,Kox1,100198296,NC_000012.12,+,100200816
4,ACTR6_+_100592092,ACTR6,chr12,+,GAGACTCTGTCACCTCCGAG,GGGCTAGACTCTGTCACCTCCGAGGGGTCC,100198314,100198317,100198336,0.001314,0.165469,-0.154343,K562,Kox1,100198321,NC_000012.12,+,100200816


In [51]:
#calculate guide distance from MANE Select annotated TSS 
NunezCleanData["Distance to TSS"] = NunezCleanData.apply(
    lambda r:(r["CRISPick CutSite"] - r["TSS Position"]) if r["Strand of Target"] == "+" \
              else (r["TSS Position"] - r["CRISPick CutSite"]), axis=1)
NunezCleanData['Strand of sgRNA'] = NunezCleanData['strand'].replace({'-':'+','+':'-'})
NunezCleanData = NunezCleanData[abs(NunezCleanData['Distance to TSS'])<=5000]
NunezCleanData.head()

Unnamed: 0,ID,Target Gene Symbol,chromosome,strand,sequence,context_seq,hg38_coord,start_coord_hg38,end_coord_hg38,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,CRISPick CutSite,Reference Sequence,Strand of Target,TSS Position,Distance to TSS,Strand of sgRNA
0,ACTR6_+_100592059,ACTR6,chr12,+,GCGAGCACATAGAGGGATCT,TGCTTCGAGCACATAGAGGGATCTGGGCCC,100198281,100198284,100198303,-0.018881,0.035104,-0.889828,K562,Kox1,100198288,NC_000012.12,+,100200816,-2528,-
1,ACTR6_+_100592060,ACTR6,chr12,+,GTCGAGCACATAGAGGGATC,CTGCTTCGAGCACATAGAGGGATCTGGGCC,100198282,100198285,100198304,0.009292,0.295731,0.580559,K562,Kox1,100198289,NC_000012.12,+,100200816,-2527,-
2,ACTR6_+_100592066,ACTR6,chr12,+,GCCTGCTTCGAGCACATAGA,GTCCACCTGCTTCGAGCACATAGAGGGATC,100198288,100198291,100198310,0.000734,0.214023,0.119582,K562,Kox1,100198295,NC_000012.12,+,100200816,-2521,-
3,ACTR6_+_100592067,ACTR6,chr12,+,GACCTGCTTCGAGCACATAG,GGTCCACCTGCTTCGAGCACATAGAGGGAT,100198289,100198292,100198311,-0.008256,0.131427,-0.346402,K562,Kox1,100198296,NC_000012.12,+,100200816,-2520,-
4,ACTR6_+_100592092,ACTR6,chr12,+,GAGACTCTGTCACCTCCGAG,GGGCTAGACTCTGTCACCTCCGAGGGGTCC,100198314,100198317,100198336,0.001314,0.165469,-0.154343,K562,Kox1,100198321,NC_000012.12,+,100200816,-2495,-


### Combine datasets together

In [52]:
GilbertRicinCleanData = deepcopy(GilbertRicinCleanData[['Target Gene Symbol','chromosome','sgRNA sequence', 'context seq', 
                                'CRISPick CutSite', 'Strand of Target', 'Strand of sgRNA', 'TSS Position', 'Distance to TSS',
                                'zscoreRelativeNC_signed', 'Avg_LFC_signed', 
                                'condition', 'Domain']])


GilbertRicinCleanData.rename(columns={
                                      'sgRNA sequence':'sgRNA Sequence', 
                                      'context seq':'sgRNA Context Sequence',
                                      'CRISPick CutSite': 'sgRNA \'Cut\' Position', 
                                      'Distance to TSS':'sgRNA \'Cut\' Site TSS Offset'
                                     }, inplace = True)

In [53]:
NunezCleanData = deepcopy(NunezCleanData[['Target Gene Symbol','chromosome','sequence', 
                                               'context_seq', 'CRISPick CutSite',
                                                       'Strand of Target', 'Strand of sgRNA', 'TSS Position', 'Distance to TSS',
                                                       'Phenotype scores-ave_Rep1_Rep2','Avg_LFC-Tfinal', 
                         'z-score relative to NC','condition','Domain']])

NunezCleanData.rename(columns={'sequence':'sgRNA Sequence', 
                                    'context_seq':'sgRNA Context Sequence',
                                   'CRISPick CutSite': 'sgRNA \'Cut\' Position', 
                                      'Distance to TSS':'sgRNA \'Cut\' Site TSS Offset'}, inplace = True)

In [54]:
InHouseCleanData = deepcopy(InHouseCleanData\
                                 [['Target Gene Symbol','chromosome','sgRNA Sequence', 
                                   'sgRNA Context Sequence','sgRNA \'Cut\' Position',
                                   'Strand of Target', 'Strand of sgRNA', 'TSS Position', 'sgRNA \'Cut\' Site TSS Offset',
                                                       'z_scored_avg_lfc',
                                                              'condition','Domain']])



In [55]:
NunezCleanData['DataSet'] = 'Nunez'
GilbertRicinCleanData['DataSet'] = 'Gilbert'
InHouseCleanData['DataSet'] = 'InHouse'

In [56]:
DatasetCombine = pd.concat([NunezCleanData, GilbertRicinCleanData, InHouseCleanData]).reset_index(drop = True)
DatasetCombine.head()

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,Strand of sgRNA,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
0,ACTR6,chr12,GCGAGCACATAGAGGGATCT,TGCTTCGAGCACATAGAGGGATCTGGGCCC,100198288.0,+,-,100200816.0,-2528.0,-0.018881,0.035104,-0.889828,K562,Kox1,Nunez,,,
1,ACTR6,chr12,GTCGAGCACATAGAGGGATC,CTGCTTCGAGCACATAGAGGGATCTGGGCC,100198289.0,+,-,100200816.0,-2527.0,0.009292,0.295731,0.580559,K562,Kox1,Nunez,,,
2,ACTR6,chr12,GCCTGCTTCGAGCACATAGA,GTCCACCTGCTTCGAGCACATAGAGGGATC,100198295.0,+,-,100200816.0,-2521.0,0.000734,0.214023,0.119582,K562,Kox1,Nunez,,,
3,ACTR6,chr12,GACCTGCTTCGAGCACATAG,GGTCCACCTGCTTCGAGCACATAGAGGGAT,100198296.0,+,-,100200816.0,-2520.0,-0.008256,0.131427,-0.346402,K562,Kox1,Nunez,,,
4,ACTR6,chr12,GAGACTCTGTCACCTCCGAG,GGGCTAGACTCTGTCACCTCCGAGGGGTCC,100198321.0,+,-,100200816.0,-2495.0,0.001314,0.165469,-0.154343,K562,Kox1,Nunez,,,


In [57]:
#how many context sequences appear across multiple datasets?
DatasetCombine.groupby('sgRNA Context Sequence')['DataSet'].nunique().value_counts()

1    204895
2      9728
3       364
Name: DataSet, dtype: int64

In [58]:
# looking specifically context sequences present in both in-house and other datasets to verify that cut position and cut site offset
    #were calculated correctly
datasets_per_seq = DatasetCombine.groupby('sgRNA Context Sequence')['DataSet'].nunique()
multi_dataset_seqs = datasets_per_seq.index[datasets_per_seq > 1]
multi_dataset_seq_df = DatasetCombine[DatasetCombine['sgRNA Context Sequence'].isin(multi_dataset_seqs)]
multi_dataset_seq_df = multi_dataset_seq_df[['Target Gene Symbol','sgRNA Context Sequence', 'Strand of Target','sgRNA \'Cut\' Site TSS Offset', 'DataSet']].drop_duplicates()
multi_dataset_seq_df.head()

Unnamed: 0,Target Gene Symbol,sgRNA Context Sequence,Strand of Target,sgRNA 'Cut' Site TSS Offset,DataSet
3047,AURKB,AATTTGGAATTAATGACTAAATTGAGGTAA,-,826.0,Nunez
3048,AURKB,CCAGGGCAAGTCTAAATTACAATTTGGAAT,-,806.0,Nunez
3049,AURKB,AAGGATATAAAGTTACAGTTTAAAAGGCCC,-,778.0,Nunez
3050,AURKB,ATGATATCTGGGACTGAGCAGGCAGGGCCT,-,674.0,Nunez
3051,AURKB,AATGATATCTGGGACTGAGCAGGCAGGGCC,-,673.0,Nunez


In [59]:
pivoted_df = multi_dataset_seq_df.pivot(index = ['sgRNA Context Sequence','Target Gene Symbol','Strand of Target'],
                                        columns = 'DataSet',
                                        values = 'sgRNA \'Cut\' Site TSS Offset').reset_index()
pivoted_df

DataSet,sgRNA Context Sequence,Target Gene Symbol,Strand of Target,Gilbert,InHouse,Nunez
0,AAAAAAAATTCTGCAGGAGGCAATAGGTTT,DDX18,+,,935.0,935.0
1,AAAAAAACGTTAAAGAAATTGTAAAGGAGA,DNTTIP2,-,,-561.0,-562.0
2,AAAAAATATATGCCATTTACCCCTCGGGAA,HEATR1,-,,-554.0,-555.0
3,AAAAAATTCCTGGAAAACCCAGGTAGGACT,METTL17,+,,617.0,617.0
4,AAAAAATTGAGGTCCTGAGTTAAGTGGCTG,MRPS25,-,,-423.0,-424.0
...,...,...,...,...,...,...
11159,TTTTTCATTGTTAAACCTGAAGACGGGGCA,TTK,+,,493.0,493.0
11160,TTTTTGGGCAGCACGGCTCCCATTCGGAAC,DNAJA3,+,,-887.0,-887.0
11161,TTTTTGTCGTAATCCTATTGAGTTCGGGTC,SNRPF,+,,-752.0,-752.0
11162,TTTTTTGCTCGCAGAAACAATACGTGGTCA,POP5,-,,-535.0,-536.0


It appears at first glance that the calculated TSS offset for the Nunez dataset is one off from the correct TSS offset for the in-house dataset (that was derived from CRISPick) ONLY when the guide is on the negative strand. Check whether this hypothesis is correct.

In [60]:
pivoted_df['InHouse - Nunez'] = pivoted_df['InHouse'] - pivoted_df['Nunez']
pivoted_df['InHouse - Gilbert'] = pivoted_df['InHouse'] - pivoted_df['Gilbert']
pivoted_df[['Strand of Target','InHouse - Nunez','InHouse - Gilbert']].value_counts()

Strand of Target  InHouse - Nunez  InHouse - Gilbert
+                 0.0              0.0                  277
-                 1.0              1.0                   87
dtype: int64

The calculated cut site is indeed one off for negative strand targeting guides.

In [61]:
#correct this offset
DatasetCombine.loc[(DatasetCombine['DataSet'].isin(['Nunez', 'Gilbert'])) & (DatasetCombine['Strand of Target'] == '-'), 
                      'sgRNA \'Cut\' Site TSS Offset'] += 1

#look at an example to verify whether it worked
DatasetCombine[DatasetCombine['sgRNA Context Sequence'] == 'AAAAAATTGAGGTCCTGAGTTAAGTGGCTG']

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,Strand of sgRNA,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,Avg_LFC-Tfinal,z-score relative to NC,condition,Domain,DataSet,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc
47174,MRPS25,chr3,GATTGAGGTCCTGAGTTAAG,AAAAAATTGAGGTCCTGAGTTAAGTGGCTG,15065739.0,-,-,15065315.0,-423.0,-0.019317,0.026164,-0.940264,K562,Kox1,Nunez,,,
142713,MRPS25,3.0,AATTGAGGTCCTGAGTTAAG,AAAAAATTGAGGTCCTGAGTTAAGTGGCTG,15065739.0,-,-,15065315.0,-423.0,,,,A549,Kox1,InHouse,,,-0.402571
142714,MRPS25,3.0,AATTGAGGTCCTGAGTTAAG,AAAAAATTGAGGTCCTGAGTTAAGTGGCTG,15065739.0,-,-,15065315.0,-423.0,,,,A549,Zim3,InHouse,,,-0.761167
142715,MRPS25,3.0,AATTGAGGTCCTGAGTTAAG,AAAAAATTGAGGTCCTGAGTTAAGTGGCTG,15065739.0,-,-,15065315.0,-423.0,,,,HCT116,Kox1,InHouse,,,0.075731
142716,MRPS25,3.0,AATTGAGGTCCTGAGTTAAG,AAAAAATTGAGGTCCTGAGTTAAGTGGCTG,15065739.0,-,-,15065315.0,-423.0,,,,HCT116,Zim3,InHouse,,,-0.764584


### Remove overlapping genes
Guide RNAs which may target protein coding genes other than their intended target have the potential to confound analyses. Since the majority of guides within the combined tiling datasets are located within 1kb of the MANE Select TSS of their target gene, we remove all genes with any GENCODE annotated TSS within 1kb of any other GENCODE annotated TSS corresponding to a different gene.

In [62]:
# remove overlapping genes
overlapping_gene = pd.read_csv('../../Data/geneOverlap/GW_ensembl_protein_coding_df_1kb.csv')
overlapping_gene_2_3 =overlapping_gene[overlapping_gene['overlap type'].isin([2,3])]
gene_2_3_unique = set(list(overlapping_gene_2_3['Selected Gene name']) +\
list(overlapping_gene_2_3['Overlapping Gene name']))
NewDatasetCombine = deepcopy(DatasetCombine[~DatasetCombine['Target Gene Symbol'].isin(gene_2_3_unique)])

In [63]:
# did removing overlapping genes remove all multi-target guides?
conseq_with_varied_gene = NewDatasetCombine.groupby('sgRNA Context Sequence').filter(lambda NewDatasetCombine: NewDatasetCombine['Target Gene Symbol'].nunique() > 1)
#if yes, should be 0
len(conseq_with_varied_gene)

0

In [64]:
NewDatasetCombine['sgRNA Context Sequence'].nunique()

166624

In [65]:
len(NewDatasetCombine)

431095

In [66]:
#standardizes syntax with which chromosomes are labeled across datasets
def standardize_chromosome(chromosome):
    chromosome = str(chromosome).replace('.0', '')
    if not chromosome.startswith('chr'):
        chromosome = 'chr' + chromosome
    return chromosome

In [67]:
NewDatasetCombine['chromosome'] = NewDatasetCombine['chromosome'].apply(standardize_chromosome)

In [68]:
#define list of essential genes in in-house datasets
EssGene = ess_noness[ess_noness.Category == 'Essential']['Target Gene Symbol'].unique()

In [69]:
#remove non-essential genes
InhouseEss = NewDatasetCombine[(NewDatasetCombine.DataSet == 'InHouse') & (NewDatasetCombine['Target Gene Symbol'].isin(EssGene))]
other = NewDatasetCombine[~(NewDatasetCombine.DataSet == 'InHouse')]
newData = pd.concat([InhouseEss,other])
newData.value_counts(['DataSet', 'condition', 'Domain'])

DataSet  condition  Domain
Nunez    K562       Kox1      68575
InHouse  A549       Zim3      42609
         HCT116     Kox1      42609
         A549       Kox1      42570
         HCT116     Zim3      42544
Gilbert  K562       Kox1      18509
dtype: int64

In [70]:
#check for any Us in context sequences
any('u' in seq for seq in newData['sgRNA Context Sequence'])

False

In [71]:
any('U' in seq for seq in newData['sgRNA Context Sequence'])

False

In [72]:
#convert entire sequence to uppercase
def to_upper_case(seq_list):
    return ''.join([seq.upper() for seq in seq_list])

#convert all context sequences to all uppercase letters
newData['sgRNA Context Sequence'] = newData['sgRNA Context Sequence'].apply(to_upper_case)

In [76]:
#calculate Rule Set 3 Sequence Scores for every context sequence with Chen tracrRNA
newData['rs3ChenSeqScore'] =  predict_seq(newData['sgRNA Context Sequence'], sequence_tracr='Chen2013')

Calculating sequence-based features


100%|██████████████████████████████████| 257416/257416 [07:07<00:00, 602.50it/s]


In [77]:
#create column which uniquely identifies the 6 datasets
newData['cellDomainDataSet'] = newData[['condition', 'Domain', 'DataSet']].apply(
    lambda x: "_".join(x.astype(str)), axis=1)

In [78]:
#create one column for z-score regardless of which dataset it comes from / how z-score was calcualted
newData['z_score'] = newData['z_scored_avg_lfc'].combine_first(newData['zscoreRelativeNC_signed']).combine_first(newData['z-score relative to NC'])

In [80]:
#sign adjust all z-scores such that more positive indicates higher activity
newData['signed_zscore'] = np.where(newData['DataSet'] == 'Gilbert', newData['z_score'], -newData['z_score'])
newData

Unnamed: 0,Target Gene Symbol,chromosome,sgRNA Sequence,sgRNA Context Sequence,sgRNA 'Cut' Position,Strand of Target,Strand of sgRNA,TSS Position,sgRNA 'Cut' Site TSS Offset,Phenotype scores-ave_Rep1_Rep2,...,condition,Domain,DataSet,zscoreRelativeNC_signed,Avg_LFC_signed,z_scored_avg_lfc,rs3ChenSeqScore,cellDomainDataSet,z_score,signed_zscore
118978,LARS2,chr3,AAAAAAAAAACAGGACACAG,CAGAAAAAAAAAAACAGGACACAGGGGAGA,45389244.0,+,-,45388577.0,667.0,,...,A549,Kox1,InHouse,,,-0.398339,0.013999,A549_Kox1_InHouse,-0.398339,0.398339
118979,LARS2,chr3,AAAAAAAAAACAGGACACAG,CAGAAAAAAAAAAACAGGACACAGGGGAGA,45389244.0,+,-,45388577.0,667.0,,...,A549,Zim3,InHouse,,,-0.841793,0.013999,A549_Zim3_InHouse,-0.841793,0.841793
118980,LARS2,chr3,AAAAAAAAAACAGGACACAG,CAGAAAAAAAAAAACAGGACACAGGGGAGA,45389244.0,+,-,45388577.0,667.0,,...,HCT116,Zim3,InHouse,,,-0.992358,0.013999,HCT116_Zim3_InHouse,-0.992358,0.992358
118985,COQ2,chr4,AAAAAAAAAAGGGCACCAAG,ATAAAAAAAAAAAAGGGCACCAAGTGGCCA,83285687.0,-,+,83284798.0,-888.0,,...,A549,Kox1,InHouse,,,0.044728,-0.401549,A549_Kox1_InHouse,0.044728,-0.044728
118986,COQ2,chr4,AAAAAAAAAAGGGCACCAAG,ATAAAAAAAAAAAAGGGCACCAAGTGGCCA,83285687.0,-,+,83284798.0,-888.0,,...,A549,Zim3,InHouse,,,-0.190308,-0.401549,A549_Zim3_InHouse,-0.190308,0.190308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
118618,WDR11,chr10,gctaggacctctactacaatgaa,GGCTAGGACCTCTACTACAATGAATGGGAG,120855890.0,+,+,120851363.0,4527.0,,...,K562,Kox1,Gilbert,0.387317,0.354061,,0.382810,K562_Kox1_Gilbert,0.387317,0.387317
118619,WDR11,chr10,gctaggacctctactacaatgaat,GCTAGGACCTCTACTACAATGAATGGGAGA,120855891.0,+,+,120851363.0,4528.0,,...,K562,Kox1,Gilbert,-0.782858,-0.048063,,0.654478,K562_Kox1_Gilbert,-0.782858,-0.782858
118620,WDR11,chr10,gtctttcatcaagtctgatgta,CAGTCTTTCATCAAGTCTGATGTAAGGTAT,120855971.0,+,+,120851363.0,4608.0,,...,K562,Kox1,Gilbert,1.429277,0.712125,,0.076554,K562_Kox1_Gilbert,1.429277,1.429277
118621,WDR11,chr10,gtaggtccaagtttcagtc,TTTATGTAGGTCCAAGTTTCAGTCTGGTAT,120856094.0,+,+,120851363.0,4731.0,,...,K562,Kox1,Gilbert,0.553328,0.411110,,0.717680,K562_Kox1_Gilbert,0.553328,0.553328


In [82]:
out_path = '../../Data/cleanedData/'
newData.to_csv(out_path + 'CombinedCleanedDatasets.csv', index = False)