In [19]:
import pandas as pd
from tqdm import tqdm
import re

In [20]:
crispr_t4_df= pd.read_csv('../data/Supplementary_table4.csv')
crispr_t4_df.head(5)

Unnamed: 0,LncRNA_family_ID,Transcript_ID,evolutionary_conserved?
0,human_lncrna_fused_1,URS00008B37EC_9606_1%URS00008B37EC_9606_2%URS0...,no
1,human_lncrna_fused_2,ENSG00000239945_ENST00000495576%URS000003B6E2_...,yes
2,human_lncrna_fused_3,URS00008B5770_9606_1%URS00008B5770_9606_2,no
3,human_lncrna_fused_4,URS000000192A_9606_1%URS000000192A_9606_2%URS0...,no
4,human_lncrna_fused_5,URS0000162D42_9606_1%URS0000162D42_9606_2%URS0...,no


In [21]:
lnc_tard_df = pd.read_csv('../data/lnc_tard_selected_clean.csv')
lnc_tard_df.head(5)

Unnamed: 0,Regulator,Target,RegulatorEnsembleID,TargetEnsembleID,diseaseCategory,DiseaseName2,RegulationDiretion,ExpressionPattern,RegulatorType,TargetType,Target_cleaned,Regulator_cleaned,Regulator_cleaned2,Target_cleaned2
0,LINC00313,miR-4429,ENSG00000185186,ENSG00000264010,Cancer,Thyroid cancer,negatively-F,upregulation,lncRNA,miRNA,miR-4429,LINC00313,LINC00313,miR-4429
1,FAM83H-AS1,CDKN1A,ENSG00000203499,ENSG00000124762,Cancer,Brain glioma,negatively-E,upregulation,lncRNA,PCG,CDKN1A,FAM83H-AS1,FAM83H-AS1,CDKN1A
2,NEAT1,TGFB1,ENSG00000245532,ENSG00000105329,Cancer,Liver cancer,positively-E,upregulation,lncRNA,PCG,TGFB1,NEAT1,NEAT1,TGFB1
3,NEAT1,ZEB1,ENSG00000245532,ENSG00000148516,Cancer,Breast cancer,positively-E,upregulation,lncRNA,TF,ZEB1,NEAT1,NEAT1,ZEB1
4,ZFPM2-AS1,MIF,ENSG00000251003,ENSG00000240972,Cancer,Gastric cancer,positively-F,upregulation,lncRNA,PCG,MIF,ZFPM2-AS1,ZFPM2-AS1,MIF


In [22]:
len(lnc_tard_df['RegulatorEnsembleID'].unique())

1211

In [23]:
urs_to_ensg_map_df = pd.read_csv('../data/ensembl.tsv', sep='\t', header=None, names=['URS ID', 'ENS','ENST','Species','type','ENSG ID'])
print(f"number of rows in map from URS IDs to ENSG IDs: {len(urs_to_ensg_map_df)}")
urs_to_ensg_map_df.head(5)

number of rows in map from URS IDs to ENSG IDs: 1953212


Unnamed: 0,URS ID,ENS,ENST,Species,type,ENSG ID
0,URS0000000055,ENSEMBL,ENST00000585414,9606,lncRNA,ENSG00000226803.10
1,URS00000000FD,ENSEMBL,ENST00000448543,9606,lncRNA,ENSG00000234279.3
2,URS0000000344,ENSEMBL,ENST00000633884,9606,lncRNA,ENSG00000282594.1
3,URS0000000351,ENSEMBL,ENST00000452009,9606,lncRNA,ENSG00000235427.1
4,URS00000005D1,ENSEMBL,ENST00000563639,9606,lncRNA,ENSG00000260457.2


In [24]:
urs_to_ensg_map_df=urs_to_ensg_map_df[(urs_to_ensg_map_df['Species']==9606)] # select only human
print(f"number of rows in map from URS IDs to ENSG IDs for human genes: {len(urs_to_ensg_map_df)}")

number of rows in map from URS IDs to ENSG IDs for human genes: 193386


In [25]:
ensg_regex = r'ENSG\d{11}' # ENSG IDs regex: ENSG followed by 11 digits
urs_regex = r'URS[0-9A-F]{10}' # URS IDs regex: URS followed by 10 digits

In [26]:
# clean up ENSG IDs by removing version numbers
urs_to_ensg_map_df['ENSG ID'] = urs_to_ensg_map_df['ENSG ID'].str.extract(f'({ensg_regex})', expand=False)
urs_to_ensg_map_df.head(5)

Unnamed: 0,URS ID,ENS,ENST,Species,type,ENSG ID
0,URS0000000055,ENSEMBL,ENST00000585414,9606,lncRNA,ENSG00000226803
1,URS00000000FD,ENSEMBL,ENST00000448543,9606,lncRNA,ENSG00000234279
2,URS0000000344,ENSEMBL,ENST00000633884,9606,lncRNA,ENSG00000282594
3,URS0000000351,ENSEMBL,ENST00000452009,9606,lncRNA,ENSG00000235427
4,URS00000005D1,ENSEMBL,ENST00000563639,9606,lncRNA,ENSG00000260457


In [27]:
print(f"unique ENSG IDs in Ensemble table: {len(urs_to_ensg_map_df['ENSG ID'].unique())}")

unique ENSG IDs in Ensemble table: 41780


In [28]:
# extract URS and ENSG IDs from Transcript_ID column and put in new columns
crispr_t4_df['Transcript_ID_URS'] = crispr_t4_df['Transcript_ID'].apply(lambda x: set(re.findall(urs_regex, x)))
crispr_t4_df['Transcript_ID_ENSG'] = crispr_t4_df['Transcript_ID'].apply(lambda x: set(re.findall(ensg_regex, x)))

# view result
crispr_t4_df[['LncRNA_family_ID', 'Transcript_ID', 'Transcript_ID_URS', 'Transcript_ID_ENSG']]


Unnamed: 0,LncRNA_family_ID,Transcript_ID,Transcript_ID_URS,Transcript_ID_ENSG
0,human_lncrna_fused_1,URS00008B37EC_9606_1%URS00008B37EC_9606_2%URS0...,"{URS00008C1914, URS00008C0037, URS00008B37EC, ...",{}
1,human_lncrna_fused_2,ENSG00000239945_ENST00000495576%URS000003B6E2_...,"{URS0000AACE4A, URS00009C129F, URS0001BE41FB, ...","{ENSG00000239945, ENSG00000228327}"
2,human_lncrna_fused_3,URS00008B5770_9606_1%URS00008B5770_9606_2,{URS00008B5770},{}
3,human_lncrna_fused_4,URS000000192A_9606_1%URS000000192A_9606_2%URS0...,"{URS0000268115, URS0001BE2E04, URS0001BE0A3D, ...",{}
4,human_lncrna_fused_5,URS0000162D42_9606_1%URS0000162D42_9606_2%URS0...,"{URS0000311D9A, URS0000162D42, URS00002D7B9C}",{}
...,...,...,...,...
97812,human_lncrna_fused_97813,URS00008B4781_9606_1%URS00008B4781_9606_2,{URS00008B4781},{}
97813,human_lncrna_fused_97814,URS00008BA232_9606_1%URS00008BA232_9606_2,{URS00008BA232},{}
97814,human_lncrna_fused_97815,URS0001BDF20A_9606_2,{URS0001BDF20A},{}
97815,human_lncrna_fused_97816,URS00008B8C46_9606_3,{URS00008B8C46},{}


In [29]:
# get all unique URS IDs from column Transcript_ID_URS in crispr_t4_df
crispr_t4_unique_urs_df = set(
    urs
    for urs_set in crispr_t4_df['Transcript_ID_URS']
    for urs in urs_set
)

# get URS IDs from Ensemble mapping
reference_urs_set = set(urs_to_ensg_map_df['URS ID'].dropna().unique())

# get matches and non-matches
matched_urs = crispr_t4_unique_urs_df & reference_urs_set
unmatched_urs = crispr_t4_unique_urs_df - reference_urs_set

print(f"Total unique URS IDs in CRISPR dataset: {len(crispr_t4_unique_urs_df)}")
print(f"Matched URS IDs: {len(matched_urs)}")
print(f"Unmatched URS IDs: {len(unmatched_urs)}")

Total unique URS IDs in CRISPR dataset: 308376
Matched URS IDs: 33586
Unmatched URS IDs: 274790


In [30]:
urs_to_ensg = dict(
    zip(
        urs_to_ensg_map_df['URS ID'].dropna(),
        urs_to_ensg_map_df['ENSG ID'].dropna()
    )
)

crispr_t4_df['mapped ENSG'] = crispr_t4_df['Transcript_ID_URS'].apply(
    lambda urs_set: [urs_to_ensg[urs] for urs in urs_set if urs in urs_to_ensg]
)

crispr_t4_df

Unnamed: 0,LncRNA_family_ID,Transcript_ID,evolutionary_conserved?,Transcript_ID_URS,Transcript_ID_ENSG,mapped ENSG
0,human_lncrna_fused_1,URS00008B37EC_9606_1%URS00008B37EC_9606_2%URS0...,no,"{URS00008C1914, URS00008C0037, URS00008B37EC, ...",{},[]
1,human_lncrna_fused_2,ENSG00000239945_ENST00000495576%URS000003B6E2_...,yes,"{URS0000AACE4A, URS00009C129F, URS0001BE41FB, ...","{ENSG00000239945, ENSG00000228327}","[ENSG00000241860, ENSG00000225880, ENSG0000022..."
2,human_lncrna_fused_3,URS00008B5770_9606_1%URS00008B5770_9606_2,no,{URS00008B5770},{},[]
3,human_lncrna_fused_4,URS000000192A_9606_1%URS000000192A_9606_2%URS0...,no,"{URS0000268115, URS0001BE2E04, URS0001BE0A3D, ...",{},"[ENSG00000243485, ENSG00000243485]"
4,human_lncrna_fused_5,URS0000162D42_9606_1%URS0000162D42_9606_2%URS0...,no,"{URS0000311D9A, URS0000162D42, URS00002D7B9C}",{},[ENSG00000237613]
...,...,...,...,...,...,...
97812,human_lncrna_fused_97813,URS00008B4781_9606_1%URS00008B4781_9606_2,no,{URS00008B4781},{},[]
97813,human_lncrna_fused_97814,URS00008BA232_9606_1%URS00008BA232_9606_2,no,{URS00008BA232},{},[]
97814,human_lncrna_fused_97815,URS0001BDF20A_9606_2,no,{URS0001BDF20A},{},[]
97815,human_lncrna_fused_97816,URS00008B8C46_9606_3,no,{URS00008B8C46},{},[]


In [31]:
# number of matches from URS IDs to ENSG IDs
num_matched_none = crispr_t4_df['mapped ENSG'].apply(lambda x: len(x) == 0).sum()
num_matched_once = crispr_t4_df['mapped ENSG'].apply(lambda x: len(x) == 1).sum()
num_matched_multiple = crispr_t4_df['mapped ENSG'].apply(lambda x: len(x) > 1).sum()
print(f"Number of rows with empty matched ENSG: {num_matched_none}")
print(f"Number of rows with one matched ENSG: {num_matched_once}")
print(f"Number of rows with multiple matched ENSG: {num_matched_multiple}")

Number of rows with empty matched ENSG: 85716
Number of rows with one matched ENSG: 8143
Number of rows with multiple matched ENSG: 3958


In [43]:
# create set of unique ENSG IDs from lnc_tard_df
lnc_targets_unique_ensg_ids = set(lnc_tard_df['TargetEnsembleID'].dropna().unique())
lnc_regulators_unique_ensg_ids = set(lnc_tard_df['RegulatorEnsembleID'].dropna().unique())
lnc_tard_all_unique_ensg_ids = lnc_targets_unique_ensg_ids.union(lnc_regulators_unique_ensg_ids)

In [44]:
# create set of unique ENSG IDs from crispr_t4_df with one matched ENSG ID
crispr_t4_single_matched_ensg_ids_df = crispr_t4_df[crispr_t4_df['mapped ENSG'].apply(lambda x: len(x) == 1)]
crispr_t4_single_unique_ensgs_df = set(
    ensg_set[0] for ensg_set in crispr_t4_single_matched_ensg_ids_df['mapped ENSG']
)

matched_single_unique_ensgs = crispr_t4_single_unique_ensgs_df & lnc_tard_all_unique_ensg_ids
unmatched_single_unique_ensgs = crispr_t4_single_unique_ensgs_df - lnc_tard_all_unique_ensg_ids

print(f"Total unique ENSG IDs from mapped list: {len(crispr_t4_single_unique_ensgs_df)}")
print(f"Single match, ENSG IDs in lncTARD dataset: {len(matched_single_unique_ensgs)}")
print(f"Unmatched ENSG IDs: {len(unmatched_single_unique_ensgs)}")


Total unique ENSG IDs from mapped list: 7786
Single match, ENSG IDs in lncTARD dataset: 207
Unmatched ENSG IDs: 7579


In [45]:
# create set of unique ENSG IDs from crispr_t4_df with more than one ENSG IDs
crispr_t4_multiple_matched_ensg_ids_df = crispr_t4_df[crispr_t4_df['mapped ENSG'].apply(lambda x: isinstance(x, list) and len(x) > 1)]
crispr_t4_multiple_unique_ensgs_df = set(
    ensg_set[0] for ensg_set in crispr_t4_multiple_matched_ensg_ids_df['mapped ENSG']
)

matched_multi_unique_ensgs = crispr_t4_multiple_unique_ensgs_df & lnc_tard_all_unique_ensg_ids
unmatched_multi_unique_ensgs = crispr_t4_multiple_unique_ensgs_df - lnc_tard_all_unique_ensg_ids

print(f"Total unique ENSG IDs in multi-mapped rows: {len(crispr_t4_multiple_unique_ensgs_df)}")
print(f"Matched ENSG IDs in df_lnc: {len(matched_multi_unique_ensgs)}")
print(f"Unmatched ENSG IDs: {len(unmatched_multi_unique_ensgs)}")


Total unique ENSG IDs in multi-mapped rows: 3782
Matched ENSG IDs in df_lnc: 383
Unmatched ENSG IDs: 3399


In [48]:
matched_ensg_ids = matched_single_unique_ensgs.union(matched_multi_unique_ensgs)
unmatched_ensg_ids = unmatched_single_unique_ensgs.union(unmatched_multi_unique_ensgs)

print(f"Matched ENSG IDs in lncTARD: {len(matched_ensg_ids)}")
print(f"Unmatched ENSG IDs: {len(unmatched_ensg_ids)}")

Matched ENSG IDs in lncTARD: 536
Unmatched ENSG IDs: 10636
