In [None]:
# i pip intsalld pip install camelot-py[cv]
# for this but it installed a version of numpy that I can't use with pinder so in fugure will have to choose between these
# before i had 1.26.4

"ChiPPI differs from current methods in that most methods utilize a simple ‘unification’ of the PPI networks of the parental proteins, but are not able to accurately evaluate ‘missing’ and ‘preserved’ interactions that result from the domains of a fusion (27,32). Nevertheless, ChiPPI finds ‘missing’ and ‘preserved’ interactors of a fusion based on the preserved protein domains of the parental proteins. We applied ChiPPI to the analysis of 11 528 fusions from ChiTaRS-3.1 (37), accurately mapping alterations in network properties that delineate the fusion protein network from parental networks."

In [19]:
import numpy as np
import camelot
import pandas as pd
import os

In [11]:
def harmonize_nulls_to_nan(df: pd.DataFrame, *, also_blank_strings=True, keep_datetime=False) -> pd.DataFrame:
    out = df.copy()

    # 1) Convert common sentinels to real missing
    if also_blank_strings:
        out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})

    # 2) Normalize to pandas NA first (unifies None/NaN/<NA>)
    out = out.convert_dtypes()

    # 3) Cast extension dtypes -> object so np.nan can live there.
    for c in out.columns:
        dt = out[c].dtype
        is_ext = isinstance(dt, pd.api.extensions.ExtensionDtype)
        if keep_datetime and pd.api.types.is_datetime64_any_dtype(dt):
            # keep datetimes as datetime64 with NaT
            continue
        if is_ext:
            out[c] = out[c].astype(object)

    # 4) Finally: make ALL missings = np.nan
    out = out.where(~out.isna(), np.nan)

    return out

In [12]:
pdf_path = "gkx423_supp_s9only.pdf"

# 1) Read all tables on all pages using lattice mode
tables = camelot.read_pdf(
    pdf_path,
    pages="all",
    flavor="lattice",   # uses ruling lines (good for this PDF)
    strip_text="\n"     # helps with multi-line cells
)

print(f"Found {len(tables)} tables")

# 2) Concatenate all pages into one big DataFrame
dfs = [t.df for t in tables]

# The first row of the first table will likely be the header
#header = dfs[0].iloc[0].tolist()
header = ["Fusion ID","Gene1","Gene2","Gene1 Nodes","Gene1 Edges","Gene2 Nodes","Gene2 Edges","Fusion Nodes","Fusion Edges","Number","Missing Interactors"]
dfs_clean = []

for i, df in enumerate(dfs):
    # Drop the header row inside each page
    df.columns = header
    dfs_clean.append(df)

full_df = pd.concat(dfs_clean, ignore_index=True)

import pandas as pd

# ... your code up through full_df definition ...
full_df = full_df.replace("", pd.NA)
full_df = harmonize_nulls_to_nan(full_df)

def merge_missing_interactor_overflow(df, missing_col="Missing Interactors"):
    """
    Merge rows where all non-missing_interactors columns are NA
    into the closest previous row that has non-NA values in every
    other column.

    Assumes the overflow rows only carry extra text in `missing_col`.
    """
    non_missing_cols = [c for c in df.columns if c != missing_col]

    anchor_idx = None          # index of last "full" row
    rows_to_drop = []

    for idx, row in df.iterrows():
        non_missing = row[non_missing_cols]
        missing_val = row[missing_col]

        # A "full" row: all non-missing-interactor columns have info
        if non_missing.notna().all():
            anchor_idx = idx

        else:
            # Overflow row: all non-missing-interactor columns NA,
            # but missing_col has text
            if non_missing.isna().all() and pd.notna(missing_val) and anchor_idx is not None:
                existing = df.at[anchor_idx, missing_col]

                if pd.isna(existing):
                    df.at[anchor_idx, missing_col] = str(missing_val)
                else:
                    # add a space or newline between chunks as you prefer
                    df.at[anchor_idx, missing_col] = (
                        str(existing).rstrip() + " " + str(missing_val).lstrip()
                    )

                rows_to_drop.append(idx)
            else:
                # row with partial info (shouldn't really happen here, but
                # if it does, don't treat it as overflow)
                pass

    df = df.drop(index=rows_to_drop).reset_index(drop=True)
    return df

full_df = merge_missing_interactor_overflow(full_df)

# Now you can safely drop fully empty rows (if any remain)
full_df = full_df.dropna(how="all")

display(full_df.head())
print(full_df.columns)
print(len(full_df))


Found 25 tables


Unnamed: 0,Fusion ID,Gene1,Gene2,Gene1 Nodes,Gene1 Edges,Gene2 Nodes,Gene2 Edges,Fusion Nodes,Fusion Edges,Number,Missing Interactors
0,EU216071,BCR,ABL1,51,234,127,843,108,597,48,"PCNA, HSPD1, PAG1, GPX1, CDKN1B, RAD9A, IL3, P..."
1,AJ298917,FGFR1,BCR,27,75,52,258,44,188,34,"HSPD1, SHB, IL3, GAB2, HOXA9, ERCC3, FGF1, FGF..."
2,Z35761,ETV6,ABL1,27,85,128,867,83,452,68,"PCNA, HSPD1, SIN3A, CDKN1B, PSMA4, MTOR, GAB2,..."
3,EF428110,PRKAR1A,RARA,65,206,106,570,64,204,104,"MMS19, PRDX6, PLEKHF2, SET, PRPF31, TOP2B, WTI..."
4,U41743,NPM1,RARA,355,4318,105,554,172,986,274,"MMS19, RPL10, ELMO2, NSUN2, PPP1CC, SMARCD3, E..."


Index(['Fusion ID', 'Gene1', 'Gene2', 'Gene1 Nodes', 'Gene1 Edges',
       'Gene2 Nodes', 'Gene2 Edges', 'Fusion Nodes', 'Fusion Edges', 'Number',
       'Missing Interactors'],
      dtype='object')
61


In [29]:
col = "Missing Interactors"  # adjust to whatever header is

# Turn multi-line comma-separated text into *lists* of strings
def parse_interactors(cell):
    if pd.isna(cell):
        return []
    # Normalize whitespace/newlines
    text = " ".join(str(cell).split())
    # Split on commas
    items = [x.strip() for x in text.split(",") if x.strip()]
    return items

full_df["missing_interactors_list"] = full_df[col].apply(parse_interactors)

# If you also want a pretty string without weird spacing:
full_df["missing_interactors_str"] = full_df["missing_interactors_list"].apply(
    lambda xs: ",".join(xs)
)

full_df = full_df.drop(columns=["Missing Interactors","missing_interactors_list"])
full_df = full_df.rename(columns={"missing_interactors_str":"Missing Interactors"})
display(full_df)


Unnamed: 0,Fusion ID,Gene1,Gene2,Gene1 Nodes,Gene1 Edges,Gene2 Nodes,Gene2 Edges,Fusion Nodes,Fusion Edges,Number,Missing Interactors
0,EU216071,BCR,ABL1,51,234,127,843,108,597,48,"PCNA,HSPD1,PAG1,GPX1,CDKN1B,RAD9A,IL3,PSMD4,GA..."
1,AJ298917,FGFR1,BCR,27,75,52,258,44,188,34,"HSPD1,SHB,IL3,GAB2,HOXA9,ERCC3,FGF1,FGF23,CSNK..."
2,Z35761,ETV6,ABL1,27,85,128,867,83,452,68,"PCNA,HSPD1,SIN3A,CDKN1B,PSMA4,MTOR,GAB2,DOK2,F..."
3,EF428110,PRKAR1A,RARA,65,206,106,570,64,204,104,"MMS19,PRDX6,PLEKHF2,SET,PRPF31,TOP2B,WTIP,HDAC..."
4,U41743,NPM1,RARA,355,4318,105,554,172,986,274,"MMS19,RPL10,ELMO2,NSUN2,PPP1CC,SMARCD3,ENO1,UB..."
...,...,...,...,...,...,...,...,...,...,...,...
56,X77754,CCND1,TACSTD2,76,455,4,3,14,26,66,"PPP1CB,NPDC1,PCNA,NEUROD1,ELAVL1,PPP1CC,CDK4,C..."
57,Y08643,COL1A1,PDGFB,27,50,17,23,7,7,35,"MMP2,ELAVL1,BRCA1,SPARC,TXN,IGFBP3,MDFI,ITGA2,..."
58,EU314929,TMPRSS2,ETV5,1,0,17,30,3,2,16,"AMH,SUMO2,SEZ6L2,CCR5,UBC,CISH,HDAC11,ELAVL1,E..."
59,EU432099,TMPRSS2,ERG,1,0,37,124,6,6,33,"NME1,DDX5,ACTB,RPLP0,ERG,PCBP1,EIF2S1,NEDD4,AT..."


In [30]:
# the matching process here with FusOn-DB is going to be pretty rough because it's based on gene names (these should all be human proteins) but it's okay. 
chippi_head_tail_genes = full_df["Gene1"].tolist() + full_df["Gene2"].tolist()
chippi_head_tail_genes = list(set(chippi_head_tail_genes))
print(len(chippi_head_tail_genes), chippi_head_tail_genes[0:5])

chippi_missing_genes = full_df["Missing Interactors"].tolist()
chippi_missing_genes = ",".join(chippi_missing_genes)
chippi_missing_genes = chippi_missing_genes.split(",")
chippi_missing_genes = list(set(chippi_missing_genes))
print(len(chippi_missing_genes),chippi_missing_genes[0:5])

chippi_s9_dir = "data_files/processed/chippi_s9"
os.makedirs(chippi_s9_dir, exist_ok=True)
os.makedirs(f"{chippi_s9_dir}/uniprot_idmap_inputs", exist_ok=True)
os.makedirs(f"{chippi_s9_dir}/uniprot_idmap_outputs", exist_ok=True)

with open(f"{chippi_s9_dir}/uniprot_idmap_inputs/head_tail_missing.txt", "w") as f:
    all_genes = list(set(chippi_head_tail_genes+chippi_missing_genes))
    f.write("\n".join(all_genes))

75 ['NIN', 'CREB3L2', 'PDGFB', 'PML', 'SH3D19']
1681 ['SEZ6L2', 'SNCA', 'IRS4', 'PSMA7', 'RPA2']


Did a UniProt IDMapping from Gene Name to UniProtKB, restricting to Homo sapiens [9606]

In [23]:
full_df.head()

Unnamed: 0,Fusion ID,Gene1,Gene2,Gene1 Nodes,Gene1 Edges,Gene2 Nodes,Gene2 Edges,Fusion Nodes,Fusion Edges,Number,Missing Interactors,missing_interactors_list,missing_interactors_str
0,EU216071,BCR,ABL1,51,234,127,843,108,597,48,"PCNA, HSPD1, PAG1, GPX1, CDKN1B, RAD9A, IL3, P...","[PCNA, HSPD1, PAG1, GPX1, CDKN1B, RAD9A, IL3, ...","PCNA,HSPD1,PAG1,GPX1,CDKN1B,RAD9A,IL3,PSMD4,GA..."
1,AJ298917,FGFR1,BCR,27,75,52,258,44,188,34,"HSPD1, SHB, IL3, GAB2, HOXA9, ERCC3, FGF1, FGF...","[HSPD1, SHB, IL3, GAB2, HOXA9, ERCC3, FGF1, FG...","HSPD1,SHB,IL3,GAB2,HOXA9,ERCC3,FGF1,FGF23,CSNK..."
2,Z35761,ETV6,ABL1,27,85,128,867,83,452,68,"PCNA, HSPD1, SIN3A, CDKN1B, PSMA4, MTOR, GAB2,...","[PCNA, HSPD1, SIN3A, CDKN1B, PSMA4, MTOR, GAB2...","PCNA,HSPD1,SIN3A,CDKN1B,PSMA4,MTOR,GAB2,DOK2,F..."
3,EF428110,PRKAR1A,RARA,65,206,106,570,64,204,104,"MMS19, PRDX6, PLEKHF2, SET, PRPF31, TOP2B, WTI...","[MMS19, PRDX6, PLEKHF2, SET, PRPF31, TOP2B, WT...","MMS19,PRDX6,PLEKHF2,SET,PRPF31,TOP2B,WTIP,HDAC..."
4,U41743,NPM1,RARA,355,4318,105,554,172,986,274,"MMS19, RPL10, ELMO2, NSUN2, PPP1CC, SMARCD3, E...","[MMS19, RPL10, ELMO2, NSUN2, PPP1CC, SMARCD3, ...","MMS19,RPL10,ELMO2,NSUN2,PPP1CC,SMARCD3,ENO1,UB..."


In [31]:
# save the ChiPPI database
savedir = "data_files/processed/chippi_s9/clean"
os.makedirs(savedir, exist_ok=True)
full_df.to_csv(f"{savedir}/tables9_clean.csv",index=False)

In [32]:
from Bio import SeqIO

In [41]:
# merge with the UniProts and stuff 
idmap_folder = "data_files/processed/chippi_s9/uniprot_idmap_outputs"
idmap_fasta_path = os.path.join(idmap_folder,"idmapping_reviewed_true_canonical_and_isoform_2025_12_12.fasta")
idmap_tsv_path = os.path.join(idmap_folder,"idmapping_reviewed_true_2025_12_12.tsv")

# read the fasta
fasta_rows = [[record.id,"".join(record.seq),record.description] for record in SeqIO.parse(idmap_fasta_path, "fasta")]
idmap_tsv_df = pd.read_csv(idmap_tsv_path,sep="\t")

# drop columns that are all NaN
idmap_tsv_df = idmap_tsv_df.dropna(axis=1, how='all')
print(f"Length of idmap_tsv_df={len(idmap_tsv_df)}")
idmap_tsv_df.head()

Length of idmap_tsv_df=1715


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length
0,E2F8,A0AVK6,reviewed,E2F8_HUMAN,Transcription factor E2F8 (E2F-8),E2F8,Homo sapiens (Human),867
1,KIAA1598,A0MZ66,reviewed,SHOT1_HUMAN,Shootin-1 (Shootin1),SHTN1 KIAA1598,Homo sapiens (Human),631
2,WTIP,A6NIX2,reviewed,WTIP_HUMAN,Wilms tumor protein 1-interacting protein (WT1...,WTIP,Homo sapiens (Human),430
3,ARHGAP32,A7KAX9,reviewed,RHG32_HUMAN,Rho GTPase-activating protein 32 (Brain-specif...,ARHGAP32 GRIT KIAA0712 RICS,Homo sapiens (Human),2087
4,MIF4GD,A9UHW6,reviewed,MI4GD_HUMAN,MIF4G domain-containing protein (SLBP-interact...,MIF4GD SLIP1,Homo sapiens (Human),222


In [42]:
# did anything have multiple mappings? 
gb = idmap_tsv_df.groupby("From").agg(
    entries=("Entry",lambda x: ",".join(set(x))),
    n_entries=("Entry",lambda x: len((set(x))))
)
test1 = len(gb.loc[gb["n_entries"]==1])
print(f"Total gene names mapped to 1 UniProt: {test1}/{len(gb)} ({100*test1/len(gb):.2f}%)")
test1 = len(gb.loc[gb["n_entries"]>1])
print(f"Total gene names mapped to >1 UniProt: {test1} ({100*test1/len(gb):.2f}%)")


Total gene names mapped to 1 UniProt: 1628/1671 (97.43%)
Total gene names mapped to >1 UniProt: 43 (2.57%)


In [43]:
idmap_fasta_df = pd.DataFrame(fasta_rows, columns=["uniprot_id_full","sequence","description"])
idmap_fasta_df[["database","uniprotkb","uniprot_gene_name"]] = idmap_fasta_df["uniprot_id_full"].str.split("|",expand=True)
idmap_fasta_df["uniprotkb_iso"] = idmap_fasta_df["uniprotkb"].apply(lambda x: x if "-" in x else f"{x}-0")
idmap_fasta_df["isoform_from_desc"] = "Isoform " +  idmap_fasta_df["description"].str.extract(r'(?i)\bisoform\s+([^\s,;:)\]]+)')[0]
idmap_fasta_df["isoform_from_desc"] = idmap_fasta_df["isoform_from_desc"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_fasta_df["isoform_from_uniprotkb"] = "Isoform " + idmap_fasta_df["uniprotkb_iso"].apply(lambda x: x.split("-")[1] if (x is not None and type(x)==str and "-" in x) else None)
idmap_fasta_df["isoform_from_uniprotkb"] = idmap_fasta_df["isoform_from_uniprotkb"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_fasta_df["canonical_uniprotkb"] = idmap_fasta_df["uniprotkb"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
idmap_fasta_df["uniprotkb"] = "uniprotkb:" + idmap_fasta_df["uniprotkb"]
idmap_fasta_df = idmap_fasta_df.drop(columns=["uniprot_id_full","description"])
display(idmap_fasta_df.head())

# want to group somehow and determine which isoforms have the same sequences as each other, if any 
test1 = idmap_fasta_df.groupby(["canonical_uniprotkb","sequence"]).agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: set(x)),
    unique_isoforms_from_desc=("isoform_from_desc", lambda x: set(x)),
).reset_index()
test1["total_isoforms_from_uniprotkb"] = test1["unique_isoforms_from_uniprotkb"].apply(lambda x: len(x) if x is not None else 0)
test1["total_isoforms_from_dec"] = test1["unique_isoforms_from_desc"].apply(lambda x: len(x) if x is not None else 0)
test2 = len(test1.loc[test1["total_isoforms_from_uniprotkb"]>1])
print(f"Total instances where two different isoforms of the same protein in UniProt have the exact same sequence: {test2}")

# Check if there are any cases where there's a blank AND an isoform 1 in the same sequence
test1 = idmap_fasta_df.groupby(["canonical_uniprotkb"]).agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: list(set(x))),
    unique_isoforms_from_desc=("isoform_from_desc", lambda x: list(set(x))),
)
test1["unique_isoforms"] = test1.apply(lambda row: list(set(row["unique_isoforms_from_uniprotkb"] + row["unique_isoforms_from_desc"])),axis=1)
test1 = len(test1.loc[(test1["unique_isoforms_from_uniprotkb"].apply(lambda x: "Isoform 0" in x)) & (test1["unique_isoforms_from_uniprotkb"].apply(lambda x: "Isoform 1" in x))])
print(f"Total instances where there are both an Isoform 0 and Isoform 1 for the same protein: {test1}")
#idmap_fasta_df["uniprotkb"] = "uniprotkb:" + idmap_fasta_df["uniprotkb"]

# make sure that there always is a unique isoform 
test1 = idmap_fasta_df.groupby("canonical_uniprotkb").agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: list(set(x))),
)
test1["iso_0_count"] = test1["unique_isoforms_from_uniprotkb"].apply(lambda x: x.count("Isoform 0") if x is not None else 0)
test1 = (test1["iso_0_count"]==1).all()
print(f"Every uniprotkb has exactly one canonical isoform, which we have named Isoform 0: {test1}")

Unnamed: 0,sequence,database,uniprotkb,uniprot_gene_name,uniprotkb_iso,isoform_from_desc,isoform_from_uniprotkb,canonical_uniprotkb
0,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,sp,uniprotkb:A0AVK6,E2F8_HUMAN,A0AVK6-0,,Isoform 0,A0AVK6
1,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,sp,uniprotkb:A0MZ66,SHOT1_HUMAN,A0MZ66-0,,Isoform 0,A0MZ66
2,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,sp,uniprotkb:A0MZ66-2,SHOT1_HUMAN,A0MZ66-2,Isoform 2,Isoform 2,A0MZ66
3,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,sp,uniprotkb:A0MZ66-3,SHOT1_HUMAN,A0MZ66-3,Isoform 3,Isoform 3,A0MZ66
4,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,sp,uniprotkb:A0MZ66-4,SHOT1_HUMAN,A0MZ66-4,Isoform 4,Isoform 4,A0MZ66


Total instances where two different isoforms of the same protein in UniProt have the exact same sequence: 0
Total instances where there are both an Isoform 0 and Isoform 1 for the same protein: 32
Every uniprotkb has exactly one canonical isoform, which we have named Isoform 0: True


In [44]:
# make the additional sequences from tsv
# make a dictionary from the FASTA df with the canonical isoform
canonical_seq_dict = idmap_fasta_df.loc[idmap_fasta_df["isoform_from_uniprotkb"]=="Isoform 0"]
canonical_seq_dict = idmap_fasta_df.loc[idmap_fasta_df["isoform_from_uniprotkb"]=="Isoform 0"].reset_index(drop=True)
canonical_seq_dict = dict(zip(canonical_seq_dict["canonical_uniprotkb"],canonical_seq_dict["sequence"]))
print(f"\tMade a mapping of UniProt IDs to their canonical sequences: {len(canonical_seq_dict)} entries")

	Made a mapping of UniProt IDs to their canonical sequences: 1715 entries


In [45]:
idmap_tsv_df["uniprotkb"] = idmap_tsv_df["Entry"]
idmap_tsv_df["Sequence"] = idmap_tsv_df["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)

idmap_tsv_df_combined_for_merge = idmap_tsv_df.copy()
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined_for_merge.drop(columns=[
    "Protein names","Gene Names","Organism","Length"
])
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined_for_merge.rename(
    columns = {
        "Entry": "uniprotkb",
        "uniprotkb": "uniprotkb_iso",
        "Reviewed": "database",
        "Entry Name": "uniprot_gene_name",
        "Sequence": "sequence"
    }
)
idmap_tsv_df_combined_for_merge["uniprotkb_iso"] = idmap_tsv_df_combined_for_merge["uniprotkb_iso"].apply(lambda x: f"{x}-0" if "-" not in x else x)
test1 = len(idmap_tsv_df_combined_for_merge["database"].value_counts().reset_index())==2
print(f"\tEverything is either reviewed or unreviewed (no other categories) in idmap tsv: {test1}")
idmap_tsv_df_combined_for_merge["database"] = idmap_tsv_df_combined_for_merge["database"].apply(
    lambda x: "sp" if x=="reviewed" else "tr")
idmap_tsv_df_combined_for_merge["uniprotkb"] = "uniprotkb:" + idmap_tsv_df_combined_for_merge["uniprotkb"]
idmap_tsv_df_combined_for_merge

	Everything is either reviewed or unreviewed (no other categories) in idmap tsv: False


Unnamed: 0,From,uniprotkb,database,uniprot_gene_name,uniprotkb_iso,sequence
0,E2F8,uniprotkb:A0AVK6,sp,E2F8_HUMAN,A0AVK6-0,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...
1,KIAA1598,uniprotkb:A0MZ66,sp,SHOT1_HUMAN,A0MZ66-0,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...
2,WTIP,uniprotkb:A6NIX2,sp,WTIP_HUMAN,A6NIX2-0,MQRSRAGADEAALLLAGLALRELEPGCGSPGRGRRGPRPGPGDEAA...
3,ARHGAP32,uniprotkb:A7KAX9,sp,RHG32_HUMAN,A7KAX9-0,METESESSTLGDDSVFWLESEVIIQVTDCEEEEREEKFRKMKSSVH...
4,MIF4GD,uniprotkb:A9UHW6,sp,MI4GD_HUMAN,A9UHW6-0,MGEPSREEYKIQSFDAETQQLLKTALKDPGAVDLEKVANVIVDHSL...
...,...,...,...,...,...,...
1710,NCOA3,uniprotkb:Q9Y6Q9,sp,NCOA3_HUMAN,Q9Y6Q9-0,MSGLGENLDPLASDSRKRKLPCDTPGQGLTCSGEKRRREQESKYIE...
1711,MAP3K4,uniprotkb:Q9Y6R4,sp,M3K4_HUMAN,Q9Y6R4-0,MREAAAALVPPPAFAVTPAAAMEEPPPPPPPPPPPPEPETESEPEC...
1712,WASF2,uniprotkb:Q9Y6W5,sp,WASF2_HUMAN,Q9Y6W5-0,MPLVTRNIEPRHLCRQTLPSVRSELECVTNITLANVIRQLGSLSKY...
1713,PIAS3,uniprotkb:Q9Y6X2,sp,PIAS3_HUMAN,Q9Y6X2-0,MAELGELKHMVMSFRVSELQVLLGFAGRNKSGRKHELLAKALHLLK...


In [46]:
# make a species map
idmap_species_dict = idmap_tsv_df[["Entry Name","Organism"]]
idmap_species_dict["Entry Name"] = idmap_species_dict["Entry Name"].apply(lambda x: x.split("_")[1] if "_" in x else None)
idmap_species_dict = idmap_species_dict.dropna().drop_duplicates().reset_index(drop=True)
idmap_species_dict = dict(zip(idmap_species_dict["Entry Name"],idmap_species_dict["Organism"]))
print(f"Total unique species: {len(idmap_species_dict)}")

Total unique species: 1


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idmap_species_dict["Entry Name"] = idmap_species_dict["Entry Name"].apply(lambda x: x.split("_")[1] if "_" in x else None)


In [49]:
idmap_merge = pd.merge(
    idmap_fasta_df,
    idmap_tsv_df_combined_for_merge,
    on=["uniprotkb","database","uniprot_gene_name","uniprotkb_iso","sequence"],
    how="outer"
)
idmap_merge = idmap_merge.drop(columns=["uniprotkb"])
idmap_merge = idmap_merge.rename(columns={"uniprotkb_iso":"uniprotkb_full"})
idmap_merge["isoform_from_desc"] = idmap_merge["isoform_from_desc"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_merge["isoform_or_chain_from_uniprotkb"] = idmap_merge["uniprotkb_full"].apply(lambda x: x.split("-")[1] if "-" in x else None)
idmap_merge["canonical_uniprotkb"] = idmap_merge["uniprotkb_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
idmap_merge = idmap_merge[["From","canonical_uniprotkb","uniprotkb_full","uniprot_gene_name","database","sequence","isoform_or_chain_from_uniprotkb","isoform_from_desc"]]
idmap_merge["species"] = idmap_merge["uniprot_gene_name"].apply(lambda x: idmap_species_dict[x.split("_")[1]] if (x is not None and type(x)==str and "_" in x and x.split("_")[1] in idmap_species_dict) else None)
idmap_merge = idmap_merge.drop_duplicates().reset_index(drop=True)
test1 = len(idmap_merge)
print(f"\tTotal")
idmap_merge = idmap_merge.loc[idmap_merge["sequence"].notna()].reset_index(drop=True)
idmap_merge

	Total


Unnamed: 0,From,canonical_uniprotkb,uniprotkb_full,uniprot_gene_name,database,sequence,isoform_or_chain_from_uniprotkb,isoform_from_desc,species
0,E2F8,A0AVK6,A0AVK6-0,E2F8_HUMAN,sp,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,0,,Homo sapiens (Human)
1,KIAA1598,A0MZ66,A0MZ66-0,SHOT1_HUMAN,sp,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,0,,Homo sapiens (Human)
2,,A0MZ66,A0MZ66-2,SHOT1_HUMAN,sp,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,2,Isoform 2,Homo sapiens (Human)
3,,A0MZ66,A0MZ66-3,SHOT1_HUMAN,sp,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,3,Isoform 3,Homo sapiens (Human)
4,,A0MZ66,A0MZ66-4,SHOT1_HUMAN,sp,MNSSDEEKQLQLITSLKEQAIGEYEDLRAENQKTKEKCDKIRQERD...,4,Isoform 4,Homo sapiens (Human)
...,...,...,...,...,...,...,...,...,...
4082,WASF2,Q9Y6W5,Q9Y6W5-0,WASF2_HUMAN,sp,MPLVTRNIEPRHLCRQTLPSVRSELECVTNITLANVIRQLGSLSKY...,0,,Homo sapiens (Human)
4083,,Q9Y6W5,Q9Y6W5-2,WASF2_HUMAN,sp,MPLVTRNIEPRHLCRQTLPSVRSELECVTNITLANVIRQLGSLSKY...,2,Isoform 2,Homo sapiens (Human)
4084,PIAS3,Q9Y6X2,Q9Y6X2-0,PIAS3_HUMAN,sp,MAELGELKHMVMSFRVSELQVLLGFAGRNKSGRKHELLAKALHLLK...,0,,Homo sapiens (Human)
4085,SEC23IP,Q9Y6Y8,Q9Y6Y8-0,S23IP_HUMAN,sp,MAERKPNGGSGGASTSSSGTNLLFSSSATEFSFNVPFIPVTQASAS...,0,,Homo sapiens (Human)


In [52]:
print(f"Size of database before adding any isoform sequences: {len(merged)}")
temp = idmap_merge.copy()
temp = temp.rename(columns={"From":"Gene1"})
tempcols = [x for x in temp.columns if x!="Gene1"]
tempcols = {x: f"Gene1_{x}" for x in tempcols}
temp = temp.rename(columns=tempcols)

merged = pd.merge(
    full_df,
    temp,
    on=["Gene1"],
    how="left"
)

temp = idmap_merge.copy()
temp = temp.rename(columns={"From":"Gene2"})
tempcols = [x for x in temp.columns if x!="Gene2"]
tempcols = {x: f"Gene2_{x}" for x in tempcols}
temp = temp.rename(columns=tempcols)

merged = pd.merge(
    merged,
    temp,
    on=["Gene2"],
    how="left"
).reset_index(drop=True)

print(f"Size of database after adding isoform sequences: {len(merged)}")
display(merged.head())

Size of database before adding any isoform sequences: 64
Size of database after adding isoform sequences: 64


Unnamed: 0,Fusion ID,Gene1,Gene2,Gene1 Nodes,Gene1 Edges,Gene2 Nodes,Gene2 Edges,Fusion Nodes,Fusion Edges,Number,...,Gene1_isoform_from_desc,Gene1_species,Gene2_canonical_uniprotkb,Gene2_uniprotkb_full,Gene2_uniprot_gene_name,Gene2_database,Gene2_sequence,Gene2_isoform_or_chain_from_uniprotkb,Gene2_isoform_from_desc,Gene2_species
0,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,,Homo sapiens (Human),P00519,P00519-0,ABL1_HUMAN,sp,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,0,,Homo sapiens (Human)
1,AJ298917,FGFR1,BCR,27,75,52,258,44,188,34,...,,Homo sapiens (Human),P11274,P11274-0,BCR_HUMAN,sp,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,0,,Homo sapiens (Human)
2,Z35761,ETV6,ABL1,27,85,128,867,83,452,68,...,,Homo sapiens (Human),P00519,P00519-0,ABL1_HUMAN,sp,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,0,,Homo sapiens (Human)
3,EF428110,PRKAR1A,RARA,65,206,106,570,64,204,104,...,,Homo sapiens (Human),P10276,P10276-0,RARA_HUMAN,sp,MASNSSSCPTPGGGHLNGYPVPPYAFFFPPMLGGLSPPGALTTLQH...,0,,Homo sapiens (Human)
4,U41743,NPM1,RARA,355,4318,105,554,172,986,274,...,,Homo sapiens (Human),P10276,P10276-0,RARA_HUMAN,sp,MASNSSSCPTPGGGHLNGYPVPPYAFFFPPMLGGLSPPGALTTLQH...,0,,Homo sapiens (Human)


In [55]:
print(f"Size of database before exploding on missing interactors: {len(merged)}")
merged_expl = merged.copy(deep=True)
merged_expl["Missing Interactors"] = merged_expl["Missing Interactors"].apply(lambda x: x.split(","))
merged_expl= merged_expl.explode(["Missing Interactors"]).reset_index(drop=True)
merged_expl = merged_expl.rename(columns={"Missing Interactors":"Missing_Interactor"})
print(f"Size of database after exploding on missing interactors: {len(merged_expl)}")

temp = idmap_merge.copy()
temp = temp.rename(columns={"From":"Missing_Interactor"})
tempcols = [x for x in temp.columns if x!="Missing_Interactor"]
tempcols = {x: f"Missing_Interactor_{x}" for x in tempcols}
temp = temp.rename(columns=tempcols)

merged_expl= pd.merge(
    merged_expl,
    temp,
    on=["Missing_Interactor"],
    how="left"
).reset_index(drop=True)


display(merged_expl.head())


Size of database before exploding on missing interactors: 64
Size of database after exploding on missing interactors: 5158


Unnamed: 0,Fusion ID,Gene1,Gene2,Gene1 Nodes,Gene1 Edges,Gene2 Nodes,Gene2 Edges,Fusion Nodes,Fusion Edges,Number,...,Gene2_isoform_from_desc,Gene2_species,Missing_Interactor_canonical_uniprotkb,Missing_Interactor_uniprotkb_full,Missing_Interactor_uniprot_gene_name,Missing_Interactor_database,Missing_Interactor_sequence,Missing_Interactor_isoform_or_chain_from_uniprotkb,Missing_Interactor_isoform_from_desc,Missing_Interactor_species
0,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,,Homo sapiens (Human),P12004,P12004-0,PCNA_HUMAN,sp,MFEARLVQGSILKKVLEALKDLINEACWDISSSGVNLQSMDSSHVS...,0,,Homo sapiens (Human)
1,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,,Homo sapiens (Human),P10809,P10809-0,CH60_HUMAN,sp,MLRLPTVFRQMRPVSRVLAPHLTRAYAKDVKFGADARALMLQGVDL...,0,,Homo sapiens (Human)
2,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,,Homo sapiens (Human),Q9NWQ8,Q9NWQ8-0,PHAG1_HUMAN,sp,MGPAGSLLGSGQMQITLWGSLAAVAIFFVITFLIFLCSSCDREKKP...,0,,Homo sapiens (Human)
3,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,,Homo sapiens (Human),P07203,P07203-0,GPX1_HUMAN,sp,MCAARLAAAAAAAQSVYAFSARPLAGGEPVSLGSLRGKVLLIENVA...,0,,Homo sapiens (Human)
4,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,,Homo sapiens (Human),P46527,P46527-0,CDN1B_HUMAN,sp,MSNVRVSNGSPSLERMDARQAEHPKPSACRNLFGPVDHEELTRDLE...,0,,Homo sapiens (Human)


In [56]:
print(merged_expl.columns)

Index(['Fusion ID', 'Gene1', 'Gene2', 'Gene1 Nodes', 'Gene1 Edges',
       'Gene2 Nodes', 'Gene2 Edges', 'Fusion Nodes', 'Fusion Edges', 'Number',
       'Missing_Interactor', 'Gene1_canonical_uniprotkb',
       'Gene1_uniprotkb_full', 'Gene1_uniprot_gene_name', 'Gene1_database',
       'Gene1_sequence', 'Gene1_isoform_or_chain_from_uniprotkb',
       'Gene1_isoform_from_desc', 'Gene1_species', 'Gene2_canonical_uniprotkb',
       'Gene2_uniprotkb_full', 'Gene2_uniprot_gene_name', 'Gene2_database',
       'Gene2_sequence', 'Gene2_isoform_or_chain_from_uniprotkb',
       'Gene2_isoform_from_desc', 'Gene2_species',
       'Missing_Interactor_canonical_uniprotkb',
       'Missing_Interactor_uniprotkb_full',
       'Missing_Interactor_uniprot_gene_name', 'Missing_Interactor_database',
       'Missing_Interactor_sequence',
       'Missing_Interactor_isoform_or_chain_from_uniprotkb',
       'Missing_Interactor_isoform_from_desc', 'Missing_Interactor_species'],
      dtype='object')


In [58]:
# delete anything where there is no sequence
merged_expl = merged_expl.loc[
    (merged_expl["Gene1_sequence"].notna()) & 
    (merged_expl["Gene2_sequence"].notna()) & 
    (merged_expl["Missing_Interactor_sequence"].notna())
].reset_index(drop=True)
print(f"Size after dropping any row where there is no mapped sequence for either Gene1, Gene2, or missing: {len(merged_expl)}")

Size after dropping any row where there is no mapped sequence for either Gene1, Gene2, or missing: 5246


# Incorporate fusion sequences
Last thing to do is incorporate all possible fusion oncoprotein sequences
I can't just use FusOnDB, I have to combine it somewhere with other versions of my curated database that have the UniProt IDs, so I know I'm doing a proper match. 

In [59]:
fusondb = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/fusondb/fusions_with_alignments.csv")
fusondb

  fusondb = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/fusondb/fusions_with_alignments.csv")


Unnamed: 0,FusionGID,FusionGenes,Fusion_Seq,Fusion_Length,Hgene,Tgene,Level,seq_id,cancers,top_hg_UniProtID,...,Fusion_Seq_Source,HG_pLDDT,TG_pLDDT,HG_Length,TG_Length,Fusion_Noncanonicals,HG_Noncanonicals,TG_Noncanonicals,HG_seq_id,TG_seq_id
0,,A1BG::FGA,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIYRDNT...,564,A1BG,FGA,,seq1,non-cancer,P04217,...,,,,495.0,866.0,,,,htseq1,htseq5300
1,,A1BG::FGA,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIYRDNT...,786,A1BG,FGA,,seq2,non-cancer,P04217,...,,,,495.0,866.0,,,,htseq1,htseq5300
2,,A1BG::ITIH4,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIFYETQ...,534,A1BG,ITIH4,,seq3,non-cancer,P04217,...,,,,495.0,930.0,,,,htseq1,htseq2156
3,4.0,A1BG::ITIH4,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIFYETQ...,548,A1BG,ITIH4,2.0,seq4,non-cancer,P04217,...,"AlphaFold,Raw Download",86.52,80.44,495.0,930.0,,,,htseq1,htseq2156
4,,A2M::CCT5,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,1474,A2M,CCT5,,seq5,liver hepatocellular carcinoma,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44414,,ZSWIM6::NDUFAF2,MAERGQQPPPAKRLCCRPGGGGGGGGSSGGGGGAGGGYSSACRPGP...,352,ZSWIM6,NDUFAF2,,seq44410,lung squamous cell carcinoma,Q9HCJ5,...,,,,1215.0,169.0,,,,htseq5298,htseq5461
44415,,ZYG11B::JAK1,MEPPRRLSLGAGSGPARRRTQDGGCMPEDQAGAAMKERFYESRCRP...,388,ZYG11B,JAK1,,seq44411,stomach adenocarcinoma,,...,,,,,,,,,,
44416,,ZYX::NCAPH,LSRAGAAGAAVSGVGASGPRRSLRPEAGSGPLPGGVSCLVRPSPLS...,655,ZYX,NCAPH,,seq44412,,,...,,,,,,,,,,
44417,,ZZEF1::ANKFY1,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,580,ZZEF1,ANKFY1,,seq44413,,O43149,...,,,,2961.0,1169.0,,,,htseq5299,htseq220


In [81]:
fusondb[["FusionGenes","Fusion_Seq","HG_Seq","TG_Seq","top_hg_UniProtID","top_tg_UniProtID","top_hg_UniProt_isoform","top_tg_UniProt_isoform"]]

Unnamed: 0,FusionGenes,Fusion_Seq,HG_Seq,TG_Seq,top_hg_UniProtID,top_tg_UniProtID,top_hg_UniProt_isoform,top_tg_UniProt_isoform
0,A1BG::FGA,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIYRDNT...,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,MFSMRIVCLVLSVVGTAWTADSGEGDFLAEGGGVRGPRVVERHQSA...,P04217,P02671,4.0,2.0
1,A1BG::FGA,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIYRDNT...,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,MFSMRIVCLVLSVVGTAWTADSGEGDFLAEGGGVRGPRVVERHQSA...,P04217,P02671,4.0,2.0
2,A1BG::ITIH4,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIFYETQ...,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,MKPPRPVRTCSKVLVLLSLLAIHQTTTAEKNGIDIYSLTVDSRVSS...,P04217,Q14624,4.0,4.0
3,A1BG::ITIH4,MLQTLTPDTHCTGVSATIMSMLVVFLLLWGVTWGPVTEAAIFYETQ...,MSMLVVFLLLWGVTWGPVTEAAIFYETQPSLWAESESLLKPLANVT...,MKPPRPVRTCSKVLVLLSLLAIHQTTTAEKNGIDIYSLTVDSRVSS...,P04217,Q14624,4.0,4.0
4,A2M::CCT5,MGKNKLLHPSLVLLLLVLLPTDASVSGKPQYMVLVPSLLHTETTEK...,,,,,,
...,...,...,...,...,...,...,...,...
44414,ZSWIM6::NDUFAF2,MAERGQQPPPAKRLCCRPGGGGGGGGSSGGGGGAGGGYSSACRPGP...,MAERGQQPPPAKRLCCRPGGGGGGGGSSGGGGGAGGGYSSACRPGP...,MGWSQDLFRALWRSLSREVKEHVGTDQFGNKYYYIPQYKNWRGQTI...,Q9HCJ5,Q8N183,2.0,1.0
44415,ZYG11B::JAK1,MEPPRRLSLGAGSGPARRRTQDGGCMPEDQAGAAMKERFYESRCRP...,,,,,,
44416,ZYX::NCAPH,LSRAGAAGAAVSGVGASGPRRSLRPEAGSGPLPGGVSCLVRPSPLS...,,,,,,
44417,ZZEF1::ANKFY1,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,MGNAPSHSSEDEAAAAGGEGWGPHQDWAAVSGTTPGPGVAAPALPP...,MAEEEVAKLEKHLMLLRQEYVKLQKKLAETEKRCALLAAQANKESS...,O43149,Q9P2R3,6.0,2.0


In [80]:
merged_expl[["Gene1_sequence","Gene1_canonical_uniprotkb","Gene1_uniprotkb_full","Gene2_sequence","Gene2_canonical_uniprotkb","Gene2_uniprotkb_full"]]

Unnamed: 0,Gene1_sequence,Gene1_canonical_uniprotkb,Gene1_uniprotkb_full,Gene2_sequence,Gene2_canonical_uniprotkb,Gene2_uniprotkb_full
0,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,P11274,P11274-0,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,P00519,P00519-0
1,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,P11274,P11274-0,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,P00519,P00519-0
2,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,P11274,P11274-0,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,P00519,P00519-0
3,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,P11274,P11274-0,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,P00519,P00519-0
4,MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,P11274,P11274-0,MLEICLKLVGCKSKKGLSSSSSCYLEEALQRPVASDFEPQGLSEAA...,P00519,P00519-0
...,...,...,...,...,...,...
5241,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,O95436,O95436-0,MKNIYCLIPKLVNFATLGCLWISVVQCTVLNSCLKSCVTNLGQQLD...,P08922,P08922-0
5242,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,O95436,O95436-0,MKNIYCLIPKLVNFATLGCLWISVVQCTVLNSCLKSCVTNLGQQLD...,P08922,P08922-0
5243,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,O95436,O95436-0,MKNIYCLIPKLVNFATLGCLWISVVQCTVLNSCLKSCVTNLGQQLD...,P08922,P08922-0
5244,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,O95436,O95436-0,MKNIYCLIPKLVNFATLGCLWISVVQCTVLNSCLKSCVTNLGQQLD...,P08922,P08922-0


In [85]:
merged_fus = pd.merge(
    merged_expl,
    fusondb[["Fusion_Seq","HG_Seq","TG_Seq","top_hg_UniProtID","top_tg_UniProtID","top_hg_UniProt_isoform","top_tg_UniProt_isoform"]].rename(
        columns={"HG_Seq":"Gene1_sequence","TG_Seq":"Gene2_sequence",
                 "top_hg_UniProtID":"Gene1_canonical_uniprotkb","top_tg_UniProtID":"Gene2_canonical_uniprotkb"}
    ),
    on=["Gene1_sequence","Gene2_sequence","Gene1_canonical_uniprotkb","Gene2_canonical_uniprotkb"],
    how="inner"
)
print(f"Length of database before merging back with right fusion, head, and tail: {len(merged_expl)}")
print(f"Length of database when merged back with right fusion, head, and tail: {len(merged_fus)}")

test1 = set(merged_expl["Fusion ID"].unique().tolist())
test2 = set(merged_fus["Fusion ID"].unique().tolist())
test1 = test1-test2
print(f"Total fusion IDs lost in the merge: {len(test1)} (e.g. {[x for x in test1][0:5]})")
merged_fus

Length of database before merging back with right fusion, head, and tail: 5246
Length of database when merged back with right fusion, head, and tail: 25685
Total fusion IDs lost in the merge: 16 (e.g. ['AJ549094', 'AA828778', 'EU093086', 'AY222643', 'AY662674'])


Unnamed: 0,Fusion ID,Gene1,Gene2,Gene1 Nodes,Gene1 Edges,Gene2 Nodes,Gene2 Edges,Fusion Nodes,Fusion Edges,Number,...,Missing_Interactor_uniprotkb_full,Missing_Interactor_uniprot_gene_name,Missing_Interactor_database,Missing_Interactor_sequence,Missing_Interactor_isoform_or_chain_from_uniprotkb,Missing_Interactor_isoform_from_desc,Missing_Interactor_species,Fusion_Seq,top_hg_UniProt_isoform,top_tg_UniProt_isoform
0,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,P12004-0,PCNA_HUMAN,sp,MFEARLVQGSILKKVLEALKDLINEACWDISSSGVNLQSMDSSHVS...,0,,Homo sapiens (Human),MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,2.0,4.0
1,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,P12004-0,PCNA_HUMAN,sp,MFEARLVQGSILKKVLEALKDLINEACWDISSSGVNLQSMDSSHVS...,0,,Homo sapiens (Human),MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,2.0,4.0
2,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,P12004-0,PCNA_HUMAN,sp,MFEARLVQGSILKKVLEALKDLINEACWDISSSGVNLQSMDSSHVS...,0,,Homo sapiens (Human),MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,2.0,4.0
3,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,P12004-0,PCNA_HUMAN,sp,MFEARLVQGSILKKVLEALKDLINEACWDISSSGVNLQSMDSSHVS...,0,,Homo sapiens (Human),MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,2.0,4.0
4,EU216071,BCR,ABL1,51,234,127,843,108,597,48,...,P12004-0,PCNA_HUMAN,sp,MFEARLVQGSILKKVLEALKDLINEACWDISSSGVNLQSMDSSHVS...,0,,Homo sapiens (Human),MVDPVGFAEAWKAQFPDSEPPRMELRSVGDIEQELERCKASIRRLE...,2.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25680,EU236946,SLC34A2,ROS1,1,0,6,8,2,1,6,...,O95436-0,NPT2B_HUMAN,sp,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,0,,Homo sapiens (Human),MAAASSPAPAEGALTMAPWPELGDAQPNPDKYLEGAAGQQPTAPDK...,3.0,3.0
25681,EU236946,SLC34A2,ROS1,1,0,6,8,2,1,6,...,O95436-0,NPT2B_HUMAN,sp,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,0,,Homo sapiens (Human),MGSKARNLCFPSHKWTMAPWPELGDAQPNPDKYLEGAAGQQPTAPD...,3.0,3.0
25682,EU236946,SLC34A2,ROS1,1,0,6,8,2,1,6,...,O95436-0,NPT2B_HUMAN,sp,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,0,,Homo sapiens (Human),MAAASSPAPAEGALTMAPWPELGDAQPNPDKYLEGAAGQQPTAPDK...,3.0,3.0
25683,EU236946,SLC34A2,ROS1,1,0,6,8,2,1,6,...,O95436-0,NPT2B_HUMAN,sp,MAPWPELGDAQPNPDKYLEGAAGQQPTAPDKSKETNKTDNTEAPVT...,0,,Homo sapiens (Human),MAAASSPAPAEGALTMAPWPELGDAQPNPDKYLEGAAGQQPTAPDK...,3.0,3.0


In [87]:
merged_expl.loc[merged_expl["Fusion ID"].isin(test1)][["Fusion ID","Gene1","Gene2"]].drop_duplicates()

Unnamed: 0,Fusion ID,Gene1,Gene2
154,EF428110,PRKAR1A,RARA
793,AA828778,CREBBP,CREBBP
1080,AY222643,CREB3L2,PPARG
2085,L21756,RUNX1,MECOM
2207,EU093086,RUNX1,SH3D19
2370,AF009227,ODZ4,NRG1
3402,AF231997,MLL,GAS7
3763,AJ549094,FUS,CREB3L2
4162,AF362886,TPM3,ALK
4521,AF295356,MSN,ALK


In [88]:
merged_fus.to_csv("data_files/processed/chippi_s9/clean/tables9_with_fusondb.csv",index=False)