# Imports

In [1]:
import pandas as pd
import numpy as np
import ast
import math
import re
import logging
import rootutils
import csv
import os

# Define methods

In [2]:
def extract_pubmed_from_experiment(exp_str):
    """exp_str is like a Python list/dict string"""
    try:
        parsed = ast.literal_eval(exp_str)
        ids = [int(item["pubmed"]) for item in parsed if "pubmed" in item]
        return ids if len(ids) > 0 else None
    except Exception:
        return None

def extract_pubmed_from_source(src_str):
    """src_str is like 'pubmed:10542231|mint:...|pubmed:40205054'"""
    ids = [int(m) for m in re.findall(r"pubmed:(\d+)", src_str)]
    return ids if len(ids) > 0 else None

def normalize_ids(ids):
    """deduplicate, sort, then join with |"""
    return "|".join(str(i) for i in sorted(set(ids))) if ids is not None else None

def deduplicate(df):
    """
    Count total duplicate rows and deletes them
    """
    og_len = len(df)
    print(f"\tTotal duplicate rows = {len(df.loc[df.duplicated()])}")
    df = df.loc[~df.duplicated()].reset_index(drop=True)
    print(f"\tLength after deduplication = {len(df)} ({100*len(df)/og_len:.2f}% of original)")
    return df

def get_unique_id(row, colA="ID(s) interactor A", colB="ID(s) interactor B"):
    """
    Create a unique ID for the pair of interactors in the row, so that order does not matter
    """
    intA = row[colA]
    intB = row[colB]
    
    if intA is None or (type(intA)==float and np.isnan(intA)):
        intA=""
    if intB is None or (type(intB)==float and np.isnan(intB)):
        intB=""
    
    if intA <= intB:
        return f"{intA}_{intB}"
    return f"{intB}_{intA}"

def extract_sorted_mis(s: str) -> str:
    # Find all MI codes like MI:0084
    mis = re.findall(r"MI:\d+", s)
    
    # Sort by the numeric part (after MI:)
    mis_sorted = sorted(mis, key=lambda x: int(x.split(":")[1]))
    
    # Join with |
    return "|".join(mis_sorted) if len(mis_sorted)>0 else None

def extract_sorted_miscores(s: str) -> str:
    # Find all MI codes like MI:0084
    mis = re.findall(r"intact-miscore:\d+(?:\.\d+)?", s)
    
    # Sort by the numeric part (after MI:)
    mis_sorted = sorted(mis, key=lambda x: float(x.split(":")[1]))
    
    # Join with |
    return "|".join(mis_sorted) if len(mis_sorted)>0 else None

def extract_sorted_uniprot(row, interactor="A") -> str:
    primary_ids = str(row[f"ID(s) interactor {interactor}"])
    secondary_ids = str(row[f"Alt. ID(s) interactor {interactor}"])

    matches_primary = re.findall(r"(uniprotkb:[^|]+)", primary_ids)
    matches_secondary = re.findall(r"(uniprotkb:[^|]+)", secondary_ids)
    matches = matches_primary + matches_secondary

    # Deduplicate and sort lexicographically
    sorted_ids = sorted(set(matches))

    return "|".join(sorted_ids) if sorted_ids else None

def extract_sorted_intact_from_scraped_only(row, interactor="A") -> str:
    # Find all MI codes like MI:0084
    if interactor=="A": interactor = "1"
    if interactor=="B": interactor = "2"
    primary_ids = row[f"intactid_{interactor}"]
    matches_primary = re.findall(r"(intact:EBI-\d+)", primary_ids)
    matches = matches_primary
    
    # Sort by the numeric part (after EBI-)
    sorted_ids = sorted(set(matches), key=lambda x: int(x.split("-")[1]))
    
    # Join with |
    return "|".join(sorted_ids) if len(sorted_ids)>0 else None

def extract_sorted_intact(row, interactor="A") -> str:
    # Find all MI codes like MI:0084
    primary_ids = row[f"ID(s) interactor {interactor}"]
    secondary_ids = row[f"Alt. ID(s) interactor {interactor}"]
    matches_primary = re.findall(r"(intact:EBI-\d+)", primary_ids)
    matches_secondary = re.findall(r"(intact:EBI-\d+)", secondary_ids)
    matches = matches_primary + matches_secondary
    
    # Sort by the numeric part (after EBI-)
    sorted_ids = sorted(set(matches), key=lambda x: int(x.split("-")[1]))
    
    # Join with |
    return "|".join(sorted_ids) if len(sorted_ids)>0 else None

def expand_cross_combinations(s: str) -> list[str]:
    """
    Given a string with a single "_" separating two groups of '|' separated IDs,
    return all combinations of left x right as a list of strings.
    """
    if "_" not in s:
        return [s]  # nothing to expand
    left, right = s.split("_", 1)  # split once at "_"
    left_parts = left.split("|")
    right_parts = right.split("|")
    combos = []
    for l in left_parts:
        for r in right_parts:
            if l <= r:
                combos += [f"{l}_{r}"]
            else:
                combos += [f"{r}_{l}"]
    return combos

def get_expanded_unique_inter_ids(row):
    all_expanded = []
    suffix = "-" + row["unique_all_idmethods_sorted"] + "-" + row["detection_method_mi"] + "-" + row["pubmeds"]
    for prefix in row["unique_all_intact_combos"]:
        all_expanded += [prefix + suffix]
    return all_expanded

def flip_interactors(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return a copy of df with interactor_1 and interactor_2 columns swapped.
    Assumes columns are named like xxx_1 and xxx_2 consistently.
    """
    # Build mapping: col_1 -> col_2 and col_2 -> col_1
    mapping = {}
    for col in df.columns:
        if col.endswith("_1"):
            mapping[col] = col.replace("_1", "_2")
        elif col.endswith("_2"):
            mapping[col] = col.replace("_2", "_1")
    # Swap columns using rename
    flipped = df.rename(columns=mapping).copy()
    # Reorder columns to original order
    flipped = flipped[df.columns]
    return flipped

def map_back_individual_intact_mypos(row, interactor="1"):
    combined_ids = row["unique_all_intact_combos"].split("_")
    individual_ids = row[f"intactid_{str(interactor)}"].split("|")
    correct_ids = [x for x in individual_ids if x in combined_ids]
    return correct_ids if len(correct_ids)>0 else None

import numpy as np
import pandas as pd
from pandas.api.types import is_scalar

def _make_hashable(x):
    # Handle containers first (so we don't call pd.isna on arrays)
    if isinstance(x, dict):
        return frozenset((k, _make_hashable(v)) for k, v in sorted(x.items()))
    if isinstance(x, (list, tuple)):
        return tuple(_make_hashable(v) for v in x)
    if isinstance(x, set):
        return frozenset(_make_hashable(v) for v in x)
    if isinstance(x, np.ndarray):
        # convert to Python containers, then recurse
        return tuple(_make_hashable(v) for v in x.tolist())

    # Now only scalars remain: safe to use pd.isna
    if is_scalar(x) and pd.isna(x):
        return ("__NA__",)  # stable sentinel for NaN/None

    return x  # already hashable (int, str, float (non-NaN), Timestamp, etc.)

def cols_with_differences(df, row_indexer):
    """
    Return list of columns where values differ among the selected rows.
    row_indexer can be a boolean mask, list of indices, or slice.
    """
    subset = df.loc[row_indexer]
    if len(subset) <= 1:
        return []
    cols = []
    for col in subset.columns:
        s = subset[col].map(_make_hashable)
        if s.nunique(dropna=False) > 1:
            cols.append(col)
    return cols

def collect_all_diff_cols(df, index_col="seq_pair_id"):
    """
    For each seq_pair_id group, find differing columns and return a set of all unique cols.
    """
    all_diff_cols = set()
    # Assumes df is already sorted by seq_pair_id
    for seq_id, group in df.groupby(index_col):
        # group.index gives row_indexer for this group
        diff_cols = cols_with_differences(df, group.index)
        all_diff_cols.update(diff_cols)
    return all_diff_cols

def calc_feature_mask(df, interact_acs, df_col="Interaction identifier(s)"):
    tok_set = set(interact_acs)
    s = df[df_col].fillna('').str.split('|')
    mask = s.explode().isin(tok_set).groupby(level=0).any()
    return mask

def norm_pos_str(x):
    if pd.isna(x):
        return pd.NA
    
    # already pipe/comma separated → keep as string
    s = str(x).strip()
    if "|" in s or "," in s:
        return s
    
    # It looks like a single numeric value → format it consistently
    # Turn "238.0" or 238.0 into "238"
    try:
        v = float(s)
        if math.isfinite(v):
            # drop trailing .0 etc
            if v.is_integer():
                return str(int(v))
            else:
                return str(v)
    except Exception:
        pass

    return s  # fallback (weird stuff stays as-is)

def harmonize_nulls_to_nan(df: pd.DataFrame, *, also_blank_strings=True, keep_datetime=False) -> pd.DataFrame:
    out = df.copy()

    # 1) Convert common sentinels to real missing
    if also_blank_strings:
        out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})

    # 2) Normalize to pandas NA first (unifies None/NaN/<NA>)
    out = out.convert_dtypes()

    # 3) Cast extension dtypes -> object so np.nan can live there.
    for c in out.columns:
        dt = out[c].dtype
        is_ext = isinstance(dt, pd.api.extensions.ExtensionDtype)
        if keep_datetime and pd.api.types.is_datetime64_any_dtype(dt):
            # keep datetimes as datetime64 with NaT
            continue
        if is_ext:
            out[c] = out[c].astype(object)

    # 4) Finally: make ALL missings = np.nan
    out = out.where(~out.isna(), np.nan)

    return out

# before we save merged, must correct invalid aas
def find_invalid_chars(seq: str, valid_chars: set) -> set:
    """
    Find and return a set of invalid characters in a sequence.

    Args:
        seq (str): The sequence you wish to search for invalid characters.
        valid_chars (set): A set of valid characters.

    Returns:
        set: A set of characters in the sequence that are not in the set of valid characters.
    """
    unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"

    if unique_chars.issubset(valid_chars):  # e.g. unique_chars = {A,C}, and {A,C} is a subset of valid_chars
        return np.nan
    else: # e.g. unique_chars = {A,X}. {A,X} is not a subset of valid_chars because X is not in valid_chars
        l = unique_chars.difference(valid_chars) # e.g. {A,X} - valid_chars = {X}
        l = sorted(list(l))
        return ",".join(l)
    
VALID_AAS = {'A',
             'R',
             'N',
             'D',
             'C',
             'E',
             'Q',
             'G',
             'H',
             'I',
             'L',
             'K',
             'M',
             'F',
             'P',
             'S',
             'T',
             'W',
             'Y',
             'V',
             'U' # this unnatural is allowed
            }

def get_subsequence(seq, coords, one_indexed=True, end_inclusive=True):
    """
    Helper method for extracting a subsequence from a full sequence
    """
    try:
        if len(coords)==1:
            coords = [coords[0],coords[0]]
            end_inclusive=True
        start, end = coords
        # if there are question marks, remove them
        if start[0]=="?":
            start = start[1:]
        if end[0]=="?":
            end = end[1:]
        
        start = int(start)
        end = int(end)
        
        if one_indexed:
            start = start - 1 
            end = end - 1
        if end_inclusive:
            end = end + 1
        
        subsequence = seq[start:end]
        return subsequence
    except:
        return None
    
import re
from typing import List

def sort_isoforms(uniprotlist: List[str]) -> List[str]:
    """
    Sort UniProt-like isoform IDs so that, within each base ID, the order is:
      [ID]        (no suffix, if present)
      ID-0, ID-1, ID-2, ...
      ID-PRO_1, ID-PRO_2, ...
      (then any other suffixes, alphabetically)

    Examples
    --------
    ["P12345-PRO_2","P12345-1","P12345-0","P12345-PRO_10","P12345-2"]
      -> ["P12345-0","P12345-1","P12345-2","P12345-PRO_2","P12345-PRO_10"]

    ["Q9XYZ1","Q9XYZ1-2","Q9XYZ1-PRO_3","Q9XYZ1-1"]
      -> ["Q9XYZ1","Q9XYZ1-1","Q9XYZ1-2","Q9XYZ1-PRO_3"]
    """
    base_re = re.compile(r'^(?P<base>[^-]+)(?:-(?P<suffix>.+))?$')
    pro_re  = re.compile(r'^PRO_(\d+)$')

    def key(s: str):
        m = base_re.match(s)
        if not m:
            # Fallback: weird string, send to the end
            return (s, 99, float('inf'), s)

        base = m.group('base')
        suf  = m.group('suffix')

        if suf is None or suf == "":
            # Bare ID first within its base group
            return (base, 0, -1, "")

        # Numeric isoform?
        if suf.isdigit():
            return (base, 1, int(suf), "")

        # Processed/proteoform like PRO_#
        pm = pro_re.match(suf)
        if pm:
            return (base, 2, int(pm.group(1)), "")

        # Anything else: keep but alphabetical at the end
        return (base, 3, float('inf'), suf)

    return sorted(uniprotlist, key=key)

# verify that nothing where there's scraped mut info has stuff filled in for both 1 and 2 
def verify_equal_feature_lengths(row, reference_cols, interactor=1):
    interactor = str(interactor)
    filt_ref_cols = [c for c in reference_cols if c.endswith(f"_{interactor}")]
    pipe_counts = []

    for c in filt_ref_cols:
        val = row[c]
        # Skip NaNs / None entirely
        if pd.isna(val):
            to_append = None
        elif isinstance(val, str):
            to_append = val.count("|") + 1
        else:
            # Non-string, non-NaN: decide how you want to handle it
            to_append = None
        if to_append is not None:
            pipe_counts.append(to_append)

    # If there are 0 or 1 distinct non-null lengths, they are consistent
    pipe_counts = set(pipe_counts)
    if len(pipe_counts) <= 1:
        return True
    else:
        return False
    
# how to find the right matching column? 
def feature_affected_protein_matches_id(row, feature="Mutation"):
    """
    Method for finding if the affected protein for a feature (mutation, binding site, PTM) matches interactor A or B
    """
    feature_ac_id = row[f"{feature} Affected protein AC"]
    #feature_full_identifiers = split_top_level_pipe(row["Mutation Interaction participants"])
    matches = []
    intA_comparison = None
    intB_comparison = None
    # don't break on np.nan or None
    if (type(feature_ac_id)==float) or feature_ac_id is None:
        return ""
    if feature_ac_id.startswith("uniprotkb:"):
        # compare to the unprot id stored by INTACT; this is the one it's referring to in the mutation table
        intA_comparison = row["uniprot_A_intact"]
        intB_comparison = row["uniprot_B_intact"]
        
    elif feature_ac_id.startswith("intact"):
        intA_comparison = row["all_intact_A_sorted"]
        intB_comparison = row["all_intact_B_sorted"]
        # check for whether intactid_1 and intactid_2 column help make matches. From my experiments, they do not
        #int1_comparison = row["intactid_1"]
        #int2_comparison = row["intactid_2"]
        #if feature_ac_id in int1_comparison.split("|"):
        #    matches.append("1")
        #if feature_ac_id == int2_comparison.split("|"):
        #    matches.append("2")
    elif feature_ac_id.startswith("dip:"):
        intA_comparison = "dip:" + str(row["dip_1"])
        intB_comparison = "dip:" + str(row["dip_2"])
    else:
        return ""
    
    if feature_ac_id==intA_comparison:
        matches.append("A")
    if feature_ac_id == intB_comparison:
        matches.append("B")
    return ",".join(matches)



In [3]:
def join_unique_nonnull(s: pd.Series,delim=",") -> str | float:
    """Join unique, order-preserving, non-null values; flatten list-like cells."""
    seen, out = set(), []
    for v in s:
        if pd.isna(v):
            continue
        items = v if isinstance(v, (list, tuple, set)) else [v]
        for item in items:
            if pd.isna(item):
                continue
            sv = str(item).strip()
            if sv and sv not in seen:
                seen.add(sv)
                out.append(sv)
    return delim.join(out) if out else np.nan

def take_first(s):
    """
    An alternative to aggregating on 'first' 
    It takes the first row whether or not the elements in it are empty 
    """
    # assumes group is non-empty
    return s.iloc[0]

def _is_missing(x):
    try:
        if x is None:
            return True
        if isinstance(x, float) and math.isnan(x):
            return True
        return pd.isna(x)
    except Exception:
        return False

def _collect_row_values(row, cols,delim=","):
    seen = set()
    out = []
    for c in cols:
        v = row.get(c)
        if _is_missing(v) or v in {"", "nan", "None", "<NA>"}:
            continue
        items = v if isinstance(v, (list, tuple, set)) else [v]
        for item in items:
            if _is_missing(item) or item in {"", "nan", "None", "<NA>"}:
                continue
            s = str(item).strip()
            if s and s not in seen:
                seen.add(s)
                out.append(s)
    # return a list (order-preserving, readable). If nothing, use NaN.
    return delim.join(sorted(out)) if out else np.nan

def mi_to_desc_string(mi_string: str, d) -> str | float:
    if pd.isna(mi_string):
        return np.nan

    groups = str(mi_string).split("|")

    converted_groups = []
    for group in groups:
        group = group.strip()
        if not group:
            continue
        codes = [c.strip() for c in group.split(",") if c.strip()]
        descs = [d.get(code, str(np.nan)) for code in codes]
        converted_groups.append(",".join(descs))

    return "|".join(converted_groups) if converted_groups else np.nan

# Load the databases

In [4]:
intact = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psimitab/intact.txt",sep="\t")

In [5]:
intact_clust = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psimitab/intact-micluster.txt",sep="\t")

In [6]:
dtypes = {
    # amino acid sequences
    "aa_1": "string",
    "aa_2": "string",

    # feature positions (stored as strings like "10|20" or "10,20|30")
    "binding_begin_1": "string",
    "binding_begin_2": "string",
    "binding_end_1": "string",
    "binding_end_2": "string",
    "mutation_begin_1": "string",
    "mutation_begin_2": "string",
    "mutation_end_1": "string",
    "mutation_end_2": "string",
    "chain_seq_start_1": "string",
    "chain_seq_start_2": "string",
    "chain_seq_end_1": "string",
    "chain_seq_end_2": "string",
    "ptm_begin_1": "string",
    "ptm_begin_2": "string",
    "ptm_end_1": "string",
    "ptm_end_2": "string",

    # feature MI terms / names / residues (all pipe-joined strings)
    "binding_mi_1": "string",
    "binding_mi_2": "string",
    "binding_name_1": "string",
    "binding_name_2": "string",
    "binding_short_1": "string",
    "binding_short_2": "string",
    "mutation_mi_1": "string",
    "mutation_mi_2": "string",
    "mutation_name_1": "string",
    "mutation_name_2": "string",
    "mutation_short_1": "string",
    "mutation_short_2": "string",
    "mutation_new_1": "string",
    "mutation_new_2": "string",
    "mutation_orig_1": "string",
    "mutation_orig_2": "string",
    "ptm_mi_1": "string",
    "ptm_mi_2": "string",
    "ptm_name_1": "string",
    "ptm_name_2": "string",
    "ptm_short_1": "string",
    "ptm_short_2": "string",
    "ptm_new_1": "string",
    "ptm_new_2": "string",
    "ptm_orig_1": "string",
    "ptm_orig_2": "string",

    # gene symbols, types, etc.
    "gene_symbol_1": "string",
    "gene_symbol_2": "string",
    "mol_type_1": "string",
    "mol_type_2": "string",

    # sequences (same as aa_1/aa_2 before renaming)
    "protein_1": "string",
    "protein_2": "string",

    # lengths (true numeric, but nullable)
    "length_1": "Int64",
    "length_2": "Int64",

    # ID lists (mostly comma-separated)
    "ensg_1": "string",
    "ensg_2": "string",
    "ensp_1": "string",
    "ensp_2": "string",
    "enst_1": "string",
    "enst_2": "string",
    "go_1": "string",
    "go_2": "string",
    "interpro_1": "string",
    "interpro_2": "string",
    "intactid_1": "string",
    "intactid_2": "string",
    "interaction_intactid": "string",
    "rscbpdb_1": "string",
    "rscbpdb_2": "string",
    "uniprotkb_1": "string",
    "uniprotkb_2": "string",
    "reactome_1": "string",
    "reactome_2": "string",
    "species_taxid_1": "string",
    "species_taxid_2": "string",

    # species / host labels
    "species_label_1": "string",
    "species_label_2": "string",
    "host_taxid_1": "string",
    "host_taxid_2": "string",
    "host_cell_type_1": "string",
    "host_cell_type_2": "string",
    "host_compartment_1": "string",
    "host_compartment_2": "string",
    "host_tissue_1": "string",
    "host_tissue_2": "string",
    "host_label_full_1": "string",
    "host_label_full_2": "string",
    "host_label_short_1": "string",
    "host_label_short_2": "string",

    # interaction-level info
    "interaction_label": "string",
    "interaction_mi": "string",
    "process_method": "string",

    # primary ref info
    "primaryref_db_1": "string",
    "primaryref_db_2": "string",
    "primaryref_id_1": "string",
    "primaryref_id_2": "string",

    # experiment blobs (list-of-dicts as repr)
    "experiments": "string",

    # miscellaneous
    "go_1": "string",
    "go_2": "string",

    # year (numeric but may be missing)
    "year": "Int64",
}

my_pos = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intact_processed_positivePPIs_2025-11-17.csv", dtype=dtypes)

  my_pos = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intact_processed_positivePPIs_2025-11-17.csv", dtype=dtypes)


In [7]:
my_neg = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intact_processed_negativePPIs_2025-11-17.csv",
                     dtype=dtypes)
my_neg = harmonize_nulls_to_nan(my_neg)

In [8]:
intact_neg = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psimitab/intact_negative.txt",sep="\t")

In [9]:
intact_clust_neg = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psimitab/intact-micluster_negative.txt",sep="\t")

# Initial processing on databases

In [10]:
[x for x in intact.columns if "ID(s)" in x]

['#ID(s) interactor A',
 'ID(s) interactor B',
 'Alt. ID(s) interactor A',
 'Alt. ID(s) interactor B']

In [11]:
# Rename columns so they match
intact = intact.rename(columns = {"#ID(s) interactor A": "ID(s) interactor A"})
intact_clust = intact_clust.rename(columns = {"#ID(s) interactor A": "ID(s) interactor A"})
intact_neg = intact_neg.rename(columns = {"#ID(s) interactor A": "ID(s) interactor A"})
intact_clust_neg = intact_clust_neg.rename(columns = {"#ID(s) interactor A": "ID(s) interactor A"})

In [12]:
# Change "-" to None
intact = intact.replace(r'^\-$', None, regex=True)
intact_clust = intact_clust.replace(r'^\-$', None, regex=True)
intact_neg = intact_neg.replace(r'^\-$', None, regex=True)
intact_clust_neg = intact_clust_neg.replace(r'^\-$', None, regex=True)

In [13]:
# Harmonize nans
print(f"Harmonizing nulls to nan (everything empty or empty-resembling will be np.nan)")
intact = harmonize_nulls_to_nan(intact)
intact_clust = harmonize_nulls_to_nan(intact_clust)
intact_neg = harmonize_nulls_to_nan(intact_neg)
intact_clust_neg = harmonize_nulls_to_nan(intact_clust_neg)
my_pos = harmonize_nulls_to_nan(my_pos)
my_neg = harmonize_nulls_to_nan(my_neg)

Harmonizing nulls to nan (everything empty or empty-resembling will be np.nan)


  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


In [14]:
# Deduplicate
print(f"Original database sizes: intact.txt={len(intact)}, intact-micluster.txt: {len(intact_clust)}, processed xml: {len(my_pos)}")
# Check for duplicates
print(f"Deduplicating intact.txt")
intact = deduplicate(intact)
print(f"Deduplicating intact-micluster.txt")
intact_clust = deduplicate(intact_clust)
print(f"Deduplicating intact_negative.txt")
intact_neg = deduplicate(intact_neg)
print(f"Deduplicating intact-micluster_negative.txt")
intact_clust_neg = deduplicate(intact_clust_neg)
print(f"Deduplicating our processed dataset from XML")
my_pos = deduplicate(my_pos)
print(f"Deduplicating our processed NEGATIVES dataset from XML")
my_neg = deduplicate(my_neg)

Original database sizes: intact.txt=1726476, intact-micluster.txt: 1136486, processed xml: 746032
Deduplicating intact.txt
	Total duplicate rows = 1090
	Length after deduplication = 1725386 (99.94% of original)
Deduplicating intact-micluster.txt
	Total duplicate rows = 0
	Length after deduplication = 1136486 (100.00% of original)
Deduplicating intact_negative.txt
	Total duplicate rows = 0
	Length after deduplication = 984 (100.00% of original)
Deduplicating intact-micluster_negative.txt
	Total duplicate rows = 0
	Length after deduplication = 931 (100.00% of original)
Deduplicating our processed dataset from XML
	Total duplicate rows = 0
	Length after deduplication = 746032 (100.00% of original)
Deduplicating our processed NEGATIVES dataset from XML
	Total duplicate rows = 0
	Length after deduplication = 969 (100.00% of original)


In [15]:
# Get rid of rows from my database that don't have sequences
print(f"Cleaning processed xml: removing interactions without sequences")
my_pos = my_pos.loc[(~my_pos["aa_1"].isna()) & (~my_pos["aa_2"].isna())].reset_index(drop=True)
print(f"\tNew size: {len(my_pos)}")

Cleaning processed xml: removing interactions without sequences
	New size: 744934


In [16]:
# Get rid of rows from my database that don't have sequences
print(f"Cleaning processed negative xml: removing interactions without sequences")
my_neg = my_neg.loc[(~my_neg["aa_1"].isna()) & (~my_neg["aa_2"].isna())].reset_index(drop=True)
print(f"\tNew size: {len(my_neg)}")

Cleaning processed negative xml: removing interactions without sequences
	New size: 969


In [17]:
import mmap
import os
import re

def xml_contains_term(path: str, term: str, ignore_case: bool = True) -> tuple[bool, int]:
    """
    Return (found_any, count) for a raw substring search in an XML file.
    Uses mmap so the OS streams from disk without reading into Python memory.
    """
    if not os.path.exists(path):
        raise FileNotFoundError(path)

    # bytes pattern
    term_b = term.encode('utf-8')

    with open(path, 'rb') as f:
        with mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ) as mm:
            if not ignore_case:
                # Fast loop using bytes.find
                count = 0
                start = 0
                while True:
                    idx = mm.find(term_b, start)
                    if idx == -1:
                        break
                    count += 1
                    start = idx + 1  # overlap allowed; change to +len(term_b) if not
                return (count > 0, count)
            else:
                # Case-insensitive search via bytes-regex (ASCII-insensitive)
                # This avoids copying the whole file to lower-case it.
                pat = re.compile(re.escape(term_b), flags=re.IGNORECASE)
                count = sum(1 for _ in pat.finditer(mm))
                return (count > 0, count)


In [18]:
import os
import pandas as pd
from interactome.data.process.intact import parse_psi30

interaction_milabel_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0190_subtree.csv")
mutation_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0118_subtree.csv")
bindsite_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0117_subtree.csv")
ptm_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0925_subtree.csv")

year = 2019
to_search = os.listdir(f"/scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psi30/pmid/{year}")
to_search = [f"/scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psi30/pmid/{year}/" + x for x in to_search]
for f in to_search:
    found, cnt = xml_contains_term(f, "EBI-21373863") 
    if found:
        print("\t",f, found, cnt)

	 /scratch/pranamlab/sophie/interactome/interactome/data_files/raw/intact/psi30/pmid/2019/31413325.xml True 1


# Clean intact-micluster.txt

In [19]:
#### Property check: intact-micluster
print(f"Cleaning intact-micluster.txt")
# (1) ID Checks
# 1a. There are no rows where ID(s) is empty, but Alt. ID(s) is not.
test1 = len(intact_clust.loc[(intact_clust["ID(s) interactor A"].isna()) & ~(intact_clust["Alt. ID(s) interactor A"].isna())])==0
print(f"\tIn all rows where ID(s) interactor A is empty, Alt. IDs interactor A is empty as well: {test1}")
test1 = len(intact_clust.loc[(intact_clust["ID(s) interactor B"].isna()) & ~(intact_clust["Alt. ID(s) interactor B"].isna())])==0
print(f"\tIn all rows where ID(s) interactor B is empty, Alt. IDs interactor B is empty as well: {test1}")

Cleaning intact-micluster.txt
	In all rows where ID(s) interactor A is empty, Alt. IDs interactor A is empty as well: True
	In all rows where ID(s) interactor B is empty, Alt. IDs interactor B is empty as well: True


In [20]:
# 1b. There are no rows where ID(s) interactor A or B is empty (if there are, we delete)
test1 = len(intact_clust.loc[intact_clust["ID(s) interactor A"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor A: {test1}")
test1 = len(intact_clust.loc[intact_clust["ID(s) interactor B"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact_clust.loc[intact_clust['ID(s) interactor B'].isna()])} rows with no ID for interactor B")
    intact_clust = intact_clust.loc[intact_clust["ID(s) interactor B"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact_clust)}")
test1 = len(intact_clust.loc[intact_clust["ID(s) interactor B"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor B: {test1}")

	Every row has something in ID(s) interactor A: True
	Dropping 203 rows with no ID for interactor B
	New dataset size: 1136283
	Every row has something in ID(s) interactor B: True


In [21]:
# 1c. Every interactor A and B has at least one intact:EBI- ID.
intact_clust["all_intact_A_sorted"] = intact_clust.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
intact_clust["all_intact_B_sorted"] = intact_clust.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)
# at least 1
test1 = len(intact_clust.loc[intact_clust["all_intact_A_sorted"].isna()])==0
print(f"\tEvery interactor A has at least one intact ID: {test1}")
test1 = len(intact_clust.loc[intact_clust["all_intact_B_sorted"].isna()])==0
print(f"\tEvery interactor B has at least one intact ID: {test1}")
# exactly 1
test1 = (intact_clust["all_intact_A_sorted"].str.count("intact:EBI-")==1).all() & (intact_clust["all_intact_A_sorted"].str.count("\\|")==0).all()
print(f"\tEvery interactor A has EXACTLY one intact ID: {test1}")
test1 = (intact_clust["all_intact_B_sorted"].str.count("intact:EBI-")==1).all() & (intact_clust["all_intact_B_sorted"].str.count("\\|")==0).all()
print(f"\tEvery interactor B has EXACTLY one intact ID: {test1}")

  intact_clust["all_intact_A_sorted"] = intact_clust.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
  intact_clust["all_intact_B_sorted"] = intact_clust.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)


	Every interactor A has at least one intact ID: True
	Every interactor B has at least one intact ID: True
	Every interactor A has EXACTLY one intact ID: True
	Every interactor B has EXACTLY one intact ID: True


In [22]:
# (3) Each intact:EBI-ID is unique, only associated with one kind of protein
test1 = intact_clust.groupby("all_intact_A_sorted").agg(
    unique_primary_ids=("ID(s) interactor A", lambda x: len(set(x))),
    unique_alt_ids=("Alt. ID(s) interactor A", lambda x: len(set(x)))
)
test1 = (test1["unique_primary_ids"]==1).all() & (test1["unique_alt_ids"]==1).all()
print(f"\tAll intact:EBI- IDs for interactor A correspond to ONE unique set of primary ids and alt ids: {test1}")
test1 = intact_clust.groupby("all_intact_B_sorted").agg(
    unique_primary_ids=("ID(s) interactor B", lambda x: len(set(x))),
    unique_alt_ids=("Alt. ID(s) interactor B", lambda x: len(set(x)))
)
test1 = (test1["unique_primary_ids"]==1).all() & (test1["unique_alt_ids"]==1).all()
print(f"\tAll intact:EBI- IDs for interactor B correspond to ONE unique set of alt ids: {test1}")

	All intact:EBI- IDs for interactor A correspond to ONE unique set of primary ids and alt ids: True
	All intact:EBI- IDs for interactor B correspond to ONE unique set of alt ids: True


In [23]:
# (5) Each row has a UNIQUE set of interactors
intact_clust["unique_id"] = intact_clust.apply(lambda x: get_unique_id(x, colA="all_intact_A_sorted", colB="all_intact_B_sorted"), axis=1)
test1 = len(intact_clust.loc[intact_clust["unique_id"].duplicated()])==0
if not test1:
    intact_clust = intact_clust.drop_duplicates("unique_id").reset_index(drop=True)
    test1 = len(intact_clust.loc[intact_clust["unique_id"].duplicated()])==0
print(f"\tEach row holds a UNIQUE binary interacting pair, as indicated by their intact-EBI IDs: {test1}")

	Each row holds a UNIQUE binary interacting pair, as indicated by their intact-EBI IDs: True


In [24]:
# (6) Check that each row only has ONE UniProt ID
test1 = len(intact_clust.loc[intact_clust["Alt. ID(s) interactor A"].notna()])
print(f"\tInteractor A has {test1} ({100*test1/len(intact_clust):.2f}%) rows with at least one alt. ID")
test1 = len(intact_clust.loc[intact_clust["Alt. ID(s) interactor B"].notna()])
print(f"\tInteractor B has {test1} ({100*test1/len(intact_clust):.2f}%) rows with at least one alt. ID")
# get uniprot IDs
intact_clust["uniprot_A"] = intact_clust.apply(lambda x: extract_sorted_uniprot(x, interactor="A"), axis=1)
intact_clust["uniprot_B"] = intact_clust.apply(lambda x: extract_sorted_uniprot(x, interactor="B"), axis=1)
test1 = len(intact_clust.loc[~intact_clust["uniprot_A"].isna()])
print(f"\tRows with a UniProt for A: {test1}/{len(intact_clust)} ({100*test1/len(intact_clust):.2f}%)")
test1 = len(intact_clust.loc[~intact_clust["uniprot_B"].isna()])
print(f"\tRows with a UniProt for B: {test1}/{len(intact_clust)} ({100*test1/len(intact_clust):.2f}%)")
test1 = len(intact_clust.loc[(~intact_clust["uniprot_A"].isna() & (~intact_clust["uniprot_B"].isna()))])
print(f"\tRows with a UniProt for both: {test1}/{len(intact_clust)} ({100*test1/len(intact_clust):.2f}%)")

	Interactor A has 1115878 (98.20%) rows with at least one alt. ID
	Interactor B has 1106185 (97.35%) rows with at least one alt. ID
	Rows with a UniProt for A: 1083176/1136283 (95.33%)
	Rows with a UniProt for B: 1105208/1136283 (97.27%)
	Rows with a UniProt for both: 1054233/1136283 (92.78%)


In [25]:
# (7) Sort  the interaction detection methods so it's easier to join with 
intact_clust["interaction_detection_methods_sorted"] = intact_clust["Interaction detection method(s)"].apply(lambda x: extract_sorted_mis(x))

# Clean intact-micluster_negative.txt

In [26]:
#### Property check: intact-micluster
print(f"Cleaning intact-micluster.txt")
# (1) ID Checks
# 1a. There are no rows where ID(s) is empty, but Alt. ID(s) is not.
test1 = len(intact_clust_neg.loc[(intact_clust_neg["ID(s) interactor A"].isna()) & ~(intact_clust_neg["Alt. ID(s) interactor A"].isna())])==0
print(f"\tIn all rows where ID(s) interactor A is empty, Alt. IDs interactor A is empty as well: {test1}")
test1 = len(intact_clust_neg.loc[(intact_clust_neg["ID(s) interactor B"].isna()) & ~(intact_clust_neg["Alt. ID(s) interactor B"].isna())])==0
print(f"\tIn all rows where ID(s) interactor B is empty, Alt. IDs interactor B is empty as well: {test1}")

Cleaning intact-micluster.txt
	In all rows where ID(s) interactor A is empty, Alt. IDs interactor A is empty as well: True
	In all rows where ID(s) interactor B is empty, Alt. IDs interactor B is empty as well: True


In [27]:
# 1b. There are no rows where ID(s) interactor A or B is empty (if there are, we delete)
test1 = len(intact_clust_neg.loc[intact_clust_neg["ID(s) interactor A"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor A: {test1}")
test1 = len(intact_clust_neg.loc[intact_clust_neg["ID(s) interactor B"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact_clust_neg.loc[intact_clust_neg['ID(s) interactor B'].isna()])} rows with no ID for interactor B")
    intact_clust_neg = intact_clust_neg.loc[intact_clust_neg["ID(s) interactor B"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact_clust_neg)}")
test1 = len(intact_clust_neg.loc[intact_clust_neg["ID(s) interactor B"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor B: {test1}")

	Every row has something in ID(s) interactor A: True
	Every row has something in ID(s) interactor B: True


In [28]:
# 1c. Every interactor A and B has at least one intact:EBI- ID.
intact_clust_neg["all_intact_A_sorted"] = intact_clust_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
intact_clust_neg["all_intact_B_sorted"] = intact_clust_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)
# at least 1
test1 = len(intact_clust_neg.loc[intact_clust_neg["all_intact_A_sorted"].isna()])==0
print(f"\tEvery interactor A has at least one intact ID: {test1}")
test1 = len(intact_clust_neg.loc[intact_clust_neg["all_intact_B_sorted"].isna()])==0
print(f"\tEvery interactor B has at least one intact ID: {test1}")
# exactly 1
test1 = (intact_clust_neg["all_intact_A_sorted"].str.count("intact:EBI-")==1).all() & (intact_clust_neg["all_intact_A_sorted"].str.count("\\|")==0).all()
print(f"\tEvery interactor A has EXACTLY one intact ID: {test1}")
test1 = (intact_clust_neg["all_intact_B_sorted"].str.count("intact:EBI-")==1).all() & (intact_clust_neg["all_intact_B_sorted"].str.count("\\|")==0).all()
print(f"\tEvery interactor B has EXACTLY one intact ID: {test1}")

	Every interactor A has at least one intact ID: True
	Every interactor B has at least one intact ID: True
	Every interactor A has EXACTLY one intact ID: True
	Every interactor B has EXACTLY one intact ID: True


  intact_clust_neg["all_intact_A_sorted"] = intact_clust_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
  intact_clust_neg["all_intact_B_sorted"] = intact_clust_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)


In [29]:
# (3) Each intact:EBI-ID is unique, only associated with one kind of protein
test1 = intact_clust_neg.groupby("all_intact_A_sorted").agg(
    unique_primary_ids=("ID(s) interactor A", lambda x: len(set(x))),
    unique_alt_ids=("Alt. ID(s) interactor A", lambda x: len(set(x)))
)
test1 = (test1["unique_primary_ids"]==1).all() & (test1["unique_alt_ids"]==1).all()
print(f"\tAll intact:EBI- IDs for interactor A correspond to ONE unique set of primary ids and alt ids: {test1}")
test1 = intact_clust_neg.groupby("all_intact_B_sorted").agg(
    unique_primary_ids=("ID(s) interactor B", lambda x: len(set(x))),
    unique_alt_ids=("Alt. ID(s) interactor B", lambda x: len(set(x)))
)
test1 = (test1["unique_primary_ids"]==1).all() & (test1["unique_alt_ids"]==1).all()
print(f"\tAll intact:EBI- IDs for interactor B correspond to ONE unique set of alt ids: {test1}")

	All intact:EBI- IDs for interactor A correspond to ONE unique set of primary ids and alt ids: True
	All intact:EBI- IDs for interactor B correspond to ONE unique set of alt ids: True


In [30]:
# (5) Each row has a UNIQUE set of interactors
intact_clust_neg["unique_id"] = intact_clust_neg.apply(lambda x: get_unique_id(x, colA="all_intact_A_sorted", colB="all_intact_B_sorted"), axis=1)
test1 = len(intact_clust_neg.loc[intact_clust_neg["unique_id"].duplicated()])==0
if not test1:
    intact_clust_neg = intact_clust_neg.drop_duplicates("unique_id").reset_index(drop=True)
    test1 = len(intact_clust_neg.loc[intact_clust_neg["unique_id"].duplicated()])==0
print(f"\tEach row holds a UNIQUE binary interacting pair, as indicated by their intact-EBI IDs: {test1}")

	Each row holds a UNIQUE binary interacting pair, as indicated by their intact-EBI IDs: True


In [31]:
# (6) Check that each row only has ONE UniProt ID
test1 = len(intact_clust_neg.loc[intact_clust_neg["Alt. ID(s) interactor A"].notna()])
print(f"\tInteractor A has {test1} ({100*test1/len(intact_clust_neg):.2f}%) rows with at least one alt. ID")
test1 = len(intact_clust_neg.loc[intact_clust_neg["Alt. ID(s) interactor B"].notna()])
print(f"\tInteractor B has {test1} ({100*test1/len(intact_clust_neg):.2f}%) rows with at least one alt. ID")
# get uniprot IDs
intact_clust_neg["uniprot_A"] = intact_clust_neg.apply(lambda x: extract_sorted_uniprot(x, interactor="A"), axis=1)
intact_clust_neg["uniprot_B"] = intact_clust_neg.apply(lambda x: extract_sorted_uniprot(x, interactor="B"), axis=1)
test1 = len(intact_clust_neg.loc[~intact_clust_neg["uniprot_A"].isna()])
print(f"\tRows with a UniProt for A: {test1}/{len(intact_clust_neg)} ({100*test1/len(intact_clust_neg):.2f}%)")
test1 = len(intact_clust_neg.loc[~intact_clust_neg["uniprot_B"].isna()])
print(f"\tRows with a UniProt for B: {test1}/{len(intact_clust_neg)} ({100*test1/len(intact_clust_neg):.2f}%)")
test1 = len(intact_clust_neg.loc[(~intact_clust_neg["uniprot_A"].isna() & (~intact_clust_neg["uniprot_B"].isna()))])
print(f"\tRows with a UniProt for both: {test1}/{len(intact_clust_neg)} ({100*test1/len(intact_clust_neg):.2f}%)")

	Interactor A has 842 (90.44%) rows with at least one alt. ID
	Interactor B has 901 (96.78%) rows with at least one alt. ID
	Rows with a UniProt for A: 841/931 (90.33%)
	Rows with a UniProt for B: 901/931 (96.78%)
	Rows with a UniProt for both: 813/931 (87.33%)


In [32]:
# (7) Sort  the interaction detection methods so it's easier to join with 
intact_clust_neg["interaction_detection_methods_sorted"] = intact_clust_neg["Interaction detection method(s)"].apply(lambda x: extract_sorted_mis(x))

# Process intact.txt

In [33]:
#### Processing intact.txt
print(f"Cleaning intact.txt")
# (1) Checking that all IDs are unique
test1 = len(intact.loc[intact["ID(s) interactor A"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor A: {test1}")
test1 = len(intact.loc[intact["ID(s) interactor B"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact.loc[intact['ID(s) interactor B'].isna()])} rows with no ID for interactor B")
    intact = intact.loc[intact["ID(s) interactor B"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact)}")
test1 = len(intact.loc[intact["ID(s) interactor B"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor B: {test1}")

test1 = (~intact["ID(s) interactor A"].str.contains("\\|")).all()
print(f"\tEach row contains ONE primary ID for interactor A: {test1}")
test1 = (~intact["ID(s) interactor B"].str.contains("\\|")).all()
print(f"\tEach row contains ONE primary ID for interactor B: {test1}")

Cleaning intact.txt
	Every row has something in ID(s) interactor A: True
	Dropping 321 rows with no ID for interactor B
	New dataset size: 1725065
	Every row has something in ID(s) interactor B: True
	Each row contains ONE primary ID for interactor A: True
	Each row contains ONE primary ID for interactor B: True


In [34]:
# (2) Getting sorted list of all intact IDs per interactor
intact["all_intact_A_sorted"] = intact.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
intact["all_intact_B_sorted"] = intact.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)
test1 = len(intact.loc[intact["all_intact_A_sorted"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact.loc[intact['all_intact_A_sorted'].isna()])} rows with no intact ID for interactor A")
    intact = intact.loc[intact["all_intact_A_sorted"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact)}")
    test1 = len(intact.loc[intact["all_intact_A_sorted"].isna()])==0
print(f"\tEach row contains at least ONE IntAct ID (intact:EBI-...) for interactor A: {test1}")
test1 = len(intact.loc[intact["all_intact_B_sorted"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact.loc[intact['all_intact_B_sorted'].isna()])} rows with no intact ID for interactor B")
    intact = intact.loc[intact["all_intact_B_sorted"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact)}")
    test1 = len(intact.loc[intact["all_intact_B_sorted"].isna()])==0
print(f"\tEach row contains at least ONE IntAct ID (intact:EBI-...) for interactor B: {test1}")
test1 = len(intact.loc[(intact["all_intact_A_sorted"].str.count("intact:")==1) & (intact["all_intact_B_sorted"].str.count("intact:")==1)])
print(f"\tTotal rows with one IntAct ID for interactor A and B: {test1} ({100*test1/len(intact):.2f}%)")
test1 = len(intact.loc[intact["all_intact_A_sorted"].str.contains("\\|")])
print(f"\tTotal rows with multiple IntAct IDs for interactor A or B: {test1} ({100*test1/len(intact):.2f}%)")

  intact["all_intact_A_sorted"] = intact.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
  intact["all_intact_B_sorted"] = intact.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)


	Each row contains at least ONE IntAct ID (intact:EBI-...) for interactor A: True
	Each row contains at least ONE IntAct ID (intact:EBI-...) for interactor B: True
	Total rows with one IntAct ID for interactor A and B: 1312922 (76.11%)
	Total rows with multiple IntAct IDs for interactor A or B: 213457 (12.37%)


In [35]:
# (3) Checking that each row corresponds only to one PubMed ID
intact["pubmeds"] = intact["Publication Identifier(s)"].apply(lambda x: normalize_ids(extract_pubmed_from_source(x)))
test1 = (intact["pubmeds"].fillna("").str.count("\\|")==0).all()
print(f"\tEach row corresponds to at most one PubMed ID: {test1}")
test1 = len(intact.loc[intact["pubmeds"].isna()])
print(f"\tTotal rows corresponding to 0 PubMed IDs: {test1} ({100*test1/len(intact):.2f}%)")

	Each row corresponds to at most one PubMed ID: True
	Total rows corresponding to 0 PubMed IDs: 12483 (0.72%)


In [36]:
# (4) Checking that each row corresponds only to one interaction detection method
test1 = (len(intact.loc[intact["Interaction detection method(s)"].str.contains("\\|")])==0) and (intact["Interaction detection method(s)"].str.count("psi-mi:")==1).all()
print(f"\tEach row corresponds to just one interaction detection method: {test1}")

	Each row corresponds to just one interaction detection method: True


In [37]:
# (5) Checking prevalence of participant detection methods
test1 = len(intact.loc[intact["Identification method participant A"].str.count("psi-mi:")>1])
print(f"\tTotal rows with >1 detection method for interactor A: {test1} ({100*test1/len(intact):.2f}%)")
test1 = len(intact.loc[intact["Identification method participant B"].str.count("psi-mi:")>1])
print(f"\tTotal rows with >1 detection method for interactor B: {test1} ({100*test1/len(intact):.2f}%)")
intact["all_idmethods_A_sorted"] = intact["Identification method participant A"].fillna("").apply(lambda x: extract_sorted_mis(x))
intact["all_idmethods_B_sorted"] = intact["Identification method participant B"].fillna("").apply(lambda x: extract_sorted_mis(x))
intact["detection_method_mi"] = intact["Interaction detection method(s)"].fillna("").apply(lambda x: extract_sorted_mis(x))
test1 = (intact["detection_method_mi"].str.count("MI:")==1).all()
print(f"\tOnly ONE experimental detection method for the interaction per row: {test1}")

	Total rows with >1 detection method for interactor A: 6066 (0.35%)
	Total rows with >1 detection method for interactor B: 5946 (0.34%)
	Only ONE experimental detection method for the interaction per row: True


In [38]:
# (6) Get unique miscores
intact["miscore"] = intact["Confidence value(s)"].fillna("").apply(lambda x: extract_sorted_miscores(x))
intact = intact.loc[intact["miscore"].notna()].reset_index(drop=True)
print(f"\tDropped rows with no intact-miscore. Remaining rows: {len(intact)}")

intact["unique_all_intact_sorted"] = intact.apply(lambda row: get_unique_id(row, colA="all_intact_A_sorted", colB="all_intact_B_sorted"),axis=1)
intact["unique_all_idmethods_sorted"] = intact.apply(lambda row: get_unique_id(row, colA="all_idmethods_A_sorted", colB="all_idmethods_B_sorted"),axis=1)
intact["unique_inter_id"] = intact["unique_all_intact_sorted"] + "-" + intact["unique_all_idmethods_sorted"] + "-" + intact["detection_method_mi"] + "-" + intact["pubmeds"]
test1 = intact.groupby("unique_inter_id").agg(unique_scores=("miscore", lambda x: len(set(x)))).reset_index()
test1 = (test1["unique_scores"]==1).all()
print(f"\tEach unique_inter_id has just one associated miscore: {test1}")

	Dropped rows with no intact-miscore. Remaining rows: 1725065
	Each unique_inter_id has just one associated miscore: True


In [39]:
# (7) Handle expansion
intact["Expansion method(s)"] = intact["Expansion method(s)"].apply(lambda x: "not expanded" if x is None or (type(x)==float and np.isnan(x)) else x)
test1 = intact["Expansion method(s)"].value_counts().to_dict()
spoke_str = 'psi-mi:"MI:1060"(spoke expansion)'
print(f"\tTotal rows with spoke expansion: {test1[spoke_str]} ({100*test1[spoke_str]/len(intact):.2f}%)")
print(f"\tTotal rows with no expansion: {test1['not expanded']} ({100*test1['not expanded']/len(intact):.2f}%)")


	Total rows with spoke expansion: 916419 (53.12%)
	Total rows with no expansion: 808646 (46.88%)


In [40]:
# (8) Look into IntAct IDs for the interaction 
intact["IntAct Interaction identifier(s)"] = intact["Interaction identifier(s)"].apply(lambda x: "|".join([y for y in x.split("|") if "intact:EBI-" in y]))
test1 = (intact["Interaction identifier(s)"].str.count("intact:EBI-")==1).sum()
print(f"Total rows of IntAct with exactly one intact:EBI- interaction identifier for the interaction: {test1}/{len(intact)} ({100*test1/len(intact):.5f}%)")
test1 = (intact["Interaction identifier(s)"].str.count("intact:EBI-")==0).sum()
print(f"Total rows of IntAct with 0 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact)} ({100*test1/len(intact):.2f}%)")
test1 = (intact["Interaction identifier(s)"].str.count("intact:EBI-")>1).sum()
print(f"Total rows of IntAct with >1 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact)} ({100*test1/len(intact):.5f}%)")

Total rows of IntAct with exactly one intact:EBI- interaction identifier for the interaction: 1725062/1725065 (99.99983%)
Total rows of IntAct with 0 intact:EBI- interaction identifiers for the interaction: 0/1725065 (0.00%)
Total rows of IntAct with >1 intact:EBI- interaction identifiers for the interaction: 3/1725065 (0.00017%)


In [41]:
intact["IntAct Interaction identifier(s)"] = intact["IntAct Interaction identifier(s)"].apply(lambda x: x.split("|"))
intact = intact.explode("IntAct Interaction identifier(s)").reset_index(drop=True)
print(f"Exploded along IntAct Interaction identifier(s). New # rows: {len(intact)}")

Exploded along IntAct Interaction identifier(s). New # rows: 1725072


# Process intact_negative.txt

In [42]:
#### Processing intact_neg.txt
print(f"Cleaning intact_neg.txt")
# (1) Checking that all IDs are unique
test1 = len(intact_neg.loc[intact_neg["ID(s) interactor A"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor A: {test1}")
test1 = len(intact_neg.loc[intact_neg["ID(s) interactor B"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact_neg.loc[intact_neg['ID(s) interactor B'].isna()])} rows with no ID for interactor B")
    intact_neg = intact_neg.loc[intact_neg["ID(s) interactor B"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact_neg)}")
test1 = len(intact_neg.loc[intact_neg["ID(s) interactor B"].isna()])==0
print(f"\tEvery row has something in ID(s) interactor B: {test1}")

test1 = (~intact_neg["ID(s) interactor A"].str.contains("\\|")).all()
print(f"\tEach row contains ONE primary ID for interactor A: {test1}")
test1 = (~intact_neg["ID(s) interactor B"].str.contains("\\|")).all()
print(f"\tEach row contains ONE primary ID for interactor B: {test1}")

Cleaning intact_neg.txt
	Every row has something in ID(s) interactor A: True
	Every row has something in ID(s) interactor B: True
	Each row contains ONE primary ID for interactor A: True
	Each row contains ONE primary ID for interactor B: True


In [43]:
# (2) Getting sorted list of all intact IDs per interactor
intact_neg["all_intact_A_sorted"] = intact_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
intact_neg["all_intact_B_sorted"] = intact_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)
test1 = len(intact_neg.loc[intact_neg["all_intact_A_sorted"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact_neg.loc[intact_neg['all_intact_A_sorted'].isna()])} rows with no intact ID for interactor A")
    intact_neg = intact_neg.loc[intact_neg["all_intact_A_sorted"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact_neg)}")
    test1 = len(intact_neg.loc[intact_neg["all_intact_A_sorted"].isna()])==0
print(f"\tEach row contains at least ONE IntAct ID (intact:EBI-...) for interactor A: {test1}")
test1 = len(intact_neg.loc[intact_neg["all_intact_B_sorted"].isna()])==0
if not test1:
    print(f"\tDropping {len(intact_neg.loc[intact_neg['all_intact_B_sorted'].isna()])} rows with no intact ID for interactor B")
    intact_neg = intact_neg.loc[intact_neg["all_intact_B_sorted"].notna()].reset_index(drop=True)
    print(f"\tNew dataset size: {len(intact_neg)}")
    test1 = len(intact_neg.loc[intact_neg["all_intact_B_sorted"].isna()])==0
print(f"\tEach row contains at least ONE IntAct ID (intact:EBI-...) for interactor B: {test1}")
test1 = len(intact_neg.loc[(intact_neg["all_intact_A_sorted"].str.count("intact:")==1) & (intact_neg["all_intact_B_sorted"].str.count("intact:")==1)])
print(f"\tTotal rows with one IntAct ID for interactor A and B: {test1} ({100*test1/len(intact_neg):.2f}%)")
test1 = len(intact_neg.loc[intact_neg["all_intact_A_sorted"].str.contains("\\|")])
print(f"\tTotal rows with multiple IntAct IDs for interactor A or B: {test1} ({100*test1/len(intact_neg):.2f}%)")

	Each row contains at least ONE IntAct ID (intact:EBI-...) for interactor A: True
	Each row contains at least ONE IntAct ID (intact:EBI-...) for interactor B: True
	Total rows with one IntAct ID for interactor A and B: 733 (74.49%)
	Total rows with multiple IntAct IDs for interactor A or B: 106 (10.77%)


  intact_neg["all_intact_A_sorted"] = intact_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="A"),axis=1)
  intact_neg["all_intact_B_sorted"] = intact_neg.fillna("").apply(lambda row: extract_sorted_intact(row,interactor="B"),axis=1)


In [44]:
# (3) Checking that each row corresponds only to one PubMed ID
intact_neg["pubmeds"] = intact_neg["Publication Identifier(s)"].apply(lambda x: normalize_ids(extract_pubmed_from_source(x)))
test1 = (intact_neg["pubmeds"].fillna("").str.count("\\|")==0).all()
print(f"\tEach row corresponds to at most one PubMed ID: {test1}")
test1 = len(intact_neg.loc[intact_neg["pubmeds"].isna()])
print(f"\tTotal rows corresponding to 0 PubMed IDs: {test1} ({100*test1/len(intact_neg):.2f}%)")

	Each row corresponds to at most one PubMed ID: True
	Total rows corresponding to 0 PubMed IDs: 0 (0.00%)


In [45]:
# (4) Checking that each row corresponds only to one interaction detection method
test1 = (len(intact_neg.loc[intact_neg["Interaction detection method(s)"].str.contains("\\|")])==0) and (intact_neg["Interaction detection method(s)"].str.count("psi-mi:")==1).all()
print(f"\tEach row corresponds to just one interaction detection method: {test1}")

	Each row corresponds to just one interaction detection method: True


In [46]:
# (5) Checking prevalence of participant detection methods
test1 = len(intact_neg.loc[intact_neg["Identification method participant A"].str.count("psi-mi:")>1])
print(f"\tTotal rows with >1 detection method for interactor A: {test1} ({100*test1/len(intact_neg):.2f}%)")
test1 = len(intact_neg.loc[intact_neg["Identification method participant B"].str.count("psi-mi:")>1])
print(f"\tTotal rows with >1 detection method for interactor B: {test1} ({100*test1/len(intact_neg):.2f}%)")
intact_neg["all_idmethods_A_sorted"] = intact_neg["Identification method participant A"].fillna("").apply(lambda x: extract_sorted_mis(x))
intact_neg["all_idmethods_B_sorted"] = intact_neg["Identification method participant B"].fillna("").apply(lambda x: extract_sorted_mis(x))
intact_neg["detection_method_mi"] = intact_neg["Interaction detection method(s)"].fillna("").apply(lambda x: extract_sorted_mis(x))
test1 = (intact_neg["detection_method_mi"].str.count("MI:")==1).all()
print(f"\tOnly ONE experimental detection method for the interaction per row: {test1}")

	Total rows with >1 detection method for interactor A: 0 (0.00%)
	Total rows with >1 detection method for interactor B: 0 (0.00%)
	Only ONE experimental detection method for the interaction per row: True


In [47]:
# (6) Get unique miscores
intact_neg["miscore"] = intact_neg["Confidence value(s)"].fillna("").apply(lambda x: extract_sorted_miscores(x))
intact_neg = intact_neg.loc[intact_neg["miscore"].notna()].reset_index(drop=True)
print(f"\tDropped rows with no intact_neg-miscore. Remaining rows: {len(intact_neg)}")

intact_neg["unique_all_intact_sorted"] = intact_neg.apply(lambda row: get_unique_id(row, colA="all_intact_A_sorted", colB="all_intact_B_sorted"),axis=1)
intact_neg["unique_all_idmethods_sorted"] = intact_neg.apply(lambda row: get_unique_id(row, colA="all_idmethods_A_sorted", colB="all_idmethods_B_sorted"),axis=1)
intact_neg["unique_inter_id"] = intact_neg["unique_all_intact_sorted"] + "-" + intact_neg["unique_all_idmethods_sorted"] + "-" + intact_neg["detection_method_mi"] + "-" + intact_neg["pubmeds"]
test1 = intact_neg.groupby("unique_inter_id").agg(unique_scores=("miscore", lambda x: len(set(x)))).reset_index()
test1 = (test1["unique_scores"]==1).all()
print(f"\tEach unique_inter_id has just one associated miscore: {test1}")

	Dropped rows with no intact_neg-miscore. Remaining rows: 984
	Each unique_inter_id has just one associated miscore: True


In [48]:
# (7) Handle expansion
intact_neg["Expansion method(s)"] = intact_neg["Expansion method(s)"].apply(lambda x: "not expanded" if x is None or (type(x)==float and np.isnan(x)) else x)
test1 = intact_neg["Expansion method(s)"].value_counts().to_dict()
spoke_str = 'psi-mi:"MI:1060"(spoke expansion)'
print(f"\tTotal rows with spoke expansion: {test1[spoke_str]} ({100*test1[spoke_str]/len(intact_neg):.2f}%)")
print(f"\tTotal rows with no expansion: {test1['not expanded']} ({100*test1['not expanded']/len(intact_neg):.2f}%)")


	Total rows with spoke expansion: 10 (1.02%)
	Total rows with no expansion: 974 (98.98%)


In [49]:
# (8) Look into IntAct IDs for the interaction 
intact_neg["IntAct Interaction identifier(s)"] = intact_neg["Interaction identifier(s)"].apply(lambda x: "|".join([y for y in x.split("|") if "intact:EBI-" in y]))
test1 = (intact_neg["Interaction identifier(s)"].str.count("intact:EBI-")==1).sum()
print(f"Total rows of intact_neg with exactly one intact:EBI- interaction identifier for the interaction: {test1}/{len(intact_neg)} ({100*test1/len(intact_neg):.5f}%)")
test1 = (intact_neg["Interaction identifier(s)"].str.count("intact:EBI-")==0).sum()
print(f"Total rows of intact_neg with 0 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact_neg)} ({100*test1/len(intact_neg):.2f}%)")
test1 = (intact_neg["Interaction identifier(s)"].str.count("intact:EBI-")>1).sum()
print(f"Total rows of intact_neg with >1 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact_neg)} ({100*test1/len(intact_neg):.5f}%)")

Total rows of intact_neg with exactly one intact:EBI- interaction identifier for the interaction: 984/984 (100.00000%)
Total rows of intact_neg with 0 intact:EBI- interaction identifiers for the interaction: 0/984 (0.00%)
Total rows of intact_neg with >1 intact:EBI- interaction identifiers for the interaction: 0/984 (0.00000%)


In [50]:
intact_neg["IntAct Interaction identifier(s)"] = intact_neg["IntAct Interaction identifier(s)"].apply(lambda x: x.split("|"))
intact_neg = intact_neg.explode("IntAct Interaction identifier(s)").reset_index(drop=True)
print(f"Exploded along IntAct Interaction identifier(s). New # rows: {len(intact_neg)}")

Exploded along IntAct Interaction identifier(s). New # rows: 984


In [51]:
display(intact_neg.head())

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,all_intact_B_sorted,pubmeds,all_idmethods_A_sorted,all_idmethods_B_sorted,detection_method_mi,miscore,unique_all_intact_sorted,unique_all_idmethods_sorted,unique_inter_id,IntAct Interaction identifier(s)
0,uniprotkb:Q9NP97,uniprotkb:O54918-3,intact:EBI-372128|uniprotkb:Q5TC72|uniprotkb:Q...,intact:EBI-526084|intact:EBI-7424890|intact:MI...,psi-mi:dlrb1_human(display_long)|uniprotkb:DYN...,psi-mi:o54918-3(display_long)|uniprotkb:Bcl2l1...,"psi-mi:""MI:0018""(two hybrid)",Puthalakath et al. (1999),pubmed:10198631|mint:MINT-5211354,taxid:9606(human)|taxid:9606(Homo sapiens),...,intact:EBI-526084|intact:EBI-7424890,10198631,MI:0056,MI:0056,MI:0018,intact-miscore:0.51,intact:EBI-372128_intact:EBI-526084|intact:EBI...,MI:0056_MI:0056,intact:EBI-372128_intact:EBI-526084|intact:EBI...,intact:EBI-526131
1,uniprotkb:O54918-3,uniprotkb:Q9NP97,intact:EBI-526084|intact:EBI-7424890|intact:MI...,intact:EBI-372128|uniprotkb:Q5TC72|uniprotkb:Q...,psi-mi:o54918-3(display_long)|uniprotkb:Bcl2l1...,psi-mi:dlrb1_human(display_long)|uniprotkb:DYN...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Puthalakath et al. (1999),pubmed:10198631|mint:MINT-5211354,taxid:10090(mouse)|taxid:10090(Mus musculus),...,intact:EBI-372128,10198631,MI:0113,MI:0113,MI:0007,intact-miscore:0.51,intact:EBI-372128_intact:EBI-526084|intact:EBI...,MI:0113_MI:0113,intact:EBI-372128_intact:EBI-526084|intact:EBI...,intact:EBI-526268
2,uniprotkb:O54918-3,uniprotkb:Q9NP97,intact:EBI-526084|intact:EBI-7424890|intact:MI...,intact:EBI-372128|uniprotkb:Q5TC72|uniprotkb:Q...,psi-mi:o54918-3(display_long)|uniprotkb:Bcl2l1...,psi-mi:dlrb1_human(display_long)|uniprotkb:DYN...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Puthalakath et al. (1999),pubmed:10198631|mint:MINT-5211354,taxid:10090(mouse)|taxid:10090(Mus musculus),...,intact:EBI-372128,10198631,MI:0113,MI:0113,MI:0007,intact-miscore:0.51,intact:EBI-372128_intact:EBI-526084|intact:EBI...,MI:0113_MI:0113,intact:EBI-372128_intact:EBI-526084|intact:EBI...,intact:EBI-526288
3,uniprotkb:Q13153,uniprotkb:P21127-12,intact:EBI-1307|uniprotkb:O75561|uniprotkb:Q13...,intact:EBI-1018806,psi-mi:pak1_human(display_long)|uniprotkb:PAK1...,psi-mi:p21127-12(display_long)|uniprotkb:PITSL...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Chen et al. (2003),pubmed:12624090,taxid:9606(human)|taxid:9606(Homo sapiens),...,intact:EBI-1018806,12624090,MI:0072,MI:0072,MI:0007,intact-miscore:0.40,intact:EBI-1018806_intact:EBI-1307|intact:EBI-...,MI:0072_MI:0072,intact:EBI-1018806_intact:EBI-1307|intact:EBI-...,intact:EBI-604045
4,uniprotkb:Q13153,uniprotkb:P21127,intact:EBI-1307|uniprotkb:O75561|uniprotkb:Q13...,intact:EBI-1298|uniprotkb:O95265|uniprotkb:Q12...,psi-mi:pak1_human(display_long)|uniprotkb:PAK1...,psi-mi:cd11b_human(display_long)|uniprotkb:PIT...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Chen et al. (2003),pubmed:12624090,taxid:9606(human)|taxid:9606(Homo sapiens),...,intact:EBI-1298,12624090,MI:0072,MI:0072,MI:0007,intact-miscore:0.40,intact:EBI-1298_intact:EBI-1307|intact:EBI-289...,MI:0072_MI:0072,intact:EBI-1298_intact:EBI-1307|intact:EBI-289...,intact:EBI-604055


# XML-scraped negatives (my_neg)

In [52]:
my_neg = my_neg.loc[
    (my_neg["aa_1"].notna()) & 
    (my_neg["aa_2"].notna())
].reset_index(drop=True)
print(f"Size after deleting any rows where aa_1 or aa_2 is nan: {len(my_neg)}")

Size after deleting any rows where aa_1 or aa_2 is nan: 969


In [53]:
# Prepare my_neg
print(f"Cleaning my_neg (the negative data we processed directly from xml)")
my_neg["pubmeds"] = my_neg["experiments"].apply(lambda x: normalize_ids(extract_pubmed_from_experiment(x)))
test1 = len(my_neg.loc[my_neg["pubmeds"].isna()])
print(f"\tTotal rows with no valid PubMed ID: {test1} ({100*test1/len(intact_clust):.2f}%)")
test1 = len(my_neg.loc[
    (my_neg["intactid_1"].isna()) | 
    (my_neg["intactid_2"].isna())
])==0
print(f"\tEvery row has an intact:EBI- ID for both interactor A and B: {test1}")
my_neg["intactid_1"] = my_neg["intactid_1"].apply(lambda x: x.replace(",","|"))
my_neg["intactid_2"] = my_neg["intactid_2"].apply(lambda x: x.replace(",","|"))
my_neg["intactid_1"] = my_neg["intactid_1"].apply(lambda x: x.replace("EBI-","intact:EBI-"))
my_neg["intactid_2"] = my_neg["intactid_2"].apply(lambda x: x.replace("EBI-","intact:EBI-"))
my_neg["unique_all_intact_sorted"] = my_neg.apply(lambda row: get_unique_id(row, colA="intactid_1", colB="intactid_2"),axis=1)
my_neg["unique_all_intact_combos"] = my_neg["unique_all_intact_sorted"].apply(lambda x: expand_cross_combinations(x))
my_neg = my_neg.explode("unique_all_intact_combos").reset_index(drop=True)
# Now, do we only have ONE sequence per intactids?
my_neg[["intactid_1","intactid_2"]]
my_neg[["intactid_1","intactid_2","unique_all_intact_combos"]]        

my_neg["seq_sort"] = my_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
gb1 = my_neg.groupby("unique_all_intact_combos").agg(unique_seqsort=("seq_sort", lambda x: len(set(x)))).reset_index()
test1 = (gb1["unique_seqsort"]>0).all()
print(f"\tAll interactions have at least one pair of sequences: {test1}")
test1 = len(gb1.loc[gb1["unique_seqsort"]==1])
print(f"\tTotal interactions with 1 associated sequence pair: {test1}")
test1 = len(gb1.loc[gb1["unique_seqsort"]>1])
print(f"\tTotal interactions with multiple associated sequence pairs: {test1}")
test1 = len(my_neg.loc[my_neg["unique_all_intact_combos"].duplicated()])
print(f"\tTotal rows that are duplicates of an intact combo ID (e.g. intact:EBI-10000824_intact:EBI-697771): {test1} ({100*test1/len(my_neg):.2f}%)")
test1 = len(my_neg.drop_duplicates("unique_all_intact_combos"))
print(f"\tNew database size if we grouped on intact combo ID: {test1} ({100*test1/len(my_neg):.2f}%)")
test1 = len(my_neg.drop_duplicates("seq_sort"))
print(f"\tNew database size if we grouped on seq_sort: {test1} ({100*test1/len(my_neg):.2f}%)")

Cleaning my_neg (the negative data we processed directly from xml)
	Total rows with no valid PubMed ID: 0 (0.00%)
	Every row has an intact:EBI- ID for both interactor A and B: True
	All interactions have at least one pair of sequences: True
	Total interactions with 1 associated sequence pair: 1814
	Total interactions with multiple associated sequence pairs: 92
	Total rows that are duplicates of an intact combo ID (e.g. intact:EBI-10000824_intact:EBI-697771): 381 (16.66%)
	New database size if we grouped on intact combo ID: 1906 (83.34%)
	New database size if we grouped on seq_sort: 916 (40.05%)


In [54]:
my_neg["all_intact_1_sorted"] = my_neg.fillna("").apply(lambda row: extract_sorted_intact_from_scraped_only(row,interactor="A"),axis=1)
my_neg["all_intact_2_sorted"] = my_neg.fillna("").apply(lambda row: extract_sorted_intact_from_scraped_only(row,interactor="B"),axis=1)
#my_neg["unique_id"] = my_neg.apply(lambda x: get_unique_id(x, colA="all_intact_1_sorted", colB="all_intact_2_sorted"), axis=1)
display(my_neg[["all_intact_1_sorted","all_intact_2_sorted"]].head())

  my_neg["all_intact_1_sorted"] = my_neg.fillna("").apply(lambda row: extract_sorted_intact_from_scraped_only(row,interactor="A"),axis=1)
  my_neg["all_intact_2_sorted"] = my_neg.fillna("").apply(lambda row: extract_sorted_intact_from_scraped_only(row,interactor="B"),axis=1)


Unnamed: 0,all_intact_1_sorted,all_intact_2_sorted
0,intact:EBI-16432404,intact:EBI-719493
1,intact:EBI-717672,intact:EBI-741515
2,intact:EBI-717672,intact:EBI-740322|intact:EBI-740343
3,intact:EBI-717672,intact:EBI-740322|intact:EBI-740343
4,intact:EBI-717672,intact:EBI-355546|intact:EBI-21352060


# Finish processing negatives

## Merge

In [55]:
###### MERGE!
print(f"Merging expansion and score information from intact into intact-micluster.")
intact_neg["unique_all_intact_combos"] = intact_neg["unique_all_intact_sorted"].apply(lambda x: expand_cross_combinations(x))
merge_db = intact_neg[["unique_all_intact_combos","miscore","Expansion method(s)"]]
merge_db = merge_db.explode("unique_all_intact_combos").reset_index(drop=True)

map_db = merge_db.groupby("unique_all_intact_combos").agg(
    unique_scores=("miscore", lambda x: list(set(x))), #keep this a list
    unique_expansions=("Expansion method(s)", lambda x: "|".join(list(set(x)))) # not a list because we actually want multiple entries if applicable
)
map_dict = map_db.to_dict()

intact_clust_neg["unique_scores"] = intact_clust_neg["unique_id"].apply(lambda x: map_dict["unique_scores"][x])
intact_clust_neg["unique_expansions"] = intact_clust_neg["unique_id"].apply(lambda x: map_dict["unique_expansions"][x])
test1 = len(intact_clust_neg.loc[intact_clust_neg["unique_scores"].apply(lambda x: len(x)>1)])
print(f"\tTotal rows with more than one miscore: {test1}. Exploding along these rows")
intact_clust_neg = intact_clust_neg.explode("unique_scores").reset_index(drop=True)

Merging expansion and score information from intact into intact-micluster.
	Total rows with more than one miscore: 0. Exploding along these rows


In [56]:
# (1) Make sure we just have one score per row
test1 = len(intact_clust_neg.loc[intact_clust_neg["unique_scores"].str.count("intact-miscore")>1])==0
print(f"\tintact contributed one intact-miscore per row: {test1}")
test1 = len(intact_clust_neg.loc[intact_clust_neg["Confidence value(s)"].str.count("intact-miscore")>1])==0
print(f"\tintact-micluster contributed one intact-miscore per row: {test1}")

	intact contributed one intact-miscore per row: True
	intact-micluster contributed one intact-miscore per row: True


In [57]:
# (2) Make sure those are the same value if you round
intact_clust_neg["confidence_val_int"] = intact_clust_neg["Confidence value(s)"].apply(lambda x: round(float(x.split("intact-miscore:")[1]),2))
intact_clust_neg["unique_score_int"] = intact_clust_neg["unique_scores"].apply(lambda x: round(float(x.split("intact-miscore:")[1]),2))
intact_clust_neg["equal_score_int"] = intact_clust_neg["unique_score_int"]==intact_clust_neg["confidence_val_int"]
test1 = len(intact_clust_neg.loc[intact_clust_neg["equal_score_int"]==False])
print(f"\tTotal rows where intact and intact-micluster.txt have different confidence scores: {test1} ({100*test1/len(intact_clust_neg):.2f}%)")
intact_clust_neg = (
    intact_clust_neg.sort_values(by=["unique_id", "equal_score_int"], ascending=[True, False])
      .drop_duplicates(subset=["unique_id"], keep="first")
).reset_index(drop=True)
test1 = len(intact_clust_neg.loc[intact_clust_neg["equal_score_int"]])
print(f"\tTotal rows where intact and intact-micluster.txt have the same confidence scores: {test1} ({100*test1/len(intact_clust_neg):.2f}%). Total rows: {len(intact_clust_neg)}")
intact_clust_neg["miscore"] = intact_clust_neg["confidence_val_int"]

	Total rows where intact and intact-micluster.txt have different confidence scores: 0 (0.00%)
	Total rows where intact and intact-micluster.txt have the same confidence scores: 931 (100.00%). Total rows: 931


In [58]:
# (3) Look at how many rows have expansion versus no expansion or some
print(f"\tInvestigating modes of expansion. Only keeping rows where at least once, this interaction was shown WITHOUT expansion")
test1 = intact_clust_neg["unique_expansions"].value_counts().to_dict()
for k,v in test1.items():
    print(f"\t\tExpansion mode = {k}. Total rows = {v} ({100*v/len(intact_clust_neg):.2f}%)")
intact_clust_expand = intact_clust_neg.loc[~intact_clust_neg["unique_expansions"].str.contains("not expanded")].reset_index(drop=True)
intact_clust_neg = intact_clust_neg.loc[intact_clust_neg["unique_expansions"].str.contains("not expanded")].reset_index(drop=True)
print(f"Total interaction rows remaining: {len(intact_clust_neg)}")
print(f"Unique values in intact_clust_neg expansion methods: {intact_clust_neg['unique_expansions'].unique().tolist()}")


	Investigating modes of expansion. Only keeping rows where at least once, this interaction was shown WITHOUT expansion
		Expansion mode = not expanded. Total rows = 921 (98.93%)
		Expansion mode = psi-mi:"MI:1060"(spoke expansion). Total rows = 10 (1.07%)
Total interaction rows remaining: 921
Unique values in intact_clust_neg expansion methods: ['not expanded']


In [59]:
temp = my_neg.loc[
    (my_neg["intactid_1"].isna()) | 
    (my_neg["intactid_2"].isna())
].reset_index(drop=True)
print(len(temp))
print(temp["process_method"].unique().tolist())
display(temp.head())
na_cols = [c for c in temp.columns if temp[c].isna().any() ]
print(f"Looking at subset of dataframe where one Intact ID failed to be mapped.\nOther empty columns: {','.join(na_cols)}")

0
[]


Unnamed: 0,interaction_label,interaction_mi,interaction_intactid,interaction_xml_id,experiments,year,process_method,protein_1,gene_symbol_1,mol_type_1,...,ptm_begin_2,ptm_end_2,ptm_orig_2,ptm_new_2,pubmeds,unique_all_intact_sorted,unique_all_intact_combos,seq_sort,all_intact_1_sorted,all_intact_2_sorted


Looking at subset of dataframe where one Intact ID failed to be mapped.
Other empty columns: 


In [60]:
print(f"Assembling the FINAL database")
my_neg = pd.concat([my_neg,flip_interactors(my_neg)]).drop_duplicates().reset_index(drop=True)
my_neg["all_intact_A_sorted"] = my_neg.apply(lambda row: map_back_individual_intact_mypos(row,interactor="1"),axis=1)
my_neg["all_intact_B_sorted"] = my_neg.apply(lambda row: map_back_individual_intact_mypos(row,interactor="2"),axis=1)
my_neg = my_neg.explode("all_intact_A_sorted").reset_index(drop=True)
my_neg = my_neg.explode("all_intact_B_sorted").reset_index(drop=True)
print(f"\tmy_neg size after flipping and doubling, and expanding by all possible matches for intact A and intact B: {len(my_neg)}")
test1 = len(my_neg.loc[my_neg["interaction_intactid"].str.contains("\\|")])
print(f"\tin the expanded database, total rows with multiple intact IDs pipe-separated: {test1}")
test1 = len(my_neg.loc[my_neg["interaction_intactid"].str.contains(",")])
print(f"\tin the expanded database, total rows with multiple intact IDs comma-separated: {test1}")

Assembling the FINAL database
	my_neg size after flipping and doubling, and expanding by all possible matches for intact A and intact B: 4610
	in the expanded database, total rows with multiple intact IDs pipe-separated: 0
	in the expanded database, total rows with multiple intact IDs comma-separated: 4204


In [61]:
# need to expand intact_clust by interaction identifiers and match on those
my_neg["interaction_intactid"] = my_neg["interaction_intactid"].apply(lambda x: x.split("|") if "|" in x else x.split(","))

In [62]:
print(f"Before exploding on interaction_intactid, size of my_neg: {len(my_neg)}")
my_neg = my_neg.explode("interaction_intactid").reset_index(drop=True)
print(f"After exploding on interaction_intactid, size of my_neg: {len(my_neg)}")

Before exploding on interaction_intactid, size of my_neg: 4610
After exploding on interaction_intactid, size of my_neg: 9462


In [63]:
# (8) Look into IntAct IDs for the interaction 
intact_clust_neg["IntAct Interaction identifier(s)"] = intact_clust_neg["Interaction identifier(s)"].apply(lambda x: "|".join([y for y in x.split("|") if "intact:EBI-" in y]))
test1 = (intact_clust_neg["Interaction identifier(s)"].str.count("intact:EBI-")==1).sum()
print(f"Total rows of IntAct with exactly one intact:EBI- interaction identifier for the interaction: {test1}/{len(intact_clust_neg)} ({100*test1/len(intact_clust_neg):.5f}%)")
test1 = (intact_clust_neg["Interaction identifier(s)"].str.count("intact:EBI-")==0).sum()
print(f"Total rows of IntAct with 0 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact_clust_neg)} ({100*test1/len(intact_clust_neg):.2f}%)")
test1 = (intact_clust_neg["Interaction identifier(s)"].str.count("intact:EBI-")>1).sum()
print(f"Total rows of IntAct with >1 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact_clust_neg)} ({100*test1/len(intact_clust_neg):.5f}%)")

intact_clust_neg["IntAct Interaction identifier(s)"] = intact_clust_neg["IntAct Interaction identifier(s)"].apply(lambda x: x.split("|"))
intact_clust_neg = intact_clust_neg.explode("IntAct Interaction identifier(s)").reset_index(drop=True)
intact_clust_neg["IntAct Interaction identifier(s)"] = intact_clust_neg["IntAct Interaction identifier(s)"].apply(lambda x: x.split("intact:")[1])
print(f"Exploded along IntAct Interaction identifier(s). New # rows: {len(intact_clust_neg)}")

Total rows of IntAct with exactly one intact:EBI- interaction identifier for the interaction: 877/921 (95.22258%)
Total rows of IntAct with 0 intact:EBI- interaction identifiers for the interaction: 0/921 (0.00%)
Total rows of IntAct with >1 intact:EBI- interaction identifiers for the interaction: 44/921 (4.77742%)
Exploded along IntAct Interaction identifier(s). New # rows: 974


In [64]:
merged_neg = pd.merge(
    intact_clust_neg.rename(columns={"IntAct Interaction identifier(s)":"interaction_intactid"}).drop(columns=["Expansion method(s)"]),
    my_neg.rename(columns={"unique_all_intact_combos":"unique_id"}),
    on=["interaction_intactid", "unique_id","all_intact_A_sorted","all_intact_B_sorted"],
    how="left"
)
print(f"\tResults of a left merge of (intact_clust_neg,my_neg): len {len(merged_neg)}")
test1 = len(merged_neg.loc[(merged_neg["aa_1"].isna()) | (merged_neg["aa_2"].isna())])
print(f"\tTotal rows where there is no associated amino acid sequence {test1} ({100*test1/len(merged_neg):.2f}%)")
merged_neg = merged_neg.loc[(merged_neg["aa_1"].notna()) & (merged_neg["aa_2"].notna())].reset_index(drop=True)
test1 = len(merged_neg.loc[(merged_neg["aa_1"].isna()) | (merged_neg["aa_2"].isna())])
print(f"\tAfter dropping rows with no AA sequence: total rows where there is no associated amino acid sequence {test1}")
# drop duplicates
merged_neg = merged_neg.drop_duplicates().reset_index(drop=True)
print(f"\tTotal rows after dropping duplicates: {len(merged_neg)}")
merged_neg.sort_values(by="unique_id")[["unique_id","miscore","unique_expansions"]]

merged_neg["no_uniprot_update_A"] = merged_neg["Annotation(s) interactor A"].fillna("").str.contains("no-uniprot-update")
merged_neg["no_uniprot_update_B"] = merged_neg["Annotation(s) interactor B"].fillna("").str.contains("no-uniprot-update")

unique_seqpairs = merged_neg["seq_sort"].unique().tolist()
seq_pair_idmap = dict(zip(unique_seqpairs, [f"seqpair{i}" for i in range(1, len(unique_seqpairs)+1)]))
merged_neg["seq_pair_id"] = merged_neg["seq_sort"].map(seq_pair_idmap)

# Add columns to help figure out cause of sequence duplication
merged_neg["unique_uniprot_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A",colB="uniprot_B"),axis=1)
merged_neg["uniprot_A_noiso1"] = merged_neg["uniprot_A"].apply(lambda x: x.replace("-1","") if (x is not None and type(x)==str) else x)
merged_neg["uniprot_B_noiso1"] = merged_neg["uniprot_B"].apply(lambda x: x.replace("-1","") if (x is not None and type(x)==str) else x)
merged_neg["unique_uniprot_noiso1_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A_noiso1",colB="uniprot_B_noiso1"),axis=1)
merged_neg["uniprot_A_noisoforms"] = merged_neg["uniprot_A"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg["uniprot_B_noisoforms"] = merged_neg["uniprot_B"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg["unique_uniprot_noisoforms_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)


	Results of a left merge of (intact_clust_neg,my_neg): len 976
	Total rows where there is no associated amino acid sequence 5 (0.51%)
	After dropping rows with no AA sequence: total rows where there is no associated amino acid sequence 0
	Total rows after dropping duplicates: 970


In [65]:
# Are any of the rows in merged peptides? 
print(merged_neg["mol_type_1"].value_counts())
print(merged_neg["mol_type_2"].value_counts())

mol_type_1
protein    970
Name: count, dtype: int64
mol_type_2
protein    970
Name: count, dtype: int64


## UniProt ID Mapping

In [66]:
l = merged_neg["uniprotkb_1"].dropna().tolist()
l = ",".join(l)
l = l.split(",")
l = list(set(l))
print(l[0:5])

l2 = merged_neg["uniprotkb_2"].dropna().tolist()
l2 = ",".join(l2)
l2 = l2.split(",")
l2 = list(set(l2))
print(l2[0:5])

l += l2
l = list(set(l))

['O15350-3', 'P67809', 'Q5KQF9', 'Q07817-1', 'A0A0S2Z333']
['Q8WWW0-2', 'P67809', 'Q96EH4', 'Q6Y5D8-1', 'Q14749']


In [67]:
display(merged_neg[["uniprotkb_1","uniprot_A","uniprotkb_2","uniprot_B"]])

Unnamed: 0,uniprotkb_1,uniprot_A,uniprotkb_2,uniprot_B
0,Q10173,uniprotkb:Q10173,Q9Y738,uniprotkb:Q9Y738
1,P35240-1,uniprotkb:P35240-1,"P0DPB3-1,Q9P0W5-1",uniprotkb:P0DPB3-1
2,A0A0S2Z6H0,uniprotkb:A0A0S2Z6H0,"Q6A162,Q6IFU5",uniprotkb:Q6A162
3,,,"Q6A162,Q6IFU5",uniprotkb:Q6A162
4,A0A0S2Z5U3,uniprotkb:A0A0S2Z5U3,"Q6A162,Q6IFU5",uniprotkb:Q6A162
...,...,...,...,...
965,"Q86UW1,Q6ZMC7",uniprotkb:Q86UW1,"P54253,Q17S02,Q9UJG2,Q9Y4J1",uniprotkb:P54253
966,O95292-2,uniprotkb:O95292-2,O95292-1,uniprotkb:O95292-1
967,Q8WXU2-2,uniprotkb:Q8WXU2-2,"O00471,B2R6C5,Q8IW24",uniprotkb:O00471
968,"Q96PU8,Q2I375,Q5MJQ1,Q969L9,Q96EJ3,Q96KA3,Q96P...",uniprotkb:Q96PU8,"Q15366,Q6PKG5,Q6IPF4,A8K7X6,F8VYL7,Q32Q82,Q68Y...",uniprotkb:Q15366


In [68]:
# write all the unique UniProtKB IDs to a file so we can do ID mapping
all_merged_neg_uniprots = set(merged_neg["uniprot_A"].dropna().str.split("uniprotkb:",expand=True)[1].unique().tolist() + merged_neg["uniprot_B"].dropna().str.split("uniprotkb:",expand=True)[1].unique().tolist())
# get rid of isoforms and pro's for now; we won't ID-map on these
all_merged_neg_uniprots = set([x.split("-")[0] for x in all_merged_neg_uniprots if (x is not None and type(x)==str and x!="")])
feature_folder = "data_files/processed/intact/features/"
os.makedirs(feature_folder,exist_ok=True)
with open(os.path.join(feature_folder,"all_merged_neg_uniprots.txt"),"w") as f:
    f.write("\n".join(sorted(all_merged_neg_uniprots)))
print(f"\tWrote {len(all_merged_neg_uniprots)} unique UniProtKB IDs to {os.path.join(feature_folder,'all_merged_neg_uniprots.txt')}")

	Wrote 643 unique UniProtKB IDs to data_files/processed/intact/features/all_merged_neg_uniprots.txt


In [69]:
# is what's in uniprot_A and uniprot_B at least one of the things in uniprotkb_1 and uniprotkb_2?
merged_neg["uniprot_kb_clust_match_A"] = merged_neg.apply(lambda row: row["uniprot_A"].split("uniprotkb:")[0] in row["uniprotkb_1"] if type(row["uniprot_A"])==str else np.nan, axis=1)
merged_neg["uniprot_kb_clust_match_B"] = merged_neg.apply(lambda row: row["uniprot_B"].split("uniprotkb:")[0] in row["uniprotkb_2"] if type(row["uniprot_B"])==str else np.nan, axis=1)
display(merged_neg[[
    "uniprot_A","uniprotkb_1","uniprot_kb_clust_match_A",
    "uniprot_B","uniprotkb_2","uniprot_kb_clust_match_B",
]].head())

test1 = len(merged_neg.loc[
    (merged_neg["uniprot_kb_clust_match_A"].notna()) & 
    ~(merged_neg["uniprot_kb_clust_match_A"].notna())
])==0
print(f"The single IDs in uniprot_A are always one of the list in unniprotkb_1: {test1}")
test1 = len(merged_neg.loc[
    (merged_neg["uniprot_kb_clust_match_B"].notna()) & 
    ~(merged_neg["uniprot_kb_clust_match_B"].notna())
])==0
print(f"The single IDs in uniprot_B are always one of the list in unniprotkb_2: {test1}")

Unnamed: 0,uniprot_A,uniprotkb_1,uniprot_kb_clust_match_A,uniprot_B,uniprotkb_2,uniprot_kb_clust_match_B
0,uniprotkb:Q10173,Q10173,True,uniprotkb:Q9Y738,Q9Y738,True
1,uniprotkb:P35240-1,P35240-1,True,uniprotkb:P0DPB3-1,"P0DPB3-1,Q9P0W5-1",True
2,uniprotkb:A0A0S2Z6H0,A0A0S2Z6H0,True,uniprotkb:Q6A162,"Q6A162,Q6IFU5",True
3,,,,uniprotkb:Q6A162,"Q6A162,Q6IFU5",True
4,uniprotkb:A0A0S2Z5U3,A0A0S2Z5U3,True,uniprotkb:Q6A162,"Q6A162,Q6IFU5",True


The single IDs in uniprot_A are always one of the list in unniprotkb_1: True
The single IDs in uniprot_B are always one of the list in unniprotkb_2: True


In [70]:
from Bio import SeqIO

In [71]:
idmap_folder = "data_files/processed/intact/idmapping"
idmap_fasta_path = os.path.join(idmap_folder,"negatives_idmapping_2025_12_01.fasta")
idmap_tsv_path = os.path.join(idmap_folder,"negatives_idmapping_2025_12_01.tsv")

# read the fasta
fasta_rows = [[record.id,"".join(record.seq),record.description] for record in SeqIO.parse(idmap_fasta_path, "fasta")]
idmap_tsv_df = pd.read_csv(idmap_tsv_path,sep="\t")

# drop columns that are all NaN
idmap_tsv_df = idmap_tsv_df.dropna(axis=1, how='all')
print(f"Length of idmap_tsv_df={len(idmap_tsv_df)}")
idmap_tsv_df.head()

Length of idmap_tsv_df=642


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Propeptide,Signal peptide,Transit peptide,Chain
0,A0A0C3SFZ9,A0A0C3SFZ9,unreviewed,A0A0C3SFZ9_HUMAN,FCH and mu domain containing endocytic adaptor...,FCHO1 hCG_2000568,Homo sapiens (Human),891,,,,
1,A0A0S2Z333,A0A0S2Z333,unreviewed,A0A0S2Z333_HUMAN,Serpin peptidase inhibitor clade G member 1 is...,SERPING1,Homo sapiens (Human),174,,"SIGNAL 1..20; /evidence=""ECO:0000256|SAM:SignalP""",,"CHAIN 21..174; /evidence=""ECO:0000256|SAM:Sign..."
2,A0A0S2Z341,A0A0S2Z341,unreviewed,A0A0S2Z341_HUMAN,Carbonic anhydrase (EC 4.2.1.1),CA9,Homo sapiens (Human),284,,"SIGNAL 1..37; /evidence=""ECO:0000256|SAM:SignalP""",,"CHAIN 38..284; /note=""Carbonic anhydrase""; /ev..."
3,A0A0S2Z368,A0A0S2Z368,unreviewed,A0A0S2Z368_HUMAN,Chloride channel 2 isoform 5,CLCN2,Homo sapiens (Human),85,,,,
4,A0A0S2Z3D2,A0A0S2Z3D2,unreviewed,A0A0S2Z3D2_HUMAN,Bcl-2-like protein 1 (Apoptosis regulator Bcl-X),BCL2L1,Homo sapiens (Human),151,,,,


In [72]:
# Indices are inclusive in these columns from what I have seen on UniProt
peptide_columns = ["Chain","Propeptide","Signal peptide","Transit peptide"]

for c in peptide_columns:
    unique_prefixes = idmap_tsv_df.loc[idmap_tsv_df[c].notna()][c].str.split(" ",expand=True)[0].unique().tolist()
    test1 = len(unique_prefixes)==1 and not(unique_prefixes[0]=="") # if true, then all entries are empty strings
    print(f"Investigating column {c} for prefixes to the peptide coordinates.\n\tUnique prefixes: {','.join(unique_prefixes)}. Only one: {test1}")
    pref = unique_prefixes[0]
    # find total columns with multiple
    test1 = idmap_tsv_df.loc[(idmap_tsv_df[c].notna()) & (idmap_tsv_df[c].str.count(pref)>1)]
    print(f"\tTotal rows with multiple {c} entries: {len(test1)}")
    # now going to split by this prefix so I can get a list
    idmap_tsv_df[c] = idmap_tsv_df[c].apply(lambda x: [y.strip() for y in x.split(f"{pref} ") if len(y)>0] if (x is not None and type(x)==str and x!="") else None)

for c in peptide_columns:
    idmap_tsv_df = idmap_tsv_df.explode(c).reset_index(drop=True)

Investigating column Chain for prefixes to the peptide coordinates.
	Unique prefixes: CHAIN. Only one: True
	Total rows with multiple Chain entries: 16
Investigating column Propeptide for prefixes to the peptide coordinates.
	Unique prefixes: PROPEP. Only one: True
	Total rows with multiple Propeptide entries: 1
Investigating column Signal peptide for prefixes to the peptide coordinates.
	Unique prefixes: SIGNAL. Only one: True
	Total rows with multiple Signal peptide entries: 0
Investigating column Transit peptide for prefixes to the peptide coordinates.
	Unique prefixes: TRANSIT. Only one: True
	Total rows with multiple Transit peptide entries: 0


In [73]:
idmap_fasta_df = pd.DataFrame(fasta_rows, columns=["uniprot_id_full","sequence","description"])
idmap_fasta_df[["database","uniprotkb","uniprot_gene_name"]] = idmap_fasta_df["uniprot_id_full"].str.split("|",expand=True)
idmap_fasta_df["uniprotkb_iso"] = idmap_fasta_df["uniprotkb"].apply(lambda x: x if "-" in x else f"{x}-0")
idmap_fasta_df["isoform_from_desc"] = "Isoform " +  idmap_fasta_df["description"].str.extract(r'(?i)\bisoform\s+([^\s,;:)\]]+)')[0]
idmap_fasta_df["isoform_from_desc"] = idmap_fasta_df["isoform_from_desc"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_fasta_df["isoform_from_uniprotkb"] = "Isoform " + idmap_fasta_df["uniprotkb_iso"].apply(lambda x: x.split("-")[1] if (x is not None and type(x)==str and "-" in x) else None)
idmap_fasta_df["isoform_from_uniprotkb"] = idmap_fasta_df["isoform_from_uniprotkb"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_fasta_df["canonical_uniprotkb"] = idmap_fasta_df["uniprotkb"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
idmap_fasta_df["uniprotkb"] = "uniprotkb:" + idmap_fasta_df["uniprotkb"]
idmap_fasta_df = idmap_fasta_df.drop(columns=["uniprot_id_full","description"])
display(idmap_fasta_df.head())

# want to group somehow and determine which isoforms have the same sequences as each other, if any 
test1 = idmap_fasta_df.groupby(["canonical_uniprotkb","sequence"]).agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: set(x)),
    unique_isoforms_from_desc=("isoform_from_desc", lambda x: set(x)),
).reset_index()
test1["total_isoforms_from_uniprotkb"] = test1["unique_isoforms_from_uniprotkb"].apply(lambda x: len(x) if x is not None else 0)
test1["total_isoforms_from_dec"] = test1["unique_isoforms_from_desc"].apply(lambda x: len(x) if x is not None else 0)
test2 = len(test1.loc[test1["total_isoforms_from_uniprotkb"]>1])
print(f"Total instances where two different isoforms of the same protein in UniProt have the exact same sequence: {test2}")

# Check if there are any cases where there's a blank AND an isoform 1 in the same sequence
test1 = idmap_fasta_df.groupby(["canonical_uniprotkb"]).agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: list(set(x))),
    unique_isoforms_from_desc=("isoform_from_desc", lambda x: list(set(x))),
)
test1["unique_isoforms"] = test1.apply(lambda row: list(set(row["unique_isoforms_from_uniprotkb"] + row["unique_isoforms_from_desc"])),axis=1)
test1 = len(test1.loc[(test1["unique_isoforms_from_uniprotkb"].apply(lambda x: "Isoform 0" in x)) & (test1["unique_isoforms_from_uniprotkb"].apply(lambda x: "Isoform 1" in x))])
print(f"Total instances where there are both an Isoform 0 and Isoform 1 for the same protein: {test1}")
#idmap_fasta_df["uniprotkb"] = "uniprotkb:" + idmap_fasta_df["uniprotkb"]

# make sure that there always is a unique isoform 
test1 = idmap_fasta_df.groupby("canonical_uniprotkb").agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: list(set(x))),
)
test1["iso_0_count"] = test1["unique_isoforms_from_uniprotkb"].apply(lambda x: x.count("Isoform 0") if x is not None else 0)
test1 = (test1["iso_0_count"]==1).all()
print(f"Every uniprotkb has exactly one canonical isoform, which we have named Isoform 0: {test1}")

Unnamed: 0,sequence,database,uniprotkb,uniprot_gene_name,uniprotkb_iso,isoform_from_desc,isoform_from_uniprotkb,canonical_uniprotkb
0,MSYFGEHFWGEKNHGFEVLYHSVKQGPISTKELADFIRERATIEET...,tr,uniprotkb:A0A0C3SFZ9,A0A0C3SFZ9_HUMAN,A0A0C3SFZ9-0,,Isoform 0,A0A0C3SFZ9
1,MASRLTLLTLLLLLLAGVGQLQLSHNLSLVILVPQNLKHRLEDMEQ...,tr,uniprotkb:A0A0S2Z333,A0A0S2Z333_HUMAN,A0A0S2Z333-0,Isoform 4,Isoform 0,A0A0S2Z333
2,MAPLCPSPWLPLLIPAPAPGLTVQLLLSLLLLVPVHPQRLPRMQED...,tr,uniprotkb:A0A0S2Z341,A0A0S2Z341_HUMAN,A0A0S2Z341-0,,Isoform 0,A0A0S2Z341
3,MAAWFPDGIHTDSSTYRIVPGGYAVVGAAALAGAVTHTVSTAVIVF...,tr,uniprotkb:A0A0S2Z368,A0A0S2Z368_HUMAN,A0A0S2Z368-0,Isoform 5,Isoform 0,A0A0S2Z368
4,MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...,tr,uniprotkb:A0A0S2Z3D2,A0A0S2Z3D2_HUMAN,A0A0S2Z3D2-0,,Isoform 0,A0A0S2Z3D2


Total instances where two different isoforms of the same protein in UniProt have the exact same sequence: 0
Total instances where there are both an Isoform 0 and Isoform 1 for the same protein: 10
Every uniprotkb has exactly one canonical isoform, which we have named Isoform 0: True


In [74]:
# make the additional sequences from tsv
# each row has 
idmap_tsv_df_chains = idmap_tsv_df.loc[idmap_tsv_df["Chain"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_propeptides = idmap_tsv_df.loc[idmap_tsv_df["Propeptide"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_sigpeptides = idmap_tsv_df.loc[idmap_tsv_df["Signal peptide"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_transitpeptides = idmap_tsv_df.loc[idmap_tsv_df["Transit peptide"].notna()].reset_index(drop=True).copy()

# make a dictionary from the FASTA df with the canonical isoform
canonical_seq_dict = idmap_fasta_df.loc[idmap_fasta_df["isoform_from_uniprotkb"]=="Isoform 0"]
canonical_seq_dict = idmap_fasta_df.loc[idmap_fasta_df["isoform_from_uniprotkb"]=="Isoform 0"].reset_index(drop=True)
canonical_seq_dict = dict(zip(canonical_seq_dict["canonical_uniprotkb"],canonical_seq_dict["sequence"]))
print(f"\tMade a mapping of UniProt IDs to their canonical sequences: {len(canonical_seq_dict)} entries")

	Made a mapping of UniProt IDs to their canonical sequences: 642 entries


In [75]:
# Process chains 
# for each of these, I need to rename as the name of the peptide and change the sequence. and I need to make all the other columns None
idmap_tsv_df_chains[["Propeptide","Signal peptide","Transit peptide"]] = None
idmap_tsv_df_chains["Chain_coords_1ind"] = idmap_tsv_df_chains["Chain"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_chains["Chain_name"] = idmap_tsv_df_chains["Chain"].apply(lambda x: x.split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x else None)
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_chains)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_chains["canonical_sequence"] = idmap_tsv_df_chains["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Chain_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
idmap_tsv_df_chains["uniprotkb"] = idmap_tsv_df_chains.apply(lambda row: row["Entry"] + "-" + row["Chain_name"], axis=1)
idmap_tsv_df_chains["Sequence"] = idmap_tsv_df_chains.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Chain_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_chains):.2f}%)")
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_chains):.2f}%)")
idmap_tsv_df_chains.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 585
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
	Total rows that could not be mapped to a subsequence: 1 (0.17%)
	Total rows that were successfully mapped to a subsequence: 584 (99.83%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Propeptide,Signal peptide,Transit peptide,Chain,Chain_coords_1ind,Chain_name,canonical_sequence,uniprotkb,Sequence
0,A0A0S2Z333,A0A0S2Z333,unreviewed,A0A0S2Z333_HUMAN,Serpin peptidase inhibitor clade G member 1 is...,SERPING1,Homo sapiens (Human),174,,,,"21..174; /evidence=""ECO:0000256|SAM:SignalP""; ...","[21, 174]",PRO_5006608196,MASRLTLLTLLLLLLAGVGQLQLSHNLSLVILVPQNLKHRLEDMEQ...,A0A0S2Z333-PRO_5006608196,LQLSHNLSLVILVPQNLKHRLEDMEQALSPSVFKAIMEKLEMSKFQ...
1,A0A0S2Z341,A0A0S2Z341,unreviewed,A0A0S2Z341_HUMAN,Carbonic anhydrase (EC 4.2.1.1),CA9,Homo sapiens (Human),284,,,,"38..284; /note=""Carbonic anhydrase""; /evidence...","[38, 284]",PRO_5006608187,MAPLCPSPWLPLLIPAPAPGLTVQLLLSLLLLVPVHPQRLPRMQED...,A0A0S2Z341-PRO_5006608187,QRLPRMQEDSPLGGGSSGEDDPLGEEDLPSEEDSPREEDPPGEEDL...
2,A0A0S2Z3E8,A0A0S2Z3E8,unreviewed,A0A0S2Z3E8_HUMAN,Fibrinogen alpha chain,FGA,Homo sapiens (Human),436,,,,"20..436; /note=""Fibrinogen alpha chain""; /evid...","[20, 436]",PRO_5006608193,MFSMRIVCLVLSVVGTAWTADSGEGDFLAEGGGVRGPRVVERHQSA...,A0A0S2Z3E8-PRO_5006608193,ADSGEGDFLAEGGGVRGPRVVERHQSACKDSDWPFCSDEDWNYKCP...
3,A0A0S2Z3I5,A0A0S2Z3I5,unreviewed,A0A0S2Z3I5_HUMAN,Betacellulin isoform 2,BTC,Homo sapiens (Human),129,,,,"32..129; /evidence=""ECO:0000256|SAM:SignalP""; ...","[32, 129]",PRO_5006608195,MDRAARCSGASSLPLLLALALGLVILHCVVADGNSTRSPETNGLLC...,A0A0S2Z3I5-PRO_5006608195,DGNSTRSPETNGLLCGDPEENCAATTTQSKRKGHFSRCPKQYKHYC...
4,A0A0S2Z3K0,A0A0S2Z3K0,unreviewed,A0A0S2Z3K0_HUMAN,Collagen type I alpha 2 isoform 5,COL1A2,Homo sapiens (Human),486,,,,"21..486; /evidence=""ECO:0000256|SAM:SignalP""; ...","[21, 486]",PRO_5006608231,MLSFVDTRTLLLLAVTLCLATCQSLQEETVRKGPAGDRGPRGERGP...,A0A0S2Z3K0-PRO_5006608231,TCQSLQEETVRKGPAGDRGPRGERGPPGPPGRDGEDGPTGPPGPPG...


In [76]:
# Process peptides
idmap_tsv_df_propeptides[["Chain","Signal peptide","Transit peptide"]] = None
idmap_tsv_df_propeptides["Propeptide_coords_1ind"] = idmap_tsv_df_propeptides["Propeptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_propeptides["Propeptide_name"] = idmap_tsv_df_propeptides["Propeptide"].apply(lambda x: x.split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x else None)
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_propeptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_propeptides["canonical_sequence"] = idmap_tsv_df_propeptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Propeptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
idmap_tsv_df_propeptides["uniprotkb"] = idmap_tsv_df_propeptides.apply(lambda row: row["Entry"] + "-" + row["Propeptide_name"], axis=1)
idmap_tsv_df_propeptides["Sequence"] = idmap_tsv_df_propeptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Propeptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_propeptides):.2f}%)")
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_propeptides):.2f}%)")
idmap_tsv_df_propeptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 18
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
	Total rows that could not be mapped to a subsequence: 0 (0.00%)
	Total rows that were successfully mapped to a subsequence: 18 (100.00%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Propeptide,Signal peptide,Transit peptide,Chain,Propeptide_coords_1ind,Propeptide_name,canonical_sequence,uniprotkb,Sequence
0,O43464,O43464,reviewed,HTRA2_HUMAN,"Serine protease HTRA2, mitochondrial (EC 3.4.2...",HTRA2 OMI PRSS25,Homo sapiens (Human),458,"32..133; /evidence=""ECO:0000269|PubMed:1158362...",,,,"[32, 133]",PRO_0000026945,MAAPRAGRGAGWSLRAWRALGGIRWGRRPRLTPDLRALLTSGTSDP...,O43464-PRO_0000026945,TPDLRALLTSGTSDPRARVTYGTPSLWARLSVGVTEPRACLTSGTP...
1,P01112,P01112,reviewed,RASH_HUMAN,GTPase HRas (EC 3.6.5.2) (H-Ras-1) (Ha-Ras) (T...,HRAS HRAS1,Homo sapiens (Human),189,"187..189; /note=""Removed in mature form""; /id=...",,,,"[187, 189]",PRO_0000042997,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,P01112-PRO_0000042997,VLS
2,P01112,P01112,reviewed,RASH_HUMAN,GTPase HRas (EC 3.6.5.2) (H-Ras-1) (Ha-Ras) (T...,HRAS HRAS1,Homo sapiens (Human),189,"187..189; /note=""Removed in mature form""; /id=...",,,,"[187, 189]",PRO_0000042997,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,P01112-PRO_0000042997,VLS
3,P04156,P04156,reviewed,PRIO_HUMAN,Major prion protein (PrP) (ASCR) (PrP27-30) (P...,PRNP ALTPRP PRIP PRP,Homo sapiens (Human),253,"231..253; /note=""Removed in mature form""; /evi...",,,,"[231, 253]",PRO_0000025676,MANLGCWMLVLFVATWSDLGLCKKRPKPGGWNTGGSRYPGQGSPGG...,P04156-PRO_0000025676,SMVLFSSPPVILLISFLIFLIVG
4,P42575,P42575,reviewed,CASP2_HUMAN,Caspase-2 (CASP-2) (EC 3.4.22.55) (Neural prec...,CASP2 ICH1 NEDD2,Homo sapiens (Human),452,"2..169; /id=""PRO_0000004541"";",,,,"[2, 169]",PRO_0000004541,MAAPSAGSWSTFQHKELMAADRGRRILGVCGMHPHHQETLKKNRVV...,P42575-PRO_0000004541,AAPSAGSWSTFQHKELMAADRGRRILGVCGMHPHHQETLKKNRVVL...


In [77]:
# Process signal peptides
idmap_tsv_df_sigpeptides[["Chain","Propeptide","Transit peptide"]] = None
idmap_tsv_df_sigpeptides["Sigpeptide_coords_1ind"] = idmap_tsv_df_sigpeptides["Signal peptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_sigpeptides["Sigpeptide_name"] = idmap_tsv_df_sigpeptides.apply(lambda x: x["Signal peptide"].split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x["Signal peptide"] else "sigpep"+x["Signal peptide"].split(";")[0].strip(), axis=1)
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_sigpeptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_sigpeptides["canonical_sequence"] = idmap_tsv_df_sigpeptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sigpeptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sigpeptide_name"].str.contains("sigpep")])
print(f"\t\tTotal chains with made-up name by me: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sigpeptide_name"].str.contains("PRO_")])
print(f"\t\tTotal chains with name from UniProt: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
idmap_tsv_df_sigpeptides["uniprotkb"] = idmap_tsv_df_sigpeptides.apply(lambda row: row["Entry"] + "-" + row["Sigpeptide_name"], axis=1)
idmap_tsv_df_sigpeptides["Sequence"] = idmap_tsv_df_sigpeptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Sigpeptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
idmap_tsv_df_sigpeptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 53
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
		Total chains with made-up name by me: 53 (100.00%)
		Total chains with name from UniProt: 0 (0.00%)
	Total rows that could not be mapped to a subsequence: 0 (0.00%)
	Total rows that were successfully mapped to a subsequence: 53 (100.00%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Propeptide,Signal peptide,Transit peptide,Chain,Sigpeptide_coords_1ind,Sigpeptide_name,canonical_sequence,uniprotkb,Sequence
0,A0A0S2Z333,A0A0S2Z333,unreviewed,A0A0S2Z333_HUMAN,Serpin peptidase inhibitor clade G member 1 is...,SERPING1,Homo sapiens (Human),174,,"1..20; /evidence=""ECO:0000256|SAM:SignalP""",,,"[1, 20]",sigpep1..20,MASRLTLLTLLLLLLAGVGQLQLSHNLSLVILVPQNLKHRLEDMEQ...,A0A0S2Z333-sigpep1..20,MASRLTLLTLLLLLLAGVGQ
1,A0A0S2Z341,A0A0S2Z341,unreviewed,A0A0S2Z341_HUMAN,Carbonic anhydrase (EC 4.2.1.1),CA9,Homo sapiens (Human),284,,"1..37; /evidence=""ECO:0000256|SAM:SignalP""",,,"[1, 37]",sigpep1..37,MAPLCPSPWLPLLIPAPAPGLTVQLLLSLLLLVPVHPQRLPRMQED...,A0A0S2Z341-sigpep1..37,MAPLCPSPWLPLLIPAPAPGLTVQLLLSLLLLVPVHP
2,A0A0S2Z3E8,A0A0S2Z3E8,unreviewed,A0A0S2Z3E8_HUMAN,Fibrinogen alpha chain,FGA,Homo sapiens (Human),436,,"1..19; /evidence=""ECO:0000256|SAM:SignalP""",,,"[1, 19]",sigpep1..19,MFSMRIVCLVLSVVGTAWTADSGEGDFLAEGGGVRGPRVVERHQSA...,A0A0S2Z3E8-sigpep1..19,MFSMRIVCLVLSVVGTAWT
3,A0A0S2Z3I5,A0A0S2Z3I5,unreviewed,A0A0S2Z3I5_HUMAN,Betacellulin isoform 2,BTC,Homo sapiens (Human),129,,"1..31; /evidence=""ECO:0000256|SAM:SignalP""",,,"[1, 31]",sigpep1..31,MDRAARCSGASSLPLLLALALGLVILHCVVADGNSTRSPETNGLLC...,A0A0S2Z3I5-sigpep1..31,MDRAARCSGASSLPLLLALALGLVILHCVVA
4,A0A0S2Z3K0,A0A0S2Z3K0,unreviewed,A0A0S2Z3K0_HUMAN,Collagen type I alpha 2 isoform 5,COL1A2,Homo sapiens (Human),486,,"1..20; /evidence=""ECO:0000256|SAM:SignalP""",,,"[1, 20]",sigpep1..20,MLSFVDTRTLLLLAVTLCLATCQSLQEETVRKGPAGDRGPRGERGP...,A0A0S2Z3K0-sigpep1..20,MLSFVDTRTLLLLAVTLCLA


In [78]:
# Process peptides
idmap_tsv_df_transitpeptides[["Chain","Signal peptide","Propeptide"]] = None
idmap_tsv_df_transitpeptides["Transpeptide_coords_1ind"] = idmap_tsv_df_transitpeptides["Transit peptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_transitpeptides["Transpeptide_name"] = idmap_tsv_df_transitpeptides.apply(lambda x: x["Transit peptide"].split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x["Transit peptide"] else "transpep"+x["Transit peptide"].split(";")[0].strip(), axis=1)
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_transitpeptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_transitpeptides["canonical_sequence"] = idmap_tsv_df_transitpeptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Transpeptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Transpeptide_name"].str.contains("transpep")])
print(f"\t\tTotal chains with made-up name by me: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Transpeptide_name"].str.contains("PRO_")])
print(f"\t\tTotal chains with name from UniProt: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
idmap_tsv_df_transitpeptides["uniprotkb"] = idmap_tsv_df_transitpeptides.apply(lambda row: row["Entry"] + "-" + row["Transpeptide_name"], axis=1)
idmap_tsv_df_transitpeptides["Sequence"] = idmap_tsv_df_transitpeptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Transpeptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
idmap_tsv_df_transitpeptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 9
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
		Total chains with made-up name by me: 9 (100.00%)
		Total chains with name from UniProt: 0 (0.00%)
	Total rows that could not be mapped to a subsequence: 0 (0.00%)
	Total rows that were successfully mapped to a subsequence: 9 (100.00%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Propeptide,Signal peptide,Transit peptide,Chain,Transpeptide_coords_1ind,Transpeptide_name,canonical_sequence,uniprotkb,Sequence
0,O43464,O43464,reviewed,HTRA2_HUMAN,"Serine protease HTRA2, mitochondrial (EC 3.4.2...",HTRA2 OMI PRSS25,Homo sapiens (Human),458,,,"1..31; /note=""Mitochondrion""",,"[1, 31]",transpep1..31,MAAPRAGRGAGWSLRAWRALGGIRWGRRPRLTPDLRALLTSGTSDP...,O43464-transpep1..31,MAAPRAGRGAGWSLRAWRALGGIRWGRRPRL
1,P07954,P07954,reviewed,FUMH_HUMAN,"Fumarate hydratase, mitochondrial (Fumarase) (...",FH,Homo sapiens (Human),510,,,"1..44; /note=""Mitochondrion""; /evidence=""ECO:0...",,"[1, 44]",transpep1..44,MYRALRLLARSRPLVRAPAAALASAPGLGGAAVPSFWPPNAARMAS...,P07954-transpep1..44,MYRALRLLARSRPLVRAPAAALASAPGLGGAAVPSFWPPNAARM
2,P26440,P26440,reviewed,IVD_HUMAN,"Isovaleryl-CoA dehydrogenase, mitochondrial (I...",IVD,Homo sapiens (Human),426,,,"1..32; /note=""Mitochondrion""; /evidence=""ECO:0...",,"[1, 32]",transpep1..32,MAEMATATRLLGWRVASWRLRPPLAGFVSQRAHSLLPVDDAINGLS...,P26440-transpep1..32,MAEMATATRLLGWRVASWRLRPPLAGFVSQRA
3,Q16854,Q16854,reviewed,DGUOK_HUMAN,"Deoxyguanosine kinase, mitochondrial (EC 2.7.1...",DGUOK DGK,Homo sapiens (Human),277,,,"1..39; /note=""Mitochondrion""; /evidence=""ECO:0...",,"[1, 39]",transpep1..39,MAAGRLFLSRLRAPFSSMAKSPLEGVSSSRGLHAGRGPRRLSIEGN...,Q16854-transpep1..39,MAAGRLFLSRLRAPFSSMAKSPLEGVSSSRGLHAGRGPR
4,Q8IYU8,Q8IYU8,reviewed,MICU2_HUMAN,"Calcium uptake protein 2, mitochondrial (hMICU...",MICU2 EFHA1,Homo sapiens (Human),434,,,"1..22; /note=""Mitochondrion""; /evidence=""ECO:0...",,"[1, 22]",transpep1..22,MAAAAGSCARVAAWGGKLRRGLAVSRQAVRSPGPLAAAVAGAALAG...,Q8IYU8-transpep1..22,MAAAAGSCARVAAWGGKLRRGL


In [79]:
idmap_tsv_df_combined = idmap_tsv_df.loc[
    (idmap_tsv_df["Chain"].isna()) & 
    (idmap_tsv_df["Propeptide"].isna()) & 
    (idmap_tsv_df["Signal peptide"].isna()) & 
    (idmap_tsv_df["Transit peptide"].isna())
    ].reset_index(drop=True).copy()
idmap_tsv_df_combined["uniprotkb"] = idmap_tsv_df_combined["Entry"]
idmap_tsv_df_combined["Sequence"] = idmap_tsv_df_combined["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)

idmap_tsv_df_combined = pd.concat(
    [
        idmap_tsv_df_combined,
        idmap_tsv_df_chains,
        idmap_tsv_df_propeptides
    ]
).reset_index(drop=True).drop(columns=[
    "Chain_coords_1ind","Chain_name",
    "Propeptide_coords_1ind","Propeptide_name",
    "canonical_sequence"])
idmap_tsv_df_combined

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Propeptide,Signal peptide,Transit peptide,Chain,uniprotkb,Sequence
0,A0A0C3SFZ9,A0A0C3SFZ9,unreviewed,A0A0C3SFZ9_HUMAN,FCH and mu domain containing endocytic adaptor...,FCHO1 hCG_2000568,Homo sapiens (Human),891,,,,,A0A0C3SFZ9,MSYFGEHFWGEKNHGFEVLYHSVKQGPISTKELADFIRERATIEET...
1,A0A0S2Z368,A0A0S2Z368,unreviewed,A0A0S2Z368_HUMAN,Chloride channel 2 isoform 5,CLCN2,Homo sapiens (Human),85,,,,,A0A0S2Z368,MAAWFPDGIHTDSSTYRIVPGGYAVVGAAALAGAVTHTVSTAVIVF...
2,A0A0S2Z3D2,A0A0S2Z3D2,unreviewed,A0A0S2Z3D2_HUMAN,Bcl-2-like protein 1 (Apoptosis regulator Bcl-X),BCL2L1,Homo sapiens (Human),151,,,,,A0A0S2Z3D2,MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
3,A0A0S2Z3F5,A0A0S2Z3F5,unreviewed,A0A0S2Z3F5_HUMAN,BCL2-associated athanogene isoform 2 (BCL2-ass...,BAG1 hCG_30265,Homo sapiens (Human),204,,,,,A0A0S2Z3F5,MKKKTRRRSTRSEELTRSEELTLSEEATWSEEATQSEEATQGEEMN...
4,A0A0S2Z3G1,A0A0S2Z3G1,unreviewed,A0A0S2Z3G1_HUMAN,Actinin alpha 4 isoform 4,ACTN4,Homo sapiens (Human),75,,,,,A0A0S2Z3G1,MVDYHAANQSYQYGPSSAGNGAGGGGSMGDYMPSWRSRASSSPTST...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
683,Q9BZS1,Q9BZS1,reviewed,FOXP3_HUMAN,Forkhead box protein P3 (Scurfin) [Cleaved int...,FOXP3 IPEX JM2,Homo sapiens (Human),431,"418..431; /evidence=""ECO:0000305|PubMed:191178...",,,,Q9BZS1-PRO_0000432432,SQRPSRCSNPTPGP
684,Q9BZS1,Q9BZS1,reviewed,FOXP3_HUMAN,Forkhead box protein P3 (Scurfin) [Cleaved int...,FOXP3 IPEX JM2,Homo sapiens (Human),431,"418..431; /evidence=""ECO:0000305|PubMed:191178...",,,,Q9BZS1-PRO_0000432432,SQRPSRCSNPTPGP
685,Q9BZS1,Q9BZS1,reviewed,FOXP3_HUMAN,Forkhead box protein P3 (Scurfin) [Cleaved int...,FOXP3 IPEX JM2,Homo sapiens (Human),431,"418..431; /evidence=""ECO:0000305|PubMed:191178...",,,,Q9BZS1-PRO_0000432432,SQRPSRCSNPTPGP
686,Q9P0K1,Q9P0K1,reviewed,ADA22_HUMAN,Disintegrin and metalloproteinase domain-conta...,ADAM22 MDC2,Homo sapiens (Human),906,"26..222; /evidence=""ECO:0000250""; /id=""PRO_000...",,,,Q9P0K1-PRO_0000029112,GQAGDASLMELEKRKENRFVERQSIVPLRLIYRSGGEDESRHDALD...


In [80]:
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined.copy()
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined_for_merge.drop(columns=[
    "From","Protein names","Gene Names","Organism","Length","Chain","Propeptide","Signal peptide","Transit peptide"
])
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined_for_merge.rename(
    columns = {
        "Entry": "uniprotkb",
        "uniprotkb": "uniprotkb_iso",
        "Reviewed": "database",
        "Entry Name": "uniprot_gene_name",
        "Sequence": "sequence"
    }
)
idmap_tsv_df_combined_for_merge["uniprotkb_iso"] = idmap_tsv_df_combined_for_merge["uniprotkb_iso"].apply(lambda x: f"{x}-0" if "-" not in x else x)
test1 = len(idmap_tsv_df_combined_for_merge["database"].value_counts().reset_index())==2
print(f"\tEverything is either reviewed or unreviewed (no other categories) in idmap tsv: {test1}")
idmap_tsv_df_combined_for_merge["database"] = idmap_tsv_df_combined_for_merge["database"].apply(
    lambda x: "sp" if x=="reviewed" else "tr")
idmap_tsv_df_combined_for_merge["uniprotkb"] = "uniprotkb:" + idmap_tsv_df_combined_for_merge["uniprotkb"]
idmap_tsv_df_combined_for_merge

	Everything is either reviewed or unreviewed (no other categories) in idmap tsv: True


Unnamed: 0,uniprotkb,database,uniprot_gene_name,uniprotkb_iso,sequence
0,uniprotkb:A0A0C3SFZ9,tr,A0A0C3SFZ9_HUMAN,A0A0C3SFZ9-0,MSYFGEHFWGEKNHGFEVLYHSVKQGPISTKELADFIRERATIEET...
1,uniprotkb:A0A0S2Z368,tr,A0A0S2Z368_HUMAN,A0A0S2Z368-0,MAAWFPDGIHTDSSTYRIVPGGYAVVGAAALAGAVTHTVSTAVIVF...
2,uniprotkb:A0A0S2Z3D2,tr,A0A0S2Z3D2_HUMAN,A0A0S2Z3D2-0,MSQSNRELVVDFLSYKLSQKGYSWSQFSDVEENRTEAPEGTESEME...
3,uniprotkb:A0A0S2Z3F5,tr,A0A0S2Z3F5_HUMAN,A0A0S2Z3F5-0,MKKKTRRRSTRSEELTRSEELTLSEEATWSEEATQSEEATQGEEMN...
4,uniprotkb:A0A0S2Z3G1,tr,A0A0S2Z3G1_HUMAN,A0A0S2Z3G1-0,MVDYHAANQSYQYGPSSAGNGAGGGGSMGDYMPSWRSRASSSPTST...
...,...,...,...,...,...
683,uniprotkb:Q9BZS1,sp,FOXP3_HUMAN,Q9BZS1-PRO_0000432432,SQRPSRCSNPTPGP
684,uniprotkb:Q9BZS1,sp,FOXP3_HUMAN,Q9BZS1-PRO_0000432432,SQRPSRCSNPTPGP
685,uniprotkb:Q9BZS1,sp,FOXP3_HUMAN,Q9BZS1-PRO_0000432432,SQRPSRCSNPTPGP
686,uniprotkb:Q9P0K1,sp,ADA22_HUMAN,Q9P0K1-PRO_0000029112,GQAGDASLMELEKRKENRFVERQSIVPLRLIYRSGGEDESRHDALD...


In [81]:
print(len(idmap_fasta_df))
print(len(
    idmap_tsv_df_combined_for_merge.loc[
        (idmap_tsv_df_combined["Chain"].notna()) | 
        (idmap_tsv_df_combined["Propeptide"].notna())
    ]
))
print(len(idmap_fasta_df) + len(
    idmap_tsv_df_combined_for_merge.loc[
        (idmap_tsv_df_combined["Chain"].notna()) | 
        (idmap_tsv_df_combined["Propeptide"].notna()) 
    ]
))

1626
603
2229


In [82]:
# make a species map
idmap_species_dict = idmap_tsv_df_combined[["Entry Name","Organism"]]
idmap_species_dict["Entry Name"] = idmap_species_dict["Entry Name"].apply(lambda x: x.split("_")[1] if "_" in x else None)
idmap_species_dict = idmap_species_dict.dropna().drop_duplicates().reset_index(drop=True)
idmap_species_dict = dict(zip(idmap_species_dict["Entry Name"],idmap_species_dict["Organism"]))
print(f"Total unique species: {len(idmap_species_dict)}")

Total unique species: 13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idmap_species_dict["Entry Name"] = idmap_species_dict["Entry Name"].apply(lambda x: x.split("_")[1] if "_" in x else None)


In [83]:
idmap_merge = pd.merge(
    idmap_fasta_df,
    idmap_tsv_df_combined_for_merge,
    on=["uniprotkb","database","uniprot_gene_name","uniprotkb_iso","sequence"],
    how="outer"
)
idmap_merge = idmap_merge.drop(columns=["uniprotkb"])
idmap_merge = idmap_merge.rename(columns={"uniprotkb_iso":"uniprotkb_full"})
idmap_merge["isoform_from_desc"] = idmap_merge["isoform_from_desc"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_merge["isoform_or_chain_from_uniprotkb"] = idmap_merge["uniprotkb_full"].apply(lambda x: x.split("-")[1] if "-" in x else None)
idmap_merge["canonical_uniprotkb"] = idmap_merge["uniprotkb_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
idmap_merge = idmap_merge[["canonical_uniprotkb","uniprotkb_full","uniprot_gene_name","database","sequence","isoform_or_chain_from_uniprotkb","isoform_from_desc"]]
idmap_merge["species"] = idmap_merge["uniprot_gene_name"].apply(lambda x: idmap_species_dict[x.split("_")[1]] if (x is not None and type(x)==str and "_" in x and x.split("_")[1] in idmap_species_dict) else None)
idmap_merge = idmap_merge.drop_duplicates().reset_index(drop=True)
test1 = len(idmap_merge)
print(f"\tTotal")
idmap_merge = idmap_merge.loc[idmap_merge["sequence"].notna()].reset_index(drop=True)
idmap_merge

	Total


Unnamed: 0,canonical_uniprotkb,uniprotkb_full,uniprot_gene_name,database,sequence,isoform_or_chain_from_uniprotkb,isoform_from_desc,species
0,A0A0C3SFZ9,A0A0C3SFZ9-0,A0A0C3SFZ9_HUMAN,tr,MSYFGEHFWGEKNHGFEVLYHSVKQGPISTKELADFIRERATIEET...,0,,Homo sapiens (Human)
1,A0A0S2Z333,A0A0S2Z333-0,A0A0S2Z333_HUMAN,tr,MASRLTLLTLLLLLLAGVGQLQLSHNLSLVILVPQNLKHRLEDMEQ...,0,Isoform 4,Homo sapiens (Human)
2,A0A0S2Z333,A0A0S2Z333-PRO_5006608196,A0A0S2Z333_HUMAN,tr,LQLSHNLSLVILVPQNLKHRLEDMEQALSPSVFKAIMEKLEMSKFQ...,PRO_5006608196,,Homo sapiens (Human)
3,A0A0S2Z341,A0A0S2Z341-0,A0A0S2Z341_HUMAN,tr,MAPLCPSPWLPLLIPAPAPGLTVQLLLSLLLLVPVHPQRLPRMQED...,0,,Homo sapiens (Human)
4,A0A0S2Z341,A0A0S2Z341-PRO_5006608187,A0A0S2Z341_HUMAN,tr,QRLPRMQEDSPLGGGSSGEDDPLGEEDLPSEEDSPREEDPPGEEDL...,PRO_5006608187,,Homo sapiens (Human)
...,...,...,...,...,...,...,...,...
2213,Q9Y738,Q9Y738-PRO_0000096492,MIS12_SCHPO,sp,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,PRO_0000096492,,Schizosaccharomyces pombe (strain 972 / ATCC 2...
2214,Q9Z0S9,Q9Z0S9-0,PRAF1_MOUSE,sp,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,0,,Mus musculus (Mouse)
2215,Q9Z0S9,Q9Z0S9-PRO_0000220879,PRAF1_MOUSE,sp,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,PRO_0000220879,,Mus musculus (Mouse)
2216,Q9Z254,Q9Z254-0,GIPC1_RAT,sp,MPLGLGRRKKAPPLVENEEAEPSRSGLGVGEPGPLGGSGAGESQMG...,0,,Rattus norvegicus (Rat)


In [84]:
idmap_savepath = "data_files/processed/intact/idmapping"
os.makedirs(idmap_savepath,exist_ok=True)
idmap_merge.to_csv(os.path.join(idmap_savepath,"negatives_idmapping_2025_12_01_processed_sequences.csv"),index=False)
idmap_tsv_df_combined.to_csv(os.path.join(idmap_savepath,"negatives_idmapping_2025_12_01_processed_tsv_only_no_isoforms.tsv"),sep="\t",index=False)

In [85]:
# first remove spaces then invalids
merged_neg["aa_1"] = merged_neg["aa_1"].str.replace(" ","")
merged_neg["aa_2"] = merged_neg["aa_2"].str.replace(" ","")
test1 = (len(merged_neg.loc[merged_neg["aa_1"].str.contains(" ")]))==0
print(f"Removed all spaces from sequences in merged_neg: {test1}")

merged_neg["invalids_aa_1"] = merged_neg["aa_1"].apply(lambda x: find_invalid_chars(x,VALID_AAS))
merged_neg["invalids_aa_2"] = merged_neg["aa_2"].apply(lambda x: find_invalid_chars(x,VALID_AAS))
test1 = len(merged_neg.loc[
    (merged_neg["invalids_aa_1"].notna()) | 
    (merged_neg["invalids_aa_2"].notna())
])
print(f"There are {test1} rows where either aa_1 or aa_2 contains an invalid character. {test1}/{len(merged_neg)} = {100*test1/len(merged_neg):.2f}%")
l = merged_neg.loc[
    (merged_neg["invalids_aa_1"].notna()) | 
    (merged_neg["invalids_aa_2"].notna())
][["invalids_aa_1","invalids_aa_2"]]
l = l["invalids_aa_1"].dropna().unique().tolist() + l["invalids_aa_2"].dropna().unique().tolist() 
l = sorted(list(set((",".join(l)).split(","))))
print(f"\tList of unique invalid characters found: {l}")
merged_neg =  merged_neg.loc[
    ~((merged_neg["invalids_aa_1"].notna()) | 
    (merged_neg["invalids_aa_2"].notna()))
].reset_index(drop=True)
print(f"New size of merged_neg after droping these rows: {len(merged_neg)}")

Removed all spaces from sequences in merged_neg: True
There are 0 rows where either aa_1 or aa_2 contains an invalid character. 0/970 = 0.00%
	List of unique invalid characters found: ['']
New size of merged_neg after droping these rows: 970


## Recombine the UniProt ID-Mapping with merged other info

In [86]:
# For the new merge, we need to follow the isoform-0 rule so we can finally get everything aligned to its correct sequence
merged_neg2 = merged_neg.copy()
merged_neg2 = merged_neg2.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged_neg2["uniprot_A_full"] = merged_neg2["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg2["uniprot_B_full"] = merged_neg2["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg2["unique_uniprot_pair"] = merged_neg2.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged_neg2["uniprot_A_noisoforms"] = merged_neg2["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg2["uniprot_B_noisoforms"] = merged_neg2["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg2["unique_uniprot_noisoforms_pair"] = merged_neg2.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)
merged_neg2

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,unique_uniprot_pair,uniprot_A_noisoforms,uniprot_B_noisoforms,unique_uniprot_noisoforms_pair,uniprot_kb_clust_match_A,uniprot_kb_clust_match_B,invalids_aa_1,invalids_aa_2,uniprot_A_full,uniprot_B_full
0,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0_uniprotkb:Q9Y738-0,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173_uniprotkb:Q9Y738,True,True,,,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0
1,intact:EBI-1014500,intact:EBI-1397518,uniprotkb:P35240-1|ensembl:ENSP00000344666.5,uniprotkb:P0DPB3-1|ensembl:ENSP00000491030.1,psi-mi:p35240-1|psi-mi:NF2|uniprotkb:I|uniprot...,psi-mi:p0dpb3-1|psi-mi:SCHIP1|uniprotkb:Q9P0W5...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Goutebroze et al. (2000),pubmed:10669747|imex:IM-19739,taxid:9606(human),...,uniprotkb:P0DPB3-1_uniprotkb:P35240-1,uniprotkb:P35240,uniprotkb:P0DPB3,uniprotkb:P0DPB3_uniprotkb:P35240,True,True,,,uniprotkb:P35240-1,uniprotkb:P0DPB3-1
2,intact:EBI-16428984,intact:EBI-10171697,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,psi-mi:a0a0s2z6h0_human|psi-mi:ZGPAT|uniprotkb...,psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,uniprotkb:A0A0S2Z6H0-0_uniprotkb:Q6A162-0,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162,uniprotkb:A0A0S2Z6H0_uniprotkb:Q6A162,True,True,,,uniprotkb:A0A0S2Z6H0-0,uniprotkb:Q6A162-0
3,intact:EBI-16467584,intact:EBI-10171697,,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,"psi-mi:""ccsb isoform id: gad1_3""|psi-mi:EBI-16...",psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,_uniprotkb:Q6A162-0,,uniprotkb:Q6A162,_uniprotkb:Q6A162,,True,,,,uniprotkb:Q6A162-0
4,intact:EBI-16468000,intact:EBI-10171697,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,psi-mi:a0a0s2z5u3_human|psi-mi:HNRPLL|uniprotk...,psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,uniprotkb:A0A0S2Z5U3-0_uniprotkb:Q6A162-0,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162,uniprotkb:A0A0S2Z5U3_uniprotkb:Q6A162,True,True,,,uniprotkb:A0A0S2Z5U3-0,uniprotkb:Q6A162-0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,intact:EBI-945738,intact:EBI-930964,uniprotkb:Q86UW1|ensembl:ENSP00000296327.5,uniprotkb:P54253|ensembl:ENSP00000244769.3,psi-mi:osta_human|psi-mi:SLC51A|uniprotkb:Q6ZM...,psi-mi:atx1_human|psi-mi:ATXN1|uniprotkb:Q17S0...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:P54253-0_uniprotkb:Q86UW1-0,uniprotkb:Q86UW1,uniprotkb:P54253,uniprotkb:P54253_uniprotkb:Q86UW1,True,True,,,uniprotkb:Q86UW1-0,uniprotkb:P54253-0
966,intact:EBI-9350855,intact:EBI-9350848,uniprotkb:O95292-2|ensembl:ENSP00000379147.3,uniprotkb:O95292-1|ensembl:ENSP00000417175.1,psi-mi:o95292-2|psi-mi:VAPB|uniprotkb:VAP-C|un...,psi-mi:o95292-1|psi-mi:VAPB|uniprotkb:VAP-B|un...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Kukihara et al. (2009),pubmed:19515777|imex:IM-25931,taxid:9606(human),...,uniprotkb:O95292-1_uniprotkb:O95292-2,uniprotkb:O95292,uniprotkb:O95292,uniprotkb:O95292_uniprotkb:O95292,True,True,,,uniprotkb:O95292-2,uniprotkb:O95292-1
967,intact:EBI-9381887,intact:EBI-949824,uniprotkb:Q8WXU2-2|ensembl:ENSP00000403412.2,uniprotkb:O00471|ensembl:ENSP00000484855.1,psi-mi:q8wxu2-2|psi-mi:DNAAF4|uniprotkb:DNAAF4...,psi-mi:exoc5_human|psi-mi:EXOC5|uniprotkb:B2R6...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,uniprotkb:O00471-0_uniprotkb:Q8WXU2-2,uniprotkb:Q8WXU2,uniprotkb:O00471,uniprotkb:O00471_uniprotkb:Q8WXU2,True,True,,,uniprotkb:Q8WXU2-2,uniprotkb:O00471-0
968,intact:EBI-945792,intact:EBI-945799,uniprotkb:Q96PU8|ensembl:ENSP00000355094.3,uniprotkb:Q15366|ensembl:ENSP00000408949.2,psi-mi:qki_human|psi-mi:QKI|uniprotkb:Q2I375|u...,psi-mi:pcbp2_human|psi-mi:PCBP2|uniprotkb:Q6PK...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q15366-0_uniprotkb:Q96PU8-0,uniprotkb:Q96PU8,uniprotkb:Q15366,uniprotkb:Q15366_uniprotkb:Q96PU8,True,True,,,uniprotkb:Q96PU8-0,uniprotkb:Q15366-0


In [87]:
idmap_merge["uniprotkb_full"] = idmap_merge["uniprotkb_full"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged_neg2 = pd.merge(
    merged_neg2,
    idmap_merge.rename(columns={"uniprotkb_full":"uniprot_A_full","sequence":"aa_1_fromidmap","uniprot_gene_name":"uniprot_gene_name_A","species":"species_A"})[["uniprot_A_full","aa_1_fromidmap","uniprot_gene_name_A","species_A"]],
    on="uniprot_A_full",
    how="left"
)
merged_neg2 = pd.merge(
    merged_neg2,
    idmap_merge.rename(columns={"uniprotkb_full":"uniprot_B_full","sequence":"aa_2_fromidmap","uniprot_gene_name":"uniprot_gene_name_B","species":"species_B"})[["uniprot_B_full","aa_2_fromidmap","uniprot_gene_name_B","species_B"]],
    on="uniprot_B_full",
    how="left"
)
merged_neg2

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,invalids_aa_1,invalids_aa_2,uniprot_A_full,uniprot_B_full,aa_1_fromidmap,uniprot_gene_name_A,species_A,aa_2_fromidmap,uniprot_gene_name_B,species_B
0,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,,,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
1,intact:EBI-1014500,intact:EBI-1397518,uniprotkb:P35240-1|ensembl:ENSP00000344666.5,uniprotkb:P0DPB3-1|ensembl:ENSP00000491030.1,psi-mi:p35240-1|psi-mi:NF2|uniprotkb:I|uniprot...,psi-mi:p0dpb3-1|psi-mi:SCHIP1|uniprotkb:Q9P0W5...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Goutebroze et al. (2000),pubmed:10669747|imex:IM-19739,taxid:9606(human),...,,,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,,,,,,
2,intact:EBI-16428984,intact:EBI-10171697,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,psi-mi:a0a0s2z6h0_human|psi-mi:ZGPAT|uniprotkb...,psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,uniprotkb:A0A0S2Z6H0-0,uniprotkb:Q6A162-0,MDEESLESALQTYRAQLQQVELALGAGLDSSEQADLRQLQGDLKEL...,A0A0S2Z6H0_HUMAN,Homo sapiens (Human),MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,K1C40_HUMAN,Homo sapiens (Human)
3,intact:EBI-16467584,intact:EBI-10171697,,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,"psi-mi:""ccsb isoform id: gad1_3""|psi-mi:EBI-16...",psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,,uniprotkb:Q6A162-0,,,,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,K1C40_HUMAN,Homo sapiens (Human)
4,intact:EBI-16468000,intact:EBI-10171697,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,psi-mi:a0a0s2z5u3_human|psi-mi:HNRPLL|uniprotk...,psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,uniprotkb:A0A0S2Z5U3-0,uniprotkb:Q6A162-0,MSSSSSSPRETYEEDREYESQAKRLKTEEGEIDYSAEEGENRREAT...,A0A0S2Z5U3_HUMAN,Homo sapiens (Human),MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,K1C40_HUMAN,Homo sapiens (Human)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,intact:EBI-945738,intact:EBI-930964,uniprotkb:Q86UW1|ensembl:ENSP00000296327.5,uniprotkb:P54253|ensembl:ENSP00000244769.3,psi-mi:osta_human|psi-mi:SLC51A|uniprotkb:Q6ZM...,psi-mi:atx1_human|psi-mi:ATXN1|uniprotkb:Q17S0...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,,,uniprotkb:Q86UW1-0,uniprotkb:P54253-0,MEPGRTQIKLDPRYTADLLEVLKTNYGIPSACFSQPPTAAQLLRAL...,OSTA_HUMAN,Homo sapiens (Human),MKSNQERSNECLPPKKREIPATSRSSEEKAPTLPSDNHRVEGTAWL...,ATX1_HUMAN,Homo sapiens (Human)
966,intact:EBI-9350855,intact:EBI-9350848,uniprotkb:O95292-2|ensembl:ENSP00000379147.3,uniprotkb:O95292-1|ensembl:ENSP00000417175.1,psi-mi:o95292-2|psi-mi:VAPB|uniprotkb:VAP-C|un...,psi-mi:o95292-1|psi-mi:VAPB|uniprotkb:VAP-B|un...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Kukihara et al. (2009),pubmed:19515777|imex:IM-25931,taxid:9606(human),...,,,uniprotkb:O95292-2,uniprotkb:O95292-1,MAKVEQVLSLEPQHELKFRGPFTDVVTTNLKLGNPTDRNVCFKVKT...,VAPB_HUMAN,Homo sapiens (Human),,,
967,intact:EBI-9381887,intact:EBI-949824,uniprotkb:Q8WXU2-2|ensembl:ENSP00000403412.2,uniprotkb:O00471|ensembl:ENSP00000484855.1,psi-mi:q8wxu2-2|psi-mi:DNAAF4|uniprotkb:DNAAF4...,psi-mi:exoc5_human|psi-mi:EXOC5|uniprotkb:B2R6...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,uniprotkb:Q8WXU2-2,uniprotkb:O00471-0,MPLQVSDYSWQQTKTAVFLSLPLKGVCVRDTDVFCTENYLKVNFPP...,DAAF4_HUMAN,Homo sapiens (Human),MATTAELFEEPFVADEYIERLVWRTPGGGSRGGPEAFDPKRLLEEF...,EXOC5_HUMAN,Homo sapiens (Human)
968,intact:EBI-945792,intact:EBI-945799,uniprotkb:Q96PU8|ensembl:ENSP00000355094.3,uniprotkb:Q15366|ensembl:ENSP00000408949.2,psi-mi:qki_human|psi-mi:QKI|uniprotkb:Q2I375|u...,psi-mi:pcbp2_human|psi-mi:PCBP2|uniprotkb:Q6PK...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,,,uniprotkb:Q96PU8-0,uniprotkb:Q15366-0,MVGEMETKEKPKPTPDYLMQLMNDKKLMSSLPNFCGIFNHLERLLD...,QKI_HUMAN,Homo sapiens (Human),MDTGVIEGGLNVTLTIRLLMHGKEVGSIIGKKGESVKKMREESGAR...,PCBP2_HUMAN,Homo sapiens (Human)


In [88]:
# for merged_neg3, we're going to merge on everything with the same canonical isoform 
# test case
merged_neg3 = merged_neg.copy(deep=True)
merged_neg3 = merged_neg3.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged_neg3["uniprot_A_full"] = merged_neg3["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg3["uniprot_B_full"] = merged_neg3["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg3["unique_uniprot_pair"] = merged_neg3.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged_neg3["uniprot_A_noisoforms"] = merged_neg3["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg3["uniprot_B_noisoforms"] = merged_neg3["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg3["unique_uniprot_noisoforms_pair"] = merged_neg3.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

idmap_merge_copy = idmap_merge.copy(deep=True)
idmap_merge_copy["canonical_uniprot"] = idmap_merge_copy["canonical_uniprotkb"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged_neg3 = pd.merge(
    merged_neg3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_A_noisoforms",
                                "uniprotkb_full":"uniprot_A_bestiso",
                                "sequence":"aa_1_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_A",
                                "species":"species_A"})[["uniprot_A_noisoforms","uniprot_A_bestiso","aa_1_bestiso","uniprot_gene_name_A","species_A"]],
    on=["uniprot_A_noisoforms"],
    how="left"
)
merged_neg3 = pd.merge(
    merged_neg3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_B_noisoforms",
                                "uniprotkb_full":"uniprot_B_bestiso",
                                "sequence":"aa_2_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_B",
                                "species":"species_B"})[["uniprot_B_noisoforms","uniprot_B_bestiso","aa_2_bestiso","uniprot_gene_name_B","species_B"]],
    on=["uniprot_B_noisoforms"],
    how="left"
    
)
merged_neg3

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,uniprot_A_full,uniprot_B_full,uniprot_A_bestiso,aa_1_bestiso,uniprot_gene_name_A,species_A,uniprot_B_bestiso,aa_2_bestiso,uniprot_gene_name_B,species_B
0,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
1,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
2,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173-PRO_0000057994,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
3,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173-PRO_0000057994,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
4,intact:EBI-1014500,intact:EBI-1397518,uniprotkb:P35240-1|ensembl:ENSP00000344666.5,uniprotkb:P0DPB3-1|ensembl:ENSP00000491030.1,psi-mi:p35240-1|psi-mi:NF2|uniprotkb:I|uniprot...,psi-mi:p0dpb3-1|psi-mi:SCHIP1|uniprotkb:Q9P0W5...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Goutebroze et al. (2000),pubmed:10669747|imex:IM-19739,taxid:9606(human),...,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240-0,MAGAIASRMSFSSLKRKQPKTFTVRIVTMDAEMEFNCEMKWKGKDL...,MERL_HUMAN,Homo sapiens (Human),uniprotkb:P0DPB3-0,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,SCHI1_HUMAN,Homo sapiens (Human)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10181,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,uniprotkb:Q9BYW2-2,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-PRO_0000048598,MKPMKKACTGLSGPGSGSKSPPATRAKALRRRGAGEGDKPEEEDDE...,CIC_HUMAN,Homo sapiens (Human)
10182,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,uniprotkb:Q9BYW2-2,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-2,MYSAHRPLMPASSAASRGLGMFVWTNVEPRSVAVFPWHSLVPFLAP...,CIC_HUMAN,Homo sapiens (Human)
10183,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,uniprotkb:Q9BYW2-3,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-0,MKPMKKACTGLSGPGSGSKSPPATRAKALRRRGAGEGDKPEEEDDE...,CIC_HUMAN,Homo sapiens (Human)
10184,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,uniprotkb:Q9BYW2-3,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-PRO_0000048598,MKPMKKACTGLSGPGSGSKSPPATRAKALRRRGAGEGDKPEEEDDE...,CIC_HUMAN,Homo sapiens (Human)


In [89]:
merged_neg4 = merged_neg.copy(deep=True)
merged_neg4 = merged_neg4.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged_neg4["uniprot_A_full"] = merged_neg4["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg4["uniprot_B_full"] = merged_neg4["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg4["unique_uniprot_pair"] = merged_neg4.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged_neg4["uniprot_A_noisoforms"] = merged_neg4["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg4["uniprot_B_noisoforms"] = merged_neg4["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg4["unique_uniprot_noisoforms_pair"] = merged_neg4.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

idmap_merge_copy = idmap_merge.copy(deep=True)
idmap_merge_copy["canonical_uniprot"] = idmap_merge_copy["canonical_uniprotkb"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged_neg4["aa_1_bestiso"] = merged_neg4["aa_1"].copy()
merged_neg4["aa_2_bestiso"] = merged_neg4["aa_2"].copy()
merged_neg4 = pd.merge(
    merged_neg4,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_A_noisoforms",
                                "uniprotkb_full":"uniprot_A_bestiso",
                                "sequence":"aa_1_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_A",
                                "species":"species_A"})[["uniprot_A_noisoforms","uniprot_A_bestiso","aa_1_bestiso","uniprot_gene_name_A","species_A"]],
    on=["uniprot_A_noisoforms","aa_1_bestiso"],
    how="left"
)
merged_neg4 = pd.merge(
    merged_neg4,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_B_noisoforms",
                                "uniprotkb_full":"uniprot_B_bestiso",
                                "sequence":"aa_2_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_B",
                                "species":"species_B"})[["uniprot_B_noisoforms","uniprot_B_bestiso","aa_2_bestiso","uniprot_gene_name_B","species_B"]],
    on=["uniprot_B_noisoforms","aa_2_bestiso"],
    how="left"
)
merged_neg4

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,uniprot_A_full,uniprot_B_full,aa_1_bestiso,aa_2_bestiso,uniprot_A_bestiso,uniprot_gene_name_A,species_A,uniprot_B_bestiso,uniprot_gene_name_B,species_B
0,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,uniprotkb:Q10173-0,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
1,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,uniprotkb:Q10173-0,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
2,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,uniprotkb:Q10173-PRO_0000057994,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
3,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,uniprotkb:Q10173-PRO_0000057994,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...
4,intact:EBI-1014500,intact:EBI-1397518,uniprotkb:P35240-1|ensembl:ENSP00000344666.5,uniprotkb:P0DPB3-1|ensembl:ENSP00000491030.1,psi-mi:p35240-1|psi-mi:NF2|uniprotkb:I|uniprot...,psi-mi:p0dpb3-1|psi-mi:SCHIP1|uniprotkb:Q9P0W5...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Goutebroze et al. (2000),pubmed:10669747|imex:IM-19739,taxid:9606(human),...,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,MAGAIASRMSFSSLKRKQPKTFTVRIVTMDAEMEFNCEMKWKGKDL...,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,uniprotkb:P35240-0,MERL_HUMAN,Homo sapiens (Human),uniprotkb:P0DPB3-0,SCHI1_HUMAN,Homo sapiens (Human)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2031,intact:EBI-945792,intact:EBI-945799,uniprotkb:Q96PU8|ensembl:ENSP00000355094.3,uniprotkb:Q15366|ensembl:ENSP00000408949.2,psi-mi:qki_human|psi-mi:QKI|uniprotkb:Q2I375|u...,psi-mi:pcbp2_human|psi-mi:PCBP2|uniprotkb:Q6PK...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q96PU8-0,uniprotkb:Q15366-0,MVGEMETKEKPKPTPDYLMQLMNDKKLMSSLPNFCGIFNHLERLLD...,MDTGVIEGGLNVTLTIRLLMHGKEVGSIIGKKGESVKKMREESGAR...,uniprotkb:Q96PU8-PRO_0000239373,QKI_HUMAN,Homo sapiens (Human),uniprotkb:Q15366-PRO_0000050090,PCBP2_HUMAN,Homo sapiens (Human)
2032,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,MKPMKKACTGLSGPGSGSKSPPATRAKALRRRGAGEGDKPEEEDDE...,uniprotkb:Q9BYW2-0,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-0,CIC_HUMAN,Homo sapiens (Human)
2033,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,MKPMKKACTGLSGPGSGSKSPPATRAKALRRRGAGEGDKPEEEDDE...,uniprotkb:Q9BYW2-0,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-PRO_0000048598,CIC_HUMAN,Homo sapiens (Human)
2034,intact:EBI-945869,intact:EBI-945857,uniprotkb:Q9BYW2|ensembl:ENSP00000386759.3,uniprotkb:Q96RK0|ensembl:ENSP00000505728.1,psi-mi:setd2_human|psi-mi:SETD2|uniprotkb:O753...,psi-mi:cic_human|psi-mi:CIC|uniprotkb:Q7LGI1|u...,"psi-mi:""MI:0096""(pull down)",Lim et al. (2006),pubmed:16713569|imex:IM-11827|mint:MINT-5218676,taxid:9606(human),...,uniprotkb:Q9BYW2-0,uniprotkb:Q96RK0-0,MKQLQPQPPPKMGDFYDPEHPTPEEEENEAKIENVQKTGFIKGPMF...,MKPMKKACTGLSGPGSGSKSPPATRAKALRRRGAGEGDKPEEEDDE...,uniprotkb:Q9BYW2-PRO_0000252367,SETD2_HUMAN,Homo sapiens (Human),uniprotkb:Q96RK0-0,CIC_HUMAN,Homo sapiens (Human)


In [90]:
idmap_merge

Unnamed: 0,canonical_uniprotkb,uniprotkb_full,uniprot_gene_name,database,sequence,isoform_or_chain_from_uniprotkb,isoform_from_desc,species
0,A0A0C3SFZ9,uniprotkb:A0A0C3SFZ9-0,A0A0C3SFZ9_HUMAN,tr,MSYFGEHFWGEKNHGFEVLYHSVKQGPISTKELADFIRERATIEET...,0,,Homo sapiens (Human)
1,A0A0S2Z333,uniprotkb:A0A0S2Z333-0,A0A0S2Z333_HUMAN,tr,MASRLTLLTLLLLLLAGVGQLQLSHNLSLVILVPQNLKHRLEDMEQ...,0,Isoform 4,Homo sapiens (Human)
2,A0A0S2Z333,uniprotkb:A0A0S2Z333-PRO_5006608196,A0A0S2Z333_HUMAN,tr,LQLSHNLSLVILVPQNLKHRLEDMEQALSPSVFKAIMEKLEMSKFQ...,PRO_5006608196,,Homo sapiens (Human)
3,A0A0S2Z341,uniprotkb:A0A0S2Z341-0,A0A0S2Z341_HUMAN,tr,MAPLCPSPWLPLLIPAPAPGLTVQLLLSLLLLVPVHPQRLPRMQED...,0,,Homo sapiens (Human)
4,A0A0S2Z341,uniprotkb:A0A0S2Z341-PRO_5006608187,A0A0S2Z341_HUMAN,tr,QRLPRMQEDSPLGGGSSGEDDPLGEEDLPSEEDSPREEDPPGEEDL...,PRO_5006608187,,Homo sapiens (Human)
...,...,...,...,...,...,...,...,...
2213,Q9Y738,uniprotkb:Q9Y738-PRO_0000096492,MIS12_SCHPO,sp,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,PRO_0000096492,,Schizosaccharomyces pombe (strain 972 / ATCC 2...
2214,Q9Z0S9,uniprotkb:Q9Z0S9-0,PRAF1_MOUSE,sp,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,0,,Mus musculus (Mouse)
2215,Q9Z0S9,uniprotkb:Q9Z0S9-PRO_0000220879,PRAF1_MOUSE,sp,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,PRO_0000220879,,Mus musculus (Mouse)
2216,Q9Z254,uniprotkb:Q9Z254-0,GIPC1_RAT,sp,MPLGLGRRKKAPPLVENEEAEPSRSGLGVGEPGPLGGSGAGESQMG...,0,,Rattus norvegicus (Rat)


In [91]:
# Here's where it gets interesting. 
# Find cases where the provided uniprot matches one isoform and not another
# Here's where it gets interesting. 
# Find cases where the provided uniprot matches one isoform and not another
merged_neg2["aa_intact_equals_aa_idmap_A"] = merged_neg2.apply(lambda row: (row["aa_1"]==row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged_neg2["aa_intact_equals_aa_idmap_B"] = merged_neg2.apply(lambda row: (row["aa_2"]==row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)
merged_neg2["aa_intact_isin_aa_idmap_A"] = merged_neg2.apply(lambda row: (row["aa_1"] in row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged_neg2["aa_intact_isin_aa_idmap_B"] = merged_neg2.apply(lambda row: (row["aa_2"] in row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)

test1 = (len(merged_neg2.loc[
    merged_neg2["aa_intact_equals_aa_idmap_A"].isna()
][["uniprot_A_full","aa_1","aa_1_fromidmap"]]))
print(f"total rows where aa_intact_equals_aa_idmap_A isna: {test1}")

test1 = (len(merged_neg2.loc[
    merged_neg2["aa_intact_equals_aa_idmap_A"].notna()
][["uniprot_A_full","aa_1","aa_1_fromidmap"]]))
print(f"total rows where aa_intact_equals_aa_idmap_A notna: {test1}")

test1 = (len(merged_neg2.loc[
    merged_neg2["aa_intact_equals_aa_idmap_B"].isna()
][["uniprot_B_full","aa_2","aa_2_fromidmap"]]))
print(f"total rows where aa_intact_equals_aa_idmap_B isna: {test1}")

test1 = (len(merged_neg2.loc[
    merged_neg2["aa_intact_equals_aa_idmap_B"].notna()
][["uniprot_B_full","aa_2","aa_2_fromidmap"]]))
print(f"total rows where aa_intact_equals_aa_idmap_B notna: {test1}")

total rows where aa_intact_equals_aa_idmap_A isna: 108
total rows where aa_intact_equals_aa_idmap_A notna: 862
total rows where aa_intact_equals_aa_idmap_B isna: 62
total rows where aa_intact_equals_aa_idmap_B notna: 908


In [92]:
merged_neg3 = merged_neg.copy(deep=True)

In [93]:
merged_neg3 = merged_neg3.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged_neg3["uniprot_A_full"] = merged_neg3["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg3["uniprot_B_full"] = merged_neg3["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg3["unique_uniprot_pair"] = merged_neg3.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged_neg3["uniprot_A_noisoforms"] = merged_neg3["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg3["uniprot_B_noisoforms"] = merged_neg3["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg3["unique_uniprot_noisoforms_pair"] = merged_neg3.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

In [94]:
gb = merged_neg3.groupby(["unique_id","seq_pair_id"]).agg(
    unique_uniprot_A_noisoforms=("uniprot_A_noisoforms", lambda x: list(set(x))),
    unique_uniprot_B_noisoforms=("uniprot_B_noisoforms", lambda x: list(set(x))),
    unique_uniprot_A_full=("uniprot_A_full", lambda x: list(set(x))),
    unique_uniprot_B_full=("uniprot_B_full", lambda x: list(set(x))),
    unique_miscores=("miscore", lambda x: list(set(x)))
)
gb.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unique_uniprot_A_noisoforms,unique_uniprot_B_noisoforms,unique_uniprot_A_full,unique_uniprot_B_full,unique_miscores
unique_id,seq_pair_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
intact:EBI-1002565_intact:EBI-1002822,seqpair1,[uniprotkb:Q10173],[uniprotkb:Q9Y738],[uniprotkb:Q10173-0],[uniprotkb:Q9Y738-0],[0.37]
intact:EBI-1014500_intact:EBI-1397518,seqpair2,[uniprotkb:P35240],[uniprotkb:P0DPB3],[uniprotkb:P35240-1],[uniprotkb:P0DPB3-1],[0.4]
intact:EBI-10171697_intact:EBI-16428984,seqpair3,[uniprotkb:A0A0S2Z6H0],[uniprotkb:Q6A162],[uniprotkb:A0A0S2Z6H0-0],[uniprotkb:Q6A162-0],[0.37]
intact:EBI-10171697_intact:EBI-16467584,seqpair4,[None],[uniprotkb:Q6A162],[None],[uniprotkb:Q6A162-0],[0.37]
intact:EBI-10171697_intact:EBI-16468000,seqpair5,[uniprotkb:A0A0S2Z5U3],[uniprotkb:Q6A162],[uniprotkb:A0A0S2Z5U3-0],[uniprotkb:Q6A162-0],[0.37]


In [95]:
print(f"Grouped by unique_id (intact:EBI-1_intact:EBI-2) and seq_pair_id (sequence pair), checking for consistency otherwise. Each group has:")
test1 = len(gb.loc[
    gb["unique_uniprot_A_full"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot A full isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_uniprot_B_full"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot B full isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_uniprot_A_noisoforms"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot A ID without isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_uniprot_B_noisoforms"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot B ID without isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_miscores"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique miscore: {test1}")

Grouped by unique_id (intact:EBI-1_intact:EBI-2) and seq_pair_id (sequence pair), checking for consistency otherwise. Each group has:
	At most 1 unique uniprot A full isoform: True
	At most 1 unique uniprot B full isoform: True
	At most 1 unique uniprot A ID without isoform: True
	At most 1 unique uniprot B ID without isoform: True
	At most 1 unique miscore: True


In [96]:
merged_neg3 = merged_neg3.drop_duplicates(["unique_id","seq_pair_id"]).reset_index(drop=True)
merged_neg3 = merged_neg3[[
    "unique_id","seq_pair_id","aa_1","aa_2","miscore",
    "uniprot_A","uniprot_B","uniprot_A_full","uniprot_B_full","uniprot_A_noisoforms","uniprot_B_noisoforms",
    "unique_uniprot_pair","unique_uniprot_noisoforms_pair"
]]
print(f"After dropping duplicates, merged_neg3 has {len(merged_neg3)} rows")
merged_neg3.head()

After dropping duplicates, merged_neg3 has 918 rows


Unnamed: 0,unique_id,seq_pair_id,aa_1,aa_2,miscore,uniprot_A,uniprot_B,uniprot_A_full,uniprot_B_full,uniprot_A_noisoforms,uniprot_B_noisoforms,unique_uniprot_pair,unique_uniprot_noisoforms_pair
0,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0_uniprotkb:Q9Y738-0,uniprotkb:Q10173_uniprotkb:Q9Y738
1,intact:EBI-1014500_intact:EBI-1397518,seqpair2,MAGAIASRMSFSSLKRKQPKTFTVRIVTMDAEMEFNCEMKWKGKDL...,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,0.4,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240,uniprotkb:P0DPB3,uniprotkb:P0DPB3-1_uniprotkb:P35240-1,uniprotkb:P0DPB3_uniprotkb:P35240
2,intact:EBI-10171697_intact:EBI-16428984,seqpair3,MDEESLESALQTYRAQLQQVELALGAGLDSSEQADLRQLQGDLKEL...,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,0.37,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162,uniprotkb:A0A0S2Z6H0-0,uniprotkb:Q6A162-0,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162,uniprotkb:A0A0S2Z6H0-0_uniprotkb:Q6A162-0,uniprotkb:A0A0S2Z6H0_uniprotkb:Q6A162
3,intact:EBI-10171697_intact:EBI-16467584,seqpair4,MASSTPSSSATSSNAGADPNTTNLRPTTYDTWCGVAHGCTRKLGLK...,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,0.37,,uniprotkb:Q6A162,,uniprotkb:Q6A162-0,,uniprotkb:Q6A162,_uniprotkb:Q6A162-0,_uniprotkb:Q6A162
4,intact:EBI-10171697_intact:EBI-16468000,seqpair5,MSSSSSSPRETYEEDREYESQAKRLKTEEGEIDYSAEEGENRREAT...,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,0.37,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162,uniprotkb:A0A0S2Z5U3-0,uniprotkb:Q6A162-0,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162,uniprotkb:A0A0S2Z5U3-0_uniprotkb:Q6A162-0,uniprotkb:A0A0S2Z5U3_uniprotkb:Q6A162


In [97]:
# on this shrunken merge3, do a merge with idmap_merge_copy where we'll get all possible isoform matches, and then we can filter down to the best ones
idmap_merge_copy = idmap_merge.copy(deep=True)
idmap_merge_copy["canonical_uniprot"] = idmap_merge_copy["canonical_uniprotkb"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged_neg3 = pd.merge(
    merged_neg3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_A_noisoforms",
                                "uniprotkb_full":"uniprot_A_fromidmap",
                                "sequence":"aa_1_fromidmap",
                                "uniprot_gene_name":"uniprot_gene_name_A",
                                "species":"species_A"})[["uniprot_A_noisoforms","uniprot_A_fromidmap","aa_1_fromidmap","uniprot_gene_name_A","species_A"]],
    on=["uniprot_A_noisoforms"],
    how="left"
)
merged_neg3 = pd.merge(
    merged_neg3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_B_noisoforms",
                                "uniprotkb_full":"uniprot_B_fromidmap",
                                "sequence":"aa_2_fromidmap",
                                "uniprot_gene_name":"uniprot_gene_name_B",
                                "species":"species_B"})[["uniprot_B_noisoforms","uniprot_B_fromidmap","aa_2_fromidmap","uniprot_gene_name_B","species_B"]],
    on=["uniprot_B_noisoforms"],
    how="left"
    
)
print(f"After merging with idmap to get all possible isoform matches, merged_neg3 has {len(merged_neg3)} rows")

After merging with idmap to get all possible isoform matches, merged_neg3 has 8891 rows


In [98]:
merged_neg3["aa_intact_equals_aa_idmap_A"] = merged_neg3.apply(lambda row: (row["aa_1"]==row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged_neg3["aa_intact_equals_aa_idmap_B"] = merged_neg3.apply(lambda row: (row["aa_2"]==row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)
merged_neg3["aa_intact_isin_aa_idmap_A"] = merged_neg3.apply(lambda row: (row["aa_1"] in row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged_neg3["aa_intact_isin_aa_idmap_B"] = merged_neg3.apply(lambda row: (row["aa_2"] in row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)

display(merged_neg3.head())
display(merged_neg3.loc[merged_neg3["aa_intact_equals_aa_idmap_A"].notna()].head())

Unnamed: 0,unique_id,seq_pair_id,aa_1,aa_2,miscore,uniprot_A,uniprot_B,uniprot_A_full,uniprot_B_full,uniprot_A_noisoforms,...,uniprot_gene_name_A,species_A,uniprot_B_fromidmap,aa_2_fromidmap,uniprot_gene_name_B,species_B,aa_intact_equals_aa_idmap_A,aa_intact_equals_aa_idmap_B,aa_intact_isin_aa_idmap_A,aa_intact_isin_aa_idmap_B
0,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
1,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
2,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
3,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
4,intact:EBI-1014500_intact:EBI-1397518,seqpair2,MAGAIASRMSFSSLKRKQPKTFTVRIVTMDAEMEFNCEMKWKGKDL...,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,0.4,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240,...,MERL_HUMAN,Homo sapiens (Human),uniprotkb:P0DPB3-0,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,SCHI1_HUMAN,Homo sapiens (Human),True,True,True,True


Unnamed: 0,unique_id,seq_pair_id,aa_1,aa_2,miscore,uniprot_A,uniprot_B,uniprot_A_full,uniprot_B_full,uniprot_A_noisoforms,...,uniprot_gene_name_A,species_A,uniprot_B_fromidmap,aa_2_fromidmap,uniprot_gene_name_B,species_B,aa_intact_equals_aa_idmap_A,aa_intact_equals_aa_idmap_B,aa_intact_isin_aa_idmap_A,aa_intact_isin_aa_idmap_B
0,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
1,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
2,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-0,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
3,intact:EBI-1002565_intact:EBI-1002822,seqpair1,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,0.37,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0,uniprotkb:Q10173,...,NUF2_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,uniprotkb:Q9Y738-PRO_0000096492,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,MIS12_SCHPO,Schizosaccharomyces pombe (strain 972 / ATCC 2...,True,True,True,True
4,intact:EBI-1014500_intact:EBI-1397518,seqpair2,MAGAIASRMSFSSLKRKQPKTFTVRIVTMDAEMEFNCEMKWKGKDL...,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,0.4,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240-1,uniprotkb:P0DPB3-1,uniprotkb:P35240,...,MERL_HUMAN,Homo sapiens (Human),uniprotkb:P0DPB3-0,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,SCHI1_HUMAN,Homo sapiens (Human),True,True,True,True


In [99]:
# group to get the equals and isin sequences 
gb_equal_A = merged_neg3.loc[(merged_neg3["aa_intact_equals_aa_idmap_A"].notna()) & (merged_neg3["aa_intact_equals_aa_idmap_A"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_A_equalseq=("uniprot_A_fromidmap", lambda x: list(set(x))),
)
gb_equal_B = merged_neg3.loc[(merged_neg3["aa_intact_equals_aa_idmap_B"].notna()) & (merged_neg3["aa_intact_equals_aa_idmap_B"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_B_equalseq=("uniprot_B_fromidmap", lambda x: list(set(x))),
)
gb_isin_A = merged_neg3.loc[(merged_neg3["aa_intact_isin_aa_idmap_A"].notna()) & (merged_neg3["aa_intact_isin_aa_idmap_A"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_A_inseq=("uniprot_A_fromidmap", lambda x: list(set(x))),
)
gb_isin_B = merged_neg3.loc[(merged_neg3["aa_intact_isin_aa_idmap_B"].notna()) & (merged_neg3["aa_intact_isin_aa_idmap_B"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_B_inseq=("uniprot_B_fromidmap", lambda x: list(set(x))),
)

gb = pd.concat([gb_equal_A, gb_equal_B, gb_isin_A, gb_isin_B], axis=1).reset_index()
gb.head()

Unnamed: 0,unique_id,seq_pair_id,uniprot_A_equalseq,uniprot_B_equalseq,uniprot_A_inseq,uniprot_B_inseq
0,intact:EBI-1002565_intact:EBI-1002822,seqpair1,"[uniprotkb:Q10173-0, uniprotkb:Q10173-PRO_0000...","[uniprotkb:Q9Y738-0, uniprotkb:Q9Y738-PRO_0000...","[uniprotkb:Q10173-0, uniprotkb:Q10173-PRO_0000...","[uniprotkb:Q9Y738-0, uniprotkb:Q9Y738-PRO_0000..."
1,intact:EBI-1014500_intact:EBI-1397518,seqpair2,"[uniprotkb:P35240-0, uniprotkb:P35240-PRO_0000...","[uniprotkb:P0DPB3-PRO_0000288927, uniprotkb:P0...","[uniprotkb:P35240-0, uniprotkb:P35240-PRO_0000...","[uniprotkb:P0DPB3-PRO_0000288927, uniprotkb:P0..."
2,intact:EBI-10171697_intact:EBI-16428984,seqpair3,[uniprotkb:A0A0S2Z6H0-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z6H0-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000..."
3,intact:EBI-10171697_intact:EBI-16468000,seqpair5,[uniprotkb:A0A0S2Z5U3-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z5U3-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000..."
4,intact:EBI-10171697_intact:EBI-16470161,seqpair7,[uniprotkb:A0A0S2Z429-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z429-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000..."


In [100]:
merged_neg = pd.merge(
    merged_neg, 
    gb,
    on=["unique_id","seq_pair_id"],
    how="left"
)
print(f"\tMerged back the sequence isoforms that equal and contain the sequences provided through IntAct XML. New merged_neg size: {len(merged_neg)}")

	Merged back the sequence isoforms that equal and contain the sequences provided through IntAct XML. New merged_neg size: 970


In [101]:
[x for x in merged_neg.columns if "uniprot" in x]

['uniprot_A',
 'uniprot_B',
 'uniprotkb_1',
 'uniprotkb_2',
 'no_uniprot_update_A',
 'no_uniprot_update_B',
 'unique_uniprot_pair',
 'uniprot_A_noiso1',
 'uniprot_B_noiso1',
 'unique_uniprot_noiso1_pair',
 'uniprot_A_noisoforms',
 'uniprot_B_noisoforms',
 'unique_uniprot_noisoforms_pair',
 'uniprot_kb_clust_match_A',
 'uniprot_kb_clust_match_B',
 'uniprot_A_equalseq',
 'uniprot_B_equalseq',
 'uniprot_A_inseq',
 'uniprot_B_inseq']

In [102]:
# Check if all the cases above are becuase there is no uniprot to begin with
test1 = len(merged_neg.loc[
    (merged_neg["uniprot_A_equalseq"].isna()) &
    (merged_neg["uniprot_A"].notna())
])==0
print(f"Every case where there is no matching sequence for partner 1 is because there was no provided uniprot in the first place: {test1}")

# Check if all the cases above are becuase there is no uniprot to begin with
test1 = len(merged_neg.loc[
    (merged_neg["uniprot_B_equalseq"].isna()) &
    (merged_neg["uniprot_B"].notna())
])==0
print(f"Every case where there is no matching sequence for partner 2 is because there was no provided uniprot in the first place: {test1}")
test1 = merged_neg.loc[
    (merged_neg["uniprot_B_equalseq"].isna()) &
    (merged_neg["uniprot_B"].notna())
]["uniprot_B"].unique().tolist()
print(f"\tuniprotkb2s in this position: {test1}")

Every case where there is no matching sequence for partner 1 is because there was no provided uniprot in the first place: True
Every case where there is no matching sequence for partner 2 is because there was no provided uniprot in the first place: False
	uniprotkb2s in this position: ['uniprotkb:Cerebral protein 11']


In [103]:
merged_neg.loc[
    (merged_neg["uniprot_B_equalseq"].isna()) &
    (merged_neg["uniprot_B"].notna())
][[
    "interaction_intactid","protein_2","uniprot_B","uniprot_B_noisoforms","aa_2","uniprot_B_equalseq"
]]

Unnamed: 0,interaction_intactid,protein_2,uniprot_B,uniprot_B_noisoforms,aa_2,uniprot_B_equalseq
45,EBI-16475727,clone_11897,uniprotkb:Cerebral protein 11,uniprotkb:Cerebral protein 11,MEEKVAYQSYERARDIQEAVESCLTRVTKLELQQQQQQVVQLEGVE...,
46,EBI-16468245,clone_11897,uniprotkb:Cerebral protein 11,uniprotkb:Cerebral protein 11,MEEKVAYQSYERARDIQEAVESCLTRVTKLELQQQQQQVVQLEGVE...,
47,EBI-16464252,clone_11897,uniprotkb:Cerebral protein 11,uniprotkb:Cerebral protein 11,MEEKVAYQSYERARDIQEAVESCLTRVTKLELQQQQQQVVQLEGVE...,
48,EBI-16475711,clone_11897,uniprotkb:Cerebral protein 11,uniprotkb:Cerebral protein 11,MEEKVAYQSYERARDIQEAVESCLTRVTKLELQQQQQQVVQLEGVE...,
49,EBI-16468229,clone_11897,uniprotkb:Cerebral protein 11,uniprotkb:Cerebral protein 11,MEEKVAYQSYERARDIQEAVESCLTRVTKLELQQQQQQVVQLEGVE...,


In [104]:
merged_neg.loc[
    merged_neg["uniprot_A"]!=merged_neg["uniprot_A_equalseq"]
][["uniprot_A","uniprot_A_equalseq"]]

Unnamed: 0,uniprot_A,uniprot_A_equalseq
0,uniprotkb:Q10173,"[uniprotkb:Q10173-0, uniprotkb:Q10173-PRO_0000..."
1,uniprotkb:P35240-1,"[uniprotkb:P35240-0, uniprotkb:P35240-PRO_0000..."
2,uniprotkb:A0A0S2Z6H0,[uniprotkb:A0A0S2Z6H0-0]
3,,
4,uniprotkb:A0A0S2Z5U3,[uniprotkb:A0A0S2Z5U3-0]
...,...,...
965,uniprotkb:Q86UW1,"[uniprotkb:Q86UW1-PRO_0000331543, uniprotkb:Q8..."
966,uniprotkb:O95292-2,[uniprotkb:O95292-2]
967,uniprotkb:Q8WXU2-2,[uniprotkb:Q8WXU2-2]
968,uniprotkb:Q96PU8,"[uniprotkb:Q96PU8-0, uniprotkb:Q96PU8-PRO_0000..."


In [105]:
merged_neg["uniprot_A_full"] = merged_neg["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg["uniprot_B_full"] = merged_neg["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged_neg["unique_uniprot_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged_neg["uniprot_A_noisoforms"] = merged_neg["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg["uniprot_B_noisoforms"] = merged_neg["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg["unique_uniprot_noisoforms_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

In [106]:
print(f"Evaluating UniProt validity for partner A")
test1 = merged_neg['uniprot_A_equalseq'].explode()                      # explode the list column
m = test1.eq(merged_neg['uniprot_A_full'].loc[test1.index])             # compare each element to that row's string
equal_mask = m.groupby(level=0).any()
merged_neg['equal_flag_A'] = (
    equal_mask
    .reindex(merged_neg.index)
    .fillna(False)
)# collapse back to row-level True/False# collapse back to row-level True/False

equal_hits = merged_neg.loc[equal_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is one of the ones with the exact right sequence: {len(equal_hits)} ({100*len(equal_hits)/len(merged_neg):.2f}%)")

test1 = merged_neg['uniprot_A_inseq'].explode()                      # explode the list column
m = test1.eq(merged_neg['uniprot_A_full'].loc[test1.index])             # compare each element to that row's string
in_mask = m.groupby(level=0).any()  
in_hits = merged_neg.loc[in_mask]   
print(f"\tTotal rows where the provided UniProt ID is one of the ones which contains the provided sequence: {len(in_hits)} ({100*len(in_hits)/len(merged_neg):.2f}%)")

mask = equal_mask | in_mask
hits = merged_neg.loc[mask]
merged_neg['equal_or_in_flag_A'] = (
    mask
    .reindex(merged_neg.index)
    .fillna(False)
)# collapse back to row-level True/False
print(f"\tTotal rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: {len(hits)} ({100*len(hits)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[~mask]
print(f"\tTotal rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[
    (~mask) & 
    (merged_neg["uniprot_A_equalseq"].isna())
]

print(f"\t\tBecause there was no mapped uniprot at all: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[
    (~mask) & 
    (merged_neg["uniprot_A_equalseq"].notna())
]

print(f"\t\tBecause there was no match: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

Evaluating UniProt validity for partner A
	Total rows where the provided UniProt ID is one of the ones with the exact right sequence: 862 (88.87%)
	Total rows where the provided UniProt ID is one of the ones which contains the provided sequence: 862 (88.87%)
	Total rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: 862 (88.87%)
	Total rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: 108 (11.13%)
		Because there was no mapped uniprot at all: 89 (9.18%)
		Because there was no match: 19 (1.96%)


In [107]:
print(f"Evaluating UniProt validity for partner B")
test1 = merged_neg['uniprot_B_equalseq'].explode()                      # explode the list column
m = test1.eq(merged_neg['uniprot_B_full'].loc[test1.index])             # compare each element to that row's string
equal_mask = m.groupby(level=0).any()                                     # collapse back to row-level True/False
merged_neg['equal_flag_B'] = (
    equal_mask
    .reindex(merged_neg.index)
    .fillna(False)
)# collapse back to row-level True/False

equal_hits = merged_neg.loc[equal_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is one of the ones with the exact right sequence: {len(equal_hits)} ({100*len(equal_hits)/len(merged_neg):.2f}%)")

test1 = merged_neg['uniprot_B_inseq'].explode()                      # explode the list column
m = test1.eq(merged_neg['uniprot_B_full'].loc[test1.index])             # compare each element to that row's string
in_mask = m.groupby(level=0).any()  
in_hits = merged_neg.loc[in_mask]   
print(f"\tTotal rows where the provided UniProt ID is one of the ones which contains the provided sequence: {len(in_hits)} ({100*len(in_hits)/len(merged_neg):.2f}%)")

mask = equal_mask | in_mask
hits = merged_neg.loc[mask]
merged_neg['equal_or_in_flag_B'] = (
    mask
    .reindex(merged_neg.index)
    .fillna(False)
)# collapse back to row-level True/False
print(f"\tTotal rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: {len(hits)} ({100*len(hits)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[~mask]
print(f"\tTotal rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[
    (~mask) & 
    (merged_neg["uniprot_B_equalseq"].isna())
]

print(f"\t\tBecause there was no mapped uniprot at all: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[
    (~mask) & 
    (merged_neg["uniprot_B_equalseq"].notna())
]

print(f"\t\tBecause there was no match: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")


Evaluating UniProt validity for partner B
	Total rows where the provided UniProt ID is one of the ones with the exact right sequence: 908 (93.61%)
	Total rows where the provided UniProt ID is one of the ones which contains the provided sequence: 908 (93.61%)
	Total rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: 908 (93.61%)
	Total rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: 62 (6.39%)
		Because there was no mapped uniprot at all: 35 (3.61%)
		Because there was no match: 27 (2.78%)


In [108]:
merged_neg["uniprot_A_equalseq_canonical"] = merged_neg["uniprot_A_equalseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)
merged_neg["uniprot_A_inseq_canonical"] = merged_neg["uniprot_A_inseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)
merged_neg["uniprot_B_equalseq_canonical"] = merged_neg["uniprot_B_equalseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)
merged_neg["uniprot_B_inseq_canonical"] = merged_neg["uniprot_B_inseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)

In [109]:
# now let's see if it' still the right canonical one 
print(f"Making sure UniProt A is at least the right canonical even if it is the wrong isoform")
test1 = merged_neg['uniprot_A_equalseq_canonical'].explode()                      # explode the list column
m = test1.eq(merged_neg['uniprot_A_noisoforms'].loc[test1.index])             # compare each element to that row's string
equal_canonical_mask = m.groupby(level=0).any()   
merged_neg['equal_canonical_flag_A'] = (
    equal_canonical_mask
    .reindex(merged_neg.index)
    .fillna(False)
)# collapse back to row-level True/False

equal_hits = merged_neg.loc[equal_canonical_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is the right canonical (but may not be the right isoform): {len(equal_hits)} ({100*len(equal_hits)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[(~equal_canonical_mask) & (merged_neg["uniprot_A_equalseq"].isna())]  
print(f"\tTotal rows where the provided UniProt ID does not exist: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[(~equal_canonical_mask) & (merged_neg["uniprot_A_equalseq"].notna())]  
print(f"\tTotal rows where the provided UniProt ID exists and is not the right canonical: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

# now let's see if it' still the right canonical one 
print(f"Making sure UniProt B is at least the right canonical even if it is the wrong isoform")
test1 = merged_neg['uniprot_B_equalseq_canonical'].explode()                      # explode the list column
m = test1.eq(merged_neg['uniprot_B_noisoforms'].loc[test1.index])             # compare each element to that row's string
equal_canonical_mask = m.groupby(level=0).any()   
merged_neg['equal_canonical_flag_B'] = (
    equal_canonical_mask
    .reindex(merged_neg.index)
    .fillna(False)
)# collapse back to row-level True/False# collapse back to row-level True/False

equal_hits = merged_neg.loc[equal_canonical_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is the right canonical (but may not be the right isoform): {len(equal_hits)} ({100*len(equal_hits)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[(~equal_canonical_mask) & (merged_neg["uniprot_B_equalseq"].isna())]  
print(f"\tTotal rows where the provided UniProt ID does not exist: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

misses = merged_neg.loc[(~equal_canonical_mask) & (merged_neg["uniprot_B_equalseq"].notna())]  
print(f"\tTotal rows where the provided UniProt ID exists and is not the right canonical: {len(misses)} ({100*len(misses)/len(merged_neg):.2f}%)")

Making sure UniProt A is at least the right canonical even if it is the wrong isoform
	Total rows where the provided UniProt ID is the right canonical (but may not be the right isoform): 881 (90.82%)
	Total rows where the provided UniProt ID does not exist: 89 (9.18%)
	Total rows where the provided UniProt ID exists and is not the right canonical: 0 (0.00%)
Making sure UniProt B is at least the right canonical even if it is the wrong isoform
	Total rows where the provided UniProt ID is the right canonical (but may not be the right isoform): 935 (96.39%)
	Total rows where the provided UniProt ID does not exist: 35 (3.61%)
	Total rows where the provided UniProt ID exists and is not the right canonical: 0 (0.00%)


In [110]:
display(equal_hits[["interaction_intactid",
                    "uniprot_B","uniprot_B_equalseq",
                    "uniprot_B_equalseq_canonical","equal_flag_B","equal_canonical_flag_B"]].head())

Unnamed: 0,interaction_intactid,uniprot_B,uniprot_B_equalseq,uniprot_B_equalseq_canonical,equal_flag_B,equal_canonical_flag_B
0,EBI-1002954,uniprotkb:Q9Y738,"[uniprotkb:Q9Y738-0, uniprotkb:Q9Y738-PRO_0000...",[uniprotkb:Q9Y738],True,True
1,EBI-1397610,uniprotkb:P0DPB3-1,"[uniprotkb:P0DPB3-PRO_0000288927, uniprotkb:P0...",[uniprotkb:P0DPB3],False,True
2,EBI-16475309,uniprotkb:Q6A162,"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:Q6A162],True,True
3,EBI-16467635,uniprotkb:Q6A162,"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:Q6A162],True,True
4,EBI-16468745,uniprotkb:Q6A162,"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:Q6A162],True,True


In [111]:
display(misses[["interaction_intactid","uniprot_B","uniprot_B_equalseq","uniprot_B_equalseq_canonical"]].head())

Unnamed: 0,interaction_intactid,uniprot_B,uniprot_B_equalseq,uniprot_B_equalseq_canonical


In [112]:
# Are there any cases where there are multiple canonicals?
test1 = len(merged_neg.loc[merged_neg["uniprot_A_equalseq_canonical"].fillna("").apply(lambda x: len(x))>1])==0
print(f"Each row only has ONE canonical uniprot that matches the provided sequence - partner A: {test1}")
test1 = len(merged_neg.loc[merged_neg["uniprot_B_equalseq_canonical"].fillna("").apply(lambda x: len(x))>1])==0
print(f"Each row only has ONE canonical uniprot that matches the provided sequence - partner B: {test1}")

Each row only has ONE canonical uniprot that matches the provided sequence - partner A: True
Each row only has ONE canonical uniprot that matches the provided sequence - partner B: True


In [113]:
merged_neg = merged_neg.rename(columns={"uniprot_A": "uniprot_A_intact","uniprot_B": "uniprot_B_intact"})

In [114]:
merged_neg["uniprot_A_equalseq"] = merged_neg["uniprot_A_equalseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged_neg["uniprot_B_equalseq"] = merged_neg["uniprot_B_equalseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged_neg["uniprot_A_inseq"] = merged_neg["uniprot_A_inseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged_neg["uniprot_B_inseq"] = merged_neg["uniprot_B_inseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged_neg["uniprot_A"] = merged_neg["uniprot_A_equalseq"].apply(lambda x: x[0] if type(x)!=float else x)
merged_neg["uniprot_B"] = merged_neg["uniprot_B_equalseq"].apply(lambda x: x[0] if type(x)!=float else x)

In [115]:
# must reset uniprot_A and uniprot_B
merged_neg["unique_uniprot_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A",colB="uniprot_B"),axis=1)
merged_neg["uniprot_A_noisoforms"] = merged_neg["uniprot_A"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg["uniprot_B_noisoforms"] = merged_neg["uniprot_B"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged_neg["unique_uniprot_noisoforms_pair"] = merged_neg.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

In [116]:
# see how often the right isoform is the first one versus another 
print(f"More interactor A checks")
test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].notna()) & 
    (merged_neg["uniprot_A"].apply(lambda x: x.endswith("-0") if type(x)!=float else False))]
print(f"\tTotal rows where interactor A sequence matches isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].notna()) & 
    (merged_neg["uniprot_A"].apply(lambda x: not(x.endswith("-0")) if type(x)!=float else False))]
print(f"\tTotal rows where interactor A sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].isna()) & 
    (merged_neg["uniprot_A_intact"].notna())
    ]
print(f"\t\tBecause the IntAct-provided Uniprot could not be mapped: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].isna()) & 
    (merged_neg["uniprot_A_intact"].isna())
    ]
print(f"\t\tBecause there was no IntAct-provided Uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

# see how often the right isoform is the first one versus another 
print(f"More interactor B checks")
test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].notna()) & 
    (merged_neg["uniprot_B"].apply(lambda x: x.endswith("-0") if type(x)!=float else False))]
print(f"\tTotal rows where interactor B sequence matches isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].notna()) & 
    (merged_neg["uniprot_B"].apply(lambda x: not(x.endswith("-0")) if type(x)!=float else False))]
print(f"\tTotal rows where interactor B sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].isna()) & 
    (merged_neg["uniprot_B_intact"].notna())
    ]
print(f"\t\tBecause the IntAct-provided Uniprot could not be mapped: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].isna()) & 
    (merged_neg["uniprot_B_intact"].isna())
    ]
print(f"\t\tBecause there was no IntAct-provided Uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")


More interactor A checks
	Total rows where interactor A sequence matches isoform 0 (canonical) of its corresponding uniprot: 797 (82.16%)
	Total rows where interactor A sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: 84 (8.66%)
	Total rows where there is no corresponding uniprot: 89 (9.18%)
		Because the IntAct-provided Uniprot could not be mapped: 0 (0.00%)
		Because there was no IntAct-provided Uniprot: 89 (9.18%)
More interactor B checks
	Total rows where interactor B sequence matches isoform 0 (canonical) of its corresponding uniprot: 714 (73.61%)
	Total rows where interactor B sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: 221 (22.78%)
	Total rows where there is no corresponding uniprot: 35 (3.61%)
		Because the IntAct-provided Uniprot could not be mapped: 5 (0.52%)
		Because there was no IntAct-provided Uniprot: 30 (3.09%)


In [117]:
merged_neg["assigned_uniprot_flag_A"] = merged_neg["uniprot_A"].apply(lambda x: True if not(x is None or type(x)==float) else False)
merged_neg["assigned_uniprot_flag_B"] = merged_neg["uniprot_B"].apply(lambda x: True if not(x is None or type(x)==float) else False)
merged_neg[["assigned_uniprot_flag_A","assigned_uniprot_flag_B","uniprot_A","uniprot_B"]]

Unnamed: 0,assigned_uniprot_flag_A,assigned_uniprot_flag_B,uniprot_A,uniprot_B
0,True,True,uniprotkb:Q10173-0,uniprotkb:Q9Y738-0
1,True,True,uniprotkb:P35240-0,uniprotkb:P0DPB3-0
2,True,True,uniprotkb:A0A0S2Z6H0-0,uniprotkb:Q6A162-0
3,False,True,,uniprotkb:Q6A162-0
4,True,True,uniprotkb:A0A0S2Z5U3-0,uniprotkb:Q6A162-0
...,...,...,...,...
965,True,True,uniprotkb:Q86UW1-0,uniprotkb:P54253-0
966,True,True,uniprotkb:O95292-2,uniprotkb:O95292-0
967,True,True,uniprotkb:Q8WXU2-2,uniprotkb:O00471-0
968,True,True,uniprotkb:Q96PU8-0,uniprotkb:Q15366-0


In [118]:
# now let's map back in the uniprot gene name
canonical_uniprot_gene_name_map = dict(zip(
    "uniprotkb:"+idmap_merge["canonical_uniprotkb"], idmap_merge["uniprot_gene_name"]
))
merged_neg["uniprot_gene_name_A"] = merged_neg["uniprot_A_noisoforms"].map(canonical_uniprot_gene_name_map)
merged_neg["uniprot_gene_name_B"] = merged_neg["uniprot_B_noisoforms"].map(canonical_uniprot_gene_name_map)

# does everything get mapped?
print(f"Interactor A gene name checks")
test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].notna()) & 
    (merged_neg["uniprot_gene_name_A"].notna())]
print(f"\tTotal rows where interactor A has a uniprot and a gene name: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].notna()) & 
    (merged_neg["uniprot_gene_name_A"].isna())]
print(f"\tTotal rows where interactor A has a uniprot and no gene name: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_A"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

print(f"Interactor B gene name checks")
test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].notna()) & 
    (merged_neg["uniprot_gene_name_B"].notna())]
print(f"\tTotal rows where interactor B has a uniprot and a gene name: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].notna()) & 
    (merged_neg["uniprot_gene_name_B"].isna())]
print(f"\tTotal rows where interactor B has a uniprot and no gene name: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")

test1 = merged_neg.loc[
    (merged_neg["uniprot_B"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged_neg):.2f}%)")


Interactor A gene name checks
	Total rows where interactor A has a uniprot and a gene name: 881 (90.82%)
	Total rows where interactor A has a uniprot and no gene name: 0 (0.00%)
	Total rows where there is no corresponding uniprot: 89 (9.18%)
Interactor B gene name checks
	Total rows where interactor B has a uniprot and a gene name: 935 (96.39%)
	Total rows where interactor B has a uniprot and no gene name: 0 (0.00%)
	Total rows where there is no corresponding uniprot: 35 (3.61%)


In [119]:
# Investigate the causes of more than one unique_id per seq_pair_id 
# Are there any cases where they are different even despite isoforms? 
gb = merged_neg.groupby("seq_pair_id").agg(
    unique_A=("ID(s) interactor A", lambda x: set(x)),
    unique_B=("ID(s) interactor B", lambda x: set(x)),
    unique_ids=("unique_id", lambda x: set(x)),
    unique_miscores=("miscore", lambda x: set(x)),
    ).reset_index()

dups_diff_ids = gb.loc[gb["unique_ids"].apply(lambda x: len(x))>1]["seq_pair_id"].tolist()
test1 = merged_neg.loc[merged_neg["seq_pair_id"].isin(dups_diff_ids)].drop_duplicates(subset=["unique_id","seq_pair_id"]).sort_values(by=["seq_pair_id","unique_id"])

test1 = test1.groupby("seq_pair_id").agg(
    unique_A=("ID(s) interactor A", lambda x: set(x)),
    unique_B=("ID(s) interactor B", lambda x: set(x)),
    uniprot_genenames_A=("uniprot_gene_name_A", lambda x: set(x)),
    uniprot_genenames_B=("uniprot_gene_name_B", lambda x: set(x)),
    unique_ids=("unique_id", lambda x: set(x)),
    unique_miscores=("miscore", lambda x: set(x)),
    unique_uniprot_pairs=("unique_uniprot_pair", lambda x: set(x)),
    unique_uniprot_noiso1_pairs=("unique_uniprot_noiso1_pair", lambda x: set(x)),
    unique_uniprot_noisoforms_pairs=("unique_uniprot_noisoforms_pair", lambda x: set(x)),
    ).reset_index()

test1 = test1.loc[
    (test1["unique_uniprot_pairs"].apply(lambda x: len(x))>1) & 
    (test1["unique_uniprot_noisoforms_pairs"].apply(lambda x: len(x))>1)
    ].reset_index(drop=True)

In [120]:
# turn list columns into ","
uniprot_cols = [x for x in merged_neg.columns if "uniprot" in x]
subset = [x for x in uniprot_cols if "seq" in x]
print(subset)
display(merged_neg[subset].head())
for c in subset:
    merged_neg[c] = merged_neg[c].apply(lambda x: ",".join(x) if (type(x)!=float and type(x)!=str) else x)
display(merged_neg[subset].head())

['uniprot_A_equalseq', 'uniprot_B_equalseq', 'uniprot_A_inseq', 'uniprot_B_inseq', 'uniprot_A_equalseq_canonical', 'uniprot_A_inseq_canonical', 'uniprot_B_equalseq_canonical', 'uniprot_B_inseq_canonical']


Unnamed: 0,uniprot_A_equalseq,uniprot_B_equalseq,uniprot_A_inseq,uniprot_B_inseq,uniprot_A_equalseq_canonical,uniprot_A_inseq_canonical,uniprot_B_equalseq_canonical,uniprot_B_inseq_canonical
0,"[uniprotkb:Q10173-0, uniprotkb:Q10173-PRO_0000...","[uniprotkb:Q9Y738-0, uniprotkb:Q9Y738-PRO_0000...","[uniprotkb:Q10173-0, uniprotkb:Q10173-PRO_0000...","[uniprotkb:Q9Y738-0, uniprotkb:Q9Y738-PRO_0000...",[uniprotkb:Q10173],[uniprotkb:Q10173],[uniprotkb:Q9Y738],[uniprotkb:Q9Y738]
1,"[uniprotkb:P35240-0, uniprotkb:P35240-PRO_0000...","[uniprotkb:P0DPB3-0, uniprotkb:P0DPB3-PRO_0000...","[uniprotkb:P35240-0, uniprotkb:P35240-PRO_0000...","[uniprotkb:P0DPB3-0, uniprotkb:P0DPB3-PRO_0000...",[uniprotkb:P35240],[uniprotkb:P35240],[uniprotkb:P0DPB3],[uniprotkb:P0DPB3]
2,[uniprotkb:A0A0S2Z6H0-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z6H0-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z6H0],[uniprotkb:A0A0S2Z6H0],[uniprotkb:Q6A162],[uniprotkb:Q6A162]
3,,"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",,"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",,,[uniprotkb:Q6A162],[uniprotkb:Q6A162]
4,[uniprotkb:A0A0S2Z5U3-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z5U3-0],"[uniprotkb:Q6A162-0, uniprotkb:Q6A162-PRO_0000...",[uniprotkb:A0A0S2Z5U3],[uniprotkb:A0A0S2Z5U3],[uniprotkb:Q6A162],[uniprotkb:Q6A162]


Unnamed: 0,uniprot_A_equalseq,uniprot_B_equalseq,uniprot_A_inseq,uniprot_B_inseq,uniprot_A_equalseq_canonical,uniprot_A_inseq_canonical,uniprot_B_equalseq_canonical,uniprot_B_inseq_canonical
0,"uniprotkb:Q10173-0,uniprotkb:Q10173-PRO_000005...","uniprotkb:Q9Y738-0,uniprotkb:Q9Y738-PRO_000009...","uniprotkb:Q10173-0,uniprotkb:Q10173-PRO_000005...","uniprotkb:Q9Y738-0,uniprotkb:Q9Y738-PRO_000009...",uniprotkb:Q10173,uniprotkb:Q10173,uniprotkb:Q9Y738,uniprotkb:Q9Y738
1,"uniprotkb:P35240-0,uniprotkb:P35240-PRO_000021...","uniprotkb:P0DPB3-0,uniprotkb:P0DPB3-PRO_000028...","uniprotkb:P35240-0,uniprotkb:P35240-PRO_000021...","uniprotkb:P0DPB3-0,uniprotkb:P0DPB3-PRO_000028...",uniprotkb:P35240,uniprotkb:P35240,uniprotkb:P0DPB3,uniprotkb:P0DPB3
2,uniprotkb:A0A0S2Z6H0-0,"uniprotkb:Q6A162-0,uniprotkb:Q6A162-PRO_000031...",uniprotkb:A0A0S2Z6H0-0,"uniprotkb:Q6A162-0,uniprotkb:Q6A162-PRO_000031...",uniprotkb:A0A0S2Z6H0,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162,uniprotkb:Q6A162
3,,"uniprotkb:Q6A162-0,uniprotkb:Q6A162-PRO_000031...",,"uniprotkb:Q6A162-0,uniprotkb:Q6A162-PRO_000031...",,,uniprotkb:Q6A162,uniprotkb:Q6A162
4,uniprotkb:A0A0S2Z5U3-0,"uniprotkb:Q6A162-0,uniprotkb:Q6A162-PRO_000031...",uniprotkb:A0A0S2Z5U3-0,"uniprotkb:Q6A162-0,uniprotkb:Q6A162-PRO_000031...",uniprotkb:A0A0S2Z5U3,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162,uniprotkb:Q6A162


In [121]:
# save it 
merged_neg.to_csv("data_files/processed/intact/intermediate/merged_neg_clean.csv",index=False)

# Finish processing positives

## Merge

In [122]:
###### MERGE!
print(f"Merging expansion and score information from intact into intact-micluster.")
intact["unique_all_intact_combos"] = intact["unique_all_intact_sorted"].apply(lambda x: expand_cross_combinations(x))
merge_db = intact[["unique_all_intact_combos","miscore","Expansion method(s)"]]
merge_db = merge_db.explode("unique_all_intact_combos").reset_index(drop=True)

map_db = merge_db.groupby("unique_all_intact_combos").agg(
    unique_scores=("miscore", lambda x: list(set(x))), #keep this a list
    unique_expansions=("Expansion method(s)", lambda x: "|".join(list(set(x)))) # not a list because we actually want multiple entries if applicable
)
map_dict = map_db.to_dict()

intact_clust["unique_scores"] = intact_clust["unique_id"].apply(lambda x: map_dict["unique_scores"][x])
intact_clust["unique_expansions"] = intact_clust["unique_id"].apply(lambda x: map_dict["unique_expansions"][x])
test1 = len(intact_clust.loc[intact_clust["unique_scores"].apply(lambda x: len(x)>1)])
print(f"\tTotal rows with more than one miscore: {test1}. Exploding along these rows")
intact_clust = intact_clust.explode("unique_scores").reset_index(drop=True)

Merging expansion and score information from intact into intact-micluster.
	Total rows with more than one miscore: 10. Exploding along these rows


In [123]:
merge_db.loc[
    merge_db["unique_all_intact_combos"]=="intact:EBI-1056089_intact:EBI-473814"
]

Unnamed: 0,unique_all_intact_combos,miscore,Expansion method(s)
1399665,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1399667,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1933080,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1933093,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,"psi-mi:""MI:1060""(spoke expansion)"
1933096,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1933110,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1933143,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1933145,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,not expanded
1933171,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,"psi-mi:""MI:1060""(spoke expansion)"
1933207,intact:EBI-1056089_intact:EBI-473814,intact-miscore:0.80,"psi-mi:""MI:1060""(spoke expansion)"


In [124]:
# (1) Make sure we just have one score per row
test1 = len(intact_clust.loc[intact_clust["unique_scores"].str.count("intact-miscore")>1])==0
print(f"\tintact contributed one intact-miscore per row: {test1}")
test1 = len(intact_clust.loc[intact_clust["Confidence value(s)"].str.count("intact-miscore")>1])==0
print(f"\tintact-micluster contributed one intact-miscore per row: {test1}")

	intact contributed one intact-miscore per row: True
	intact-micluster contributed one intact-miscore per row: True


In [125]:
# (2) Make sure those are the same value if you round
intact_clust["confidence_val_int"] = intact_clust["Confidence value(s)"].apply(lambda x: round(float(x.split("intact-miscore:")[1]),2))
intact_clust["unique_score_int"] = intact_clust["unique_scores"].apply(lambda x: round(float(x.split("intact-miscore:")[1]),2))
intact_clust["equal_score_int"] = intact_clust["unique_score_int"]==intact_clust["confidence_val_int"]
test1 = len(intact_clust.loc[intact_clust["equal_score_int"]==False])
print(f"\tTotal rows where intact and intact-micluster.txt have different confidence scores: {test1} ({100*test1/len(intact_clust):.2f}%)")
intact_clust = (
    intact_clust.sort_values(by=["unique_id", "equal_score_int"], ascending=[True, False])
      .drop_duplicates(subset=["unique_id"], keep="first")
).reset_index(drop=True)
test1 = len(intact_clust.loc[intact_clust["equal_score_int"]])
print(f"\tTotal rows where intact and intact-micluster.txt have the same confidence scores: {test1} ({100*test1/len(intact_clust):.2f}%). Total rows: {len(intact_clust)}")
intact_clust["miscore"] = intact_clust["confidence_val_int"]

	Total rows where intact and intact-micluster.txt have different confidence scores: 11 (0.00%)
	Total rows where intact and intact-micluster.txt have the same confidence scores: 1136283 (100.00%). Total rows: 1136283


In [126]:
# (3) Look at how many rows have expansion versus no expansion or some
print(f"\tInvestigating modes of expansion. Only keeping rows where at least once, this interaction was shown WITHOUT expansion")
test1 = intact_clust["unique_expansions"].value_counts().to_dict()
for k,v in test1.items():
    print(f"\t\tExpansion mode = {k}. Total rows = {v} ({100*v/len(intact_clust):.2f}%)")
intact_clust_expand = intact_clust.loc[~intact_clust["unique_expansions"].str.contains("not expanded")].reset_index(drop=True)
intact_clust = intact_clust.loc[intact_clust["unique_expansions"].str.contains("not expanded")].reset_index(drop=True)
print(f"Total interaction rows remaining: {len(intact_clust)}")
print(f"Unique values in intact_clust expansion methods: {intact_clust['unique_expansions'].unique().tolist()}")


	Investigating modes of expansion. Only keeping rows where at least once, this interaction was shown WITHOUT expansion
		Expansion mode = psi-mi:"MI:1060"(spoke expansion). Total rows = 650370 (57.24%)
		Expansion mode = not expanded. Total rows = 459115 (40.40%)
		Expansion mode = psi-mi:"MI:1060"(spoke expansion)|not expanded. Total rows = 26798 (2.36%)
Total interaction rows remaining: 485913
Unique values in intact_clust expansion methods: ['not expanded', 'psi-mi:"MI:1060"(spoke expansion)|not expanded']


In [127]:
temp = my_pos.loc[
    (my_pos["intactid_1"].isna()) | 
    (my_pos["intactid_2"].isna())
].reset_index(drop=True)
print(len(temp))
print(temp["process_method"].unique().tolist())
display(temp.head())
na_cols = [c for c in temp.columns if temp[c].isna().any() ]
print(f"Looking at subset of dataframe where one Intact ID failed to be mapped.\nOther empty columns: {','.join(na_cols)}")

0
[]


Unnamed: 0,interaction_label,interaction_mi,interaction_intactid,interaction_xml_id,experiments,year,process_method,protein_1,gene_symbol_1,mol_type_1,...,binding_short_2,binding_begin_2,binding_end_2,ptm_mi_2,ptm_name_2,ptm_short_2,ptm_begin_2,ptm_end_2,ptm_orig_2,ptm_new_2


Looking at subset of dataframe where one Intact ID failed to be mapped.
Other empty columns: 


In [128]:
# Prepare my_pos
print(f"Cleaning my_pos (the data we processed directly from xml)")
my_pos["pubmeds"] = my_pos["experiments"].apply(lambda x: normalize_ids(extract_pubmed_from_experiment(x)))
test1 = len(my_pos.loc[my_pos["pubmeds"].isna()])
print(f"\tTotal rows with no valid PubMed ID: {test1} ({100*test1/len(intact_clust):.2f}%)")
test1 = len(my_pos.loc[
    (my_pos["intactid_1"].isna()) | 
    (my_pos["intactid_2"].isna())
])==0
print(f"\tEvery row has an intact:EBI- ID for both interactor A and B: {test1}")
my_pos["intactid_1"] = my_pos["intactid_1"].apply(lambda x: x.replace(",","|"))
my_pos["intactid_2"] = my_pos["intactid_2"].apply(lambda x: x.replace(",","|"))
my_pos["intactid_1"] = my_pos["intactid_1"].apply(lambda x: x.replace("EBI-","intact:EBI-"))
my_pos["intactid_2"] = my_pos["intactid_2"].apply(lambda x: x.replace("EBI-","intact:EBI-"))
my_pos["unique_all_intact_sorted"] = my_pos.apply(lambda row: get_unique_id(row, colA="intactid_1", colB="intactid_2"),axis=1)
my_pos["unique_all_intact_combos"] = my_pos["unique_all_intact_sorted"].apply(lambda x: expand_cross_combinations(x))
my_pos = my_pos.explode("unique_all_intact_combos").reset_index(drop=True)
# Now, do we only have ONE sequence per intactids?
my_pos[["intactid_1","intactid_2"]]
my_pos[["intactid_1","intactid_2","unique_all_intact_combos"]]        

my_pos["seq_sort"] = my_pos.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
gb1 = my_pos.groupby("unique_all_intact_combos").agg(unique_seqsort=("seq_sort", lambda x: len(set(x)))).reset_index()
test1 = (gb1["unique_seqsort"]>0).all()
print(f"\tAll interactions have at least one pair of sequences: {test1}")
test1 = len(gb1.loc[gb1["unique_seqsort"]==1])
print(f"\tTotal interactions with 1 associated sequence pair: {test1}")
test1 = len(gb1.loc[gb1["unique_seqsort"]>1])
print(f"\tTotal interactions with multiple associated sequence pairs: {test1}")
test1 = len(my_pos.loc[my_pos["unique_all_intact_combos"].duplicated()])
print(f"\tTotal rows that are duplicates of an intact combo ID (e.g. intact:EBI-10000824_intact:EBI-697771): {test1} ({100*test1/len(my_pos):.2f}%)")
test1 = len(my_pos.drop_duplicates("unique_all_intact_combos"))
print(f"\tNew database size if we grouped on intact combo ID: {test1} ({100*test1/len(my_pos):.2f}%)")
                  
# check for homo-interactions at some point
#intact.loc[intact["ID(s) interactor A"]==intact["ID(s) interactor B"]]
print(f"Assembling the FINAL database")
my_pos = pd.concat([my_pos,flip_interactors(my_pos)]).drop_duplicates().reset_index(drop=True)
my_pos["all_intact_A_sorted"] = my_pos.apply(lambda row: map_back_individual_intact_mypos(row,interactor="1"),axis=1)
my_pos["all_intact_B_sorted"] = my_pos.apply(lambda row: map_back_individual_intact_mypos(row,interactor="2"),axis=1)
my_pos = my_pos.explode("all_intact_A_sorted").reset_index(drop=True)
my_pos = my_pos.explode("all_intact_B_sorted").reset_index(drop=True)
print(f"\tmy_pos size after flipping and doubling, and expanding by all possible matches for intact A and intact B: {len(my_pos)}")
test1 = len(my_pos.loc[my_pos["interaction_intactid"].str.contains("\\|")])
print(f"\tin the expanded database, total rows with multiple intact IDs pipe-separated: {test1}")
test1 = len(my_pos.loc[my_pos["interaction_intactid"].str.contains(",")])
print(f"\tin the expanded database, total rows with multiple intact IDs comma-separated: {test1}")


Cleaning my_pos (the data we processed directly from xml)
	Total rows with no valid PubMed ID: 10438 (2.15%)
	Every row has an intact:EBI- ID for both interactor A and B: True
	All interactions have at least one pair of sequences: True
	Total interactions with 1 associated sequence pair: 630152
	Total interactions with multiple associated sequence pairs: 6014
	Total rows that are duplicates of an intact combo ID (e.g. intact:EBI-10000824_intact:EBI-697771): 549782 (46.36%)
	New database size if we grouped on intact combo ID: 636166 (53.64%)
Assembling the FINAL database
	my_pos size after flipping and doubling, and expanding by all possible matches for intact A and intact B: 2366390
	in the expanded database, total rows with multiple intact IDs pipe-separated: 0
	in the expanded database, total rows with multiple intact IDs comma-separated: 4308


In [129]:
# need to expand intact_clust by interaction identifiers and match on those
my_pos["interaction_intactid"] = my_pos["interaction_intactid"].apply(lambda x: x.split("|") if "|" in x else x.split(","))

In [130]:
print(f"Before exploding on interaction_intactid, size of my_pos: {len(my_pos)}")
my_pos = my_pos.explode("interaction_intactid").reset_index(drop=True)
print(f"After exploding on interaction_intactid, size of my_pos: {len(my_pos)}")

Before exploding on interaction_intactid, size of my_pos: 2366390
After exploding on interaction_intactid, size of my_pos: 2372044


In [131]:
# (8) Look into IntAct IDs for the interaction 
intact_clust["IntAct Interaction identifier(s)"] = intact_clust["Interaction identifier(s)"].apply(lambda x: "|".join([y for y in x.split("|") if "intact:EBI-" in y]))
test1 = (intact_clust["Interaction identifier(s)"].str.count("intact:EBI-")==1).sum()
print(f"Total rows of IntAct with exactly one intact:EBI- interaction identifier for the interaction: {test1}/{len(intact_clust)} ({100*test1/len(intact_clust):.5f}%)")
test1 = (intact_clust["Interaction identifier(s)"].str.count("intact:EBI-")==0).sum()
print(f"Total rows of IntAct with 0 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact_clust)} ({100*test1/len(intact_clust):.2f}%)")
test1 = (intact_clust["Interaction identifier(s)"].str.count("intact:EBI-")>1).sum()
print(f"Total rows of IntAct with >1 intact:EBI- interaction identifiers for the interaction: {test1}/{len(intact_clust)} ({100*test1/len(intact_clust):.5f}%)")

intact_clust["IntAct Interaction identifier(s)"] = intact_clust["IntAct Interaction identifier(s)"].apply(lambda x: x.split("|"))
intact_clust = intact_clust.explode("IntAct Interaction identifier(s)").reset_index(drop=True)
intact_clust["IntAct Interaction identifier(s)"] = intact_clust["IntAct Interaction identifier(s)"].apply(lambda x: x.split("intact:")[1])
print(f"Exploded along IntAct Interaction identifier(s). New # rows: {len(intact_clust)}")

Total rows of IntAct with exactly one intact:EBI- interaction identifier for the interaction: 330151/485913 (67.94447%)
Total rows of IntAct with 0 intact:EBI- interaction identifiers for the interaction: 0/485913 (0.00%)
Total rows of IntAct with >1 intact:EBI- interaction identifiers for the interaction: 155762/485913 (32.05553%)
Exploded along IntAct Interaction identifier(s). New # rows: 879670


In [132]:
intact_clust.loc[
    intact_clust["IntAct Interaction identifier(s)"]=="EBI-30875096"
][["unique_id","IntAct Interaction identifier(s)"]]

Unnamed: 0,unique_id,IntAct Interaction identifier(s)
97382,intact:EBI-1056089_intact:EBI-473814,EBI-30875096


In [133]:
intact_clust.loc[
    intact_clust["unique_id"]=="intact:EBI-1056089_intact:EBI-473814"
][["unique_expansions","Expansion method(s)"]]

Unnamed: 0,unique_expansions,Expansion method(s)
97379,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97380,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97381,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97382,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97383,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97384,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97385,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97386,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97387,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"
97388,"psi-mi:""MI:1060""(spoke expansion)|not expanded","psi-mi:""MI:1060""(spoke expansion)"


In [134]:
merged = pd.merge(
    intact_clust.rename(columns={"IntAct Interaction identifier(s)":"interaction_intactid"}).drop(columns=["Expansion method(s)"]),
    my_pos.rename(columns={"unique_all_intact_combos":"unique_id"}),
    on=["interaction_intactid", "unique_id","all_intact_A_sorted","all_intact_B_sorted"],
    how="left"
)
print(f"\tResults of a left merge of (intact_clust,my_pos): len {len(merged)}")
test1 = len(merged.loc[(merged["aa_1"].isna()) | (merged["aa_2"].isna())])
print(f"\tTotal rows where there is no associated amino acid sequence {test1} ({100*test1/len(merged):.2f}%)")
merged = merged.loc[(merged["aa_1"].notna()) & (merged["aa_2"].notna())].reset_index(drop=True)
test1 = len(merged.loc[(merged["aa_1"].isna()) | (merged["aa_2"].isna())])
print(f"\tAfter dropping rows with no AA sequence: total rows where there is no associated amino acid sequence {test1}")
# drop duplicates
merged = merged.drop_duplicates().reset_index(drop=True)
print(f"\tTotal rows after dropping duplicates: {len(merged)}")
merged.sort_values(by="unique_id")[["unique_id","miscore","unique_expansions"]]

merged["no_uniprot_update_A"] = merged["Annotation(s) interactor A"].fillna("").str.contains("no-uniprot-update")
merged["no_uniprot_update_B"] = merged["Annotation(s) interactor B"].fillna("").str.contains("no-uniprot-update")

unique_seqpairs = merged["seq_sort"].unique().tolist()
seq_pair_idmap = dict(zip(unique_seqpairs, [f"seqpair{i}" for i in range(1, len(unique_seqpairs)+1)]))
merged["seq_pair_id"] = merged["seq_sort"].map(seq_pair_idmap)

# Add columns to help figure out cause of sequence duplication
merged["unique_uniprot_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A",colB="uniprot_B"),axis=1)
merged["uniprot_A_noiso1"] = merged["uniprot_A"].apply(lambda x: x.replace("-1","") if (x is not None and type(x)==str) else x)
merged["uniprot_B_noiso1"] = merged["uniprot_B"].apply(lambda x: x.replace("-1","") if (x is not None and type(x)==str) else x)
merged["unique_uniprot_noiso1_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A_noiso1",colB="uniprot_B_noiso1"),axis=1)
merged["uniprot_A_noisoforms"] = merged["uniprot_A"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged["uniprot_B_noisoforms"] = merged["uniprot_B"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged["unique_uniprot_noisoforms_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)


	Results of a left merge of (intact_clust,my_pos): len 881641
	Total rows where there is no associated amino acid sequence 134763 (15.29%)
	After dropping rows with no AA sequence: total rows where there is no associated amino acid sequence 0
	Total rows after dropping duplicates: 746865


In [135]:
# Are any of the rows in merged peptides? 
print(merged["mol_type_1"].value_counts())
print(merged["mol_type_2"].value_counts())

mol_type_1
protein    746623
peptide       242
Name: count, dtype: int64
mol_type_2
protein    746029
peptide       836
Name: count, dtype: int64


In [136]:
# write all the unique UniProtKB IDs to a file so we can do ID mapping
all_merged_uniprots = set(merged["uniprot_A"].dropna().str.split("uniprotkb:",expand=True)[1].unique().tolist() + merged["uniprot_B"].dropna().str.split("uniprotkb:",expand=True)[1].unique().tolist())
# get rid of isoforms and pro's for now; we won't ID-map on these
all_merged_uniprots = set([x.split("-")[0] for x in all_merged_uniprots if (x is not None and type(x)==str and x!="")])
feature_folder = "data_files/processed/intact/features/"
os.makedirs(feature_folder,exist_ok=True)
with open(os.path.join(feature_folder,"all_merged_uniprots.txt"),"w") as f:
    f.write("\n".join(sorted(all_merged_uniprots)))
print(f"\tWrote {len(all_merged_uniprots)} unique UniProtKB IDs to {os.path.join(feature_folder,'all_merged_uniprots.txt')}")


	Wrote 88505 unique UniProtKB IDs to data_files/processed/intact/features/all_merged_uniprots.txt


## UniProt ID Mapping

In [137]:
from Bio import SeqIO

In [138]:
idmap_folder = "data_files/processed/intact/idmapping"
idmap_fasta_path = os.path.join(idmap_folder,"idmapping_2025_11_05.fasta")
idmap_tsv_path = os.path.join(idmap_folder,"idmapping_2025_11_05.tsv")

# read the fasta
fasta_rows = [[record.id,"".join(record.seq),record.description] for record in SeqIO.parse(idmap_fasta_path, "fasta")]
idmap_tsv_df = pd.read_csv(idmap_tsv_path,sep="\t")
idmap_tsv_df.head()


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide
0,A0A023GQA5,A0A023GQA5,unreviewed,A0A023GQA5_DROME,Prohibitin,Phb1 Cc Dmel\CG10691 fs(2)HH32 l (2) 37Cc l(2)...,Drosophila melanogaster (Fruit fly),276.0,,,,,
1,A0A023GRW2,A0A023GRW2,unreviewed,A0A023GRW2_DROME,IP07931p,CG12541,Drosophila melanogaster (Fruit fly),124.0,,,,,
2,A0A023PXA5,A0A023PXA5,reviewed,YA19A_YEAST,Putative uncharacterized protein YAL019W-A,YAL019W-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,189.0,"CHAIN 1..189; /note=""Putative uncharacterized ...",,,,
3,A0A023PXH6,A0A023PXH6,reviewed,YM172_YEAST,Putative uncharacterized membrane protein YMR1...,YMR172C-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,127.0,"CHAIN 1..127; /note=""Putative uncharacterized ...",,,,
4,A0A023PXI0,A0A023PXI0,reviewed,YM306_YEAST,Putative uncharacterized membrane protein YMR3...,YMR306C-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,129.0,"CHAIN 1..129; /note=""Putative uncharacterized ...",,,,


In [139]:
# Indices are inclusive in these columns from what I have seen on UniProt
peptide_columns = ["Chain","Peptide","Propeptide","Signal peptide","Transit peptide"]

for c in peptide_columns:
    unique_prefixes = idmap_tsv_df.loc[idmap_tsv_df[c].notna()][c].str.split(" ",expand=True)[0].unique().tolist()
    test1 = len(unique_prefixes)==1 and not(unique_prefixes[0]=="") # if true, then all entries are empty strings
    print(f"Investigating column {c} for prefixes to the peptide coordinates.\n\tUnique prefixes: {','.join(unique_prefixes)}. Only one: {test1}")
    pref = unique_prefixes[0]
    # find total columns with multiple
    test1 = idmap_tsv_df.loc[(idmap_tsv_df[c].notna()) & (idmap_tsv_df[c].str.count(pref)>1)]
    print(f"\tTotal rows with multiple {c} entries: {len(test1)}")
    # now going to split by this prefix so I can get a list
    idmap_tsv_df[c] = idmap_tsv_df[c].apply(lambda x: [y.strip() for y in x.split(f"{pref} ") if len(y)>0] if (x is not None and type(x)==str and x!="") else None)

for c in peptide_columns:
    idmap_tsv_df = idmap_tsv_df.explode(c).reset_index(drop=True)

Investigating column Chain for prefixes to the peptide coordinates.
	Unique prefixes: CHAIN. Only one: True
	Total rows with multiple Chain entries: 1335
Investigating column Peptide for prefixes to the peptide coordinates.
	Unique prefixes: PEPTIDE. Only one: True
	Total rows with multiple Peptide entries: 133
Investigating column Propeptide for prefixes to the peptide coordinates.
	Unique prefixes: PROPEP. Only one: True
	Total rows with multiple Propeptide entries: 163
Investigating column Signal peptide for prefixes to the peptide coordinates.
	Unique prefixes: SIGNAL. Only one: True
	Total rows with multiple Signal peptide entries: 0
Investigating column Transit peptide for prefixes to the peptide coordinates.
	Unique prefixes: TRANSIT. Only one: True
	Total rows with multiple Transit peptide entries: 33


In [140]:
idmap_fasta_df = pd.DataFrame(fasta_rows, columns=["uniprot_id_full","sequence","description"])
idmap_fasta_df[["database","uniprotkb","uniprot_gene_name"]] = idmap_fasta_df["uniprot_id_full"].str.split("|",expand=True)
idmap_fasta_df["uniprotkb_iso"] = idmap_fasta_df["uniprotkb"].apply(lambda x: x if "-" in x else f"{x}-0")
idmap_fasta_df["isoform_from_desc"] = "Isoform " +  idmap_fasta_df["description"].str.extract(r'(?i)\bisoform\s+([^\s,;:)\]]+)')[0]
idmap_fasta_df["isoform_from_desc"] = idmap_fasta_df["isoform_from_desc"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_fasta_df["isoform_from_uniprotkb"] = "Isoform " + idmap_fasta_df["uniprotkb_iso"].apply(lambda x: x.split("-")[1] if (x is not None and type(x)==str and "-" in x) else None)
idmap_fasta_df["isoform_from_uniprotkb"] = idmap_fasta_df["isoform_from_uniprotkb"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_fasta_df["canonical_uniprotkb"] = idmap_fasta_df["uniprotkb"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
idmap_fasta_df["uniprotkb"] = "uniprotkb:" + idmap_fasta_df["uniprotkb"]
idmap_fasta_df = idmap_fasta_df.drop(columns=["uniprot_id_full","description"])
display(idmap_fasta_df.head())

# want to group somehow and determine which isoforms have the same sequences as each other, if any 
test1 = idmap_fasta_df.groupby(["canonical_uniprotkb","sequence"]).agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: set(x)),
    unique_isoforms_from_desc=("isoform_from_desc", lambda x: set(x)),
).reset_index()
test1["total_isoforms_from_uniprotkb"] = test1["unique_isoforms_from_uniprotkb"].apply(lambda x: len(x) if x is not None else 0)
test1["total_isoforms_from_dec"] = test1["unique_isoforms_from_desc"].apply(lambda x: len(x) if x is not None else 0)
test2 = len(test1.loc[test1["total_isoforms_from_uniprotkb"]>1])
print(f"Total instances where two different isoforms of the same protein in UniProt have the exact same sequence: {test2}")

# Check if there are any cases where there's a blank AND an isoform 1 in the same sequence
test1 = idmap_fasta_df.groupby(["canonical_uniprotkb"]).agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: list(set(x))),
    unique_isoforms_from_desc=("isoform_from_desc", lambda x: list(set(x))),
)
test1["unique_isoforms"] = test1.apply(lambda row: list(set(row["unique_isoforms_from_uniprotkb"] + row["unique_isoforms_from_desc"])),axis=1)
test1 = len(test1.loc[(test1["unique_isoforms_from_uniprotkb"].apply(lambda x: "Isoform 0" in x)) & (test1["unique_isoforms_from_uniprotkb"].apply(lambda x: "Isoform 1" in x))])
print(f"Total instances where there are both an Isoform 0 and Isoform 1 for the same protein: {test1}")
#idmap_fasta_df["uniprotkb"] = "uniprotkb:" + idmap_fasta_df["uniprotkb"]

# make sure that there always is a unique isoform 
test1 = idmap_fasta_df.groupby("canonical_uniprotkb").agg(
    unique_isoforms_from_uniprotkb=("isoform_from_uniprotkb", lambda x: list(set(x))),
)
test1["iso_0_count"] = test1["unique_isoforms_from_uniprotkb"].apply(lambda x: x.count("Isoform 0") if x is not None else 0)
test1 = (test1["iso_0_count"]==1).all()
print(f"Every uniprotkb has exactly one canonical isoform, which we have named Isoform 0: {test1}")



Unnamed: 0,sequence,database,uniprotkb,uniprot_gene_name,uniprotkb_iso,isoform_from_desc,isoform_from_uniprotkb,canonical_uniprotkb
0,MAAQFFNRIGQMGLGVAVLGGVVNSALYNVEGGHRAVIFDRFTGIK...,tr,uniprotkb:A0A023GQA5,A0A023GQA5_DROME,A0A023GQA5-0,,Isoform 0,A0A023GQA5
1,MLFFNRWGKIRMLEPYQPKFQQQHRSSCPLVDLDAVTTHQRSSVSR...,tr,uniprotkb:A0A023GRW2,A0A023GRW2_DROME,A0A023GRW2-0,,Isoform 0,A0A023GRW2
2,MLLSELVATASSLPYTAISIHNNCRVPAARHIHHGCRYFHGPPVMH...,sp,uniprotkb:A0A023PXA5,YA19A_YEAST,A0A023PXA5-0,,Isoform 0,A0A023PXA5
3,MYLCYTCFFLPSYDCKRLFTIVRAYIPARLSCNQPMVLFFTSPSSS...,sp,uniprotkb:A0A023PXH6,YM172_YEAST,A0A023PXH6-0,,Isoform 0,A0A023PXH6
4,MIGTSSLYQLLKITFFFYPYATVLKVGKVGVVVRIVDGAFGPVSLL...,sp,uniprotkb:A0A023PXI0,YM306_YEAST,A0A023PXI0-0,,Isoform 0,A0A023PXI0


Total instances where two different isoforms of the same protein in UniProt have the exact same sequence: 0
Total instances where there are both an Isoform 0 and Isoform 1 for the same protein: 427
Every uniprotkb has exactly one canonical isoform, which we have named Isoform 0: True


In [141]:
def get_subsequence(seq, coords, one_indexed=True, end_inclusive=True):
    """
    Helper method for extracting a subsequence from a full sequence
    """
    try:
        if len(coords)==1:
            coords = [coords[0],coords[0]]
            end_inclusive=True
        start, end = coords
        # if there are question marks, remove them
        if start[0]=="?":
            start = start[1:]
        if end[0]=="?":
            end = end[1:]
        
        start = int(start)
        end = int(end)
        
        if one_indexed:
            start = start - 1 
            end = end - 1
        if end_inclusive:
            end = end + 1
        
        subsequence = seq[start:end]
        return subsequence
    except:
        return None

In [142]:
# make the additional sequences from tsv
# each row has 
idmap_tsv_df_chains = idmap_tsv_df.loc[idmap_tsv_df["Chain"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_peptides = idmap_tsv_df.loc[idmap_tsv_df["Peptide"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_propeptides = idmap_tsv_df.loc[idmap_tsv_df["Propeptide"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_sigpeptides = idmap_tsv_df.loc[idmap_tsv_df["Signal peptide"].notna()].reset_index(drop=True).copy()
idmap_tsv_df_transitpeptides = idmap_tsv_df.loc[idmap_tsv_df["Transit peptide"].notna()].reset_index(drop=True).copy()

# make a dictionary from the FASTA df with the canonical isoform
canonical_seq_dict = idmap_fasta_df.loc[idmap_fasta_df["isoform_from_uniprotkb"]=="Isoform 0"]
canonical_seq_dict = idmap_fasta_df.loc[idmap_fasta_df["isoform_from_uniprotkb"]=="Isoform 0"].reset_index(drop=True)
canonical_seq_dict = dict(zip(canonical_seq_dict["canonical_uniprotkb"],canonical_seq_dict["sequence"]))
print(f"\tMade a mapping of UniProt IDs to their canonical sequences: {len(canonical_seq_dict)} entries")

	Made a mapping of UniProt IDs to their canonical sequences: 87607 entries


In [143]:
# Process chains 
# for each of these, I need to rename as the name of the peptide and change the sequence. and I need to make all the other columns None
idmap_tsv_df_chains[["Peptide","Propeptide","Signal peptide","Transit peptide"]] = None
idmap_tsv_df_chains["Chain_coords_1ind"] = idmap_tsv_df_chains["Chain"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_chains["Chain_name"] = idmap_tsv_df_chains["Chain"].apply(lambda x: x.split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x else None)
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_chains)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_chains["canonical_sequence"] = idmap_tsv_df_chains["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Chain_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
idmap_tsv_df_chains["uniprotkb"] = idmap_tsv_df_chains.apply(lambda row: row["Entry"] + "-" + row["Chain_name"], axis=1)
idmap_tsv_df_chains["Sequence"] = idmap_tsv_df_chains.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Chain_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_chains):.2f}%)")
test1 = len(idmap_tsv_df_chains.loc[idmap_tsv_df_chains["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_chains):.2f}%)")
idmap_tsv_df_chains.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 56946
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
	Total rows that could not be mapped to a subsequence: 505 (0.89%)
	Total rows that were successfully mapped to a subsequence: 56441 (99.11%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide,Chain_coords_1ind,Chain_name,canonical_sequence,uniprotkb,Sequence
0,A0A023PXA5,A0A023PXA5,reviewed,YA19A_YEAST,Putative uncharacterized protein YAL019W-A,YAL019W-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,189.0,"1..189; /note=""Putative uncharacterized protei...",,,,,"[1, 189]",PRO_0000430976,MLLSELVATASSLPYTAISIHNNCRVPAARHIHHGCRYFHGPPVMH...,A0A023PXA5-PRO_0000430976,MLLSELVATASSLPYTAISIHNNCRVPAARHIHHGCRYFHGPPVMH...
1,A0A023PXH6,A0A023PXH6,reviewed,YM172_YEAST,Putative uncharacterized membrane protein YMR1...,YMR172C-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,127.0,"1..127; /note=""Putative uncharacterized membra...",,,,,"[1, 127]",PRO_0000431055,MYLCYTCFFLPSYDCKRLFTIVRAYIPARLSCNQPMVLFFTSPSSS...,A0A023PXH6-PRO_0000431055,MYLCYTCFFLPSYDCKRLFTIVRAYIPARLSCNQPMVLFFTSPSSS...
2,A0A023PXI0,A0A023PXI0,reviewed,YM306_YEAST,Putative uncharacterized membrane protein YMR3...,YMR306C-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,129.0,"1..129; /note=""Putative uncharacterized membra...",,,,,"[1, 129]",PRO_0000431050,MIGTSSLYQLLKITFFFYPYATVLKVGKVGVVVRIVDGAFGPVSLL...,A0A023PXI0-PRO_0000431050,MIGTSSLYQLLKITFFFYPYATVLKVGKVGVVVRIVDGAFGPVSLL...
3,A0A023PXP4,A0A023PXP4,reviewed,YL235_YEAST,Putative uncharacterized protein YLR235C,YLR235C,Saccharomyces cerevisiae (strain ATCC 204508 /...,132.0,"1..132; /note=""Putative uncharacterized protei...",,,,,"[1, 132]",PRO_0000431045,MLIGAPSNMRLRGALELLWRRLLHGLMQLRLVLKMHICSQLNHAIK...,A0A023PXP4-PRO_0000431045,MLIGAPSNMRLRGALELLWRRLLHGLMQLRLVLKMHICSQLNHAIK...
4,A0A023PYF7,A0A023PYF7,reviewed,YE172_YEAST,Putative uncharacterized protein YER172C-A,YER172C-A,Saccharomyces cerevisiae (strain ATCC 204508 /...,126.0,"1..126; /note=""Putative uncharacterized protei...",,,,,"[1, 126]",PRO_0000431006,MTVLLEHPLGPDSSRILCLALGKNMASKASCTSLSFLLCMATCSKQ...,A0A023PYF7-PRO_0000431006,MTVLLEHPLGPDSSRILCLALGKNMASKASCTSLSFLLCMATCSKQ...


In [144]:
# Process peptides
idmap_tsv_df_peptides[["Chain","Propeptide","Signal peptide","Transit peptide"]] = None
idmap_tsv_df_peptides["Peptide_coords_1ind"] = idmap_tsv_df_peptides["Peptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_peptides["Peptide_name"] = idmap_tsv_df_peptides["Peptide"].apply(lambda x: x.split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x else None)
test1 = len(idmap_tsv_df_peptides.loc[idmap_tsv_df_peptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_peptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_peptides["canonical_sequence"] = idmap_tsv_df_peptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_peptides.loc[idmap_tsv_df_peptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_peptides.loc[idmap_tsv_df_peptides["Peptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
idmap_tsv_df_peptides["uniprotkb"] = idmap_tsv_df_peptides.apply(lambda row: row["Entry"] + "-" + row["Peptide_name"], axis=1)
idmap_tsv_df_peptides["Sequence"] = idmap_tsv_df_peptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Peptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_peptides.loc[idmap_tsv_df_peptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_peptides):.2f}%)")
test1 = len(idmap_tsv_df_peptides.loc[idmap_tsv_df_peptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_peptides):.2f}%)")
idmap_tsv_df_peptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 1640
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
	Total rows that could not be mapped to a subsequence: 5 (0.30%)
	Total rows that were successfully mapped to a subsequence: 1635 (99.70%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide,Peptide_coords_1ind,Peptide_name,canonical_sequence,uniprotkb,Sequence
0,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,"2247..2269; /note=""Peptide 2k""; /id=""PRO_00004...",,,,"[2247, 2269]",PRO_0000443030,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443030,SPQDNQMAIIIMVAVGLLGLITA
1,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,"2247..2269; /note=""Peptide 2k""; /id=""PRO_00004...",,,,"[2247, 2269]",PRO_0000443030,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443030,SPQDNQMAIIIMVAVGLLGLITA
2,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,"2247..2269; /note=""Peptide 2k""; /id=""PRO_00004...",,,,"[2247, 2269]",PRO_0000443030,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443030,SPQDNQMAIIIMVAVGLLGLITA
3,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,"2247..2269; /note=""Peptide 2k""; /id=""PRO_00004...",,,,"[2247, 2269]",PRO_0000443030,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443030,SPQDNQMAIIIMVAVGLLGLITA
4,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,"2247..2269; /note=""Peptide 2k""; /id=""PRO_00004...",,,,"[2247, 2269]",PRO_0000443030,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443030,SPQDNQMAIIIMVAVGLLGLITA


In [145]:
# Process peptides
idmap_tsv_df_propeptides[["Chain","Peptide","Signal peptide","Transit peptide"]] = None
idmap_tsv_df_propeptides["Propeptide_coords_1ind"] = idmap_tsv_df_propeptides["Propeptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_propeptides["Propeptide_name"] = idmap_tsv_df_propeptides["Propeptide"].apply(lambda x: x.split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x else None)
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_propeptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_propeptides["canonical_sequence"] = idmap_tsv_df_propeptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Propeptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
idmap_tsv_df_propeptides["uniprotkb"] = idmap_tsv_df_propeptides.apply(lambda row: row["Entry"] + "-" + row["Propeptide_name"], axis=1)
idmap_tsv_df_propeptides["Sequence"] = idmap_tsv_df_propeptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Propeptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_propeptides):.2f}%)")
test1 = len(idmap_tsv_df_propeptides.loc[idmap_tsv_df_propeptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_propeptides):.2f}%)")
idmap_tsv_df_propeptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 2961
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
	Total rows that could not be mapped to a subsequence: 51 (1.72%)
	Total rows that were successfully mapped to a subsequence: 2910 (98.28%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide,Propeptide_coords_1ind,Propeptide_name,canonical_sequence,uniprotkb,Sequence
0,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,,"105..122; /note=""ER anchor for capsid protein ...",,,"[105, 122]",PRO_0000443020,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443020,GADTSVGIVGLLLTTAMA
1,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,,"105..122; /note=""ER anchor for capsid protein ...",,,"[105, 122]",PRO_0000443020,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443020,GADTSVGIVGLLLTTAMA
2,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,,"105..122; /note=""ER anchor for capsid protein ...",,,"[105, 122]",PRO_0000443020,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443020,GADTSVGIVGLLLTTAMA
3,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,,"105..122; /note=""ER anchor for capsid protein ...",,,"[105, 122]",PRO_0000443020,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443020,GADTSVGIVGLLLTTAMA
4,A0A024B7W1,A0A024B7W1,reviewed,POLG_ZIKVF,Genome polyprotein [Cleaved into: Capsid prote...,,Zika virus (isolate ZIKV/Human/French Polynesi...,3423.0,,,"105..122; /note=""ER anchor for capsid protein ...",,,"[105, 122]",PRO_0000443020,MKNPKKKSGGFRIVNMLKRGVARVSPFGGLKRLPAGLLLGHGPIRM...,A0A024B7W1-PRO_0000443020,GADTSVGIVGLLLTTAMA


In [146]:
# Process signal peptides
idmap_tsv_df_sigpeptides[["Chain","Peptide","Propeptide","Transit peptide"]] = None
idmap_tsv_df_sigpeptides["Sigpeptide_coords_1ind"] = idmap_tsv_df_sigpeptides["Signal peptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_sigpeptides["Sigpeptide_name"] = idmap_tsv_df_sigpeptides.apply(lambda x: x["Signal peptide"].split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x["Signal peptide"] else "sigpep"+x["Signal peptide"].split(";")[0].strip(), axis=1)
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_sigpeptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_sigpeptides["canonical_sequence"] = idmap_tsv_df_sigpeptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sigpeptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sigpeptide_name"].str.contains("sigpep")])
print(f"\t\tTotal chains with made-up name by me: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sigpeptide_name"].str.contains("PRO_")])
print(f"\t\tTotal chains with name from UniProt: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
idmap_tsv_df_sigpeptides["uniprotkb"] = idmap_tsv_df_sigpeptides.apply(lambda row: row["Entry"] + "-" + row["Sigpeptide_name"], axis=1)
idmap_tsv_df_sigpeptides["Sequence"] = idmap_tsv_df_sigpeptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Sigpeptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
test1 = len(idmap_tsv_df_sigpeptides.loc[idmap_tsv_df_sigpeptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_sigpeptides):.2f}%)")
idmap_tsv_df_sigpeptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 11663
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
		Total chains with made-up name by me: 11663 (100.00%)
		Total chains with name from UniProt: 0 (0.00%)
	Total rows that could not be mapped to a subsequence: 35 (0.30%)
	Total rows that were successfully mapped to a subsequence: 11628 (99.70%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide,Sigpeptide_coords_1ind,Sigpeptide_name,canonical_sequence,uniprotkb,Sequence
0,A0A023PZE6,A0A023PZE6,reviewed,YD048_YEAST,Putative uncharacterized membrane protein YDR048C,YDR048C,Saccharomyces cerevisiae (strain ATCC 204508 /...,104.0,,,,"1..16; /evidence=""ECO:0000255""",,"[1, 16]",sigpep1..16,MSSLLQLLAVWSQSSSISMNSEVAQTNQKYKDHSFLVSQSFYSPFV...,A0A023PZE6-sigpep1..16,MSSLLQLLAVWSQSSS
1,A0A024A2C9,A0A024A2C9,unreviewed,A0A024A2C9_HAEIF,Lipoprotein binding FH,lph,Haemophilus influenzae,280.0,,,,"1..29; /evidence=""ECO:0000256|SAM:SignalP""",,"[1, 29]",sigpep1..29,MNINLKKFSLTILAALTLTACGSGSGASASNAPTAQPSTPATQPSE...,A0A024A2C9-sigpep1..29,MNINLKKFSLTILAALTLTACGSGSGASA
2,A0A089ZWN7,A0A089ZWN7,unreviewed,A0A089ZWN7_DATST,Chitin-binding lectin,dsa-b,Datura stramonium (Jimsonweed) (Common thornap...,279.0,,,,"1..23; /evidence=""ECO:0000256|SAM:SignalP""",,"[1, 23]",sigpep1..23,MMRMRHTAISLLALALFFLKVSAKLSLPFYLPANETLGLEVGNTSA...,A0A089ZWN7-sigpep1..23,MMRMRHTAISLLALALFFLKVSA
3,A0A096PNU3,A0A096PNU3,unreviewed,A0A096PNU3_MAIZE,Putative lipid-transfer protein DIR1,LOC100281647 ZEAMMB73_Zm00001d019399,Zea mays (Maize),103.0,,,,"1..26; /evidence=""ECO:0000256|SAM:SignalP""",,"[1, 26]",sigpep1..26,MAKAGATAVLVTVLVVLAASAEMAHGVCNLSSAGIRACQPAAAIRN...,A0A096PNU3-sigpep1..26,MAKAGATAVLVTVLVVLAASAEMAHG
4,A0A096QAE9,A0A096QAE9,unreviewed,A0A096QAE9_MAIZE,BURP domain protein RD22 (BURP7),LOC100274899 ZEAMMB73_Zm00001d018837,Zea mays (Maize),525.0,,,,"1..19; /evidence=""ECO:0000256|SAM:SignalP""",,"[1, 19]",sigpep1..19,MARGIILLLLVTPLAISMALPSLSTRIEGSVIGERSNLFSQLHSDN...,A0A096QAE9-sigpep1..19,MARGIILLLLVTPLAISMA


In [147]:
# Process peptides
idmap_tsv_df_transitpeptides[["Chain","Peptide","Signal peptide","Propeptide"]] = None
idmap_tsv_df_transitpeptides["Transpeptide_coords_1ind"] = idmap_tsv_df_transitpeptides["Transit peptide"].apply(lambda x: x.split(";")[0].strip().split(".."))
idmap_tsv_df_transitpeptides["Transpeptide_name"] = idmap_tsv_df_transitpeptides.apply(lambda x: x["Transit peptide"].split("; /id=")[1].split(";")[0].strip().strip("\"") if "; /id=" in x["Transit peptide"] else "transpep"+x["Transit peptide"].split(";")[0].strip(), axis=1)
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Entry"].str.contains("-")])==0
print(f"Processing all of the chains - extracting their sequences and getting their names")
print(f"\tTotal unique chains: {len(idmap_tsv_df_transitpeptides)}")
print(f"\tEverything in the IDmap TSV is canonical (no isoform indicated by -): {test1}")
idmap_tsv_df_transitpeptides["canonical_sequence"] = idmap_tsv_df_transitpeptides["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["canonical_sequence"].isna()])==0
print(f"\tAll uniprots could be mapped to a canonical seuqence: {test1}")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Transpeptide_name"].isna()])==0
print(f"\tAll chains have a name: {test1}")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Transpeptide_name"].str.contains("transpep")])
print(f"\t\tTotal chains with made-up name by me: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Transpeptide_name"].str.contains("PRO_")])
print(f"\t\tTotal chains with name from UniProt: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
idmap_tsv_df_transitpeptides["uniprotkb"] = idmap_tsv_df_transitpeptides.apply(lambda row: row["Entry"] + "-" + row["Transpeptide_name"], axis=1)
idmap_tsv_df_transitpeptides["Sequence"] = idmap_tsv_df_transitpeptides.apply(lambda row: get_subsequence(row["canonical_sequence"],row["Transpeptide_coords_1ind"], one_indexed=True, end_inclusive=True), axis=1)
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Sequence"].isna()])
print(f"\tTotal rows that could not be mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
test1 = len(idmap_tsv_df_transitpeptides.loc[idmap_tsv_df_transitpeptides["Sequence"].notna()])
print(f"\tTotal rows that were successfully mapped to a subsequence: {test1} ({100*test1/len(idmap_tsv_df_transitpeptides):.2f}%)")
idmap_tsv_df_transitpeptides.head()

Processing all of the chains - extracting their sequences and getting their names
	Total unique chains: 1865
	Everything in the IDmap TSV is canonical (no isoform indicated by -): True
	All uniprots could be mapped to a canonical seuqence: True
	All chains have a name: True
		Total chains with made-up name by me: 1865 (100.00%)
		Total chains with name from UniProt: 0 (0.00%)
	Total rows that could not be mapped to a subsequence: 254 (13.62%)
	Total rows that were successfully mapped to a subsequence: 1611 (86.38%)


Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide,Transpeptide_coords_1ind,Transpeptide_name,canonical_sequence,uniprotkb,Sequence
0,A0A096LP55,A0A096LP55,reviewed,QCR6L_HUMAN,"Cytochrome b-c1 complex subunit 6-like, mitoch...",UQCRHL,Homo sapiens (Human),91.0,,,,,"1..13; /note=""Mitochondrion""; /evidence=""ECO:0...","[1, 13]",transpep1..13,MGLEDEQKMLTESGDPEEEEEEEEELVDPLTTVREQCEQLEKCVKA...,A0A096LP55-transpep1..13,MGLEDEQKMLTES
1,A0A1D6GDY8,A0A1D6GDY8,reviewed,RH3A_MAIZE,"DEAD-box ATP-dependent RNA helicase 3A, chloro...",RH3A ZEAMMB73_Zm00001d012922,Zea mays (Maize),745.0,,,,,"1..41; /note=""Chloroplast""; /evidence=""ECO:000...","[1, 41]",transpep1..41,MASLVTLPAIAFSNPATASGAVRLRAAAFRCWALRRRGWAVAAAVA...,A0A1D6GDY8-transpep1..41,MASLVTLPAIAFSNPATASGAVRLRAAAFRCWALRRRGWAV
2,A0A1D6LAG9,A0A1D6LAG9,reviewed,CPS1_MAIZE,"Cysteine--tRNA ligase CPS1, chloroplastic/mito...",CPS1 ZEAMMB73_Zm00001d034736,Zea mays (Maize),564.0,,,,,"1..43; /note=""Chloroplast and mitochondrion""; ...","[1, 43]",transpep1..43,MAAAVVVRRAAGLIPLLSSRFGARMPLHRALSQIPPPRFCRLLSQQ...,A0A1D6LAG9-transpep1..43,MAAAVVVRRAAGLIPLLSSRFGARMPLHRALSQIPPPRFCRLL
3,A1L4X0,A1L4X0,reviewed,CLT2_ARATH,"Protein CLT2, chloroplastic (CRT-like transpor...",CLT2 At4g24460 T22A6.290,Arabidopsis thaliana (Mouse-ear cress),431.0,,,,,"1..79; /note=""Chloroplast""; /evidence=""ECO:000...","[1, 79]",transpep1..79,MDTVLMATTPPIRCLHASIPTVFRSPAIYQVSCRSSQLFSYRSTTM...,A1L4X0-transpep1..79,MDTVLMATTPPIRCLHASIPTVFRSPAIYQVSCRSSQLFSYRSTTM...
4,A1XBS5,A1XBS5,reviewed,CBAR1_HUMAN,CBY1-interacting BAR domain-containing protein 1,CIBAR1 FAM92A FAM92A1,Homo sapiens (Human),289.0,,,,,"1..47; /note=""Mitochondrion""; /evidence=""ECO:0...","[1, 47]",transpep1..47,MMRRTLENRNAQTKQLQTAVSNVEKHFGELCQIFAAYVRKTARLRD...,A1XBS5-transpep1..47,MMRRTLENRNAQTKQLQTAVSNVEKHFGELCQIFAAYVRKTARLRDK


In [148]:
idmap_tsv_df_combined = idmap_tsv_df.loc[
    (idmap_tsv_df["Chain"].isna()) & 
    (idmap_tsv_df["Peptide"].isna()) & 
    (idmap_tsv_df["Propeptide"].isna()) & 
    (idmap_tsv_df["Signal peptide"].isna()) & 
    (idmap_tsv_df["Transit peptide"].isna())
    ].reset_index(drop=True).copy()
idmap_tsv_df_combined["uniprotkb"] = idmap_tsv_df_combined["Entry"]
idmap_tsv_df_combined["Sequence"] = idmap_tsv_df_combined["Entry"].apply(lambda x: canonical_seq_dict[x] if x in canonical_seq_dict else None)

idmap_tsv_df_combined = pd.concat(
    [
        idmap_tsv_df_combined,
        idmap_tsv_df_chains,
        idmap_tsv_df_peptides,
        idmap_tsv_df_propeptides
    ]
).reset_index(drop=True).drop(columns=[
    "Chain_coords_1ind","Chain_name",
    "Peptide_coords_1ind","Peptide_name",
    "Propeptide_coords_1ind","Propeptide_name",
    "canonical_sequence"])
idmap_tsv_df_combined

Unnamed: 0,From,Entry,Reviewed,Entry Name,Protein names,Gene Names,Organism,Length,Chain,Peptide,Propeptide,Signal peptide,Transit peptide,uniprotkb,Sequence
0,A0A023GQA5,A0A023GQA5,unreviewed,A0A023GQA5_DROME,Prohibitin,Phb1 Cc Dmel\CG10691 fs(2)HH32 l (2) 37Cc l(2)...,Drosophila melanogaster (Fruit fly),276.0,,,,,,A0A023GQA5,MAAQFFNRIGQMGLGVAVLGGVVNSALYNVEGGHRAVIFDRFTGIK...
1,A0A023GRW2,A0A023GRW2,unreviewed,A0A023GRW2_DROME,IP07931p,CG12541,Drosophila melanogaster (Fruit fly),124.0,,,,,,A0A023GRW2,MLFFNRWGKIRMLEPYQPKFQQQHRSSCPLVDLDAVTTHQRSSVSR...
2,A0A024R0Y4,A0A024R0Y4,unreviewed,A0A024R0Y4_HUMAN,Transcriptional adapter,TADA2A TADA2L hCG_28187,Homo sapiens (Human),443.0,,,,,,A0A024R0Y4,MDRLGSFSNDPSDKPPCRGCSSYLMEPYIKCAECGPPPFFLCLQCF...
3,A0A045IZR3,A0A045IZR3,unreviewed,A0A045IZR3_MYCTX,Methylmalonyl-CoA mutase small subunit (EC 5.4...,mutA A4S10_01581 DKC2_1589 DSJ38_02340 ERS0941...,Mycobacterium tuberculosis,615.0,,,,,,A0A045IZR3,MSIDVPERADLEQVRGRWRNAVAGVLSKSNRTDSAQLGDHPERLLD...
4,A0A060CUY1,A0A060CUY1,unreviewed,A0A060CUY1_MAIZE,BHLH transcription factor (Transcription facto...,bHLH86 LOC100383089 ZEAMMB73_Zm00001d020826,Zea mays (Maize),434.0,,,,,,A0A060CUY1,MDMNESGEKGMEGNASSGIPVDWQTQFSAAAFSCAPPQQQQVPMMD...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96317,Q9Z0X1,Q9Z0X1,reviewed,AIFM1_MOUSE,"Apoptosis-inducing factor 1, mitochondrial (EC...",Aifm1 Aif Pdcd8,Mus musculus (Mouse),612.0,,,"55..101; /note=""Removed in mature form""; /evid...",,,Q9Z0X1-PRO_0000401936,SSGSSGGKMDNSVLVLIVGLSTIGAGAYAYKTIKEDQKRYNERVMGL
96318,Q9Z1W4,Q9Z1W4,reviewed,GDF11_MOUSE,Growth/differentiation factor 11 (GDF-11) (Bon...,Gdf11 Bmp11,Mus musculus (Mouse),405.0,,,"21..296; /evidence=""ECO:0000250""; /id=""PRO_000...",,,Q9Z1W4-PRO_0000033988,RGEAAEGPAAAAAAAAAAAGVGGERSSRPAPSAPPEPDGCPVCVWR...
96319,Q9Z1Y3,Q9Z1Y3,reviewed,CADH2_RAT,Cadherin-2 (Neural cadherin) (N-cadherin) (CD ...,Cdh2,Rattus norvegicus (Rat),906.0,,,"26..159; /evidence=""ECO:0000255""; /id=""PRO_000...",,,Q9Z1Y3-PRO_0000003735,SGELALCKTGFPEDVYSAVLPKTVHEGQPLLNVKFSNCNRKRKVQY...
96320,Q9ZRD6,Q9ZRD6,reviewed,YKT61_ARATH,VAMP-like protein YKT61 (AtYKT61) (Geranylgera...,YKT61 At5g58060 K21L19.5 K21L19_40,Arabidopsis thaliana (Mouse-ear cress),199.0,,,"197..199; /note=""Removed in mature form""; /evi...",,,Q9ZRD6-PRO_0000370846,TIL


In [149]:
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined.copy()
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined_for_merge.drop(columns=[
    "From","Protein names","Gene Names","Organism","Length","Chain","Peptide","Propeptide","Signal peptide","Transit peptide"
])
idmap_tsv_df_combined_for_merge = idmap_tsv_df_combined_for_merge.rename(
    columns = {
        "Entry": "uniprotkb",
        "uniprotkb": "uniprotkb_iso",
        "Reviewed": "database",
        "Entry Name": "uniprot_gene_name",
        "Sequence": "sequence"
    }
)
idmap_tsv_df_combined_for_merge["uniprotkb_iso"] = idmap_tsv_df_combined_for_merge["uniprotkb_iso"].apply(lambda x: f"{x}-0" if "-" not in x else x)
test1 = len(idmap_tsv_df_combined_for_merge["database"].value_counts().reset_index())==2
print(f"\tEverything is either reviewed or unreviewed (no other categories) in idmap tsv: {test1}")
idmap_tsv_df_combined_for_merge["database"] = idmap_tsv_df_combined_for_merge["database"].apply(
    lambda x: "sp" if x=="reviewed" else "tr")
idmap_tsv_df_combined_for_merge["uniprotkb"] = "uniprotkb:" + idmap_tsv_df_combined_for_merge["uniprotkb"]
idmap_tsv_df_combined_for_merge

	Everything is either reviewed or unreviewed (no other categories) in idmap tsv: True


Unnamed: 0,uniprotkb,database,uniprot_gene_name,uniprotkb_iso,sequence
0,uniprotkb:A0A023GQA5,tr,A0A023GQA5_DROME,A0A023GQA5-0,MAAQFFNRIGQMGLGVAVLGGVVNSALYNVEGGHRAVIFDRFTGIK...
1,uniprotkb:A0A023GRW2,tr,A0A023GRW2_DROME,A0A023GRW2-0,MLFFNRWGKIRMLEPYQPKFQQQHRSSCPLVDLDAVTTHQRSSVSR...
2,uniprotkb:A0A024R0Y4,tr,A0A024R0Y4_HUMAN,A0A024R0Y4-0,MDRLGSFSNDPSDKPPCRGCSSYLMEPYIKCAECGPPPFFLCLQCF...
3,uniprotkb:A0A045IZR3,tr,A0A045IZR3_MYCTX,A0A045IZR3-0,MSIDVPERADLEQVRGRWRNAVAGVLSKSNRTDSAQLGDHPERLLD...
4,uniprotkb:A0A060CUY1,tr,A0A060CUY1_MAIZE,A0A060CUY1-0,MDMNESGEKGMEGNASSGIPVDWQTQFSAAAFSCAPPQQQQVPMMD...
...,...,...,...,...,...
96317,uniprotkb:Q9Z0X1,sp,AIFM1_MOUSE,Q9Z0X1-PRO_0000401936,SSGSSGGKMDNSVLVLIVGLSTIGAGAYAYKTIKEDQKRYNERVMGL
96318,uniprotkb:Q9Z1W4,sp,GDF11_MOUSE,Q9Z1W4-PRO_0000033988,RGEAAEGPAAAAAAAAAAAGVGGERSSRPAPSAPPEPDGCPVCVWR...
96319,uniprotkb:Q9Z1Y3,sp,CADH2_RAT,Q9Z1Y3-PRO_0000003735,SGELALCKTGFPEDVYSAVLPKTVHEGQPLLNVKFSNCNRKRKVQY...
96320,uniprotkb:Q9ZRD6,sp,YKT61_ARATH,Q9ZRD6-PRO_0000370846,TIL


In [150]:
print(len(idmap_fasta_df))
print(len(
    idmap_tsv_df_combined_for_merge.loc[
        (idmap_tsv_df_combined["Chain"].notna()) | 
        (idmap_tsv_df_combined["Peptide"].notna()) |
        (idmap_tsv_df_combined["Propeptide"].notna()) 
    ]
))
print(len(idmap_fasta_df) + len(
    idmap_tsv_df_combined_for_merge.loc[
        (idmap_tsv_df_combined["Chain"].notna()) | 
        (idmap_tsv_df_combined["Peptide"].notna()) |
        (idmap_tsv_df_combined["Propeptide"].notna()) 
    ]
))


115456
61547
177003


In [151]:
# make a species map
idmap_species_dict = idmap_tsv_df_combined[["Entry Name","Organism"]]
idmap_species_dict["Entry Name"] = idmap_species_dict["Entry Name"].apply(lambda x: x.split("_")[1] if "_" in x else None)
idmap_species_dict = idmap_species_dict.dropna().drop_duplicates().reset_index(drop=True)
idmap_species_dict = dict(zip(idmap_species_dict["Entry Name"],idmap_species_dict["Organism"]))
print(f"Total unique species: {len(idmap_species_dict)}")

Total unique species: 1179


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  idmap_species_dict["Entry Name"] = idmap_species_dict["Entry Name"].apply(lambda x: x.split("_")[1] if "_" in x else None)


In [152]:
idmap_merge = pd.merge(
    idmap_fasta_df,
    idmap_tsv_df_combined_for_merge,
    on=["uniprotkb","database","uniprot_gene_name","uniprotkb_iso","sequence"],
    how="outer"
)
idmap_merge = idmap_merge.drop(columns=["uniprotkb"])
idmap_merge = idmap_merge.rename(columns={"uniprotkb_iso":"uniprotkb_full"})
idmap_merge["isoform_from_desc"] = idmap_merge["isoform_from_desc"].apply(lambda x: x if (type(x)==str and x!="Isoform ") else None)
idmap_merge["isoform_or_chain_from_uniprotkb"] = idmap_merge["uniprotkb_full"].apply(lambda x: x.split("-")[1] if "-" in x else None)
idmap_merge["canonical_uniprotkb"] = idmap_merge["uniprotkb_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
idmap_merge = idmap_merge[["canonical_uniprotkb","uniprotkb_full","uniprot_gene_name","database","sequence","isoform_or_chain_from_uniprotkb","isoform_from_desc"]]
idmap_merge["species"] = idmap_merge["uniprot_gene_name"].apply(lambda x: idmap_species_dict[x.split("_")[1]] if (x is not None and type(x)==str and "_" in x and x.split("_")[1] in idmap_species_dict) else None)
idmap_merge = idmap_merge.drop_duplicates().reset_index(drop=True)
test1 = len(idmap_merge)
print(f"\tTotal")
idmap_merge = idmap_merge.loc[idmap_merge["sequence"].notna()].reset_index(drop=True)
idmap_merge

	Total


Unnamed: 0,canonical_uniprotkb,uniprotkb_full,uniprot_gene_name,database,sequence,isoform_or_chain_from_uniprotkb,isoform_from_desc,species
0,A0A023GQA5,A0A023GQA5-0,A0A023GQA5_DROME,tr,MAAQFFNRIGQMGLGVAVLGGVVNSALYNVEGGHRAVIFDRFTGIK...,0,,Drosophila melanogaster (Fruit fly)
1,A0A023GRW2,A0A023GRW2-0,A0A023GRW2_DROME,tr,MLFFNRWGKIRMLEPYQPKFQQQHRSSCPLVDLDAVTTHQRSSVSR...,0,,Drosophila melanogaster (Fruit fly)
2,A0A023PXA5,A0A023PXA5-0,YA19A_YEAST,sp,MLLSELVATASSLPYTAISIHNNCRVPAARHIHHGCRYFHGPPVMH...,0,,Saccharomyces cerevisiae (strain ATCC 204508 /...
3,A0A023PXA5,A0A023PXA5-PRO_0000430976,YA19A_YEAST,sp,MLLSELVATASSLPYTAISIHNNCRVPAARHIHHGCRYFHGPPVMH...,PRO_0000430976,,Saccharomyces cerevisiae (strain ATCC 204508 /...
4,A0A023PXH6,A0A023PXH6-0,YM172_YEAST,sp,MYLCYTCFFLPSYDCKRLFTIVRAYIPARLSCNQPMVLFFTSPSSS...,0,,Saccharomyces cerevisiae (strain ATCC 204508 /...
...,...,...,...,...,...,...,...,...
173388,X5M5N0,X5M5N0-8,WNK_CAEEL,sp,MPDSITNGGRPPAPPSSVSSTTASTTGNFGTRRRLVNRIKKVDELH...,8,Isoform g,Caenorhabditis elegans
173389,X5M5N0,X5M5N0-9,WNK_CAEEL,sp,MPDSITNGGRPPAPPSSVSSTTASTTGNFGTRRRLVNRIKKVDELH...,9,Isoform h,Caenorhabditis elegans
173390,Z4YIA7,Z4YIA7-0,Z4YIA7_DANRE,tr,MKMEIRPLLMCFALCVVYATSKPTEKKDRVHHDAPLSSKEHDDGTN...,0,,Danio rerio (Zebrafish) (Brachydanio rerio)
173391,Z4YIA7,Z4YIA7-PRO_5004990248,Z4YIA7_DANRE,tr,TSKPTEKKDRVHHDAPLSSKEHDDGTNFEYDHDAFLGEEEAKTFDD...,PRO_5004990248,,Danio rerio (Zebrafish) (Brachydanio rerio)


In [153]:
idmap_savepath = "data_files/processed/intact/idmapping"
os.makedirs(idmap_savepath,exist_ok=True)
idmap_merge.to_csv(os.path.join(idmap_savepath,"idmapping_2025_11_05_processed_sequences.csv"),index=False)
idmap_tsv_df_combined.to_csv(os.path.join(idmap_savepath,"idmapping_2025_11_05_processed_tsv_only_no_isoforms.tsv"),sep="\t",index=False)


In [154]:
# before we save merged, must correct invalid aas
def find_invalid_chars(seq: str, valid_chars: set) -> set:
    """
    Find and return a set of invalid characters in a sequence.

    Args:
        seq (str): The sequence you wish to search for invalid characters.
        valid_chars (set): A set of valid characters.

    Returns:
        set: A set of characters in the sequence that are not in the set of valid characters.
    """
    unique_chars = set(seq) # set of all characters in the sequence; unique_chars = {A, C} for protein="AAACCC"

    if unique_chars.issubset(valid_chars):  # e.g. unique_chars = {A,C}, and {A,C} is a subset of valid_chars
        return np.nan
    else: # e.g. unique_chars = {A,X}. {A,X} is not a subset of valid_chars because X is not in valid_chars
        l = unique_chars.difference(valid_chars) # e.g. {A,X} - valid_chars = {X}
        l = sorted(list(l))
        return ",".join(l)
    
VALID_AAS = {'A',
             'R',
             'N',
             'D',
             'C',
             'E',
             'Q',
             'G',
             'H',
             'I',
             'L',
             'K',
             'M',
             'F',
             'P',
             'S',
             'T',
             'W',
             'Y',
             'V',
             'U' # this unnatural is allowed
            }


In [155]:
# first remove spaces then invalids
merged["aa_1"] = merged["aa_1"].str.replace(" ","")
merged["aa_2"] = merged["aa_2"].str.replace(" ","")
test1 = (len(merged.loc[merged["aa_1"].str.contains(" ")]))==0
print(f"Removed all spaces from sequences in merged: {test1}")

merged["invalids_aa_1"] = merged["aa_1"].apply(lambda x: find_invalid_chars(x,VALID_AAS))
merged["invalids_aa_2"] = merged["aa_2"].apply(lambda x: find_invalid_chars(x,VALID_AAS))
test1 = len(merged.loc[
    (merged["invalids_aa_1"].notna()) | 
    (merged["invalids_aa_2"].notna())
])
print(f"There are {test1} rows where either aa_1 or aa_2 contains an invalid character. {test1}/{len(merged)} = {100*test1/len(merged):.2f}%")
l = merged.loc[
    (merged["invalids_aa_1"].notna()) | 
    (merged["invalids_aa_2"].notna())
][["invalids_aa_1","invalids_aa_2"]]
l = l["invalids_aa_1"].dropna().unique().tolist() + l["invalids_aa_2"].dropna().unique().tolist() 
l = sorted(list(set((",".join(l)).split(","))))
print(f"\tList of unique invalid characters found: {l}")
merged =  merged.loc[
    ~((merged["invalids_aa_1"].notna()) | 
    (merged["invalids_aa_2"].notna()))
].reset_index(drop=True)
print(f"New size of merged after droping these rows: {len(merged)}")



Removed all spaces from sequences in merged: True
There are 1780 rows where either aa_1 or aa_2 contains an invalid character. 1780/746865 = 0.24%
	List of unique invalid characters found: ['B', 'J', 'O', 'X', 'Z']
New size of merged after droping these rows: 745085


In [156]:
print(",\n".join([f"\"{x}\"" for x in merged.columns]))

"ID(s) interactor A",
"ID(s) interactor B",
"Alt. ID(s) interactor A",
"Alt. ID(s) interactor B",
"Alias(es) interactor A",
"Alias(es) interactor B",
"Interaction detection method(s)",
"Publication 1st author(s)",
"Publication Identifier(s)",
"Taxid interactor A",
"Taxid interactor B",
"Interaction type(s)",
"Source database(s)",
"Interaction identifier(s)",
"Confidence value(s)",
"Biological role(s) interactor A",
"Biological role(s) interactor B",
"Experimental role(s) interactor A",
"Experimental role(s) interactor B",
"Type(s) interactor A",
"Type(s) interactor B",
"Xref(s) interactor A",
"Xref(s) interactor B",
"Interaction Xref(s)",
"Annotation(s) interactor A",
"Annotation(s) interactor B",
"Interaction annotation(s)",
"Host organism(s)",
"Interaction parameter(s)",
"Creation date",
"Update date",
"Checksum(s) interactor A",
"Checksum(s) interactor B",
"Interaction Checksum(s)",
"Negative",
"all_intact_A_sorted",
"all_intact_B_sorted",
"unique_id",
"uniprot_A",
"uniprot_B",
"int

In [157]:
merged.to_csv("data_files/processed/intact/merged_intermediate_file.csv",index=False)

## Recombine the UniProt ID-Mapping with merged other info

In [158]:
merged_dtypes = {
	"ID(s) interactor A": 'string',
	"ID(s) interactor B": 'string',
	"Alt. ID(s) interactor A": 'string',
	"Alt. ID(s) interactor B": 'string',
	"Alias(es) interactor A": 'string',
	"Alias(es) interactor B": 'string',
	"Interaction detection method(s)": 'string',
	"Publication 1st author(s)": 'string',
	"Publication Identifier(s)": 'string',
	"Taxid interactor A": 'string',
	"Taxid interactor B": 'string',
	"Interaction type(s)": 'string',
	"Source database(s)": 'string',
	"Interaction identifier(s)": 'string',
	"Confidence value(s)": 'string',
	"Biological role(s) interactor A": 'string',
	"Biological role(s) interactor B": 'string',
	"Experimental role(s) interactor A": 'string',
	"Experimental role(s) interactor B": 'string',
	"Type(s) interactor A": 'string',
	"Type(s) interactor B": 'string',
	"Xref(s) interactor A": 'string',
	"Xref(s) interactor B": 'string',
	"Interaction Xref(s)": 'string',
	"Annotation(s) interactor A": 'string',
	"Annotation(s) interactor B": 'string',
	"Interaction annotation(s)": 'string',
	"Host organism(s)": 'string',
	"Interaction parameter(s)": 'string',
	"Creation date": 'string',
	"Update date": 'string',
	"Checksum(s) interactor A": 'string',
	"Checksum(s) interactor B": 'string',
	"Interaction Checksum(s)": 'string',
	"Negative": 'bool',
	"all_intact_A_sorted": 'string',
	"all_intact_B_sorted": 'string',
	"unique_id": 'string',
	"uniprot_A": 'string',
	"uniprot_B": 'string',
	"interaction_detection_methods_sorted": 'string',
	"unique_scores": 'string',
	"unique_expansions": 'string',
	"confidence_val_int": 'float',
	"unique_score_int": 'float',
	"equal_score_int": 'bool',
	"miscore": 'float',
	"interaction_intactid": 'string',
	"interaction_label": 'string',
	"interaction_mi": 'string',
	"interaction_xml_id": 'string',
	"experiments": 'string',
	"year": 'int',
	"process_method": 'string',
	"protein_1": 'string',
	"gene_symbol_1": 'string',
	"mol_type_1": 'string',
	"species_label_1": 'string',
	"species_taxid_1": 'string',
	"length_1": 'int',
	"aa_1": 'string',
	"chain_seq_start_1": 'string',
	"chain_seq_end_1": 'string',
	"uniprotkb_1": 'string',
	"ensp_1": 'string',
	"ensg_1": 'string',
	"enst_1": 'string',
	"interpro_1": 'string',
	"reactome_1": 'string',
	"rscbpdb_1": 'string',
	"intactid_1": 'string',
	"primaryref_db_1": 'string',
	"primaryref_id_1": 'string',
	"go_1": 'string',
	"host_taxid_1": 'string',
	"host_label_short_1": 'string',
	"host_label_full_1": 'string',
	"host_cell_type_1": 'string',
	"host_compartment_1": 'string',
	"host_tissue_1": 'string',
	"mutation_mi_1": 'string',
	"mutation_name_1": 'string',
	"mutation_short_1": 'string',
	"mutation_begin_1": 'string',
	"mutation_end_1": 'string',
	"mutation_orig_1": 'string',
	"mutation_new_1": 'string',
	"binding_mi_1": 'string',
	"binding_name_1": 'string',
	"binding_short_1": 'string',
	"binding_begin_1": 'string',
	"binding_end_1": 'string',
	"ptm_mi_1": 'string',
	"ptm_name_1": 'string',
	"ptm_short_1": 'string',
	"ptm_begin_1": 'string',
	"ptm_end_1": 'string',
	"ptm_orig_1": 'string',
	"ptm_new_1": 'string',
	"protein_2": 'string',
	"gene_symbol_2": 'string',
	"mol_type_2": 'string',
	"species_label_2": 'string',
	"species_taxid_2": 'string',
	"length_2": 'int',
	"aa_2": 'string',
	"chain_seq_start_2": 'string',
	"chain_seq_end_2": 'string',
	"uniprotkb_2": 'string',
	"ensp_2": 'string',
	"ensg_2": 'string',
	"enst_2": 'string',
	"interpro_2": 'string',
	"reactome_2": 'string',
	"rscbpdb_2": 'string',
	"intactid_2": 'string',
	"primaryref_db_2": 'string',
	"primaryref_id_2": 'string',
	"go_2": 'string',
	"host_taxid_2": 'string',
	"host_label_short_2": 'string',
	"host_label_full_2": 'string',
	"host_cell_type_2": 'string',
	"host_compartment_2": 'string',
	"host_tissue_2": 'string',
	"mutation_mi_2": 'string',
	"mutation_name_2": 'string',
	"mutation_short_2": 'string',
	"mutation_begin_2": 'string',
	"mutation_end_2": 'string',
	"mutation_orig_2": 'string',
	"mutation_new_2": 'string',
	"binding_mi_2": 'string',
	"binding_name_2": 'string',
	"binding_short_2": 'string',
	"binding_begin_2": 'string',
	"binding_end_2": 'string',
	"ptm_mi_2": 'string',
	"ptm_name_2": 'string',
	"ptm_short_2": 'string',
	"ptm_begin_2": 'string',
	"ptm_end_2": 'string',
	"ptm_orig_2": 'string',
	"ptm_new_2": 'string',
	"pubmeds": 'string',
	"unique_all_intact_sorted": 'string',
	"seq_sort": 'string',
	"no_uniprot_update_A": 'bool',
	"no_uniprot_update_B": 'bool',
	"seq_pair_id": 'string',
	"unique_uniprot_pair": 'string',
	"uniprot_A_noiso1": 'string',
	"uniprot_B_noiso1": 'string',
	"unique_uniprot_noiso1_pair": 'string',
	"uniprot_A_noisoforms": 'string',
	"uniprot_B_noisoforms": 'string',
	"unique_uniprot_noisoforms_pair": 'string',
	"invalids_aa_1": 'string',
	"invalids_aa_2": 'string'
}

In [159]:
merged = pd.read_csv("data_files/processed/intact/merged_intermediate_file.csv", dtype=merged_dtypes)

  merged = pd.read_csv("data_files/processed/intact/merged_intermediate_file.csv", dtype=merged_dtypes)


In [160]:
# harmonize nulls
merged = harmonize_nulls_to_nan(merged)
print(f"Harmonized all nulls to nans. All empty entries are np.nan now.")

Harmonized all nulls to nans. All empty entries are np.nan now.


In [161]:
# For the new merge, we need to follow the isoform-0 rule so we can finally get everything aligned to its correct sequence
merged2 = merged.copy()
merged2 = merged2.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged2["uniprot_A_full"] = merged2["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged2["uniprot_B_full"] = merged2["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged2["unique_uniprot_pair"] = merged2.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged2["uniprot_A_noisoforms"] = merged2["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged2["uniprot_B_noisoforms"] = merged2["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged2["unique_uniprot_noisoforms_pair"] = merged2.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)
merged2


Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,no_uniprot_update_B,seq_pair_id,unique_uniprot_pair,uniprot_A_noisoforms,uniprot_B_noisoforms,unique_uniprot_noisoforms_pair,invalids_aa_1,invalids_aa_2,uniprot_A_full,uniprot_B_full
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,False,seqpair1,uniprotkb:Q86P48-0_uniprotkb:Q9VE54-0,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48_uniprotkb:Q9VE54,,,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,False,seqpair2,uniprotkb:O16844-0_uniprotkb:Q9VE54-0,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:O16844_uniprotkb:Q9VE54,,,uniprotkb:Q9VE54-0,uniprotkb:O16844-0
2,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,False,seqpair3,uniprotkb:Q9VE54-0_uniprotkb:Q9VTR6-0,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,uniprotkb:Q9VE54_uniprotkb:Q9VTR6,,,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0
3,intact:EBI-100018,intact:EBI-107089,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,False,seqpair4,uniprotkb:Q9VE54-0_uniprotkb:Q9VWG2-0,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,uniprotkb:Q9VE54_uniprotkb:Q9VWG2,,,uniprotkb:Q9VE54-0,uniprotkb:Q9VWG2-0
4,intact:EBI-117032,intact:EBI-100018,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,False,seqpair5,uniprotkb:Q9VE54-0_uniprotkb:Q9VHR4-0,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,uniprotkb:Q9VE54_uniprotkb:Q9VHR4,,,uniprotkb:Q9VHR4-0,uniprotkb:Q9VE54-0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745080,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,False,seqpair427592,uniprotkb:Q8NBT2-0_uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,uniprotkb:Q8NBT2_uniprotkb:Q9HBM1,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0
745081,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,False,seqpair427592,uniprotkb:Q8NBT2-0_uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,uniprotkb:Q8NBT2_uniprotkb:Q9HBM1,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0
745082,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,False,seqpair427592,uniprotkb:Q8NBT2-0_uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,uniprotkb:Q8NBT2_uniprotkb:Q9HBM1,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0
745083,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,False,seqpair427592,uniprotkb:Q8NBT2-0_uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,uniprotkb:Q8NBT2_uniprotkb:Q9HBM1,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0


In [162]:
idmap_merge["uniprotkb_full"] = idmap_merge["uniprotkb_full"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged2 = pd.merge(
    merged2,
    idmap_merge.rename(columns={"uniprotkb_full":"uniprot_A_full","sequence":"aa_1_fromidmap","uniprot_gene_name":"uniprot_gene_name_A","species":"species_A"})[["uniprot_A_full","aa_1_fromidmap","uniprot_gene_name_A","species_A"]],
    on="uniprot_A_full",
    how="left"
)
merged2 = pd.merge(
    merged2,
    idmap_merge.rename(columns={"uniprotkb_full":"uniprot_B_full","sequence":"aa_2_fromidmap","uniprot_gene_name":"uniprot_gene_name_B","species":"species_B"})[["uniprot_B_full","aa_2_fromidmap","uniprot_gene_name_B","species_B"]],
    on="uniprot_B_full",
    how="left"
)
merged2

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,invalids_aa_1,invalids_aa_2,uniprot_A_full,uniprot_B_full,aa_1_fromidmap,uniprot_gene_name_A,species_A,aa_2_fromidmap,uniprot_gene_name_B,species_B
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,ATBP_DROME,Drosophila melanogaster (Fruit fly),MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,,,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly)
2,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,Q9VTR6_DROME,Drosophila melanogaster (Fruit fly),MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
3,intact:EBI-100018,intact:EBI-107089,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,uniprotkb:Q9VE54-0,uniprotkb:Q9VWG2-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,Q9VWG2_DROME,Drosophila melanogaster (Fruit fly)
4,intact:EBI-117032,intact:EBI-100018,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,uniprotkb:Q9VHR4-0,uniprotkb:Q9VE54-0,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,Q9VHR4_DROME,Drosophila melanogaster (Fruit fly),MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745080,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)
745081,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)
745082,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)
745083,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,,,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)


In [163]:
# for merged3, we're going to merge on everything with the same canonical isoform 
# test case
merged3 = merged.copy(deep=True)
merged3 = merged3.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged3["uniprot_A_full"] = merged3["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged3["uniprot_B_full"] = merged3["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged3["unique_uniprot_pair"] = merged3.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged3["uniprot_A_noisoforms"] = merged3["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged3["uniprot_B_noisoforms"] = merged3["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged3["unique_uniprot_noisoforms_pair"] = merged3.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

idmap_merge_copy = idmap_merge.copy(deep=True)
idmap_merge_copy["canonical_uniprot"] = idmap_merge_copy["canonical_uniprotkb"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged3 = pd.merge(
    merged3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_A_noisoforms",
                                "uniprotkb_full":"uniprot_A_bestiso",
                                "sequence":"aa_1_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_A",
                                "species":"species_A"})[["uniprot_A_noisoforms","uniprot_A_bestiso","aa_1_bestiso","uniprot_gene_name_A","species_A"]],
    on=["uniprot_A_noisoforms"],
    how="left"
)
merged3 = pd.merge(
    merged3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_B_noisoforms",
                                "uniprotkb_full":"uniprot_B_bestiso",
                                "sequence":"aa_2_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_B",
                                "species":"species_B"})[["uniprot_B_noisoforms","uniprot_B_bestiso","aa_2_bestiso","uniprot_gene_name_B","species_B"]],
    on=["uniprot_B_noisoforms"],
    how="left"
    
)
merged3

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,uniprot_A_full,uniprot_B_full,uniprot_A_bestiso,aa_1_bestiso,uniprot_gene_name_A,species_A,uniprot_B_bestiso,aa_2_bestiso,uniprot_gene_name_B,species_B
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48-0,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
1,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48-PRO_0000378614,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
2,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-0,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly)
3,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-PRO_0000307148,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly)
4,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0,uniprotkb:Q9VTR6-0,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,Q9VTR6_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6566982,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-PRO_0000249565,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)
6566983,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2-PRO_0000249559,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-0,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)
6566984,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2-PRO_0000249559,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-PRO_0000249565,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)
6566985,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,uniprotkb:Q8NBT2-2,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-0,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human)


In [164]:
# So i ahve to remember that merged is indexed by the combination of IntAct IDs. 
# The original merged. 
# So it's abot mapping IntAct IDs to their best possible UniProt sequence. 
# unique_id column
print(len(merged.loc[merged.duplicated(subset=["unique_id","seq_pair_id"])]))

316590


In [165]:
merged4 = merged.copy(deep=True)
merged4 = merged4.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged4["uniprot_A_full"] = merged4["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged4["uniprot_B_full"] = merged4["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged4["unique_uniprot_pair"] = merged4.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged4["uniprot_A_noisoforms"] = merged4["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged4["uniprot_B_noisoforms"] = merged4["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged4["unique_uniprot_noisoforms_pair"] = merged4.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

idmap_merge_copy = idmap_merge.copy(deep=True)
idmap_merge_copy["canonical_uniprot"] = idmap_merge_copy["canonical_uniprotkb"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged4["aa_1_bestiso"] = merged4["aa_1"].copy()
merged4["aa_2_bestiso"] = merged4["aa_2"].copy()
merged4 = pd.merge(
    merged4,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_A_noisoforms",
                                "uniprotkb_full":"uniprot_A_bestiso",
                                "sequence":"aa_1_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_A",
                                "species":"species_A"})[["uniprot_A_noisoforms","uniprot_A_bestiso","aa_1_bestiso","uniprot_gene_name_A","species_A"]],
    on=["uniprot_A_noisoforms","aa_1_bestiso"],
    how="left"
)
merged4 = pd.merge(
    merged4,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_B_noisoforms",
                                "uniprotkb_full":"uniprot_B_bestiso",
                                "sequence":"aa_2_bestiso",
                                "uniprot_gene_name":"uniprot_gene_name_B",
                                "species":"species_B"})[["uniprot_B_noisoforms","uniprot_B_bestiso","aa_2_bestiso","uniprot_gene_name_B","species_B"]],
    on=["uniprot_B_noisoforms","aa_2_bestiso"],
    how="left"
)
merged4

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,uniprot_A_full,uniprot_B_full,aa_1_bestiso,aa_2_bestiso,uniprot_A_bestiso,uniprot_gene_name_A,species_A,uniprot_B_bestiso,uniprot_gene_name_B,species_B
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,uniprotkb:Q86P48-0,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
1,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,uniprotkb:Q86P48-PRO_0000378614,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
2,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,uniprotkb:Q9VE54-0,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-0,COS_DROME,Drosophila melanogaster (Fruit fly)
3,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,uniprotkb:Q9VE54-0,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-PRO_0000307148,COS_DROME,Drosophila melanogaster (Fruit fly)
4,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,uniprotkb:Q9VTR6-0,Q9VTR6_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,Q9VE54_DROME,Drosophila melanogaster (Fruit fly)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1824156,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,uniprotkb:Q8NBT2-PRO_0000249559,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-PRO_0000249565,SPC25_HUMAN,Homo sapiens (Human)
1824157,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,uniprotkb:Q8NBT2-0,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-0,SPC25_HUMAN,Homo sapiens (Human)
1824158,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,uniprotkb:Q8NBT2-0,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-PRO_0000249565,SPC25_HUMAN,Homo sapiens (Human)
1824159,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,uniprotkb:Q8NBT2-PRO_0000249559,SPC24_HUMAN,Homo sapiens (Human),uniprotkb:Q9HBM1-0,SPC25_HUMAN,Homo sapiens (Human)


In [166]:
len(merged3.loc[merged3["uniprot_A_full"]!=merged3["uniprot_A_bestiso"]])

4566923

In [167]:
# Here's where it gets interesting. 
# Find cases where the provided uniprot matches one isoform and not another
merged2["aa_intact_equals_aa_idmap_A"] = merged2.apply(lambda row: (row["aa_1"]==row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged2["aa_intact_equals_aa_idmap_B"] = merged2.apply(lambda row: (row["aa_2"]==row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)
merged2["aa_intact_isin_aa_idmap_A"] = merged2.apply(lambda row: (row["aa_1"] in row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged2["aa_intact_isin_aa_idmap_B"] = merged2.apply(lambda row: (row["aa_2"] in row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)
merged2

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,aa_1_fromidmap,uniprot_gene_name_A,species_A,aa_2_fromidmap,uniprot_gene_name_B,species_B,aa_intact_equals_aa_idmap_A,aa_intact_equals_aa_idmap_B,aa_intact_isin_aa_idmap_A,aa_intact_isin_aa_idmap_B
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,ATBP_DROME,Drosophila melanogaster (Fruit fly),MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
2,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,Q9VTR6_DROME,Drosophila melanogaster (Fruit fly),MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
3,intact:EBI-100018,intact:EBI-107089,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,Q9VWG2_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
4,intact:EBI-117032,intact:EBI-100018,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,Q9VHR4_DROME,Drosophila melanogaster (Fruit fly),MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745080,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human),True,True,True,True
745081,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human),True,True,True,True
745082,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human),True,True,True,True
745083,intact:EBI-999900,intact:EBI-999909,uniprotkb:Q8NBT2,uniprotkb:Q9HBM1,psi-mi:spc24_human|psi-mi:SPC24|uniprotkb:C9JG...,psi-mi:spc25_human|psi-mi:SPC25|uniprotkb:D3DP...,"psi-mi:""MI:1112""(two hybrid prey pooling appro...",Ciferri et al. (2005)|McCleland et al. (2004)|...,pubmed:15961401|pubmed:14738735|pubmed:2649661...,taxid:9606(human),...,MAAFRDIEEVSQGLLSLLGANRAEAQQRRLLGRHEQVVERLLETQD...,SPC24_HUMAN,Homo sapiens (Human),MVEDELALFDKSINEFWNKFKSTDTSCQMAGLRDTYKDSIKAFAEK...,SPC25_HUMAN,Homo sapiens (Human),True,True,True,True


In [168]:
merged2.loc[
    merged2["aa_intact_equals_aa_idmap_A"].isna()
][["uniprot_A_full","aa_1","aa_1_fromidmap"]]

Unnamed: 0,uniprot_A_full,aa_1,aa_1_fromidmap
66,uniprotkb:Rgk3-0,MLPLRQEAVISAAATWRMRNESRSIINRGDSFRRRRSRSNSLAPSS...,
88,uniprotkb:Pif1A-0,MAENQTKTTSSKGCNTMSRTAHILAPHNGTVVRRTDPGSKLKESFH...,
108,uniprotkb:P39745-1,MPTWIPNNLCAQPTTRNAKPPSNGHPQATQQQSAPGSLAYRNSSNI...,
130,,TSTAGRIVRRAI,
148,uniprotkb:ewg-0,MATTSYRLVVAPAGSQRSSTGNVVVTTTSSGSHSSNGANGGTGGTS...,
...,...,...,...
744792,uniprotkb:O00255-2,MGLKAAQKTLFPLRSIDDVVRLFAAELGREEPDLVLLSLVLGFVEH...,
744793,uniprotkb:O00255-2,MGLKAAQKTLFPLRSIDDVVRLFAAELGREEPDLVLLSLVLGFVEH...,
744878,uniprotkb:Q9JJV2-1,MAGWQSYVDNLMCDGCCQEAAIVGYCDAKYVWAATAGGVFQSITPV...,
744985,uniprotkb:P54256-1,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,


In [169]:
merged2.loc[merged2["uniprot_A_full"]=="uniprotkb:P54256-1"][["uniprot_A_full","aa_1","aa_1_fromidmap"]]

Unnamed: 0,uniprot_A_full,aa_1,aa_1_fromidmap
736696,uniprotkb:P54256-1,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,
736697,uniprotkb:P54256-1,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,
744985,uniprotkb:P54256-1,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,
744986,uniprotkb:P54256-1,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,


In [170]:
idmap_fasta_df.loc[idmap_fasta_df["canonical_uniprotkb"]=="P54256"].sort_values(by="uniprotkb_iso").reset_index(drop=True)

Unnamed: 0,sequence,database,uniprotkb,uniprot_gene_name,uniprotkb_iso,isoform_from_desc,isoform_from_uniprotkb,canonical_uniprotkb
0,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,sp,uniprotkb:P54256,HAP1_RAT,P54256-0,,Isoform 0,P54256
1,MRPKDQVQSSAGDGTGSGDPATGTPTTQPAADPAPEPSAEPKPAPA...,sp,uniprotkb:P54256-2,HAP1_RAT,P54256-2,Isoform A,Isoform 2,P54256


In [171]:
merged3 = merged.copy(deep=True)

In [172]:
merged3 = merged3.drop(columns=["uniprot_A_noiso1","uniprot_B_noiso1","unique_uniprot_noiso1_pair"])
merged3["uniprot_A_full"] = merged3["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged3["uniprot_B_full"] = merged3["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged3["unique_uniprot_pair"] = merged3.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged3["uniprot_A_noisoforms"] = merged3["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged3["uniprot_B_noisoforms"] = merged3["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged3["unique_uniprot_noisoforms_pair"] = merged3.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

In [173]:
gb = merged3.groupby(["unique_id","seq_pair_id"]).agg(
    unique_uniprot_A_noisoforms=("uniprot_A_noisoforms", lambda x: list(set(x))),
    unique_uniprot_B_noisoforms=("uniprot_B_noisoforms", lambda x: list(set(x))),
    unique_uniprot_A_full=("uniprot_A_full", lambda x: list(set(x))),
    unique_uniprot_B_full=("uniprot_B_full", lambda x: list(set(x))),
    unique_miscores=("miscore", lambda x: list(set(x)))
)
gb.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,unique_uniprot_A_noisoforms,unique_uniprot_B_noisoforms,unique_uniprot_A_full,unique_uniprot_B_full,unique_miscores
unique_id,seq_pair_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
intact:EBI-100018_intact:EBI-101707,seqpair1,[uniprotkb:Q86P48],[uniprotkb:Q9VE54],[uniprotkb:Q86P48-0],[uniprotkb:Q9VE54-0],[0.37]
intact:EBI-100018_intact:EBI-102069,seqpair2,[uniprotkb:Q9VE54],[uniprotkb:O16844],[uniprotkb:Q9VE54-0],[uniprotkb:O16844-0],[0.37]
intact:EBI-100018_intact:EBI-104215,seqpair3,[uniprotkb:Q9VTR6],[uniprotkb:Q9VE54],[uniprotkb:Q9VTR6-0],[uniprotkb:Q9VE54-0],[0.37]
intact:EBI-100018_intact:EBI-107089,seqpair4,[uniprotkb:Q9VE54],[uniprotkb:Q9VWG2],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VWG2-0],[0.37]
intact:EBI-100018_intact:EBI-117032,seqpair5,[uniprotkb:Q9VHR4],[uniprotkb:Q9VE54],[uniprotkb:Q9VHR4-0],[uniprotkb:Q9VE54-0],[0.37]


In [174]:
print(f"Grouped by unique_id (intact:EBI-1_intact:EBI-2) and seq_pair_id (sequence pair), checking for consistency otherwise. Each group has:")
test1 = len(gb.loc[
    gb["unique_uniprot_A_full"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot A full isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_uniprot_B_full"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot B full isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_uniprot_A_noisoforms"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot A ID without isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_uniprot_B_noisoforms"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique uniprot B ID without isoform: {test1}")
test1 = len(gb.loc[
    gb["unique_miscores"].apply(lambda x: len(x))>1
])==0
print(f"\tAt most 1 unique miscore: {test1}")

Grouped by unique_id (intact:EBI-1_intact:EBI-2) and seq_pair_id (sequence pair), checking for consistency otherwise. Each group has:
	At most 1 unique uniprot A full isoform: True
	At most 1 unique uniprot B full isoform: True
	At most 1 unique uniprot A ID without isoform: True
	At most 1 unique uniprot B ID without isoform: True
	At most 1 unique miscore: True


In [175]:
merged3 = merged3.drop_duplicates(["unique_id","seq_pair_id"]).reset_index(drop=True)
merged3 = merged3[[
    "unique_id","seq_pair_id","aa_1","aa_2","miscore",
    "uniprot_A","uniprot_B","uniprot_A_full","uniprot_B_full","uniprot_A_noisoforms","uniprot_B_noisoforms",
    "unique_uniprot_pair","unique_uniprot_noisoforms_pair"
]]
print(f"After dropping duplicates, merged3 has {len(merged3)} rows")
merged3.head()

After dropping duplicates, merged3 has 428495 rows


Unnamed: 0,unique_id,seq_pair_id,aa_1,aa_2,miscore,uniprot_A,uniprot_B,uniprot_A_full,uniprot_B_full,uniprot_A_noisoforms,uniprot_B_noisoforms,unique_uniprot_pair,unique_uniprot_noisoforms_pair
0,intact:EBI-100018_intact:EBI-101707,seqpair1,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48-0_uniprotkb:Q9VE54-0,uniprotkb:Q86P48_uniprotkb:Q9VE54
1,intact:EBI-100018_intact:EBI-102069,seqpair2,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,0.37,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:O16844-0_uniprotkb:Q9VE54-0,uniprotkb:O16844_uniprotkb:Q9VE54
2,intact:EBI-100018_intact:EBI-104215,seqpair3,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,uniprotkb:Q9VE54-0_uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54_uniprotkb:Q9VTR6
3,intact:EBI-100018_intact:EBI-107089,seqpair4,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,0.37,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,uniprotkb:Q9VE54-0,uniprotkb:Q9VWG2-0,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,uniprotkb:Q9VE54-0_uniprotkb:Q9VWG2-0,uniprotkb:Q9VE54_uniprotkb:Q9VWG2
4,intact:EBI-100018_intact:EBI-117032,seqpair5,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,uniprotkb:Q9VHR4-0,uniprotkb:Q9VE54-0,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,uniprotkb:Q9VE54-0_uniprotkb:Q9VHR4-0,uniprotkb:Q9VE54_uniprotkb:Q9VHR4


In [176]:
# on this shrunken merge3, do a merge with idmap_merge_copy where we'll get all possible isoform matches, and then we can filter down to the best ones
idmap_merge_copy = idmap_merge.copy(deep=True)
idmap_merge_copy["canonical_uniprot"] = idmap_merge_copy["canonical_uniprotkb"].apply(lambda x: "uniprotkb:" + x if not(x.startswith("uniprotkb:")) else x)
merged3 = pd.merge(
    merged3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_A_noisoforms",
                                "uniprotkb_full":"uniprot_A_fromidmap",
                                "sequence":"aa_1_fromidmap",
                                "uniprot_gene_name":"uniprot_gene_name_A",
                                "species":"species_A"})[["uniprot_A_noisoforms","uniprot_A_fromidmap","aa_1_fromidmap","uniprot_gene_name_A","species_A"]],
    on=["uniprot_A_noisoforms"],
    how="left"
)
merged3 = pd.merge(
    merged3,
    idmap_merge_copy.rename(columns={"canonical_uniprot":"uniprot_B_noisoforms",
                                "uniprotkb_full":"uniprot_B_fromidmap",
                                "sequence":"aa_2_fromidmap",
                                "uniprot_gene_name":"uniprot_gene_name_B",
                                "species":"species_B"})[["uniprot_B_noisoforms","uniprot_B_fromidmap","aa_2_fromidmap","uniprot_gene_name_B","species_B"]],
    on=["uniprot_B_noisoforms"],
    how="left"
    
)
print(f"After merging with idmap to get all possible isoform matches, merged3 has {len(merged3)} rows")

After merging with idmap to get all possible isoform matches, merged3 has 3193368 rows


In [177]:
merged3["aa_intact_equals_aa_idmap_A"] = merged3.apply(lambda row: (row["aa_1"]==row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged3["aa_intact_equals_aa_idmap_B"] = merged3.apply(lambda row: (row["aa_2"]==row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)
merged3["aa_intact_isin_aa_idmap_A"] = merged3.apply(lambda row: (row["aa_1"] in row["aa_1_fromidmap"]) if (row["aa_1"] is not None and type(row["aa_1"])==str and row["aa_1_fromidmap"] is not None and type(row["aa_1_fromidmap"])==str) else None, axis=1)
merged3["aa_intact_isin_aa_idmap_B"] = merged3.apply(lambda row: (row["aa_2"] in row["aa_2_fromidmap"]) if (row["aa_2"] is not None and type(row["aa_2"])==str and row["aa_2_fromidmap"] is not None and type(row["aa_2_fromidmap"])==str) else None, axis=1)

merged3.head()

Unnamed: 0,unique_id,seq_pair_id,aa_1,aa_2,miscore,uniprot_A,uniprot_B,uniprot_A_full,uniprot_B_full,uniprot_A_noisoforms,...,uniprot_gene_name_A,species_A,uniprot_B_fromidmap,aa_2_fromidmap,uniprot_gene_name_B,species_B,aa_intact_equals_aa_idmap_A,aa_intact_equals_aa_idmap_B,aa_intact_isin_aa_idmap_A,aa_intact_isin_aa_idmap_B
0,intact:EBI-100018_intact:EBI-101707,seqpair1,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48,...,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
1,intact:EBI-100018_intact:EBI-101707,seqpair1,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48,...,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
2,intact:EBI-100018_intact:EBI-102069,seqpair2,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,0.37,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54,...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-0,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
3,intact:EBI-100018_intact:EBI-102069,seqpair2,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,0.37,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54,...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-PRO_0000307148,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
4,intact:EBI-100018_intact:EBI-104215,seqpair3,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0,uniprotkb:Q9VTR6,...,Q9VTR6_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True


In [178]:
merged3.head()

Unnamed: 0,unique_id,seq_pair_id,aa_1,aa_2,miscore,uniprot_A,uniprot_B,uniprot_A_full,uniprot_B_full,uniprot_A_noisoforms,...,uniprot_gene_name_A,species_A,uniprot_B_fromidmap,aa_2_fromidmap,uniprot_gene_name_B,species_B,aa_intact_equals_aa_idmap_A,aa_intact_equals_aa_idmap_B,aa_intact_isin_aa_idmap_A,aa_intact_isin_aa_idmap_B
0,intact:EBI-100018_intact:EBI-101707,seqpair1,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48,...,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
1,intact:EBI-100018_intact:EBI-101707,seqpair1,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q86P48,uniprotkb:Q9VE54,uniprotkb:Q86P48-0,uniprotkb:Q9VE54-0,uniprotkb:Q86P48,...,ATBP_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
2,intact:EBI-100018_intact:EBI-102069,seqpair2,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,0.37,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54,...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-0,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
3,intact:EBI-100018_intact:EBI-102069,seqpair2,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,0.37,uniprotkb:Q9VE54,uniprotkb:O16844,uniprotkb:Q9VE54-0,uniprotkb:O16844-0,uniprotkb:Q9VE54,...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:O16844-PRO_0000307148,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,COS_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True
4,intact:EBI-100018_intact:EBI-104215,seqpair3,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,0.37,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,uniprotkb:Q9VTR6-0,uniprotkb:Q9VE54-0,uniprotkb:Q9VTR6,...,Q9VTR6_DROME,Drosophila melanogaster (Fruit fly),uniprotkb:Q9VE54-0,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,Q9VE54_DROME,Drosophila melanogaster (Fruit fly),True,True,True,True


In [179]:
# group to get the equals and isin sequences 
gb_equal_A = merged3.loc[(merged3["aa_intact_equals_aa_idmap_A"].notna()) & (merged3["aa_intact_equals_aa_idmap_A"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_A_equalseq=("uniprot_A_fromidmap", lambda x: list(set(x))),
)
gb_equal_B = merged3.loc[(merged3["aa_intact_equals_aa_idmap_B"].notna()) & (merged3["aa_intact_equals_aa_idmap_B"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_B_equalseq=("uniprot_B_fromidmap", lambda x: list(set(x))),
)
gb_isin_A = merged3.loc[(merged3["aa_intact_isin_aa_idmap_A"].notna()) & (merged3["aa_intact_isin_aa_idmap_A"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_A_inseq=("uniprot_A_fromidmap", lambda x: list(set(x))),
)
gb_isin_B = merged3.loc[(merged3["aa_intact_isin_aa_idmap_B"].notna()) & (merged3["aa_intact_isin_aa_idmap_B"])].groupby(["unique_id","seq_pair_id"]).agg(
    uniprot_B_inseq=("uniprot_B_fromidmap", lambda x: list(set(x))),
)

gb = pd.concat([gb_equal_A, gb_equal_B, gb_isin_A, gb_isin_B], axis=1).reset_index()
gb.head()

Unnamed: 0,unique_id,seq_pair_id,uniprot_A_equalseq,uniprot_B_equalseq,uniprot_A_inseq,uniprot_B_inseq
0,intact:EBI-100018_intact:EBI-101707,seqpair1,"[uniprotkb:Q86P48-PRO_0000378614, uniprotkb:Q8...",[uniprotkb:Q9VE54-0],"[uniprotkb:Q86P48-PRO_0000378614, uniprotkb:Q8...",[uniprotkb:Q9VE54-0]
1,intact:EBI-100018_intact:EBI-102069,seqpair2,[uniprotkb:Q9VE54-0],"[uniprotkb:O16844-0, uniprotkb:O16844-PRO_0000...",[uniprotkb:Q9VE54-0],"[uniprotkb:O16844-0, uniprotkb:O16844-PRO_0000..."
2,intact:EBI-100018_intact:EBI-104215,seqpair3,[uniprotkb:Q9VTR6-0],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VTR6-0],[uniprotkb:Q9VE54-0]
3,intact:EBI-100018_intact:EBI-107089,seqpair4,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VWG2-0],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VWG2-0]
4,intact:EBI-100018_intact:EBI-117032,seqpair5,[uniprotkb:Q9VHR4-0],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VHR4-0],[uniprotkb:Q9VE54-0]


In [180]:
gb = pd.concat([gb_equal_A, gb_equal_B, gb_isin_A, gb_isin_B], axis=1).reset_index()
gb.head()

Unnamed: 0,unique_id,seq_pair_id,uniprot_A_equalseq,uniprot_B_equalseq,uniprot_A_inseq,uniprot_B_inseq
0,intact:EBI-100018_intact:EBI-101707,seqpair1,"[uniprotkb:Q86P48-PRO_0000378614, uniprotkb:Q8...",[uniprotkb:Q9VE54-0],"[uniprotkb:Q86P48-PRO_0000378614, uniprotkb:Q8...",[uniprotkb:Q9VE54-0]
1,intact:EBI-100018_intact:EBI-102069,seqpair2,[uniprotkb:Q9VE54-0],"[uniprotkb:O16844-0, uniprotkb:O16844-PRO_0000...",[uniprotkb:Q9VE54-0],"[uniprotkb:O16844-0, uniprotkb:O16844-PRO_0000..."
2,intact:EBI-100018_intact:EBI-104215,seqpair3,[uniprotkb:Q9VTR6-0],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VTR6-0],[uniprotkb:Q9VE54-0]
3,intact:EBI-100018_intact:EBI-107089,seqpair4,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VWG2-0],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VWG2-0]
4,intact:EBI-100018_intact:EBI-117032,seqpair5,[uniprotkb:Q9VHR4-0],[uniprotkb:Q9VE54-0],[uniprotkb:Q9VHR4-0],[uniprotkb:Q9VE54-0]


In [181]:
merged = pd.merge(
    merged, 
    gb,
    on=["unique_id","seq_pair_id"],
    how="left"
)
print(f"\tMerged back the sequence isoforms that equal and contain the sequences provided through IntAct XML. New merged size: {len(merged)}")

	Merged back the sequence isoforms that equal and contain the sequences provided through IntAct XML. New merged size: 745085


In [182]:
merged.columns

Index(['ID(s) interactor A', 'ID(s) interactor B', 'Alt. ID(s) interactor A',
       'Alt. ID(s) interactor B', 'Alias(es) interactor A',
       'Alias(es) interactor B', 'Interaction detection method(s)',
       'Publication 1st author(s)', 'Publication Identifier(s)',
       'Taxid interactor A',
       ...
       'unique_uniprot_noiso1_pair', 'uniprot_A_noisoforms',
       'uniprot_B_noisoforms', 'unique_uniprot_noisoforms_pair',
       'invalids_aa_1', 'invalids_aa_2', 'uniprot_A_equalseq',
       'uniprot_B_equalseq', 'uniprot_A_inseq', 'uniprot_B_inseq'],
      dtype='object', length=165)

In [183]:
merged["uniprot_A_full"] = merged["uniprot_A"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged["uniprot_B_full"] = merged["uniprot_B"].apply(lambda x: x if (x is not None and type(x)==str and "-" in x) else (f"{x}-0" if (x is not None and type(x)==str) else x))
merged["unique_uniprot_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A_full",colB="uniprot_B_full"),axis=1)
merged["uniprot_A_noisoforms"] = merged["uniprot_A_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged["uniprot_B_noisoforms"] = merged["uniprot_B_full"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged["unique_uniprot_noisoforms_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

In [184]:
print([x for x in merged.columns if "uniprot" in x])

['uniprot_A', 'uniprot_B', 'uniprotkb_1', 'uniprotkb_2', 'no_uniprot_update_A', 'no_uniprot_update_B', 'unique_uniprot_pair', 'uniprot_A_noiso1', 'uniprot_B_noiso1', 'unique_uniprot_noiso1_pair', 'uniprot_A_noisoforms', 'uniprot_B_noisoforms', 'unique_uniprot_noisoforms_pair', 'uniprot_A_equalseq', 'uniprot_B_equalseq', 'uniprot_A_inseq', 'uniprot_B_inseq', 'uniprot_A_full', 'uniprot_B_full']


In [185]:
merged[["unique_id","seq_pair_id","uniprot_A_full","uniprot_A_equalseq","uniprot_A_inseq","uniprot_B_full","uniprot_B_equalseq","uniprot_B_inseq"]].head()

Unnamed: 0,unique_id,seq_pair_id,uniprot_A_full,uniprot_A_equalseq,uniprot_A_inseq,uniprot_B_full,uniprot_B_equalseq,uniprot_B_inseq
0,intact:EBI-100018_intact:EBI-101707,seqpair1,uniprotkb:Q86P48-0,"[uniprotkb:Q86P48-PRO_0000378614, uniprotkb:Q8...","[uniprotkb:Q86P48-PRO_0000378614, uniprotkb:Q8...",uniprotkb:Q9VE54-0,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VE54-0]
1,intact:EBI-100018_intact:EBI-102069,seqpair2,uniprotkb:Q9VE54-0,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VE54-0],uniprotkb:O16844-0,"[uniprotkb:O16844-0, uniprotkb:O16844-PRO_0000...","[uniprotkb:O16844-0, uniprotkb:O16844-PRO_0000..."
2,intact:EBI-100018_intact:EBI-104215,seqpair3,uniprotkb:Q9VTR6-0,[uniprotkb:Q9VTR6-0],[uniprotkb:Q9VTR6-0],uniprotkb:Q9VE54-0,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VE54-0]
3,intact:EBI-100018_intact:EBI-107089,seqpair4,uniprotkb:Q9VE54-0,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VE54-0],uniprotkb:Q9VWG2-0,[uniprotkb:Q9VWG2-0],[uniprotkb:Q9VWG2-0]
4,intact:EBI-100018_intact:EBI-117032,seqpair5,uniprotkb:Q9VHR4-0,[uniprotkb:Q9VHR4-0],[uniprotkb:Q9VHR4-0],uniprotkb:Q9VE54-0,[uniprotkb:Q9VE54-0],[uniprotkb:Q9VE54-0]


In [186]:
print(f"Evaluating UniProt validity for partner A")
test1 = merged['uniprot_A_equalseq'].explode()                      # explode the list column
m = test1.eq(merged['uniprot_A_full'].loc[test1.index])             # compare each element to that row's string
equal_mask = m.groupby(level=0).any()                                     # collapse back to row-level True/False

equal_hits = merged.loc[equal_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is one of the ones with the exact right sequence: {len(equal_hits)} ({100*len(equal_hits)/len(merged):.2f}%)")

test1 = merged['uniprot_A_inseq'].explode()                      # explode the list column
m = test1.eq(merged['uniprot_A_full'].loc[test1.index])             # compare each element to that row's string
in_mask = m.groupby(level=0).any()  
in_hits = merged.loc[in_mask]   
print(f"\tTotal rows where the provided UniProt ID is one of the ones which contains the provided sequence: {len(in_hits)} ({100*len(in_hits)/len(merged):.2f}%)")

mask = equal_mask | in_mask
hits = merged.loc[mask]
print(f"\tTotal rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: {len(hits)} ({100*len(hits)/len(merged):.2f}%)")

misses = merged.loc[~mask]
print(f"\tTotal rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

misses = merged.loc[
    (~mask) & 
    (merged["uniprot_A_equalseq"].isna())
]

print(f"\t\tBecause there was no mapped uniprot at all: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

misses = merged.loc[
    (~mask) & 
    (merged["uniprot_A_equalseq"].notna())
]

print(f"\t\tBecause there was no match: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")


Evaluating UniProt validity for partner A
	Total rows where the provided UniProt ID is one of the ones with the exact right sequence: 730953 (98.10%)
	Total rows where the provided UniProt ID is one of the ones which contains the provided sequence: 730959 (98.10%)
	Total rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: 730959 (98.10%)
	Total rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: 14126 (1.90%)
		Because there was no mapped uniprot at all: 9818 (1.32%)
		Because there was no match: 4308 (0.58%)


In [187]:
print(f"Evaluating UniProt validity for partner B")
test1 = merged['uniprot_B_equalseq'].explode()                      # explode the list column
m = test1.eq(merged['uniprot_B_full'].loc[test1.index])             # compare each element to that row's string
equal_mask = m.groupby(level=0).any()                                     # collapse back to row-level True/False

equal_hits = merged.loc[equal_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is one of the ones with the exact right sequence: {len(equal_hits)} ({100*len(equal_hits)/len(merged):.2f}%)")

test1 = merged['uniprot_B_inseq'].explode()                      # explode the list column
m = test1.eq(merged['uniprot_B_full'].loc[test1.index])             # compare each element to that row's string
in_mask = m.groupby(level=0).any()  
in_hits = merged.loc[in_mask]   
print(f"\tTotal rows where the provided UniProt ID is one of the ones which contains the provided sequence: {len(in_hits)} ({100*len(in_hits)/len(merged):.2f}%)")

mask = equal_mask | in_mask
hits = merged.loc[mask]
print(f"\tTotal rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: {len(hits)} ({100*len(hits)/len(merged):.2f}%)")

misses = merged.loc[~mask]
print(f"\tTotal rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

misses = merged.loc[
    (~mask) & 
    (merged["uniprot_B_equalseq"].isna())
]

print(f"\t\tBecause there was no mapped uniprot at all: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

misses = merged.loc[
    (~mask) & 
    (merged["uniprot_B_equalseq"].notna())
]

print(f"\t\tBecause there was no match: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")


Evaluating UniProt validity for partner B
	Total rows where the provided UniProt ID is one of the ones with the exact right sequence: 728478 (97.77%)
	Total rows where the provided UniProt ID is one of the ones which contains the provided sequence: 728496 (97.77%)
	Total rows where the provided UniProt ID by IntAct is in the equal-seq or in-seq list: 728496 (97.77%)
	Total rows where the provided UniProt ID by IntAct is NOT in the equal-seq or in-seq list: 16589 (2.23%)
		Because there was no mapped uniprot at all: 12474 (1.67%)
		Because there was no match: 4115 (0.55%)


In [188]:
merged["uniprot_A_equalseq_canonical"] = merged["uniprot_A_equalseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)
merged["uniprot_A_inseq_canonical"] = merged["uniprot_A_inseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)
merged["uniprot_B_equalseq_canonical"] = merged["uniprot_B_equalseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)
merged["uniprot_B_inseq_canonical"] = merged["uniprot_B_inseq"].apply(lambda x: list(set([y.split("-")[0] for y in x])) if type(x)!=float else x)

In [189]:
# now let's see if it' still the right canonical one 
print(f"Making sure UniProt A is at least the right canonical even if it is the wrong isoform")
test1 = merged['uniprot_A_equalseq_canonical'].explode()                      # explode the list column
m = test1.eq(merged['uniprot_A_noisoforms'].loc[test1.index])             # compare each element to that row's string
equal_canonical_mask = m.groupby(level=0).any()                                     # collapse back to row-level True/False

equal_hits = merged.loc[equal_canonical_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is the right canonical (but may not be the right isoform): {len(equal_hits)} ({100*len(equal_hits)/len(merged):.2f}%)")

misses = merged.loc[(~equal_canonical_mask) & (merged["uniprot_A_equalseq"].isna())]  
print(f"\tTotal rows where the provided UniProt ID does not exist: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

misses = merged.loc[(~equal_canonical_mask) & (merged["uniprot_A_equalseq"].notna())]  
print(f"\tTotal rows where the provided UniProt ID exists and is not the right canonical: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

# now let's see if it' still the right canonical one 
print(f"Making sure UniProt B is at least the right canonical even if it is the wrong isoform")
test1 = merged['uniprot_B_equalseq_canonical'].explode()                      # explode the list column
m = test1.eq(merged['uniprot_B_noisoforms'].loc[test1.index])             # compare each element to that row's string
equal_canonical_mask = m.groupby(level=0).any()                                     # collapse back to row-level True/False

equal_hits = merged.loc[equal_canonical_mask]                        # use .loc
print(f"\tTotal rows where the provided UniProt ID is the right canonical (but may not be the right isoform): {len(equal_hits)} ({100*len(equal_hits)/len(merged):.2f}%)")

misses = merged.loc[(~equal_canonical_mask) & (merged["uniprot_B_equalseq"].isna())]  
print(f"\tTotal rows where the provided UniProt ID does not exist: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

misses = merged.loc[(~equal_canonical_mask) & (merged["uniprot_B_equalseq"].notna())]  
print(f"\tTotal rows where the provided UniProt ID exists and is not the right canonical: {len(misses)} ({100*len(misses)/len(merged):.2f}%)")

Making sure UniProt A is at least the right canonical even if it is the wrong isoform
	Total rows where the provided UniProt ID is the right canonical (but may not be the right isoform): 735261 (98.68%)
	Total rows where the provided UniProt ID does not exist: 9824 (1.32%)
	Total rows where the provided UniProt ID exists and is not the right canonical: 0 (0.00%)
Making sure UniProt B is at least the right canonical even if it is the wrong isoform
	Total rows where the provided UniProt ID is the right canonical (but may not be the right isoform): 732593 (98.32%)
	Total rows where the provided UniProt ID does not exist: 12492 (1.68%)
	Total rows where the provided UniProt ID exists and is not the right canonical: 0 (0.00%)


In [190]:
# Are there any cases where there are multiple canonicals?
test1 = len(merged.loc[merged["uniprot_A_equalseq_canonical"].fillna("").apply(lambda x: len(x))>1])==0
print(f"Each row only has ONE canonical uniprot that matches the provided sequence - partner A: {test1}")
test1 = len(merged.loc[merged["uniprot_B_equalseq_canonical"].fillna("").apply(lambda x: len(x))>1])==0
print(f"Each row only has ONE canonical uniprot that matches the provided sequence - partner B: {test1}")

Each row only has ONE canonical uniprot that matches the provided sequence - partner A: True
Each row only has ONE canonical uniprot that matches the provided sequence - partner B: True


In [191]:
import re
from typing import List

def sort_isoforms(uniprotlist: List[str]) -> List[str]:
    """
    Sort UniProt-like isoform IDs so that, within each base ID, the order is:
      [ID]        (no suffix, if present)
      ID-0, ID-1, ID-2, ...
      ID-PRO_1, ID-PRO_2, ...
      (then any other suffixes, alphabetically)

    Examples
    --------
    ["P12345-PRO_2","P12345-1","P12345-0","P12345-PRO_10","P12345-2"]
      -> ["P12345-0","P12345-1","P12345-2","P12345-PRO_2","P12345-PRO_10"]

    ["Q9XYZ1","Q9XYZ1-2","Q9XYZ1-PRO_3","Q9XYZ1-1"]
      -> ["Q9XYZ1","Q9XYZ1-1","Q9XYZ1-2","Q9XYZ1-PRO_3"]
    """
    base_re = re.compile(r'^(?P<base>[^-]+)(?:-(?P<suffix>.+))?$')
    pro_re  = re.compile(r'^PRO_(\d+)$')

    def key(s: str):
        m = base_re.match(s)
        if not m:
            # Fallback: weird string, send to the end
            return (s, 99, float('inf'), s)

        base = m.group('base')
        suf  = m.group('suffix')

        if suf is None or suf == "":
            # Bare ID first within its base group
            return (base, 0, -1, "")

        # Numeric isoform?
        if suf.isdigit():
            return (base, 1, int(suf), "")

        # Processed/proteoform like PRO_#
        pm = pro_re.match(suf)
        if pm:
            return (base, 2, int(pm.group(1)), "")

        # Anything else: keep but alphabetical at the end
        return (base, 3, float('inf'), suf)

    return sorted(uniprotlist, key=key)


In [192]:
merged = merged.rename(columns={"uniprot_A": "uniprot_A_intact","uniprot_B": "uniprot_B_intact"})

In [193]:
merged["uniprot_A_equalseq"] = merged["uniprot_A_equalseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged["uniprot_B_equalseq"] = merged["uniprot_B_equalseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged["uniprot_A_inseq"] = merged["uniprot_A_inseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged["uniprot_B_inseq"] = merged["uniprot_B_inseq"].apply(lambda x: sort_isoforms(x) if type(x)!=float else x)
merged["uniprot_A"] = merged["uniprot_A_equalseq"].apply(lambda x: x[0] if type(x)!=float else x)
merged["uniprot_B"] = merged["uniprot_B_equalseq"].apply(lambda x: x[0] if type(x)!=float else x)

In [194]:
# must reset uniprot_A and uniprot_B
merged["unique_uniprot_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A",colB="uniprot_B"),axis=1)
merged["uniprot_A_noisoforms"] = merged["uniprot_A"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged["uniprot_B_noisoforms"] = merged["uniprot_B"].apply(lambda x: x.split("-")[0] if (x is not None and type(x)==str) else x)
merged["unique_uniprot_noisoforms_pair"] = merged.apply(lambda row: get_unique_id(row, colA="uniprot_A_noisoforms",colB="uniprot_B_noisoforms"),axis=1)

In [195]:
# see how often the right isoform is the first one versus another 
print(f"More interactor A checks")
test1 = merged.loc[
    (merged["uniprot_A"].notna()) & 
    (merged["uniprot_A"].apply(lambda x: x.endswith("-0") if type(x)!=float else False))]
print(f"\tTotal rows where interactor A sequence matches isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_A"].notna()) & 
    (merged["uniprot_A"].apply(lambda x: not(x.endswith("-0")) if type(x)!=float else False))]
print(f"\tTotal rows where interactor A sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_A"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_A"].isna()) & 
    (merged["uniprot_A_intact"].notna())
    ]
print(f"\t\tBecause the IntAct-provided Uniprot could not be mapped: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_A"].isna()) & 
    (merged["uniprot_A_intact"].isna())
    ]
print(f"\t\tBecause there was no IntAct-provided Uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

# see how often the right isoform is the first one versus another 
print(f"More interactor B checks")
test1 = merged.loc[
    (merged["uniprot_B"].notna()) & 
    (merged["uniprot_B"].apply(lambda x: x.endswith("-0") if type(x)!=float else False))]
print(f"\tTotal rows where interactor B sequence matches isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_B"].notna()) & 
    (merged["uniprot_B"].apply(lambda x: not(x.endswith("-0")) if type(x)!=float else False))]
print(f"\tTotal rows where interactor B sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_B"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_B"].isna()) & 
    (merged["uniprot_B_intact"].notna())
    ]
print(f"\t\tBecause the IntAct-provided Uniprot could not be mapped: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_B"].isna()) & 
    (merged["uniprot_B_intact"].isna())
    ]
print(f"\t\tBecause there was no IntAct-provided Uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")




More interactor A checks
	Total rows where interactor A sequence matches isoform 0 (canonical) of its corresponding uniprot: 689490 (92.54%)
	Total rows where interactor A sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: 45771 (6.14%)
	Total rows where there is no corresponding uniprot: 9824 (1.32%)
		Because the IntAct-provided Uniprot could not be mapped: 3266 (0.44%)
		Because there was no IntAct-provided Uniprot: 6558 (0.88%)
More interactor B checks
	Total rows where interactor B sequence matches isoform 0 (canonical) of its corresponding uniprot: 688920 (92.46%)
	Total rows where interactor B sequence does NOT match isoform 0 (canonical) of its corresponding uniprot: 43673 (5.86%)
	Total rows where there is no corresponding uniprot: 12492 (1.68%)
		Because the IntAct-provided Uniprot could not be mapped: 3001 (0.40%)
		Because there was no IntAct-provided Uniprot: 9491 (1.27%)


In [196]:
# now let's map back in the uniprot gene name
canonical_uniprot_gene_name_map = dict(zip(
    "uniprotkb:"+idmap_merge["canonical_uniprotkb"], idmap_merge["uniprot_gene_name"]
))
merged["uniprot_gene_name_A"] = merged["uniprot_A_noisoforms"].map(canonical_uniprot_gene_name_map)
merged["uniprot_gene_name_B"] = merged["uniprot_B_noisoforms"].map(canonical_uniprot_gene_name_map)

# does everything get mapped?
print(f"Interactor A gene name checks")
test1 = merged.loc[
    (merged["uniprot_A"].notna()) & 
    (merged["uniprot_gene_name_A"].notna())]
print(f"\tTotal rows where interactor A has a uniprot and a gene name: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_A"].notna()) & 
    (merged["uniprot_gene_name_A"].isna())]
print(f"\tTotal rows where interactor A has a uniprot and no gene name: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_A"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

print(f"Interactor B gene name checks")
test1 = merged.loc[
    (merged["uniprot_B"].notna()) & 
    (merged["uniprot_gene_name_B"].notna())]
print(f"\tTotal rows where interactor B has a uniprot and a gene name: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_B"].notna()) & 
    (merged["uniprot_gene_name_B"].isna())]
print(f"\tTotal rows where interactor B has a uniprot and no gene name: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")

test1 = merged.loc[
    (merged["uniprot_B"].isna())
    ]
print(f"\tTotal rows where there is no corresponding uniprot: {len(test1)} ({100*len(test1)/len(merged):.2f}%)")


Interactor A gene name checks
	Total rows where interactor A has a uniprot and a gene name: 735261 (98.68%)
	Total rows where interactor A has a uniprot and no gene name: 0 (0.00%)
	Total rows where there is no corresponding uniprot: 9824 (1.32%)
Interactor B gene name checks
	Total rows where interactor B has a uniprot and a gene name: 732593 (98.32%)
	Total rows where interactor B has a uniprot and no gene name: 0 (0.00%)
	Total rows where there is no corresponding uniprot: 12492 (1.68%)


In [197]:
# Investigate the causes of more than one unique_id per seq_pair_id 
# Are there any cases where they are different even despite isoforms? 
gb = merged.groupby("seq_pair_id").agg(
    unique_A=("ID(s) interactor A", lambda x: set(x)),
    unique_B=("ID(s) interactor B", lambda x: set(x)),
    unique_ids=("unique_id", lambda x: set(x)),
    unique_miscores=("miscore", lambda x: set(x)),
    ).reset_index()

dups_diff_ids = gb.loc[gb["unique_ids"].apply(lambda x: len(x))>1]["seq_pair_id"].tolist()
test1 = merged.loc[merged["seq_pair_id"].isin(dups_diff_ids)].drop_duplicates(subset=["unique_id","seq_pair_id"]).sort_values(by=["seq_pair_id","unique_id"])

test1 = test1.groupby("seq_pair_id").agg(
    unique_A=("ID(s) interactor A", lambda x: set(x)),
    unique_B=("ID(s) interactor B", lambda x: set(x)),
    uniprot_genenames_A=("uniprot_gene_name_A", lambda x: set(x)),
    uniprot_genenames_B=("uniprot_gene_name_B", lambda x: set(x)),
    unique_ids=("unique_id", lambda x: set(x)),
    unique_miscores=("miscore", lambda x: set(x)),
    unique_uniprot_pairs=("unique_uniprot_pair", lambda x: set(x)),
    unique_uniprot_noiso1_pairs=("unique_uniprot_noiso1_pair", lambda x: set(x)),
    unique_uniprot_noisoforms_pairs=("unique_uniprot_noisoforms_pair", lambda x: set(x)),
    ).reset_index()

test1 = test1.loc[
    (test1["unique_uniprot_pairs"].apply(lambda x: len(x))>1) & 
    (test1["unique_uniprot_noisoforms_pairs"].apply(lambda x: len(x))>1)
    ].reset_index(drop=True)

In [198]:
test1[["seq_pair_id","unique_uniprot_pairs","unique_uniprot_noiso1_pairs","unique_uniprot_noisoforms_pairs","uniprot_genenames_A","uniprot_genenames_B"]].to_csv("diff_uniprot_same_sequences.csv",index=False)

# Consider PTMs, mutations, and bind sites

In [199]:
# Now to read bindsites we have to bump the limit
# bump the limit (use a big number; sys.maxsize may OverflowError on some platforms)
limit = 10**9
try:
    csv.field_size_limit(limit)
except OverflowError:
    # fallback: shrink until it fits the platform
    while True:
        try:
            csv.field_size_limit(limit)
            break
        except OverflowError:
            limit //= 10

## PTMs

In [200]:
# Load 
import pandas as pd
ptms_path = "data_files/raw/intact/psimitab/features/ptms.tsv"
ptms = pd.read_csv(ptms_path, sep="\t", engine="python")

In [201]:
ptms["Interaction AC"] = ptms["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)
ptm_interact_acs = ptms["Interaction AC"].dropna().unique().tolist()
ptm_mask = calc_feature_mask(merged, ptm_interact_acs)
test1 = len(ptms.loc[(ptms["Interaction AC"].notna()) & ~(ptms["Interaction AC"].fillna("").str.contains("EBI-"))])==0
print(f"\tAll rows of the PTMs dataframe either have no interaction ID, or they have one in intact:EBI- format: {test1}")
test1 = len(ptms.loc[(ptms["Interaction AC"].notna()) & (ptms["Interaction AC"].fillna("").str.count("EBI-")>1)])==0
print(f"\tNo rows of the PTMs dataframe have >1 intact:EBI- ID: {test1}")
test1 = len(ptms.loc[(ptms["Interaction AC"].notna()) & (ptms["Interaction AC"].fillna("").str.count("EBI-")==1)])
print(f"\tTotal rows of PTMs with one intact:EBI- ID: {test1} ({100*test1/len(ptms):.2f}%)")
test1 = len(ptms.loc[(ptms["Interaction AC"].isna())])
print(f"\tTotal rows of PTMs with no interaction ID: {test1} ({100*test1/len(ptms):.2f}%)")

	All rows of the PTMs dataframe either have no interaction ID, or they have one in intact:EBI- format: True
	No rows of the PTMs dataframe have >1 intact:EBI- ID: True
	Total rows of PTMs with one intact:EBI- ID: 10283 (99.97%)
	Total rows of PTMs with no interaction ID: 3 (0.03%)


## Mutations

In [202]:
mutations_path = "data_files/raw/intact/psimitab/features/mutations.tsv"
mutations = pd.read_csv(mutations_path, sep="\t", engine="python")

In [203]:
mutations["Interaction AC"] = mutations["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)
mutation_interact_acs = mutations["Interaction AC"].dropna().unique().tolist()
mutation_mask = calc_feature_mask(merged, mutation_interact_acs)
test1 = len(mutations.loc[(mutations["Interaction AC"].notna()) & ~(mutations["Interaction AC"].fillna("").str.contains("EBI-"))])==0
print(f"\tAll rows of the Mutations dataframe either have no interaction ID, or they have one in intact:EBI- format: {test1}")
test1 = len(mutations.loc[(mutations["Interaction AC"].notna()) & (mutations["Interaction AC"].fillna("").str.count("EBI-")>1)])==0
print(f"\tNo rows of the Mutations dataframe have >1 intact:EBI- ID: {test1}")
test1 = len(mutations.loc[(mutations["Interaction AC"].notna()) & (mutations["Interaction AC"].fillna("").str.count("EBI-")==1)])
print(f"\tTotal rows of Mutations with one intact:EBI- ID: {test1} ({100*test1/len(mutations):.2f}%)")
test1 = len(mutations.loc[(mutations["Interaction AC"].isna())])
print(f"\tTotal rows of Mutations with no interaction ID: {test1} ({100*test1/len(mutations):.2f}%)")

	All rows of the Mutations dataframe either have no interaction ID, or they have one in intact:EBI- format: True
	No rows of the Mutations dataframe have >1 intact:EBI- ID: True
	Total rows of Mutations with one intact:EBI- ID: 83620 (99.97%)
	Total rows of Mutations with no interaction ID: 24 (0.03%)


## Binding sites

In [204]:
bindsites_path = "data_files/raw/intact/psimitab/features/bindings_regions.tsv"            
bindsites = pd.read_csv(bindsites_path, sep="\t", engine="python")

In [205]:
bindsites["Interaction AC"] = bindsites["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)
bindsite_interact_acs = bindsites["Interaction AC"].dropna().unique().tolist()
bindside_mask = calc_feature_mask(merged, bindsite_interact_acs)
test1 = len(bindsites.loc[(bindsites["Interaction AC"].notna()) & ~(bindsites["Interaction AC"].fillna("").str.contains("EBI-"))])==0
print(f"\tAll rows of the Binding Sites dataframe either have no interaction ID, or they have one in intact:EBI- format: {test1}")
test1 = len(bindsites.loc[(bindsites["Interaction AC"].notna()) & (bindsites["Interaction AC"].fillna("").str.count("EBI-")>1)])==0
print(f"\tNo rows of the Binding Sites dataframe have >1 intact:EBI- ID: {test1}")
test1 = len(bindsites.loc[(bindsites["Interaction AC"].notna()) & (bindsites["Interaction AC"].fillna("").str.count("EBI-")==1)])
print(f"\tTotal rows of Binding Sites with one intact:EBI- ID: {test1} ({100*test1/len(bindsites):.2f}%)")
test1 = len(bindsites.loc[(bindsites["Interaction AC"].isna())])
print(f"\tTotal rows of Binding Sites with no interaction ID: {test1} ({100*test1/len(bindsites):.2f}%)")

	All rows of the Binding Sites dataframe either have no interaction ID, or they have one in intact:EBI- format: True
	No rows of the Binding Sites dataframe have >1 intact:EBI- ID: True
	Total rows of Binding Sites with one intact:EBI- ID: 205542 (99.94%)
	Total rows of Binding Sites with no interaction ID: 118 (0.06%)


## Define notable PTM features

In [206]:
feature_folder = "data_files/processed/intact/features/"
os.makedirs(feature_folder,exist_ok=True)
# ptm
with open(f"{feature_folder}/ptm_feature_annotations.txt", "w") as f:
    ptm_features = ptms["Feature annotation(s)"].dropna().unique().tolist()
    f.write("\n".join(ptm_features))
with open(f"{feature_folder}/ptm_feature_types.txt", "w") as f:
    ptm_feature_types = ptms["Feature type"].dropna().unique().tolist()
    f.write("\n".join(ptm_feature_types))
with open(f"{feature_folder}/ptm_feature_ac.txt", "w") as f:
    ptm_feature_ac = ptms["# Feature AC"].dropna().unique().tolist()
    ptm_feature_ac = [x for x in ptm_feature_ac if not(x.startswith("EBI"))]
    f.write("\n".join(ptm_feature_ac))
    
# mutation
with open(f"{feature_folder}/mutation_feature_annotations.txt", "w") as f:
    mutation_features = mutations["Feature annotation(s)"].dropna().unique().tolist()
    f.write("\n".join(mutation_features))
with open(f"{feature_folder}/mutation_feature_types.txt","w") as f:
    mutation_types = mutations["Feature type"].dropna().unique().tolist()
    f.write("\n".join(mutation_types))
with open(f"{feature_folder}/mutation_feature_ac.txt","w") as f:
    mutation_feature_ac = mutations["# Feature AC"].dropna().unique().tolist()
    mutation_feature_ac = [x for x in mutation_feature_ac if not(x.startswith("EBI"))]
    f.write("\n".join(mutation_feature_ac))
    
with open(f"{feature_folder}/bindsite_feature_types.txt", "w") as f:
    bindsite_feature_types = bindsites["Feature type"].dropna().unique().tolist()
    bindsite_feature_types = "|".join(bindsite_feature_types).split("|")
    bindsite_feature_types = ",".join(bindsite_feature_types).split(",")
    bindsite_feature_types = [x for x in bindsite_feature_types if len(x)>0]
    bindsite_feature_types = list(set(bindsite_feature_types))
    f.write("\n".join(bindsite_feature_types))
with open(f"{feature_folder}/bindsite_feature_annotations.txt", "w") as f:
    bindsite_features = bindsites["Feature annotation(s)"].dropna().unique().tolist()
    f.write("\n".join(bindsite_features))
with open(f"{feature_folder}/bindsite_feature_ac.txt", "w") as f:
    bindsite_ac = bindsites["# Feature AC"].dropna().unique().tolist()
    bindsite_ac = [x for x in bindsite_ac if not(x.startswith("EBI"))]
    f.write("\n".join(bindsite_ac))
    
with open(f"{feature_folder}/annotations_interactors.txt","w") as f:
    annotations_interactor_A = merged["Annotation(s) interactor A"].dropna().unique().tolist()
    annotations_interactor_B = merged["Annotation(s) interactor B"].dropna().unique().tolist()
    annotations_interactors = list(set(annotations_interactor_A+annotations_interactor_B))
    annotations_interactors = "|".join(annotations_interactors).split("|")
    annotations_interactors = ",".join(annotations_interactors).split(",")
    annotations_interactors = [x for x in annotations_interactors if len(x)>0]
    annotations_interactors = list(set(annotations_interactors))
    f.write("\n".join(annotations_interactors))
with open(f"{feature_folder}/annotations_interactions.txt","w") as f:
    annotations = merged["Interaction annotation(s)"].dropna().unique().tolist()
    annotations = "|".join(annotations).split("|")
    annotations = ",".join(annotations).split(",")
    annotations = [x for x in annotations if len(x)>0]
    annotations = list(set(annotations))
    f.write("\n".join(annotations))

In [207]:
analyzed_mods_dir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/features_analyzed"
bindsite_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/bindsite_types_analyzed.csv")
mutation_feature_ac_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_ac_analyzed.csv")
mutation_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_annotations_analyzed.csv")
mutation_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_types_analyzed.csv")
ptm_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_types_analyzed.csv")
ptm_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_annotations_analyzed.csv")

# Get these PTM, binding site, and mutation features back into merged

In [208]:
print("Checking on interaction ID status of merged dataframe")
test1 = len(merged.loc[
    merged["Interaction identifier(s)"].isna()
])==0
print(f"\tAll interactions have at least one identifier: {test1}")
test1 = len(merged.loc[
    merged["Interaction identifier(s)"].str.count("intact:EBI-")<1
])==0
print(f"\tAll interactions have at least one intact:EBI- formatted identifier: {test1}")


Checking on interaction ID status of merged dataframe
	All interactions have at least one identifier: True
	All interactions have at least one intact:EBI- formatted identifier: True


In [209]:
print("Checking on interaction ID status of merged_neg dataframe")
test1 = len(merged_neg.loc[
    merged_neg["Interaction identifier(s)"].isna()
])==0
print(f"\tAll interactions have at least one identifier: {test1}")
test1 = len(merged_neg.loc[
    merged_neg["Interaction identifier(s)"].str.count("intact:EBI-")<1
])==0
print(f"\tAll interactions have at least one intact:EBI- formatted identifier: {test1}")
test1 = len(merged_neg.loc[
    merged_neg["interaction_intactid"].str.count("intact:EBI-")>1
])
print(f"\tTotal rows with >1 intact:EBI- formatted identifier: {test1}/{len(merged_neg)}")

Checking on interaction ID status of merged_neg dataframe
	All interactions have at least one identifier: True
	All interactions have at least one intact:EBI- formatted identifier: True
	Total rows with >1 intact:EBI- formatted identifier: 0/970


In [210]:
merged.loc[
    merged["Interaction identifier(s)"].str.count("intact:EBI-")>1
].sort_values(by="Interaction identifier(s)")[["unique_id","uniprot_A","uniprot_B","Interaction identifier(s)"]]

Unnamed: 0,unique_id,uniprot_A,uniprot_B,Interaction identifier(s)
54,intact:EBI-1000337_intact:EBI-1000342,uniprotkb:Q8AWF5-0,uniprotkb:Q8AWF4-0,intact:EBI-1000385|intact:EBI-1000421|intact:E...
56,intact:EBI-1000337_intact:EBI-1000342,uniprotkb:Q8AWF5-0,uniprotkb:Q8AWF4-0,intact:EBI-1000385|intact:EBI-1000421|intact:E...
57,intact:EBI-1000337_intact:EBI-1000342,uniprotkb:Q8AWF5-0,uniprotkb:Q8AWF4-0,intact:EBI-1000385|intact:EBI-1000421|intact:E...
58,intact:EBI-1000337_intact:EBI-1000342,uniprotkb:Q8AWF5-0,uniprotkb:Q8AWF4-0,intact:EBI-1000385|intact:EBI-1000421|intact:E...
55,intact:EBI-1000337_intact:EBI-1000342,uniprotkb:Q8AWF5-0,uniprotkb:Q8AWF4-0,intact:EBI-1000385|intact:EBI-1000421|intact:E...
...,...,...,...,...
745076,intact:EBI-999900_intact:EBI-999909,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,intact:EBI-999899|intact:EBI-999976|intact:EBI...
745074,intact:EBI-999900_intact:EBI-999909,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,intact:EBI-999899|intact:EBI-999976|intact:EBI...
745083,intact:EBI-999900_intact:EBI-999909,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,intact:EBI-999899|intact:EBI-999976|intact:EBI...
745078,intact:EBI-999900_intact:EBI-999909,uniprotkb:Q8NBT2-0,uniprotkb:Q9HBM1-0,intact:EBI-999899|intact:EBI-999976|intact:EBI...


In [211]:
merged_neg.loc[
    merged_neg["Interaction identifier(s)"].str.count("intact:EBI-")>1
].sort_values(by="Interaction identifier(s)")[["unique_id","uniprot_A","uniprot_B","Interaction identifier(s)"]]

Unnamed: 0,unique_id,uniprot_A,uniprot_B,Interaction identifier(s)
123,intact:EBI-10898504_intact:EBI-373586,uniprotkb:P49841-0,uniprotkb:O75952-0,intact:EBI-10900933|intact:EBI-10901155
124,intact:EBI-10898504_intact:EBI-373586,uniprotkb:P49841-0,uniprotkb:O75952-0,intact:EBI-10900933|intact:EBI-10901155
133,intact:EBI-1100098_intact:EBI-1100119,uniprotkb:Q0DJ33-0,uniprotkb:Q40687-0,intact:EBI-1100306|intact:EBI-1100357
134,intact:EBI-1100098_intact:EBI-1100119,uniprotkb:Q0DJ33-0,uniprotkb:Q40687-0,intact:EBI-1100306|intact:EBI-1100357
766,intact:EBI-347088_intact:EBI-432545,uniprotkb:P63104-0,uniprotkb:Q14103-4,intact:EBI-1107667|intact:EBI-1107748
...,...,...,...,...
665,intact:EBI-1760079_intact:EBI-9247467,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,intact:EBI-8840419|intact:EBI-8840571|intact:E...
664,intact:EBI-1760079_intact:EBI-9247467,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,intact:EBI-8840419|intact:EBI-8840571|intact:E...
663,intact:EBI-1760079_intact:EBI-9247467,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,intact:EBI-8840419|intact:EBI-8840571|intact:E...
957,intact:EBI-78473_intact:EBI-9080418,uniprotkb:P03372-0,uniprotkb:Q16881-5,intact:EBI-9080432|intact:EBI-9080610


In [212]:
scraped_mut_cols = [
    "mutation_mi_1",
    "mutation_name_1",
    "mutation_short_1",
    "mutation_begin_1",
    "mutation_end_1",
    "mutation_orig_1",
    "mutation_new_1",
    "mutation_mi_2",
    "mutation_name_2",
    "mutation_short_2",
    "mutation_begin_2",
    "mutation_end_2",
    "mutation_orig_2",
    "mutation_new_2",
]

In [213]:
# Make sure the following columns are not in list format but are instea pipe-separated 
list_cols = ['uniprot_A_equalseq', 'uniprot_B_equalseq', 'uniprot_A_inseq', 'uniprot_B_inseq', 'uniprot_A_equalseq_canonical', 'uniprot_A_inseq_canonical', 'uniprot_B_equalseq_canonical', 'uniprot_B_inseq_canonical']
for c in list_cols:
    merged[c] = merged[c].apply(lambda x: "|".join(x) if type(x)==list else x)
    merged_neg[c] = merged_neg[c].apply(lambda x: "|".join(x) if type(x)==list else x)

In [214]:
merged_expl = merged.copy(deep=True)
print(f"Length of merged: {len(merged_expl)}. Merged is already exploded by IntAct interaction identifier(s)")
# confirm just one more time that there is only one interaction identifier per row
test1 = len(merged_expl.loc[merged_expl["interaction_intactid"].isna()])==0
print(f"\tAll rows have one intact interaction ID after exploding: {test1}")
test1 = len(merged_expl.loc[merged_expl["interaction_intactid"].str.count("EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID after exploding: {test1}")

Length of merged: 745085. Merged is already exploded by IntAct interaction identifier(s)
	All rows have one intact interaction ID after exploding: True
	No rows have >1 intact interaction ID after exploding: True


In [215]:
merged_neg_expl = merged_neg.copy(deep=True)
print(f"Length of merged_neg_expl: {len(merged_neg_expl)}. Merged is already exploded by IntAct interaction identifier(s)")
# confirm just one more time that there is only one interaction identifier per row
test1 = len(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"].isna()])==0
print(f"\tAll rows have at least one intact interaction ID after exploding: {test1}")
test1 = len(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"].str.count("EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID after exploding: {test1}")
test1 = len(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"].str.count("EBI-")>1])
print(f"\t\tTotal rows with >1 intact interaction ID after: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")

# since we're going to match on interaction ID we need to expand
merged_neg_expl["interaction_intactid"] = merged_neg_expl["interaction_intactid"].apply(lambda x: x.split(","))
merged_neg_expl = merged_neg_expl.explode("interaction_intactid").reset_index(drop=True)
print(f"Length after exploding: {len(merged_neg_expl)}")
test1 = len(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"].str.count("EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID after exploding: {test1}")

Length of merged_neg_expl: 970. Merged is already exploded by IntAct interaction identifier(s)
	All rows have at least one intact interaction ID after exploding: True
	No rows have >1 intact interaction ID after exploding: True
		Total rows with >1 intact interaction ID after: 0/970 (0.00%)
Length after exploding: 970
	No rows have >1 intact interaction ID after exploding: True


In [216]:
merged_expl["scraped_mut_has_info"] = merged_expl[scraped_mut_cols].notna().any(axis=1)
merged_expl["scraped_mut_has_info_1"] = merged_expl[[x for x in scraped_mut_cols if x.endswith("_1")]].notna().any(axis=1)
merged_expl["scraped_mut_has_info_2"] = merged_expl[[x for x in scraped_mut_cols if x.endswith("_2")]].notna().any(axis=1)
test1 = len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info"]) & 
    (merged_expl["scraped_mut_has_info_1"]) &
    (merged_expl["scraped_mut_has_info_2"])
    ][scraped_mut_cols])
print(f"Total rows with mutation entries for both interactor 1 and interactor 2: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info"])
    ][scraped_mut_cols])
print(f"Total rows with mutation entries for either interactor 1 or interactor 2: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")

Total rows with mutation entries for both interactor 1 and interactor 2: 1567/745085 (0.21%)
Total rows with mutation entries for either interactor 1 or interactor 2: 37655/745085 (5.05%)


In [217]:
merged_neg_expl["scraped_mut_has_info"] = merged_neg_expl[scraped_mut_cols].notna().any(axis=1)
merged_neg_expl["scraped_mut_has_info_1"] = merged_neg_expl[[x for x in scraped_mut_cols if x.endswith("_1")]].notna().any(axis=1)
merged_neg_expl["scraped_mut_has_info_2"] = merged_neg_expl[[x for x in scraped_mut_cols if x.endswith("_2")]].notna().any(axis=1)
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info"]) & 
    (merged_neg_expl["scraped_mut_has_info_1"]) &
    (merged_neg_expl["scraped_mut_has_info_2"])
    ][scraped_mut_cols])
print(f"Total rows with mutation entries for both interactor 1 and interactor 2: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info"])
    ][scraped_mut_cols])
print(f"Total rows with mutation entries for either interactor 1 or interactor 2: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")

Total rows with mutation entries for both interactor 1 and interactor 2: 0/970 (0.00%)
Total rows with mutation entries for either interactor 1 or interactor 2: 12/970 (1.24%)


In [218]:
merged_expl["mutations_expandable_1"] = merged_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_mut_cols, interactor=1) if row["scraped_mut_has_info_1"] else False, axis=1)
merged_expl["mutations_expandable_2"] = merged_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_mut_cols, interactor=2) if row["scraped_mut_has_info_2"] else False, axis=1)

In [219]:
merged_neg_expl["mutations_expandable_1"] = merged_neg_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_mut_cols, interactor=1) if row["scraped_mut_has_info_1"] else False, axis=1)
merged_neg_expl["mutations_expandable_2"] = merged_neg_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_mut_cols, interactor=2) if row["scraped_mut_has_info_2"] else False, axis=1)

In [220]:
test1 = len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info_1"] != merged_expl["mutations_expandable_1"])
])
print(f"Total rows where interactor 1 mutation info exists but is not expandable: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info_1"] != merged_expl["mutations_expandable_1"]) & 
    (merged_expl["mutation_orig_1"].notna() & merged_expl["mutation_new_1"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info_2"] != merged_expl["mutations_expandable_2"])
])
print(f"Total rows where interactor 2 mutation info exists but is not expandable: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info_2"] != merged_expl["mutations_expandable_2"]) & 
    (merged_expl["mutation_orig_2"].notna() & merged_expl["mutation_new_2"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")

Total rows where interactor 1 mutation info exists but is not expandable: 0/745085 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True
Total rows where interactor 2 mutation info exists but is not expandable: 0/745085 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True


In [221]:
print("negative database analysis")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info_1"] != merged_neg_expl["mutations_expandable_1"])
])
print(f"Total rows where interactor 1 mutation info exists but is not expandable: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info_1"] != merged_neg_expl["mutations_expandable_1"]) & 
    (merged_neg_expl["mutation_orig_1"].notna() & merged_neg_expl["mutation_new_1"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info_2"] != merged_neg_expl["mutations_expandable_2"])
])
print(f"Total rows where interactor 2 mutation info exists but is not expandable: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info_2"] != merged_neg_expl["mutations_expandable_2"]) & 
    (merged_neg_expl["mutation_orig_2"].notna() & merged_neg_expl["mutation_new_2"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")

negative database analysis
Total rows where interactor 1 mutation info exists but is not expandable: 0/970 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True
Total rows where interactor 2 mutation info exists but is not expandable: 0/970 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True


In [222]:
# Figure out how to explode along these columns
scraped_mut_cols_1 = [x for x in scraped_mut_cols if x.endswith("_1")]
scraped_mut_cols_2 = [x for x in scraped_mut_cols if x.endswith("_2")]
for c in scraped_mut_cols_1:
    merged_expl[c] = merged_expl.apply(lambda row: row[c].split("|") if (row["mutations_expandable_1"] and type(row[c])==str) else [row[c]], axis=1)
for c in scraped_mut_cols_2:
    merged_expl[c] = merged_expl.apply(lambda row: row[c].split("|") if (row["mutations_expandable_2"] and type(row[c])==str) else [row[c]], axis=1)

In [223]:
for c in scraped_mut_cols_1:
    merged_neg_expl[c] = merged_neg_expl.apply(lambda row: row[c].split("|") if (row["mutations_expandable_1"] and type(row[c])==str) else [row[c]], axis=1)
for c in scraped_mut_cols_2:
    merged_neg_expl[c] = merged_neg_expl.apply(lambda row: row[c].split("|") if (row["mutations_expandable_2"] and type(row[c])==str) else [row[c]], axis=1)

In [224]:
# Find examples of proteins in merged_expl that have the same IntAct ID but multiple sequences
[x for x in merged_expl.columns if "uniprot" in x]

['uniprot_A_intact',
 'uniprot_B_intact',
 'uniprotkb_1',
 'uniprotkb_2',
 'no_uniprot_update_A',
 'no_uniprot_update_B',
 'unique_uniprot_pair',
 'uniprot_A_noiso1',
 'uniprot_B_noiso1',
 'unique_uniprot_noiso1_pair',
 'uniprot_A_noisoforms',
 'uniprot_B_noisoforms',
 'unique_uniprot_noisoforms_pair',
 'uniprot_A_equalseq',
 'uniprot_B_equalseq',
 'uniprot_A_inseq',
 'uniprot_B_inseq',
 'uniprot_A_full',
 'uniprot_B_full',
 'uniprot_A_equalseq_canonical',
 'uniprot_A_inseq_canonical',
 'uniprot_B_equalseq_canonical',
 'uniprot_B_inseq_canonical',
 'uniprot_A',
 'uniprot_B',
 'uniprot_gene_name_A',
 'uniprot_gene_name_B']

In [225]:
temp = merged_expl.groupby("uniprot_A_noisoforms").agg(
    n_aa_1 = ("aa_1", lambda x: len(set(x))),
    n_intact = ("all_intact_A_sorted", lambda x: len(set(x)))
)
temp.sort_values(by="n_aa_1",ascending=False)

Unnamed: 0_level_0,n_aa_1,n_intact
uniprot_A_noisoforms,Unnamed: 1_level_1,Unnamed: 2_level_1
uniprotkb:P0DTD1,16,16
uniprotkb:P0C6X7,15,15
uniprotkb:Q9WMX2,11,11
uniprotkb:P27958,10,10
uniprotkb:Q99IB8,10,10
...,...,...
uniprotkb:P0AC69,1,1
uniprotkb:P0AC73,1,1
uniprotkb:P0AC81,1,1
uniprotkb:P0AC92,1,1


In [226]:
l = merged_expl["Alias(es) interactor B"].tolist()
l += merged_expl["Alias(es) interactor A"].tolist()
l = "|".join(l)
l = l.split("|")
l = [x.split(":")[0] for x in l if not(x.startswith("psi-mi:"))]
l = pd.Series(l)
l.value_counts()
    

uniprotkb               9633659
ensembl                 1531782
intact                   278930
dip                        1592
genbank_protein_gi          669
uniparc                     198
refseq                      115
ensembl metazoa              98
afcs                         46
mint                         12
flybase                      11
ensembl plants                6
entrezgene/locuslink          1
Name: count, dtype: int64

In [227]:
def extract_db_sources(row, interactor="A"):
    s = row[f"Alt. ID(s) interactor {interactor}"]
    if type(s) == float:
        l2 = []
    else:
        l = s.split("|")
        l = [x.split(":")[0] for x in l if not(x.startswith("psi-mi:"))]
        l2 = sorted(list(set(l)))
    
    if type(s) == float:
        l = []
    else:
        s = row[f"Alias(es) interactor {interactor}"]
        l = s.split("|")
        l = [x.split(":")[0] for x in l if not(x.startswith("psi-mi:"))]
        l = sorted(list(set(l)))
    
    l = l2 + l
    l = sorted(list(set(l)))
    return "|".join(l)

merged_expl["DB Sources interactor A"] = merged_expl.apply(lambda row: extract_db_sources(row,interactor="A"),axis=1)
merged_expl["DB Sources interactor B"] = merged_expl.apply(lambda row: extract_db_sources(row,interactor="B"),axis=1)
display(merged_expl[[
    "ID(s) interactor A",
    "Alias(es) interactor A",
    "DB Sources interactor A",
    "ID(s) interactor B",
    "Alias(es) interactor B",
    "DB Sources interactor B",
]].head())
display(merged_expl.loc[
    merged_expl["DB Sources interactor A"]=="intact"
    ].reset_index(drop=True))

Unnamed: 0,ID(s) interactor A,Alias(es) interactor A,DB Sources interactor A,ID(s) interactor B,Alias(es) interactor B,DB Sources interactor B
0,intact:EBI-101707,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,uniprotkb,intact:EBI-100018,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,uniprotkb
1,intact:EBI-100018,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,uniprotkb,intact:EBI-102069,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,intact|uniprotkb
2,intact:EBI-104215,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,uniprotkb,intact:EBI-100018,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,uniprotkb
3,intact:EBI-100018,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,uniprotkb,intact:EBI-107089,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,uniprotkb
4,intact:EBI-117032,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,uniprotkb,intact:EBI-100018,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,uniprotkb


Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,uniprot_B,uniprot_gene_name_A,uniprot_gene_name_B,scraped_mut_has_info,scraped_mut_has_info_1,scraped_mut_has_info_2,mutations_expandable_1,mutations_expandable_2,DB Sources interactor A,DB Sources interactor B


In [228]:
merged_expl["DB Sources interactor A"].value_counts()

DB Sources interactor A
ensembl|uniprotkb            322183
uniprotkb                    309583
ensembl|intact|uniprotkb      83344
intact|uniprotkb              23144
                               6557
dip|uniprotkb                   125
uniparc|uniprotkb                77
dip|refseq|uniprotkb             64
dip|intact|uniprotkb              2
ensembl metazoa|uniprotkb         2
dip|mint|uniprotkb                2
ensembl plants|uniprotkb          1
ddbj/embl/genbank                 1
Name: count, dtype: int64

In [229]:
temp = merged_expl.groupby("aa_2").agg(
    n_intact = ("all_intact_B_sorted", lambda x: len([x for x in list(set(x)) if type(x)!=float])),
    n_uniprot_B = ("uniprot_B", lambda x: len([x for x in list(set(x)) if type(x)!=float])),
    n_aliases = ("Alias(es) interactor B", lambda x: len([x for x in list(set(x)) if type(x)!=float])),
    uniprot_B = ("uniprot_B", lambda x: [x for x in list(set(x)) if type(x)!=float]),
    intact = ("all_intact_B_sorted", lambda x: [x for x in list(set(x)) if type(x)!=float]),
    aliases = ("Alias(es) interactor B", lambda x: [x for x in list(set(x)) if type(x)!=float]),
).reset_index()
display(temp.sort_values(by=["n_intact","n_uniprot_B"],ascending=False).head())

temp.loc[temp["n_intact"]!=temp["n_aliases"]]

Unnamed: 0,aa_2,n_intact,n_uniprot_B,n_aliases,uniprot_B,intact,aliases
57970,MSLLTEVETPIRNEWGCRCNDSSDPLVVAASIIGILHLILWILDRL...,8,8,8,"[uniprotkb:Q76V12-0, uniprotkb:Q77IM6-0, unipr...","[intact:EBI-40249317, intact:EBI-40249504, int...",[psi-mi:m2_i68a4|psi-mi:M|uniprotkb:Q910G8|uni...
49021,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...,7,1,7,[uniprotkb:P62992-PRO_0000396474],"[intact:EBI-7038538, intact:EBI-413074, intact...",[psi-mi:ubiq_drome|psi-mi:RpL40|psi-mi:RpS27A|...
15291,MDDDIAALVVDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,6,6,6,"[uniprotkb:Q1KLZ0-0, uniprotkb:P60710-0, unipr...","[intact:EBI-353957, intact:EBI-349272, intact:...",[psi-mi:actb_human|psi-mi:ACTB|uniprotkb:P0257...
4266,MAEDADMRNELEEMQRRADQLADESLESTRRMLQLVEESKDAGIRT...,6,5,6,"[uniprotkb:Q17QQ3-0, uniprotkb:P60880-0, unipr...","[intact:EBI-524785, intact:EBI-445288, intact:...",[psi-mi:snp25_mouse|psi-mi:Snap25|uniprotkb:P1...
48524,MQAIKCVVVGDGAVGKTCLLISYTTNAFPGEYIPTVFDNYSANVMV...,6,5,6,"[uniprotkb:P62998-0, uniprotkb:P62999-0, unipr...","[intact:EBI-7002659, intact:EBI-413646, intact...",[psi-mi:p63000-1|psi-mi:RAC1|uniprotkb:P15154-...


Unnamed: 0,aa_2,n_intact,n_uniprot_B,n_aliases,uniprot_B,intact,aliases


In [230]:
merged_expl.loc[merged_expl["scraped_mut_has_info"]][scraped_mut_cols].head()

Unnamed: 0,mutation_mi_1,mutation_name_1,mutation_short_1,mutation_begin_1,mutation_end_1,mutation_orig_1,mutation_new_1,mutation_mi_2,mutation_name_2,mutation_short_2,mutation_begin_2,mutation_end_2,mutation_orig_2,mutation_new_2
107,"[MI:0573, MI:0573]","[mutation disrupting interaction, mutation dis...","[P08069:p.Lys1033Ala, P08069:p.Tyr980Phe]","[1033, 980]","[1033, 980]","[K, Y]","[A, F]",[nan],[nan],[nan],[nan],[nan],[nan],[nan]
124,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0429],[necessary binding region],[finger_domain],[1],[187],[nan],[nan]
127,[MI:0118],[mutation],[Q16513:p.Leu520Ile],[520],[520],[L],[I],[nan],[nan],[nan],[nan],[nan],[nan],[nan]
128,[MI:0118],[mutation],[Q16513:p.Leu520Ile],[520],[520],[L],[I],[nan],[nan],[nan],[nan],[nan],[nan],[nan]
129,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0429],[necessary binding region],[finger_domain],[1],[187],[nan],[nan]


In [231]:
merged_neg_expl.loc[merged_neg_expl["scraped_mut_has_info"]][scraped_mut_cols].head()

Unnamed: 0,mutation_mi_1,mutation_name_1,mutation_short_1,mutation_begin_1,mutation_end_1,mutation_orig_1,mutation_new_1,mutation_mi_2,mutation_name_2,mutation_short_2,mutation_begin_2,mutation_end_2,mutation_orig_2,mutation_new_2
109,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0118],[mutation],[P20340:p.Gln72Leu],[72],[72],[Q],[L]
663,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0118],[mutation],[Q9NRW1:p.Gln72Leu],[72],[72],[Q],[L]
664,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0118],[mutation],[Q9NRW1:p.Gln72Leu],[72],[72],[Q],[L]
665,[nan],[nan],[nan],[nan],[nan],[nan],[nan],"[MI:0118, MI:0118]","[mutation, mutation]","[Q9NRW1:p.Gln72Leu, Q9NRW1:p.Thr27Asn]","[72, 27]","[72, 27]","[Q, T]","[L, N]"
749,[MI:0573],[mutation disrupting interaction],[Q61686:p.[Ile165Glu;Tyr168Glu]],"[168,165]","[168,165]","[Y,I]","[E,E]",[nan],[nan],[nan],[nan],[nan],[nan],[nan]


In [232]:
print(f"Going to explode merged_expl by interactor 1, then interactor 2 mutation columns. Length before: {len(merged_expl)}")
merged_expl = merged_expl.explode(scraped_mut_cols_1).reset_index(drop=True)
print(f"\tLength after exploding across 1: {len(merged_expl)}")
merged_expl = merged_expl.explode(scraped_mut_cols_2).reset_index(drop=True)
print(f"\tLength after exploding across 2: {len(merged_expl)}")

Going to explode merged_expl by interactor 1, then interactor 2 mutation columns. Length before: 745085
	Length after exploding across 1: 768198
	Length after exploding across 2: 788905


In [233]:
print(f"Going to explode merged_neg_expl by interactor 1, then interactor 2 mutation columns. Length before: {len(merged_neg_expl)}")
merged_neg_expl = merged_neg_expl.explode(scraped_mut_cols_1).reset_index(drop=True)
print(f"\tLength after exploding across 1: {len(merged_neg_expl)}")
merged_neg_expl = merged_neg_expl.explode(scraped_mut_cols_2).reset_index(drop=True)
print(f"\tLength after exploding across 2: {len(merged_neg_expl)}")

Going to explode merged_neg_expl by interactor 1, then interactor 2 mutation columns. Length before: 970
	Length after exploding across 1: 971
	Length after exploding across 2: 972


In [234]:
# if there is a merged_expl row that has mutation info for interactor A and interactor B, then duplicate that row, and delete all the mutation_*_1 info for one and all the mutation_*_2 info for the other 
import pandas as pd
import numpy as np
import re

def split_rows_by_mutation_blocks(merged_expl: pd.DataFrame) -> pd.DataFrame:
    """
    If a row has mutation info in ANY mutation_*_1 column AND ANY mutation_*_2 column,
    duplicate that row into two:
      - copy A: keep *_1, null out *_2
      - copy B: keep *_2, null out *_1
    All other columns are preserved. Returns a new DataFrame.
    """
    df = merged_expl.copy()

    # Find all mutation columns and split into sides
    mut_cols = [c for c in df.columns if re.match(r"^mutation_.*_(1|2)$", c)]
    block1 = [c for c in mut_cols if c.endswith("_1")]
    block2 = [c for c in mut_cols if c.endswith("_2")]

    # "Has info" = any non-null value in that side's block
    has1 = df[block1].notna().any(axis=1)
    has2 = df[block2].notna().any(axis=1)

    both = has1 & has2
    keep = ~both

    # Rows that don't need splitting
    base = df.loc[keep]

    # Rows that do need splitting -> make two copies
    to_split = df.loc[both]

    left  = to_split.copy()
    right = to_split.copy()

    # In left copy: keep side 1, wipe side 2
    left.loc[:, block2] = np.nan

    # In right copy: keep side 2, wipe side 1
    right.loc[:, block1] = np.nan

    # Return combined result (preserve order roughly: base rows first, then splits)
    out = pd.concat([base, left, right], ignore_index=True)
    out = out.reset_index(drop=True)

    return out

In [235]:
merged_expl = split_rows_by_mutation_blocks(merged_expl)
print(f"Length of merged_expl after splitting multiple mutation effects for the same row: {len(merged_expl)}")

Length of merged_expl after splitting multiple mutation effects for the same row: 796348


In [236]:
merged_neg_expl = split_rows_by_mutation_blocks(merged_neg_expl)
print(f"Length of my_neg_expl after splitting multiple mutation effects for the same row: {len(merged_neg_expl)}")

Length of my_neg_expl after splitting multiple mutation effects for the same row: 972


In [237]:
mutations_to_merge = mutations.copy(deep=True)
test1 = len(mutations.loc[mutations["Interaction AC"].str.count("intact:EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID in mutations dataframe: {test1}")
mutations_to_merge.columns = "Mutation " + mutations_to_merge.columns
mutations_to_merge["Mutation interaction_intactid"] = mutations_to_merge["Mutation Interaction AC"].apply(lambda x: x.split("intact:")[-1] if (type(x)==str and x.count("intact:")==1) else (x if type(x)!=str else None))
merged_expl = pd.merge(
    merged_expl,
    mutations_to_merge.rename(columns={"Mutation interaction_intactid":"interaction_intactid"}),
    on=["interaction_intactid"],
    how="left"
)

	No rows have >1 intact interaction ID in mutations dataframe: True


In [238]:
merged_neg_expl = pd.merge(
    merged_neg_expl,
    mutations_to_merge.rename(columns={"Mutation interaction_intactid":"interaction_intactid"}),
    on=["interaction_intactid"],
    how="left"
)

In [239]:
agg_mut_cols = [
    "Mutation # Feature AC",
    "Mutation Feature short label",
    "Mutation Feature range(s)",
    "Mutation Original sequence",
    "Mutation Resulting sequence",
    "Mutation Feature type",
    "Mutation Feature annotation(s)",
    "Mutation Affected protein AC",
    "Mutation Affected protein symbol",
    "Mutation Affected protein full name",
    "Mutation Affected protein organism",
    "Mutation Interaction participants",
    "Mutation PubMedID",
    "Mutation Figure legend(s)",
    "Mutation Xref ID(s)",
]

In [240]:
merged_expl["agg_mut_has_info"] = merged_expl[agg_mut_cols].notna().any(axis=1)

In [241]:
merged_neg_expl["agg_mut_has_info"] = merged_neg_expl[agg_mut_cols].notna().any(axis=1)

In [242]:
len(merged_expl.loc[
    (merged_expl["scraped_mut_has_info"]) &
    (merged_expl["agg_mut_has_info"])
].drop_duplicates(subset=["seq_pair_id"]))

17300

In [243]:
len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info"]) &
    (merged_neg_expl["agg_mut_has_info"])
].drop_duplicates(subset=["seq_sort"]))

7

In [244]:
l = merged_expl.loc[
    (merged_expl["scraped_mut_has_info"]) &
    ~(merged_expl["agg_mut_has_info"])
]["mutation_mi_1"].dropna().unique().tolist() 
l2 = merged_expl.loc[
    (merged_expl["scraped_mut_has_info"]) &
    ~(merged_expl["agg_mut_has_info"])
]["mutation_mi_2"].dropna().unique().tolist()
l = l+l2
mutation_mi_ok.loc[
    mutation_mi_ok["id"].isin(l+l2)
].drop_duplicates(subset=["id"])

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,mutation,MI:0118,,MI:0252,biological feature
3,necessary binding region,MI:0429,MI:0573,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting..."
4,mutation disrupting interaction strength,MI:1128,MI:0573,MI:0573,mutation disrupting interaction
14,mutation causing an interaction,MI:2227,MI:0118,MI:0118,mutation
15,mutation with complex effect,MI:2333,MI:0118,MI:0118,mutation


In [245]:
l = merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info"]) &
    ~(merged_neg_expl["agg_mut_has_info"])
]["mutation_mi_1"].dropna().unique().tolist() 
l2 = merged_neg_expl.loc[
    (merged_neg_expl["scraped_mut_has_info"]) &
    ~(merged_neg_expl["agg_mut_has_info"])
]["mutation_mi_2"].dropna().unique().tolist()
l = l+l2
mutation_mi_ok.loc[
    mutation_mi_ok["id"].isin(l+l2)
].drop_duplicates(subset=["id"])

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all


In [246]:
# fix issues like this: QQQQQQQQQQQQQQQQQQQQQQQQQ\r\nQQQQQQQ
for c in ["mutation_orig_1","mutation_new_1","mutation_orig_2","mutation_new_2"]:
    merged_expl[c] = merged_expl[c].apply(lambda x: x if type(x)!=str else x.replace("\r","").replace("\n","").strip())

In [247]:
# fix issues like this: QQQQQQQQQQQQQQQQQQQQQQQQQ\r\nQQQQQQQ
for c in ["mutation_orig_1","mutation_new_1","mutation_orig_2","mutation_new_2"]:
    merged_neg_expl[c] = merged_neg_expl[c].apply(lambda x: x if type(x)!=str else x.replace("\r","").replace("\n","").strip())

In [248]:
# For what's below, I have to keep in mind that the aggregated mutation data is NOT binary interactions only. It can also be n-ary interactions. 
interactions_with_xml_mutation_data_only = merged_expl.loc[(merged_expl["scraped_mut_has_info"]) & ~(merged_expl["agg_mut_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_agg_mutation_data_only = merged_expl.loc[~(merged_expl["scraped_mut_has_info"]) & (merged_expl["agg_mut_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_xml_and_agg_mutation_data = merged_expl.loc[(merged_expl["scraped_mut_has_info"]) & (merged_expl["agg_mut_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_in_my_pos = my_pos["interaction_intactid"].dropna().unique().tolist()
total_intactids = len(merged_expl["interaction_intactid"].dropna().unique())
print(f"Total interaction IDs with mutation data only from XML scraping: {len(interactions_with_xml_mutation_data_only)}/{total_intactids} = ({100*len(interactions_with_xml_mutation_data_only)/total_intactids:.2f}%)")
print(f"Total interaction IDs with mutation data only from aggregated mutation table: {len(interactions_with_agg_mutation_data_only)}/{total_intactids} = ({100*len(interactions_with_agg_mutation_data_only)/total_intactids:.2f}%)")
print(f"\tTotal that are also in my_pos: {len(set(interactions_with_agg_mutation_data_only).intersection(set(interactions_in_my_pos)))}")
print(f"Total interaction IDs with mutation data from both XML scraping and aggregated mutation table: {len(interactions_with_xml_and_agg_mutation_data)}/{total_intactids} = ({100*len(interactions_with_xml_and_agg_mutation_data)/total_intactids:.2f}%)")

# For what's below, I have to keep in mind that the aggregated mutation data is NOT binary interactions only. It can also be n-ary interactions. 
seq_pair_ids_with_xml_mutation_data_only = merged_expl.loc[(merged_expl["scraped_mut_has_info"]) & ~(merged_expl["agg_mut_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_agg_mutation_data_only = merged_expl.loc[~(merged_expl["scraped_mut_has_info"]) & (merged_expl["agg_mut_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_xml_and_agg_mutation_data = merged_expl.loc[(merged_expl["scraped_mut_has_info"]) & (merged_expl["agg_mut_has_info"])]["seq_pair_id"].dropna().unique().tolist()
total_seq_pair_ids = len(merged_expl["seq_pair_id"].dropna().unique())
print(f"\nTotal seq_pair_ids with mutation data only from XML scraping: {len(seq_pair_ids_with_xml_mutation_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_mutation_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with mutation data only from aggregated mutation table: {len(seq_pair_ids_with_agg_mutation_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_agg_mutation_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with mutation data from both XML scraping and aggregated mutation table: {len(seq_pair_ids_with_xml_and_agg_mutation_data)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_and_agg_mutation_data)/total_seq_pair_ids:.2f}%)")

Total interaction IDs with mutation data only from XML scraping: 2999/743127 = (0.40%)
Total interaction IDs with mutation data only from aggregated mutation table: 3/743127 = (0.00%)
	Total that are also in my_pos: 3
Total interaction IDs with mutation data from both XML scraping and aggregated mutation table: 33866/743127 = (4.56%)

Total seq_pair_ids with mutation data only from XML scraping: 2315/426539 = (0.54%)
Total seq_pair_ids with mutation data only from aggregated mutation table: 3/426539 = (0.00%)
Total seq_pair_ids with mutation data from both XML scraping and aggregated mutation table: 17300/426539 = (4.06%)


In [249]:
# For what's below, I have to keep in mind that the aggregated mutation data is NOT binary interactions only. It can also be n-ary interactions. 
interactions_with_xml_mutation_data_only = merged_neg_expl.loc[(merged_neg_expl["scraped_mut_has_info"]) & ~(merged_neg_expl["agg_mut_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_agg_mutation_data_only = merged_neg_expl.loc[~(merged_neg_expl["scraped_mut_has_info"]) & (merged_neg_expl["agg_mut_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_xml_and_agg_mutation_data = merged_neg_expl.loc[(merged_neg_expl["scraped_mut_has_info"]) & (merged_neg_expl["agg_mut_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_in_my_pos = my_pos["interaction_intactid"].dropna().unique().tolist()
total_intactids = len(merged_neg_expl["interaction_intactid"].dropna().unique())
print(f"Total interaction IDs with mutation data only from XML scraping: {len(interactions_with_xml_mutation_data_only)}/{total_intactids} = ({100*len(interactions_with_xml_mutation_data_only)/total_intactids:.2f}%)")
print(f"Total interaction IDs with mutation data only from aggregated mutation table: {len(interactions_with_agg_mutation_data_only)}/{total_intactids} = ({100*len(interactions_with_agg_mutation_data_only)/total_intactids:.2f}%)")
print(f"\tTotal that are also in my_pos: {len(set(interactions_with_agg_mutation_data_only).intersection(set(interactions_in_my_pos)))}")
print(f"Total interaction IDs with mutation data from both XML scraping and aggregated mutation table: {len(interactions_with_xml_and_agg_mutation_data)}/{total_intactids} = ({100*len(interactions_with_xml_and_agg_mutation_data)/total_intactids:.2f}%)")

# For what's below, I have to keep in mind that the aggregated mutation data is NOT binary interactions only. It can also be n-ary interactions. 
seq_sorts_with_xml_mutation_data_only = merged_neg_expl.loc[(merged_neg_expl["scraped_mut_has_info"]) & ~(merged_neg_expl["agg_mut_has_info"])]["seq_sort"].dropna().unique().tolist()
seq_sorts_with_agg_mutation_data_only = merged_neg_expl.loc[~(merged_neg_expl["scraped_mut_has_info"]) & (merged_neg_expl["agg_mut_has_info"])]["seq_sort"].dropna().unique().tolist()
seq_sorts_with_xml_and_agg_mutation_data = merged_neg_expl.loc[(merged_neg_expl["scraped_mut_has_info"]) & (merged_neg_expl["agg_mut_has_info"])]["seq_sort"].dropna().unique().tolist()
total_seq_sorts = len(merged_neg_expl["seq_sort"].dropna().unique())
print(f"\nTotal seq_sorts with mutation data only from XML scraping: {len(seq_sorts_with_xml_mutation_data_only)}/{total_seq_sorts} = ({100*len(seq_sorts_with_xml_mutation_data_only)/total_seq_sorts:.2f}%)")
print(f"Total seq_sorts with mutation data only from aggregated mutation table: {len(seq_sorts_with_agg_mutation_data_only)}/{total_seq_sorts} = ({100*len(seq_sorts_with_agg_mutation_data_only)/total_seq_sorts:.2f}%)")
print(f"Total seq_sorts with mutation data from both XML scraping and aggregated mutation table: {len(seq_sorts_with_xml_and_agg_mutation_data)}/{total_seq_sorts} = ({100*len(seq_sorts_with_xml_and_agg_mutation_data)/total_seq_sorts:.2f}%)")

Total interaction IDs with mutation data only from XML scraping: 0/969 = (0.00%)
Total interaction IDs with mutation data only from aggregated mutation table: 0/969 = (0.00%)
	Total that are also in my_pos: 0
Total interaction IDs with mutation data from both XML scraping and aggregated mutation table: 11/969 = (1.14%)

Total seq_sorts with mutation data only from XML scraping: 0/916 = (0.00%)
Total seq_sorts with mutation data only from aggregated mutation table: 0/916 = (0.00%)
Total seq_sorts with mutation data from both XML scraping and aggregated mutation table: 7/916 = (0.76%)


In [250]:
# how do we determine if a mutation row is a mutation row? 
# if it has # Feature AC 
test1 = len(mutations.loc[mutations["# Feature AC"].isna()])==0
print(f"Everything in the mutations dataframe has a # Feature AC entry: {test1}")
test1 = len(mutations.loc[mutations["Feature type"].isna()])==0
print(f"Everything in the mutations dataframe has a Feature type entry: {test1}")
test1 = len(mutations.loc[mutations["Feature type"].isna()])
print(f"\tTotal mutation features without an MI term identifier: {test1}/{len(mutations)} ({100*test1/len(mutations):.2f}%)")
test1 = len(mutations.loc[mutations["Feature range(s)"].isna()])
print(f"\tTotal mutation features without a feature range: {test1}/{len(mutations)} ({100*test1/len(mutations):.2f}%)")
test1 = len(mutations.loc[
    mutations["Affected protein AC"].isna()
])
print(f"\tTotal mutations that do not indicate which protein is affected: {test1}")
test1 = len(mutations.loc[
    mutations["Interaction AC"].isna()
])
print(f"\tTotal mutations that do not indicate which interaction is affected: {test1}")

test1 = len(mutations.loc[
    mutations["Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal mutations that have multiple comma-separated entries for affected protein AC: {test1}")
test1 = len(mutations.loc[
    mutations["Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal mutations that have multiple pipe-separated entries for affected protein AC: {test1}")



Everything in the mutations dataframe has a # Feature AC entry: True
Everything in the mutations dataframe has a Feature type entry: False
	Total mutation features without an MI term identifier: 12/83644 (0.01%)
	Total mutation features without a feature range: 10/83644 (0.01%)
	Total mutations that do not indicate which protein is affected: 17
	Total mutations that do not indicate which interaction is affected: 24
	Total mutations that have multiple comma-separated entries for affected protein AC: 4
	Total mutations that have multiple pipe-separated entries for affected protein AC: 2


In [251]:
# Check for comma-separated identifiers of affected protein
test1 = len(merged_expl.loc[
    merged_expl["Mutation Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal merged-in mutations that have multiple comma-separated entries for affected protein AC: {test1}")

# Check for pipe-separated identifiers of affected protein
test1 = len(merged_expl.loc[
    merged_expl["Mutation Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal merged-in mutations that have multiple pipe-separated entries for affected protein AC: {test1}")

# Check for UniProtKB vs. IntAct identifiers for affected protein
test2 = len(merged_expl.loc[
    merged_expl["Mutation Affected protein AC"].notna()
])
test1 = len(merged_expl.loc[
    merged_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
print(f"\tTotal merged-in mutations that have uniprotkb identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")
# How many uniprots have isoforms? 
test2 = len(merged_expl.loc[
    merged_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
test1 = len(merged_expl.loc[
    (merged_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_expl["Mutation Affected protein AC"].fillna("").str.contains("-"))
])
print(f"\t\tFraction where an isoform is present: {test1}/{test2} ({100*test1/test2:.2f}%)")

test1 = len(merged_expl.loc[
    merged_expl["Mutation Affected protein AC"].fillna("").str.contains("intact:EBI-")
])
print(f"\tTotal merged-in mutations that have IntAct identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")

temp = merged_expl.loc[
    (merged_expl["Mutation Affected protein AC"].notna()) & 
    ~(merged_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) &
    ~(merged_expl["Mutation Affected protein AC"].fillna("").str.contains("intact:EBI-")) 
]["Mutation Affected protein AC"].apply(lambda x: x.split(":")[0]).value_counts().to_dict()
test1 = sum([v for v in list(temp.values())])
print(f"\tTotal merged-in mutations that have other identifiers: {test1}/{test2} ({100*test1/test2:.2f}%)")
print(f"\t\tBreakdown: {temp}")


	Total merged-in mutations that have multiple comma-separated entries for affected protein AC: 0
	Total merged-in mutations that have multiple pipe-separated entries for affected protein AC: 0
	Total merged-in mutations that have uniprotkb identifier for affected protein: 13174740/13175683 (99.99%)
		Fraction where an isoform is present: 1075551/13174740 (8.16%)
	Total merged-in mutations that have IntAct identifier for affected protein: 773/13174740 (0.01%)
	Total merged-in mutations that have other identifiers: 170/13174740 (0.00%)
		Breakdown: {'dip': 170}


In [252]:
# Check for comma-separated identifiers of affected protein
test1 = len(merged_neg_expl.loc[
    merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal merged-in mutations that have multiple comma-separated entries for affected protein AC: {test1}")

# Check for pipe-separated identifiers of affected protein
test1 = len(merged_neg_expl.loc[
    merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal merged-in mutations that have multiple pipe-separated entries for affected protein AC: {test1}")

# Check for UniProtKB vs. IntAct identifiers for affected protein
test2 = len(merged_neg_expl.loc[
    merged_neg_expl["Mutation Affected protein AC"].notna()
])
test1 = len(merged_neg_expl.loc[
    merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
print(f"\tTotal merged-in mutations that have uniprotkb identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")
# How many uniprots have isoforms? 
test2 = len(merged_neg_expl.loc[
    merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("-"))
])
print(f"\t\tFraction where an isoform is present: {test1}/{test2} ({100*test1/test2:.2f}%)")

test1 = len(merged_neg_expl.loc[
    merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("intact:EBI-")
])
print(f"\tTotal merged-in mutations that have IntAct identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")

temp = merged_neg_expl.loc[
    (merged_neg_expl["Mutation Affected protein AC"].notna()) & 
    ~(merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) &
    ~(merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("intact:EBI-")) 
]["Mutation Affected protein AC"].apply(lambda x: x.split(":")[0]).value_counts().to_dict()
test1 = sum([v for v in list(temp.values())])
print(f"\tTotal merged-in mutations that have other identifiers: {test1}/{test2} ({100*test1/test2:.2f}%)")
print(f"\t\tBreakdown: {temp}")

	Total merged-in mutations that have multiple comma-separated entries for affected protein AC: 0
	Total merged-in mutations that have multiple pipe-separated entries for affected protein AC: 0
	Total merged-in mutations that have uniprotkb identifier for affected protein: 18/18 (100.00%)
		Fraction where an isoform is present: 5/18 (27.78%)
	Total merged-in mutations that have IntAct identifier for affected protein: 0/18 (0.00%)
	Total merged-in mutations that have other identifiers: 0/18 (0.00%)
		Breakdown: {}


In [253]:
test1 = merged_expl.loc[
    (merged_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) 
    #& (merged_expl["Mutation Affected protein AC"].fillna("").str.contains("-")) 
][
    ["unique_id","uniprot_A_intact","uniprot_B_intact","uniprot_A","uniprot_B","Mutation Affected protein AC", "aa_1","aa_2",]
]
test2 = len(test1.loc[
    (test1["uniprot_A"]!=test1["Mutation Affected protein AC"]) & 
    (test1["uniprot_B"]!=test1["Mutation Affected protein AC"]) & 
    (test1["uniprot_A_intact"]!=test1["Mutation Affected protein AC"]) & 
    (test1["uniprot_B_intact"]!=test1["Mutation Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test2}")
test2 = len(test1.loc[
    (test1["uniprot_A"]==test1["Mutation Affected protein AC"]) | 
    (test1["uniprot_B"]==test1["Mutation Affected protein AC"]) | 
    (test1["uniprot_A_intact"]==test1["Mutation Affected protein AC"]) | 
    (test1["uniprot_B_intact"]==test1["Mutation Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test2}")


	Total rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 0
	Total rows where the Affected protein AC from UniProt does match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 13174740


In [254]:
test1 = merged_neg_expl.loc[
    (merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) 
    #& (merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("-")) 
][
    ["unique_id","uniprot_A_intact","uniprot_B_intact","uniprot_A","uniprot_B","Mutation Affected protein AC", "aa_1","aa_2",]
]
test2 = len(test1.loc[
    (test1["uniprot_A"]!=test1["Mutation Affected protein AC"]) & 
    (test1["uniprot_B"]!=test1["Mutation Affected protein AC"]) & 
    (test1["uniprot_A_intact"]!=test1["Mutation Affected protein AC"]) & 
    (test1["uniprot_B_intact"]!=test1["Mutation Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test2}")
test2 = len(test1.loc[
    (test1["uniprot_A"]==test1["Mutation Affected protein AC"]) | 
    (test1["uniprot_B"]==test1["Mutation Affected protein AC"]) | 
    (test1["uniprot_A_intact"]==test1["Mutation Affected protein AC"]) | 
    (test1["uniprot_B_intact"]==test1["Mutation Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test2}")


	Total rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 0
	Total rows where the Affected protein AC from UniProt does match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 18


In [255]:
def parse_mutation_short_for_interactors(s: str):
    """
    Take mutation_short, which could look like Q61686:p.[Ile165Glu;Tyr168Glu]|P20340-1:p.Gln72Leu
    And parse it for the affected proteins
    """
    if type(s)==float:
        return np.nan
    l = s.split("\\|")
    affected = []
    for part in l:
        cur_affected = part.split(":")[0]
        affected.append(cur_affected)
    return "|".join(affected)

# Add a mutation short column to get an affected partner column
merged_neg_expl["mutation_short"] = merged_neg_expl["mutation_short_1"].fillna("") + "|" + merged_neg_expl["mutation_short_2"].fillna("")
merged_neg_expl["mutation_short"] = merged_neg_expl["mutation_short"].str.strip("\\|")
merged_neg_expl["mutation_short"] = merged_neg_expl["mutation_short"].apply(lambda x: np.nan if x=="" else x)

# are there any where both were mutated?? 
test1 = len(merged_neg_expl.loc[merged_neg_expl["mutation_short"].fillna("").str.contains("\\|")])
print(f"Total negative mutation rows that have mutations on both proteins: {test1}")

merged_neg_expl["mutation_affected_partner"] = merged_neg_expl["mutation_short"].apply(lambda x: parse_mutation_short_for_interactors(x))
display(merged_neg_expl.loc[
    merged_neg_expl["mutation_short"].notna()
][
    ["mutation_short","mutation_affected_partner"]
])

Total negative mutation rows that have mutations on both proteins: 0


Unnamed: 0,mutation_short,mutation_affected_partner
109,P20340:p.Gln72Leu,P20340
663,Q9NRW1:p.Gln72Leu,Q9NRW1
664,Q9NRW1:p.Gln72Leu,Q9NRW1
665,Q9NRW1:p.Gln72Leu,Q9NRW1
666,Q9NRW1:p.Gln72Leu,Q9NRW1
667,Q9NRW1:p.Thr27Asn,Q9NRW1
668,Q9NRW1:p.Thr27Asn,Q9NRW1
752,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686
763,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686
764,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686


In [256]:
test1 = merged_neg_expl.loc[
    (merged_neg_expl["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")) &
    (merged_neg_expl["scraped_mut_has_info"]) &
    (
        (merged_neg_expl["uniprot_A"]==merged_neg_expl["Mutation Affected protein AC"]) |
        (merged_neg_expl["uniprot_B"]==merged_neg_expl["Mutation Affected protein AC"]) | 
        (merged_neg_expl["uniprot_A_intact"]==merged_neg_expl["Mutation Affected protein AC"]) |
        (merged_neg_expl["uniprot_B_intact"]==merged_neg_expl["Mutation Affected protein AC"]) 
    )
]
print(f"Displaying a few rows where there is a match between scraped-identity and merged-agg identity (total: {len(test1)})")
display(test1[
    ["interaction_intactid","year","interaction_xml_id","uniprot_A","uniprot_B","uniprot_A_intact","uniprot_B_intact",
     "Mutation Affected protein AC","Mutation Feature short label","mutation_short","mutation_affected_partner","aa_1","aa_2",]
])

Displaying a few rows where there is a match between scraped-identity and merged-agg identity (total: 18)


Unnamed: 0,interaction_intactid,year,interaction_xml_id,uniprot_A,uniprot_B,uniprot_A_intact,uniprot_B_intact,Mutation Affected protein AC,Mutation Feature short label,mutation_short,mutation_affected_partner,aa_1,aa_2
109,EBI-8840423,2013,25,uniprotkb:Q02410-0,uniprotkb:P20340-0,uniprotkb:Q02410-1,uniprotkb:P20340,uniprotkb:P20340,P20340:p.Gln72Leu,P20340:p.Gln72Leu,P20340,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
663,EBI-8840419,2013,18,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,uniprotkb:Q02410-1,uniprotkb:Q9NRW1,uniprotkb:Q9NRW1,Q9NRW1:p.Gln72Leu,Q9NRW1:p.Gln72Leu,Q9NRW1,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
664,EBI-8840571,2013,32,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,uniprotkb:Q02410-1,uniprotkb:Q9NRW1,uniprotkb:Q9NRW1,Q9NRW1:p.Gln72Leu,Q9NRW1:p.Gln72Leu,Q9NRW1,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
665,EBI-8840307,2013,13,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,uniprotkb:Q02410-1,uniprotkb:Q9NRW1,uniprotkb:Q9NRW1,Q9NRW1:p.Gln72Leu,Q9NRW1:p.Gln72Leu,Q9NRW1,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
666,EBI-8840307,2013,13,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,uniprotkb:Q02410-1,uniprotkb:Q9NRW1,uniprotkb:Q9NRW1,Q9NRW1:p.Thr27Asn,Q9NRW1:p.Gln72Leu,Q9NRW1,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
667,EBI-8840307,2013,13,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,uniprotkb:Q02410-1,uniprotkb:Q9NRW1,uniprotkb:Q9NRW1,Q9NRW1:p.Gln72Leu,Q9NRW1:p.Thr27Asn,Q9NRW1,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
668,EBI-8840307,2013,13,uniprotkb:Q02410-0,uniprotkb:Q9NRW1-0,uniprotkb:Q02410-1,uniprotkb:Q9NRW1,uniprotkb:Q9NRW1,Q9NRW1:p.Thr27Asn,Q9NRW1:p.Thr27Asn,Q9NRW1,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...
752,EBI-8596072,2003,11,uniprotkb:Q61686-0,uniprotkb:O54864-0,uniprotkb:Q61686,uniprotkb:O54864,uniprotkb:Q61686,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],Q61686,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...
763,EBI-8596042,2003,4,uniprotkb:Q61686-0,uniprotkb:Q61686-0,uniprotkb:Q61686,uniprotkb:Q61686,uniprotkb:Q61686,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],Q61686,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...
764,EBI-8596042,2003,4,uniprotkb:Q61686-0,uniprotkb:Q61686-0,uniprotkb:Q61686,uniprotkb:Q61686,uniprotkb:Q61686,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],Q61686,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...


In [257]:
display(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"]=="EBI-6857409"][["interaction_intactid","year","interaction_xml_id"]])
display(merged.loc[merged["interaction_intactid"]=="EBI-6857409"][["interaction_intactid","year","interaction_xml_id"]])

Unnamed: 0,interaction_intactid,year,interaction_xml_id


Unnamed: 0,interaction_intactid,year,interaction_xml_id
366901,EBI-6857409,2007,29


In [258]:
# process: determine which sequence is affected --> mutate its sequence according to the range --> assign labels based on my annotations
temp = merged_expl.loc[
   merged_expl["Mutation # Feature AC"].notna()
   ].reset_index(drop=True)[
   ["unique_id","uniprot_A","uniprot_B","Mutation Affected protein AC", "aa_1","aa_2", "Interaction identifier(s)",
    "Mutation Original sequence", "Mutation Resulting sequence", "Mutation Feature type", "Mutation Feature range(s)",
       "Mutation Feature annotation(s)",
       "Mutation Affected protein symbol", "Mutation Affected protein full name"]
   ]
temp

Unnamed: 0,unique_id,uniprot_A,uniprot_B,Mutation Affected protein AC,aa_1,aa_2,Interaction identifier(s),Mutation Original sequence,Mutation Resulting sequence,Mutation Feature type,Mutation Feature range(s),Mutation Feature annotation(s),Mutation Affected protein symbol,Mutation Affected protein full name
0,intact:EBI-1000553_intact:EBI-475981,uniprotkb:P08069-0,uniprotkb:P29353-2,uniprotkb:P08069,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,intact:EBI-2437761|intact:EBI-2437595,Y,F,"psi-mi:""MI:0573""(mutation disrupting interaction)",980-980,-,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
1,intact:EBI-1000553_intact:EBI-475981,uniprotkb:P08069-0,uniprotkb:P29353-2,uniprotkb:P08069,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,intact:EBI-2437761|intact:EBI-2437595,K,A,"psi-mi:""MI:0573""(mutation disrupting interaction)",1033-1033,-,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
2,intact:EBI-1000553_intact:EBI-475981,uniprotkb:P08069-0,uniprotkb:P29353-2,uniprotkb:P08069,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,intact:EBI-2437761|intact:EBI-2437595,Y,F,"psi-mi:""MI:0573""(mutation disrupting interaction)",980-980,-,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
3,intact:EBI-1000553_intact:EBI-475981,uniprotkb:P08069-0,uniprotkb:P29353-2,uniprotkb:P08069,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,intact:EBI-2437761|intact:EBI-2437595,K,A,"psi-mi:""MI:0573""(mutation disrupting interaction)",1033-1033,-,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
4,intact:EBI-10006231_intact:EBI-2511350,uniprotkb:Q16513-0,uniprotkb:O92972-PRO_0000278753,uniprotkb:Q16513,MASNPERGEILLTELQGDSRSLPFSENVSAVQKLDFSDTMVQQKLD...,SMSYTWTGALITPCAAEESKLPINPLSNSLLRHHNMVYATTSRSAS...,intact:EBI-9678732|intact:EBI-10026878|intact:...,L,I,"psi-mi:""MI:0118""(mutation)",520-520,-,uniprotkb:PKN2(gene name),Serine/threonine-protein kinase N2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13175678,intact:EBI-992580_intact:EBI-992580,uniprotkb:Q13188-0,uniprotkb:Q13188-0,uniprotkb:Q13188,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,intact:EBI-16071090|intact:EBI-16071160|intact...,E,A,"psi-mi:""MI:1128""(mutation disrupting interacti...",462-462,-,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3
13175679,intact:EBI-992580_intact:EBI-992580,uniprotkb:Q13188-0,uniprotkb:Q13188-0,uniprotkb:Q13188,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,intact:EBI-16071090|intact:EBI-16071160|intact...,Y,A,"psi-mi:""MI:1128""(mutation disrupting interacti...",470-470,-,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3
13175680,intact:EBI-992580_intact:EBI-992580,uniprotkb:Q13188-0,uniprotkb:Q13188-0,uniprotkb:Q13188,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,intact:EBI-16071090|intact:EBI-16071160|intact...,Y,A,"psi-mi:""MI:1128""(mutation disrupting interacti...",470-470,-,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3
13175681,intact:EBI-992580_intact:EBI-992580,uniprotkb:Q13188-0,uniprotkb:Q13188-0,uniprotkb:Q13188,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,MEQPPAPKSKLKKLSEDSLTKQPEEVFDVLEKLGEGSYGSVFKAIH...,intact:EBI-16071090|intact:EBI-16071160|intact...,L,A,"psi-mi:""MI:1133""(mutation decreasing interacti...",478-478,-,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3


In [259]:
## Positives
# Merge on feature_types_labeled
merged_expl = pd.merge(
    merged_expl,
    mutation_feature_types_labeled.rename(
        columns={
            "feature": "Mutation Feature type",
            "original_sequence": "Mutation og_binds_bo_feature_type",
            "mutated_sequence": "Mutation new_binds_bo_feature_type"
            }
    )[[
        "Mutation Feature type",
        "Mutation og_binds_bo_feature_type",
        "Mutation new_binds_bo_feature_type"
            ]],
    on="Mutation Feature type",
    how="left"
)

# Merge on feature annotations labeled
merged_expl = pd.merge(
    merged_expl,
    mutation_feature_annotations_labeled.rename(
        columns={
            "feature": "Mutation Feature annotation(s)",
            "original_sequence": "Mutation og_binds_bo_annotation",
            "mutated_sequence": "Mutation new_binds_bo_annotation",
            "mutated_binds_to_uniprot": "Mutation new_binds_to_uniprot_bo_annotation",
            "mutated_no_bind_to_uniprot": "Mutation new_nobind_to_uniprot_bo_annotation",
            "mutated_binds_to_genename": "Mutation new_binds_to_gname_bo_annotation",
            "mutated_no_bind_to_genename": "Mutation new_nobind_to_gname_bo_annotation",
            }
    )[[
        "Mutation Feature annotation(s)",
        "Mutation og_binds_bo_annotation",
        "Mutation new_binds_bo_annotation",
        "Mutation new_binds_to_uniprot_bo_annotation",
        "Mutation new_nobind_to_uniprot_bo_annotation",
        "Mutation new_binds_to_gname_bo_annotation",
        "Mutation new_nobind_to_gname_bo_annotation",
            ]],
    on="Mutation Feature annotation(s)",
    how="left"
)

# Merge on feature AC labeled
merged_expl = pd.merge(
    merged_expl,
    mutation_feature_ac_labeled.rename(
        columns={
            "feature": "Mutation # Feature AC",
            "original_sequence": "Mutation og_binds_bo_ac",
            "mutated_sequence": "Mutation new_binds_bo_ac",
            }
    )[[
        "Mutation # Feature AC",
        "Mutation og_binds_bo_ac",
        "Mutation new_binds_bo_ac",
            ]],
    on="Mutation # Feature AC",
    how="left"
)

merged_expl.head()


Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,Mutation og_binds_bo_feature_type,Mutation new_binds_bo_feature_type,Mutation og_binds_bo_annotation,Mutation new_binds_bo_annotation,Mutation new_binds_to_uniprot_bo_annotation,Mutation new_nobind_to_uniprot_bo_annotation,Mutation new_binds_to_gname_bo_annotation,Mutation new_nobind_to_gname_bo_annotation,Mutation og_binds_bo_ac,Mutation new_binds_bo_ac
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,,,,,,,,
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,,,,,,,,,,
2,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,,,,,,,,
3,intact:EBI-100018,intact:EBI-107089,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,,,,,,,,
4,intact:EBI-117032,intact:EBI-100018,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,,,,,,,,


In [260]:
[x for x in merged.columns if "mutation" in x]

['mutation_mi_1',
 'mutation_name_1',
 'mutation_short_1',
 'mutation_begin_1',
 'mutation_end_1',
 'mutation_orig_1',
 'mutation_new_1',
 'mutation_mi_2',
 'mutation_name_2',
 'mutation_short_2',
 'mutation_begin_2',
 'mutation_end_2',
 'mutation_orig_2',
 'mutation_new_2']

In [261]:
merged.loc[merged["mutation_mi_1"].notna()][[
    'interaction_intactid',    
    
    'mutation_mi_1',
 'mutation_name_1',
 'mutation_short_1',
 'mutation_begin_1',
 'mutation_end_1',
 'mutation_orig_1',
 'mutation_new_1',
 'mutation_mi_2',
 'mutation_name_2',
 'mutation_short_2',
 'mutation_begin_2',
 'mutation_end_2',
 'mutation_orig_2',
 'mutation_new_2']]

Unnamed: 0,interaction_intactid,mutation_mi_1,mutation_name_1,mutation_short_1,mutation_begin_1,mutation_end_1,mutation_orig_1,mutation_new_1,mutation_mi_2,mutation_name_2,mutation_short_2,mutation_begin_2,mutation_end_2,mutation_orig_2,mutation_new_2
107,EBI-2437595,MI:0573|MI:0573,mutation disrupting interaction|mutation disru...,P08069:p.Lys1033Ala|P08069:p.Tyr980Phe,1033|980,1033|980,K|Y,A|F,,,,,,,
127,EBI-9678671,MI:0118,mutation,Q16513:p.Leu520Ile,520,520,L,I,,,,,,,
128,EBI-9998408,MI:0118,mutation,Q16513:p.Leu520Ile,520,520,L,I,,,,,,,
355,EBI-15947532,MI:0573|MI:0573,mutation disrupting interaction|mutation disru...,O60566:p.[Leu128Ala;Leu131Ala]|O60566:p.[Tyr14...,"131,128|142,141","131,128|142,141","L,L|L,Y","A,A|A,A",,,,,,,
356,EBI-15947620,MI:0119|MI:0119,mutation decreasing interaction|mutation decre...,O60566:p.[Tyr141Ala;Leu142Ala]|O60566:p.[Leu12...,"142,141|131,128","142,141|131,128","L,Y|L,L","A,A|A,A",,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
745016,EBI-22271845,MI:2226,mutation with no effect,Q96F86:p.Phe213Leu,213,213,F,L,,,,,,,
745018,EBI-22123304,MI:2226|MI:2226|MI:2226|MI:2226|MI:2226,mutation with no effect|mutation with no effec...,Q96F86:p.Glu450Lys|Q96F86:p.Ile442Val|Q96F86:p...,450|442|416|378|267,450|442|416|378|267,E|I|N|I|V,K|V|T|V|M,,,,,,,
745032,EBI-27095251,MI:0429,necessary binding region,daz repeat regions,497,712,,,,,,,,,
745042,EBI-27095585,MI:0429,necessary binding region,region,161,185,,,,,,,,,


In [262]:
# look at the subset of merged_expl that has mutation-related data
mask = merged_expl["scraped_mut_has_info"] | merged_expl["agg_mut_has_info"]
merged_expl_mut = merged_expl.loc[mask].reset_index(drop=True)
print(f"\tTotal rows of exploded merged that have at least one column of mutation-related data (from XML or aggregated): {len(merged_expl_mut)}/{len(merged_expl)} ({100*len(merged_expl_mut)/len(merged_expl):.2f}%)")

	Total rows of exploded merged that have at least one column of mutation-related data (from XML or aggregated): 13179124/13886551 (94.91%)


In [263]:
del merged_expl

In [264]:
merged_neg_expl = pd.merge(
    merged_neg_expl,
    mutation_feature_types_labeled.rename(
        columns={
            "feature": "Mutation Feature type",
            "original_sequence": "Mutation og_binds_bo_feature_type",
            "mutated_sequence": "Mutation new_binds_bo_feature_type"
            }
    )[[
        "Mutation Feature type",
        "Mutation og_binds_bo_feature_type",
        "Mutation new_binds_bo_feature_type"
            ]],
    on="Mutation Feature type",
    how="left"
)
merged_neg_expl = pd.merge(
    merged_neg_expl,
    mutation_feature_annotations_labeled.rename(
        columns={
            "feature": "Mutation Feature annotation(s)",
            "original_sequence": "Mutation og_binds_bo_annotation",
            "mutated_sequence": "Mutation new_binds_bo_annotation",
            "mutated_binds_to_uniprot": "Mutation new_binds_to_uniprot_bo_annotation",
            "mutated_no_bind_to_uniprot": "Mutation new_nobind_to_uniprot_bo_annotation",
            "mutated_binds_to_genename": "Mutation new_binds_to_gname_bo_annotation",
            "mutated_no_bind_to_genename": "Mutation new_nobind_to_gname_bo_annotation",
            }
    )[[
        "Mutation Feature annotation(s)",
        "Mutation og_binds_bo_annotation",
        "Mutation new_binds_bo_annotation",
        "Mutation new_binds_to_uniprot_bo_annotation",
        "Mutation new_nobind_to_uniprot_bo_annotation",
        "Mutation new_binds_to_gname_bo_annotation",
        "Mutation new_nobind_to_gname_bo_annotation",
            ]],
    on="Mutation Feature annotation(s)",
    how="left"
)
merged_neg_expl = pd.merge(
    merged_neg_expl,
    mutation_feature_ac_labeled.rename(
        columns={
            "feature": "Mutation # Feature AC",
            "original_sequence": "Mutation og_binds_bo_ac",
            "mutated_sequence": "Mutation new_binds_bo_ac",
            }
    )[[
        "Mutation # Feature AC",
        "Mutation og_binds_bo_ac",
        "Mutation new_binds_bo_ac",
            ]],
    on="Mutation # Feature AC",
    how="left"
)
merged_neg_expl.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,Mutation og_binds_bo_feature_type,Mutation new_binds_bo_feature_type,Mutation og_binds_bo_annotation,Mutation new_binds_bo_annotation,Mutation new_binds_to_uniprot_bo_annotation,Mutation new_nobind_to_uniprot_bo_annotation,Mutation new_binds_to_gname_bo_annotation,Mutation new_nobind_to_gname_bo_annotation,Mutation og_binds_bo_ac,Mutation new_binds_bo_ac
0,intact:EBI-1002565,intact:EBI-1002822,uniprotkb:Q10173,uniprotkb:Q9Y738,psi-mi:nuf2_schpo|psi-mi:nuf2|uniprotkb:NMS co...,psi-mi:mis12_schpo|psi-mi:mis12|uniprotkb:NMS ...,"psi-mi:""MI:0018""(two hybrid)",Asakawa et al. (2005),pubmed:15728720,taxid:284812(schpo),...,,,,,,,,,,
1,intact:EBI-1014500,intact:EBI-1397518,uniprotkb:P35240-1|ensembl:ENSP00000344666.5,uniprotkb:P0DPB3-1|ensembl:ENSP00000491030.1,psi-mi:p35240-1|psi-mi:NF2|uniprotkb:I|uniprot...,psi-mi:p0dpb3-1|psi-mi:SCHIP1|uniprotkb:Q9P0W5...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitation)",Goutebroze et al. (2000),pubmed:10669747|imex:IM-19739,taxid:9606(human),...,,,,,,,,,,
2,intact:EBI-16428984,intact:EBI-10171697,uniprotkb:A0A0S2Z6H0,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,psi-mi:a0a0s2z6h0_human|psi-mi:ZGPAT|uniprotkb...,psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,,,,,,,,
3,intact:EBI-16467584,intact:EBI-10171697,,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,"psi-mi:""ccsb isoform id: gad1_3""|psi-mi:EBI-16...",psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,,,,,,,,
4,intact:EBI-16468000,intact:EBI-10171697,uniprotkb:A0A0S2Z5U3,uniprotkb:Q6A162|ensembl:ENSP00000366984.4,psi-mi:a0a0s2z5u3_human|psi-mi:HNRPLL|uniprotk...,psi-mi:k1c40_human|psi-mi:KRT40|uniprotkb:Q6IF...,"psi-mi:""MI:0397""(two hybrid array)",Yang et al. (2016),pubmed:26871637|imex:IM-25013|doi:10.1016/j.ce...,taxid:9606(human),...,,,,,,,,,,


In [265]:
# look at the subset of merged_expl that has mutation-related data
mask = merged_neg_expl["scraped_mut_has_info"] | merged_neg_expl["agg_mut_has_info"]
merged_neg_expl_mut = merged_neg_expl.loc[mask].reset_index(drop=True)
print(f"\tTotal rows of exploded merged that have at least one column of mutation-related data (from XML or aggregated): {len(merged_neg_expl_mut)}/{len(merged_neg_expl)} ({100*len(merged_neg_expl_mut)/len(merged_neg_expl):.2f}%)")

	Total rows of exploded merged that have at least one column of mutation-related data (from XML or aggregated): 18/976 (1.84%)


In [266]:
investigate_cols = [
    "unique_id","uniprot_A","uniprot_B","Mutation Affected protein AC", "aa_1","aa_2", "Interaction identifier(s)",
] + scraped_mut_cols + agg_mut_cols 
print("\n".join(investigate_cols))
#merged_expl_mut[]

unique_id
uniprot_A
uniprot_B
Mutation Affected protein AC
aa_1
aa_2
Interaction identifier(s)
mutation_mi_1
mutation_name_1
mutation_short_1
mutation_begin_1
mutation_end_1
mutation_orig_1
mutation_new_1
mutation_mi_2
mutation_name_2
mutation_short_2
mutation_begin_2
mutation_end_2
mutation_orig_2
mutation_new_2
Mutation # Feature AC
Mutation Feature short label
Mutation Feature range(s)
Mutation Original sequence
Mutation Resulting sequence
Mutation Feature type
Mutation Feature annotation(s)
Mutation Affected protein AC
Mutation Affected protein symbol
Mutation Affected protein full name
Mutation Affected protein organism
Mutation Interaction participants
Mutation PubMedID
Mutation Figure legend(s)
Mutation Xref ID(s)


In [267]:
print("\n".join(sorted(merged_expl_mut.columns)))

Alias(es) interactor A
Alias(es) interactor B
Alt. ID(s) interactor A
Alt. ID(s) interactor B
Annotation(s) interactor A
Annotation(s) interactor B
Biological role(s) interactor A
Biological role(s) interactor B
Checksum(s) interactor A
Checksum(s) interactor B
Confidence value(s)
Creation date
DB Sources interactor A
DB Sources interactor B
Experimental role(s) interactor A
Experimental role(s) interactor B
Host organism(s)
ID(s) interactor A
ID(s) interactor B
Interaction Checksum(s)
Interaction Xref(s)
Interaction annotation(s)
Interaction detection method(s)
Interaction identifier(s)
Interaction parameter(s)
Interaction type(s)
Mutation # Feature AC
Mutation Affected protein AC
Mutation Affected protein full name
Mutation Affected protein organism
Mutation Affected protein symbol
Mutation Feature annotation(s)
Mutation Feature range(s)
Mutation Feature short label
Mutation Feature type
Mutation Figure legend(s)
Mutation Interaction AC
Mutation Interaction participants
Mutation Orig

In [268]:
merged_expl_mut[[
    "all_intact_A_sorted",
    "all_intact_B_sorted",
    "uniprot_A_noisoforms",
    "uniprot_B_noisoforms",
    "Mutation Affected protein AC",
    "Mutation Affected protein symbol",
    "Mutation Affected protein full name"
]].loc[merged_expl_mut["Mutation Affected protein AC"].fillna("").str.contains("uniprotkb:")]

Unnamed: 0,all_intact_A_sorted,all_intact_B_sorted,uniprot_A_noisoforms,uniprot_B_noisoforms,Mutation Affected protein AC,Mutation Affected protein symbol,Mutation Affected protein full name
0,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353,uniprotkb:P08069,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
1,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353,uniprotkb:P08069,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
2,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353,uniprotkb:P08069,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
3,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353,uniprotkb:P08069,uniprotkb:IGF1R(gene name),Insulin-like growth factor 1 receptor
5,intact:EBI-2511350,intact:EBI-10006231,uniprotkb:Q16513,uniprotkb:O92972,uniprotkb:Q16513,uniprotkb:PKN2(gene name),Serine/threonine-protein kinase N2
...,...,...,...,...,...,...,...
13179118,intact:EBI-992580,intact:EBI-992580,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3
13179119,intact:EBI-992580,intact:EBI-992580,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3
13179120,intact:EBI-992580,intact:EBI-992580,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3
13179121,intact:EBI-992580,intact:EBI-992580,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:Q13188,uniprotkb:STK3(gene name),Serine/threonine-protein kinase 3


In [269]:
merged_expl_mut.loc[
    merged_expl_mut["intactid_1"].str.contains("\\|")
][["ID(s) interactor A","Alt. ID(s) interactor A","all_intact_A_sorted","intactid_1"]]

Unnamed: 0,ID(s) interactor A,Alt. ID(s) interactor A,all_intact_A_sorted,intactid_1
8,intact:EBI-1001438,uniprotkb:O60566,intact:EBI-1001438,intact:EBI-2551047|intact:EBI-1001438
9,intact:EBI-1001438,uniprotkb:O60566,intact:EBI-1001438,intact:EBI-2551047|intact:EBI-1001438
10,intact:EBI-1001438,uniprotkb:O60566,intact:EBI-1001438,intact:EBI-2551047|intact:EBI-1001438
11,intact:EBI-1001438,uniprotkb:O60566,intact:EBI-1001438,intact:EBI-2551047|intact:EBI-1001438
12,intact:EBI-1001438,uniprotkb:O60566,intact:EBI-1001438,intact:EBI-2551047|intact:EBI-1001438
...,...,...,...,...
13179068,intact:EBI-983809,uniprotkb:Q9H334,intact:EBI-983809,intact:EBI-28984754|intact:EBI-983809
13179069,intact:EBI-983809,uniprotkb:Q9H334,intact:EBI-983809,intact:EBI-28984754|intact:EBI-983809
13179070,intact:EBI-983809,uniprotkb:Q9H334,intact:EBI-983809,intact:EBI-28984754|intact:EBI-983809
13179071,intact:EBI-983809,uniprotkb:Q9H334,intact:EBI-983809,intact:EBI-28984754|intact:EBI-983809


In [270]:
def split_top_level_pipe(s: str):
    """
    Return (frag1, frag2) if `s` contains exactly one top-level '|' (depth==0)
    and parentheses are balanced; else return None.
    """
    depth = 0
    split_idx = None
    for i, ch in enumerate(s):
        if ch == '(':
            depth += 1
        elif ch == ')':
            depth -= 1
            if depth < 0:
                return None  # early unbalanced
        elif ch == '|' and depth == 0:
            if split_idx is not None:
                return None  # more than one top-level '|'
            split_idx = i

    # must end balanced and have exactly one top-level split
    if depth != 0 or split_idx is None:
        return None

    frag1 = s[:split_idx]
    frag2 = s[split_idx+1:]
    return frag1, frag2

In [271]:
s = merged_expl_mut.loc[
    merged_expl_mut["unique_id"]=="intact:EBI-25492095_intact:EBI-25492388"
]["Mutation Interaction participants"].unique().tolist()[0]

split_top_level_pipe(s)

('(uniprotkb:P0DTD1(psi-mi:"MI:0326"(protein)), taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-CoV-2))',
 '(uniprotkb:P0DTD1-PRO_0000449621(psi-mi:"MI:0326"(protein)), taxid:2697049(SARS-CoV-2)|taxid:2697049(SARS-CoV-2))')

In [272]:
merged_expl_mut["Mutation Interactor Matches"] = merged_expl_mut.apply(lambda row: feature_affected_protein_matches_id(row), axis=1)
merged_expl_mut.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,Mutation new_binds_bo_feature_type,Mutation og_binds_bo_annotation,Mutation new_binds_bo_annotation,Mutation new_binds_to_uniprot_bo_annotation,Mutation new_nobind_to_uniprot_bo_annotation,Mutation new_binds_to_gname_bo_annotation,Mutation new_nobind_to_gname_bo_annotation,Mutation og_binds_bo_ac,Mutation new_binds_bo_ac,Mutation Interactor Matches
0,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353-2,psi-mi:igf1r_human|psi-mi:IGF1R|uniprotkb:Q14C...,psi-mi:p29353-2|psi-mi:SHC1|uniprotkb:SHC1|uni...,"psi-mi:""MI:0047""(far western blotting)|psi-mi:...",Craparo et al. (1995),pubmed:7541045|imex:IM-19397,taxid:9606(human),...,no,,,,,,,,,A
1,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353-2,psi-mi:igf1r_human|psi-mi:IGF1R|uniprotkb:Q14C...,psi-mi:p29353-2|psi-mi:SHC1|uniprotkb:SHC1|uni...,"psi-mi:""MI:0047""(far western blotting)|psi-mi:...",Craparo et al. (1995),pubmed:7541045|imex:IM-19397,taxid:9606(human),...,no,,,,,,,,,A
2,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353-2,psi-mi:igf1r_human|psi-mi:IGF1R|uniprotkb:Q14C...,psi-mi:p29353-2|psi-mi:SHC1|uniprotkb:SHC1|uni...,"psi-mi:""MI:0047""(far western blotting)|psi-mi:...",Craparo et al. (1995),pubmed:7541045|imex:IM-19397,taxid:9606(human),...,no,,,,,,,,,A
3,intact:EBI-475981,intact:EBI-1000553,uniprotkb:P08069,uniprotkb:P29353-2,psi-mi:igf1r_human|psi-mi:IGF1R|uniprotkb:Q14C...,psi-mi:p29353-2|psi-mi:SHC1|uniprotkb:SHC1|uni...,"psi-mi:""MI:0047""(far western blotting)|psi-mi:...",Craparo et al. (1995),pubmed:7541045|imex:IM-19397,taxid:9606(human),...,no,,,,,,,,,A
4,intact:EBI-2511350,intact:EBI-10006231,uniprotkb:Q16513,uniprotkb:O92972-PRO_0000278753,psi-mi:pkn2_human|psi-mi:PKN2|uniprotkb:Q9H1W4...,psi-mi:o92972-pro_0000278753,"psi-mi:""MI:0424""(protein kinase assay)|psi-mi:...",Kim et al. (2004),pubmed:15364941|imex:IM-26019,taxid:9606(human),...,,,,,,,,,,


In [273]:
merged_expl_mut["Mutation Interactor Matches"].value_counts()

Mutation Interactor Matches
B      8455875
A      4610696
A,B     109112
          3441
Name: count, dtype: int64

In [274]:
merged_neg_expl_mut["Mutation Interactor Matches"] = merged_neg_expl_mut.apply(lambda row: feature_affected_protein_matches_id(row), axis=1)
merged_neg_expl_mut.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,Mutation new_binds_bo_feature_type,Mutation og_binds_bo_annotation,Mutation new_binds_bo_annotation,Mutation new_binds_to_uniprot_bo_annotation,Mutation new_nobind_to_uniprot_bo_annotation,Mutation new_binds_to_gname_bo_annotation,Mutation new_nobind_to_gname_bo_annotation,Mutation og_binds_bo_ac,Mutation new_binds_bo_ac,Mutation Interactor Matches
0,intact:EBI-9247467,intact:EBI-1052826,uniprotkb:Q02410-1|ensembl:ENSP00000265381.3,uniprotkb:P20340|ensembl:ENSP00000336850.3,psi-mi:q02410-1|psi-mi:APBA1|uniprotkb:APBA1|u...,psi-mi:rab6a_human|psi-mi:RAB6A|uniprotkb:Q5U0...,"psi-mi:""MI:0096""(pull down)",Thyrock et al. (2013),pubmed:23737971|imex:IM-21673,taxid:9606(human),...,unknown,,,,,,,,,B
1,intact:EBI-9247467,intact:EBI-1760079,uniprotkb:Q02410-1|ensembl:ENSP00000265381.3,uniprotkb:Q9NRW1|ensembl:ENSP00000285208.4,psi-mi:q02410-1|psi-mi:APBA1|uniprotkb:APBA1|u...,psi-mi:rab6b_human|psi-mi:RAB6B|uniprotkb:D3DN...,"psi-mi:""MI:0055""(fluorescent resonance energy ...",Thyrock et al. (2013),pubmed:23737971|imex:IM-21673,taxid:9606(human),...,unknown,,,,,,,,,B
2,intact:EBI-9247467,intact:EBI-1760079,uniprotkb:Q02410-1|ensembl:ENSP00000265381.3,uniprotkb:Q9NRW1|ensembl:ENSP00000285208.4,psi-mi:q02410-1|psi-mi:APBA1|uniprotkb:APBA1|u...,psi-mi:rab6b_human|psi-mi:RAB6B|uniprotkb:D3DN...,"psi-mi:""MI:0055""(fluorescent resonance energy ...",Thyrock et al. (2013),pubmed:23737971|imex:IM-21673,taxid:9606(human),...,unknown,,,,,,,,,B
3,intact:EBI-9247467,intact:EBI-1760079,uniprotkb:Q02410-1|ensembl:ENSP00000265381.3,uniprotkb:Q9NRW1|ensembl:ENSP00000285208.4,psi-mi:q02410-1|psi-mi:APBA1|uniprotkb:APBA1|u...,psi-mi:rab6b_human|psi-mi:RAB6B|uniprotkb:D3DN...,"psi-mi:""MI:0055""(fluorescent resonance energy ...",Thyrock et al. (2013),pubmed:23737971|imex:IM-21673,taxid:9606(human),...,unknown,,,,,,,,,B
4,intact:EBI-9247467,intact:EBI-1760079,uniprotkb:Q02410-1|ensembl:ENSP00000265381.3,uniprotkb:Q9NRW1|ensembl:ENSP00000285208.4,psi-mi:q02410-1|psi-mi:APBA1|uniprotkb:APBA1|u...,psi-mi:rab6b_human|psi-mi:RAB6B|uniprotkb:D3DN...,"psi-mi:""MI:0055""(fluorescent resonance energy ...",Thyrock et al. (2013),pubmed:23737971|imex:IM-21673,taxid:9606(human),...,unknown,,,,,,,,,B


In [275]:
merged_neg_expl_mut["Mutation Interactor Matches"].value_counts()

Mutation Interactor Matches
B      9
A      7
A,B    2
Name: count, dtype: int64

In [276]:
# Look at the different databases the results came from
temp = merged_expl_mut.loc[merged_expl_mut["Mutation Interactor Matches"].apply(lambda x: len(x)>0)]
print("Databases yielding successful matches:")
print(temp["Mutation Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

temp = merged_expl_mut.loc[merged_expl_mut["Mutation Interactor Matches"].apply(lambda x: len(x)==0)]
print("Databases yielding unsuccessful matches:")
print(temp["Mutation Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

Databases yielding successful matches:
0
uniprotkb    13174740
intact            773
dip               170
Name: count, dtype: int64
Databases yielding unsuccessful matches:
0
    3441
Name: count, dtype: int64


In [277]:
# Look at the different databases the results came from
temp = merged_neg_expl_mut.loc[merged_neg_expl_mut["Mutation Interactor Matches"].apply(lambda x: len(x)>0)]
print("Databases yielding successful matches:")
print(temp["Mutation Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

temp = merged_neg_expl_mut.loc[merged_neg_expl_mut["Mutation Interactor Matches"].apply(lambda x: len(x)==0)]
if len(temp)>0:
    print("Databases yielding unsuccessful matches:")
    print(temp["Mutation Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())
else:
    print(f"\nDatabases yielding unsuccessful matches: {len(temp)}")

Databases yielding successful matches:
0
uniprotkb    18
Name: count, dtype: int64

Databases yielding unsuccessful matches: 0


In [278]:
print(f"Matched mutations with their interactors A and/or B based on IDs.")
# no match
test1 = len(merged_expl_mut.loc[
    merged_expl_mut["Mutation Interactor Matches"].apply(lambda x: len(x))==0
])
print(f"\tTotal rows where mutated partner could not be found: {test1}/{len(merged_expl_mut)} ({100*test1/len(merged_expl_mut):.2f}%)")
# A only
test1 = len(merged_expl_mut.loc[
    (merged_expl_mut["Mutation Interactor Matches"].apply(lambda x: x=="A" if len(x)==1 else False))
])
print(f"\tTotal rows where mutated partner is A only: {test1}/{len(merged_expl_mut)} ({100*test1/len(merged_expl_mut):.2f}%)")
test1 = len(merged_expl_mut.loc[
    (merged_expl_mut["Mutation Interactor Matches"].apply(lambda x: x=="B" if len(x)==1 else False))
])
print(f"\tTotal rows where mutated partner is B only: {test1}/{len(merged_expl_mut)} ({100*test1/len(merged_expl_mut):.2f}%)")
test1 = len(merged_expl_mut.loc[
    (merged_expl_mut["Mutation Interactor Matches"].apply(lambda x: x=="A,B" if len(x)>1 else False))
])
print(f"\tTotal rows where mutated partner is A and B: {test1}/{len(merged_expl_mut)} ({100*test1/len(merged_expl_mut):.2f}%)")


Matched mutations with their interactors A and/or B based on IDs.
	Total rows where mutated partner could not be found: 3441/13179124 (0.03%)
	Total rows where mutated partner is A only: 4610696/13179124 (34.98%)
	Total rows where mutated partner is B only: 8455875/13179124 (64.16%)
	Total rows where mutated partner is A and B: 109112/13179124 (0.83%)


In [279]:
print(f"Matched mutations with their interactors A and/or B based on IDs.")
# no match
test1 = len(merged_neg_expl_mut.loc[
    merged_neg_expl_mut["Mutation Interactor Matches"].apply(lambda x: len(x))==0
])
print(f"\tTotal rows where mutated partner could not be found: {test1}/{len(merged_neg_expl_mut)} ({100*test1/len(merged_neg_expl_mut):.2f}%)")
# A only
test1 = len(merged_neg_expl_mut.loc[
    (merged_neg_expl_mut["Mutation Interactor Matches"].apply(lambda x: x=="A" if len(x)==1 else False))
])
print(f"\tTotal rows where mutated partner is A only: {test1}/{len(merged_neg_expl_mut)} ({100*test1/len(merged_neg_expl_mut):.2f}%)")
test1 = len(merged_neg_expl_mut.loc[
    (merged_neg_expl_mut["Mutation Interactor Matches"].apply(lambda x: x=="B" if len(x)==1 else False))
])
print(f"\tTotal rows where mutated partner is B only: {test1}/{len(merged_neg_expl_mut)} ({100*test1/len(merged_neg_expl_mut):.2f}%)")
test1 = len(merged_neg_expl_mut.loc[
    (merged_neg_expl_mut["Mutation Interactor Matches"].apply(lambda x: x=="A,B" if len(x)>1 else False))
])
print(f"\tTotal rows where mutated partner is A and B: {test1}/{len(merged_neg_expl_mut)} ({100*test1/len(merged_neg_expl_mut):.2f}%)")


Matched mutations with their interactors A and/or B based on IDs.
	Total rows where mutated partner could not be found: 0/18 (0.00%)
	Total rows where mutated partner is A only: 7/18 (38.89%)
	Total rows where mutated partner is B only: 9/18 (50.00%)
	Total rows where mutated partner is A and B: 2/18 (11.11%)


In [280]:
def check_mut_indices_for_redundancy(s):
    """
    Return true if not redundancy
    """
    # is it always the same thing on both sides? 
    if s is None or type(s)==float:
        return True
    s_list = s.split(",")
    for s in s_list:
        left = s.split("-")[0]
        right = s.split("-")[1]

        if len(list(set(left.split(".."))))!=1 or  len(list(set(right.split(".."))))!=1:
            return False
    return True

def clean_mut_redundant_indices(s):
    # is it always the same thing on both sides? 
    if s is None or type(s)==float:
        return None
    s_list = s.split(",")
    fixed = []
    for s in s_list:
        left = s.split("-")[0]
        right = s.split("-")[1]
        
        if len(list(set(left.split(".."))))!=1 or  len(list(set(right.split(".."))))!=1:
            return None

        new_item_left = left.split("..")[0]
        new_item_right = right.split("..")[0]
        
        if (not new_item_left.isdigit()) or (not new_item_right.isdigit):
            return None
        
        new_item = f"{new_item_left}-{new_item_right}"
        fixed.append(new_item)

    return ",".join(fixed)

In [281]:
# correct ranges
merged_expl_mut["Mutation Feature range(s)"] = merged_expl_mut["Mutation Feature range(s)"].apply(lambda x: clean_mut_redundant_indices(x))
merged_expl_mut[["Mutation Feature range(s)"]].head()

merged_expl_mut["Mutation redundant_indices"] = merged_expl_mut["Mutation Feature range(s)"].apply(lambda x: check_mut_indices_for_redundancy(x))
test1 = len(merged_expl_mut.loc[merged_expl_mut["Mutation redundant_indices"]==False])==0
print(f"None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : {test1}")
merged_expl_mut = merged_expl_mut.drop(columns=["Mutation redundant_indices"])

None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : True


In [282]:
# correct ranges
merged_neg_expl_mut["Mutation Feature range(s)"] = merged_neg_expl_mut["Mutation Feature range(s)"].apply(lambda x: clean_mut_redundant_indices(x))
merged_neg_expl_mut[["Mutation Feature range(s)"]].head()

merged_neg_expl_mut["Mutation redundant_indices"] = merged_neg_expl_mut["Mutation Feature range(s)"].apply(lambda x: check_mut_indices_for_redundancy(x))
test1 = len(merged_neg_expl_mut.loc[merged_neg_expl_mut["Mutation redundant_indices"]==False])==0
print(f"None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : {test1}")
merged_neg_expl_mut = merged_neg_expl_mut.drop(columns=["Mutation redundant_indices"])

None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : True


In [283]:
# make new ranges
def convert_mut_begin_end_into_range(row, partner:int|str = 1):
    """
    Take mutation_begin_1 and mutation_end_1 (or _2) and convert into Mutation Feature range(s) format 
    e.g. mutation_begin_1 = 1033 and mutation_end_1 = 1033 --> Mutation Feature range(s) = "1033-1033"
    """
    partner = str(partner)
    beg_col = f"mutation_begin_{partner}"
    end_col = f"mutation_end_{partner}"
    
    if (row[beg_col] is None) or type(row[beg_col]) in [float,pd._libs.missing.NAType] or (row[end_col] is None) or type(row[end_col])in [float,pd._libs.missing.NAType]:
        return None
    
    # there may be commas. e.g. 91, 94. split on commas and match by index
    try:
        begins = [int(x.strip()) for x in row[beg_col].split(",")]
        ends = [int(x.strip()) for x in row[end_col].split(",")]
    except:
        return None
    
    if len(begins)!=len(ends):
        return None
    ranges = []
    for i in range(len(begins)):
        ranges.append(f"{begins[i]}-{ends[i]}")
    return ",".join(ranges)

In [284]:
merged_expl_mut["mutation_range_1"] = merged_expl_mut.apply(lambda row: convert_mut_begin_end_into_range(row, partner=1), axis=1)
merged_expl_mut["mutation_range_2"] = merged_expl_mut.apply(lambda row: convert_mut_begin_end_into_range(row, partner=2), axis=1)

In [285]:
merged_neg_expl_mut["mutation_range_1"] = merged_neg_expl_mut.apply(lambda row: convert_mut_begin_end_into_range(row, partner=1), axis=1)
merged_neg_expl_mut["mutation_range_2"] = merged_neg_expl_mut.apply(lambda row: convert_mut_begin_end_into_range(row, partner=2), axis=1)

In [286]:
merged_expl_mut.loc[
    merged_expl_mut["Mutation Affected protein AC"].fillna("").str.contains("dip")
][
    ["mutation_orig_1","mutation_new_1","mutation_range_1","mutation_short_1",
     "mutation_orig_2","mutation_new_2","mutation_range_2","mutation_short_2",
     "Mutation Feature short label","Mutation Feature range(s)"]]

Unnamed: 0,mutation_orig_1,mutation_new_1,mutation_range_1,mutation_short_1,mutation_orig_2,mutation_new_2,mutation_range_2,mutation_short_2,Mutation Feature short label,Mutation Feature range(s)
62782,,,,,"G,G","A,A","7-7,6-6",h3k9:p.[Gly6Ala;Gly7Ala],h3k9:p.Ser4Ala,4-4
62783,,,,,"G,G","A,A","7-7,6-6",h3k9:p.[Gly6Ala;Gly7Ala],h3k9:p.[Gly6Ala;Gly7Ala],"7-7,6-6"
62784,,,,,S,A,4-4,h3k9:p.Ser4Ala,h3k9:p.Ser4Ala,4-4
62785,,,,,S,A,4-4,h3k9:p.Ser4Ala,h3k9:p.[Gly6Ala;Gly7Ala],"7-7,6-6"
62786,,,,,"P,A","A,P","7-7,6-6",h3k27:p.[Ala6Pro;Pro7Ala],h3k27:p.[Ala6Pro;Pro7Ala],"7-7,6-6"
...,...,...,...,...,...,...,...,...,...,...
13100842,,,,,P,S,9-9,ssb-1:p.Pro9Ser,ssb-1:p.Pro9Ser,9-9
13100846,,,,,P,S,9-9,ssb-1:p.Pro9Ser,ssb-1:p.Pro9Ser,9-9
13137758,,,,,I,A,44-44,ile44ala,ile44ala,44-44
13137762,,,,,I,A,44-44,ile44ala,ile44ala,44-44


In [287]:
merged_expl_mut_filt = merged_expl_mut.loc[
    ((merged_expl_mut["scraped_mut_has_info"]) & ~(merged_expl_mut["agg_mut_has_info"])) |
    (
        (merged_expl_mut["scraped_mut_has_info"]) & 
        (merged_expl_mut["agg_mut_has_info"]) & 
            (
                (merged_expl_mut["Mutation Interactor Matches"].str.contains("A")) & 
                (merged_expl_mut["mutation_short_1"]==merged_expl_mut["Mutation Feature short label"]) & 
                (merged_expl_mut["mutation_range_1"]==merged_expl_mut["Mutation Feature range(s)"])
            ) | 
            (
                (merged_expl_mut["Mutation Interactor Matches"].str.contains("B")) & 
                (merged_expl_mut["mutation_short_2"]==merged_expl_mut["Mutation Feature short label"]) &
                (merged_expl_mut["mutation_range_2"]==merged_expl_mut["Mutation Feature range(s)"])
            )  
    )
]
print(len(merged_expl_mut))
print(len(merged_expl_mut_filt))
merged_expl_mut_filt.head()[[
    "interaction_intactid","Mutation Interactor Matches","Mutation Feature short label","mutation_short_1","mutation_short_2"
]]

13179124
93177


Unnamed: 0,interaction_intactid,Mutation Interactor Matches,Mutation Feature short label,mutation_short_1,mutation_short_2
1,EBI-2437595,A,P08069:p.Lys1033Ala,P08069:p.Lys1033Ala,
2,EBI-2437595,A,P08069:p.Tyr980Phe,P08069:p.Tyr980Phe,
4,EBI-9678732,,,,finger_domain
5,EBI-9678671,A,Q16513:p.Leu520Ile,Q16513:p.Leu520Ile,
6,EBI-9998408,A,Q16513:p.Leu520Ile,Q16513:p.Leu520Ile,


In [288]:
merged_neg_expl_mut_filt = merged_neg_expl_mut.loc[
    ((merged_neg_expl_mut["scraped_mut_has_info"]) & ~(merged_neg_expl_mut["agg_mut_has_info"])) |
    (
        (merged_neg_expl_mut["scraped_mut_has_info"]) & 
        (merged_neg_expl_mut["agg_mut_has_info"]) & 
            (
                (merged_neg_expl_mut["Mutation Interactor Matches"].str.contains("A")) & 
                (merged_neg_expl_mut["mutation_short_1"]==merged_neg_expl_mut["Mutation Feature short label"]) & 
                (merged_neg_expl_mut["mutation_range_1"]==merged_neg_expl_mut["Mutation Feature range(s)"])
            ) | 
            (
                (merged_neg_expl_mut["Mutation Interactor Matches"].str.contains("B")) & 
                (merged_neg_expl_mut["mutation_short_2"]==merged_neg_expl_mut["Mutation Feature short label"]) &
                (merged_neg_expl_mut["mutation_range_2"]==merged_neg_expl_mut["Mutation Feature range(s)"])
            )  
    )
]
print(len(merged_neg_expl_mut))
print(len(merged_neg_expl_mut_filt))
merged_neg_expl_mut_filt.head()[[
    "interaction_intactid","Mutation Interactor Matches","Mutation Feature short label","mutation_short_1","mutation_short_2"
]]

18
14


Unnamed: 0,interaction_intactid,Mutation Interactor Matches,Mutation Feature short label,mutation_short_1,mutation_short_2
0,EBI-8840423,B,P20340:p.Gln72Leu,,P20340:p.Gln72Leu
1,EBI-8840419,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
2,EBI-8840571,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
3,EBI-8840307,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
6,EBI-8840307,B,Q9NRW1:p.Thr27Asn,,Q9NRW1:p.Thr27Asn


In [289]:
# what didn't mkae it to merged_neg_expl_mut_filt?
display(merged_neg_expl_mut_filt[[
    "interaction_intactid","Mutation Interactor Matches","Mutation Feature short label","mutation_short_1","mutation_short_2"
]])
display(merged_neg_expl_mut[[
    "interaction_intactid","Mutation Interactor Matches","Mutation Feature short label","mutation_short_1","mutation_short_2"
]])

Unnamed: 0,interaction_intactid,Mutation Interactor Matches,Mutation Feature short label,mutation_short_1,mutation_short_2
0,EBI-8840423,B,P20340:p.Gln72Leu,,P20340:p.Gln72Leu
1,EBI-8840419,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
2,EBI-8840571,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
3,EBI-8840307,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
6,EBI-8840307,B,Q9NRW1:p.Thr27Asn,,Q9NRW1:p.Thr27Asn
7,EBI-8596072,A,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],
8,EBI-8596042,"A,B",Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],
9,EBI-8596042,"A,B",Q61686:p.[Ile165Glu;Tyr168Glu],,Q61686:p.[Ile165Glu;Tyr168Glu]
10,EBI-8551826,B,Q13526:p.Trp34Ala,,Q13526:p.Trp34Ala
11,EBI-492194,B,P62834:p.Gly12Val,,P62834:p.Gly12Val


Unnamed: 0,interaction_intactid,Mutation Interactor Matches,Mutation Feature short label,mutation_short_1,mutation_short_2
0,EBI-8840423,B,P20340:p.Gln72Leu,,P20340:p.Gln72Leu
1,EBI-8840419,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
2,EBI-8840571,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
3,EBI-8840307,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
4,EBI-8840307,B,Q9NRW1:p.Thr27Asn,,Q9NRW1:p.Gln72Leu
5,EBI-8840307,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Thr27Asn
6,EBI-8840307,B,Q9NRW1:p.Thr27Asn,,Q9NRW1:p.Thr27Asn
7,EBI-8596072,A,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],
8,EBI-8596042,"A,B",Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],
9,EBI-8596042,"A,B",Q61686:p.[Ile165Glu;Tyr168Glu],,Q61686:p.[Ile165Glu;Tyr168Glu]


In [290]:
merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutation Affected protein AC"].fillna("").str.contains("dip")
][
    ["interaction_intactid","mol_type_2",
     "mutation_orig_2","mutation_new_2","mutation_range_2","mutation_short_2",
     "Mutation Feature short label","Mutation Feature range(s)"]]

Unnamed: 0,interaction_intactid,mol_type_2,mutation_orig_2,mutation_new_2,mutation_range_2,mutation_short_2,Mutation Feature short label,Mutation Feature range(s)
62783,EBI-15643757,peptide,"G,G","A,A","7-7,6-6",h3k9:p.[Gly6Ala;Gly7Ala],h3k9:p.[Gly6Ala;Gly7Ala],"7-7,6-6"
62784,EBI-15643757,peptide,S,A,4-4,h3k9:p.Ser4Ala,h3k9:p.Ser4Ala,4-4
62786,EBI-15643684,peptide,"P,A","A,P","7-7,6-6",h3k27:p.[Ala6Pro;Pro7Ala],h3k27:p.[Ala6Pro;Pro7Ala],"7-7,6-6"
62792,EBI-16026675,peptide,R,A,8-8,h3k9-1:p.Arg8Ala,h3k9-1:p.Arg8Ala,8-8
62796,EBI-16026675,peptide,S,A,10-10,h3k9-1:p.Ser10Ala,h3k9-1:p.Ser10Ala,10-10
...,...,...,...,...,...,...,...,...
13100842,EBI-15712296,peptide,P,S,9-9,ssb-1:p.Pro9Ser,ssb-1:p.Pro9Ser,9-9
13100846,EBI-15712296,peptide,P,S,9-9,ssb-1:p.Pro9Ser,ssb-1:p.Pro9Ser,9-9
13137758,EBI-15607885,protein,I,A,44-44,ile44ala,ile44ala,44-44
13137762,EBI-15607885,protein,I,A,44-44,ile44ala,ile44ala,44-44


In [291]:
print(merged_expl_mut_filt["interaction_intactid"].nunique())
print(merged_expl_mut_filt["seq_pair_id"].nunique())
dup_seq_pair_ids = merged_expl_mut_filt.loc[merged_expl_mut_filt["seq_pair_id"].duplicated()]["seq_pair_id"].unique().tolist()
merged_expl_mut_filt.loc[merged_expl_mut_filt["seq_pair_id"].isin(dup_seq_pair_ids)].sort_values(
    by=["seq_pair_id","interaction_intactid"])[
        ["seq_pair_id","interaction_intactid","Mutation Interactor Matches","Mutation Feature short label","mutation_short_1","mutation_short_2"] + scraped_mut_cols + agg_mut_cols].reset_index(drop=True)


36789
19134


Unnamed: 0,seq_pair_id,interaction_intactid,Mutation Interactor Matches,Mutation Feature short label,mutation_short_1,mutation_short_2,mutation_mi_1,mutation_name_1,mutation_short_1.1,mutation_begin_1,...,Mutation Feature type,Mutation Feature annotation(s),Mutation Affected protein AC,Mutation Affected protein symbol,Mutation Affected protein full name,Mutation Affected protein organism,Mutation Interaction participants,Mutation PubMedID,Mutation Figure legend(s),Mutation Xref ID(s)
0,seqpair100007,EBI-6924268,B,Q9V637:p.Leu210Ala,,Q9V637:p.Leu210Ala,,,,,...,"psi-mi:""MI:1128""(mutation disrupting interacti...",-,uniprotkb:Q9V637,uniprotkb:PI31(gene name),Proteasome inhibitor PI31 subunit,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VBP3(psi-mi:""MI:0326""(protein)), ...",pubmed:23622245|imex:IM-21136,figure legend:Fig. 1C-G,-
1,seqpair100007,EBI-6924268,B,Q9V637:p.[Arg49Ala;Gly54Ala],,Q9V637:p.[Arg49Ala;Gly54Ala],,,,,...,"psi-mi:""MI:1128""(mutation disrupting interacti...",-,uniprotkb:Q9V637,uniprotkb:PI31(gene name),Proteasome inhibitor PI31 subunit,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VBP3(psi-mi:""MI:0326""(protein)), ...",pubmed:23622245|imex:IM-21136,figure legend:Fig. 1C-G,-
2,seqpair100007,EBI-6924268,B,Q9V637:p.Phe241Ala,,Q9V637:p.Phe241Ala,,,,,...,"psi-mi:""MI:1128""(mutation disrupting interacti...",-,uniprotkb:Q9V637,uniprotkb:PI31(gene name),Proteasome inhibitor PI31 subunit,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VBP3(psi-mi:""MI:0326""(protein)), ...",pubmed:23622245|imex:IM-21136,figure legend:Fig. 1C-G,-
3,seqpair100045,EBI-22137333,A,Q8N8A2:p.Arg80Trp,Q8N8A2:p.Arg80Trp,,MI:1128,mutation disrupting interaction strength,Q8N8A2:p.Arg80Trp,80,...,"psi-mi:""MI:1128""(mutation disrupting interacti...",-,uniprotkb:Q8N8A2,uniprotkb:ANKRD44(gene name),Serine/threonine-protein phosphatase 6 regulat...,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:O00308(psi-mi:""MI:0326""(protein)), ...",pubmed:31515488|imex:IM-27438,"figure legend:Data S2, Data S3, Data S4",dbsnp:rs368673568(identity)
4,seqpair100045,EBI-22146465,B,O00308:p.Arg315Cys,,O00308:p.Arg315Cys,,,,,...,"psi-mi:""MI:2226""(mutation with no effect)",-,uniprotkb:O00308,uniprotkb:WWP2(gene name),NEDD4-like E3 ubiquitin-protein ligase WWP2,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:O00308(psi-mi:""MI:0326""(protein)), ...",pubmed:31515488|imex:IM-27438,"figure legend:Data S2, Data S3, Data S4",dbsnp:rs377619443(identity)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84664,seqpair9992,EBI-12710366,B,P04075-2:p.Glu261Lys,,P04075-2:p.Glu261Lys,,,,,...,"psi-mi:""MI:1128""(mutation disrupting interacti...","comment:""Allele_ID: 226_709""",uniprotkb:P04075-2,uniprotkb:ALDOA(gene name),Fructose-bisphosphate aldolase A,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P04075-2(psi-mi:""MI:0326""(protein))...",pubmed:25910212|imex:IM-25511,figure legend:Table S3A,-
84665,seqpair9992,EBI-12710366,B,P04075-2:p.Cys393Tyr,,P04075-2:p.Cys393Tyr,,,,,...,"psi-mi:""MI:2226""(mutation with no effect)","comment:""Allele_ID: 226_711""",uniprotkb:P04075-2,uniprotkb:ALDOA(gene name),Fructose-bisphosphate aldolase A,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P04075-2(psi-mi:""MI:0326""(protein))...",pubmed:25910212|imex:IM-25511,figure legend:Table S3A,-
84666,seqpair9992,EBI-12724626,B,P04075-2:p.Asp183Gly,,P04075-2:p.Asp183Gly,,,,,...,"psi-mi:""MI:2226""(mutation with no effect)","comment:""Allele_ID: 226_708""",uniprotkb:P04075-2,uniprotkb:ALDOA(gene name),Fructose-bisphosphate aldolase A,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P04075-2(psi-mi:""MI:0326""(protein))...",pubmed:25910212|imex:IM-25511,figure legend:Table S3A,-
84667,seqpair9992,EBI-12724626,B,P04075-2:p.Glu261Lys,,P04075-2:p.Glu261Lys,,,,,...,"psi-mi:""MI:1128""(mutation disrupting interacti...","comment:""Allele_ID: 226_709""",uniprotkb:P04075-2,uniprotkb:ALDOA(gene name),Fructose-bisphosphate aldolase A,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P04075-2(psi-mi:""MI:0326""(protein))...",pubmed:25910212|imex:IM-25511,figure legend:Table S3A,-


In [292]:
from typing import List, Sequence, Union, Tuple

T = Union[str, List]

def apply_replacements(
    seq: T,
    ranges: List[Tuple[int, int]],        # inclusive [start, end] in ORIGINAL indexing
    repls: List[Sequence],                # replacement sequences (e.g., ["Y"] or list of tokens)
) -> T:
    """
    Replaces segments of `seq` at the given inclusive ranges with `repls`.
    Works for strings (char-level) and lists (token-level).
    - Validates: equal lengths, index bounds, non-overlap.
    - Applies from right to left to avoid index-shift issues.
    """
    if len(ranges) != len(repls):
        raise ValueError("`ranges` and `repls` must have the same length.")

    # Normalize to list for easy splice; remember if input was string
    is_string = isinstance(seq, str)
    seq_list = list(seq)

    n = len(seq_list)

    # Coerce inputs to tuples and check bounds
    normalized = []
    for (s, e), r in zip(ranges, repls):
        if not (isinstance(s, int) and isinstance(e, int)):
            raise TypeError("Range endpoints must be integers.")
        if s < 0 or e < 0 or s >= n or e >= n:
            raise IndexError(f"Range [{s}, {e}] is out of bounds for length {n}.")
        if s > e:
            raise ValueError(f"Range start > end: [{s}, {e}].")
        # Ensure replacement is a sequence for list concatenation
        rep_seq = list(r) if not isinstance(r, (list, tuple)) else list(r)
        normalized.append((s, e, rep_seq))

    # Sort by start, then end; check for overlap in ORIGINAL coordinates
    normalized.sort(key=lambda x: (x[0], x[1]))
    for i in range(1, len(normalized)):
        prev_s, prev_e, _ = normalized[i-1]
        s, e, _ = normalized[i]
        if s <= prev_e:  # overlap if any shared index
            raise ValueError(f"Overlapping ranges detected: [{prev_s},{prev_e}] and [{s},{e}].")

    # Apply replacements from right to left to avoid index shifting
    for s, e, rep in reversed(normalized):
        # splice: [0:s] + rep + [e+1:]
        seq_list = seq_list[:s] + rep + seq_list[e+1:]

    return "".join(seq_list) if is_string else seq_list


In [293]:
acceptable_annotations_for_missing_range = [
    "PB1_domain",
    "Spry domain",
    "c-terminal",
    "c-terminal region",
    "c-terminus",
    "c_terminal_pest_domain",
    "c_terminal_region",
    "ecd_region",
    "gk c-terminus",
    "n-terminal region",
    "region"
]

cols = [
    "mutation_begin_1","mutation_end_1","mutation_range_1","mutation_short_1",
    "mutation_begin_2","mutation_end_2","mutation_range_2","mutation_short_2",
]
merged_expl_mut_filt[cols] = (
    merged_expl_mut_filt[cols]
    .replace({"": pd.NA, None: pd.NA, "None": pd.NA, "<NA>": pd.NA, "nan": pd.NA})
)

acc = set(acceptable_annotations_for_missing_range)

m2 = (merged_expl_mut_filt["mutation_begin_2"].notna()
      & merged_expl_mut_filt["mutation_range_2"].isna()
      & ~merged_expl_mut_filt["mutation_short_2"].isin(acc))

m1 = (merged_expl_mut_filt["mutation_begin_1"].notna()
      & merged_expl_mut_filt["mutation_range_1"].isna()
      & ~merged_expl_mut_filt["mutation_short_1"].isin(acc))

merged_expl_mut_filt["invalid"] = m1 | m2
print(f"Total invalid rows (mutation begin but no range, and the feature is mutation-related) = {merged_expl_mut_filt['invalid'].sum()}")
merged_expl_mut_filt = merged_expl_mut_filt.loc[~merged_expl_mut_filt["invalid"]].reset_index(drop=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_expl_mut_filt[cols] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_expl_mut_filt["invalid"] = m1 | m2


Total invalid rows (mutation begin but no range, and the feature is mutation-related) = 0


In [294]:
acceptable_annotations_for_missing_range = [
    "PB1_domain",
    "Spry domain",
    "c-terminal",
    "c-terminal region",
    "c-terminus",
    "c_terminal_pest_domain",
    "c_terminal_region",
    "ecd_region",
    "gk c-terminus",
    "n-terminal region",
    "region"
]

cols = [
    "mutation_begin_1","mutation_end_1","mutation_range_1","mutation_short_1",
    "mutation_begin_2","mutation_end_2","mutation_range_2","mutation_short_2",
]
merged_neg_expl_mut_filt[cols] = (
    merged_neg_expl_mut_filt[cols]
    .replace({"": pd.NA, None: pd.NA, "None": pd.NA, "<NA>": pd.NA, "nan": pd.NA})
)

acc = set(acceptable_annotations_for_missing_range)

m2 = (merged_neg_expl_mut_filt["mutation_begin_2"].notna()
      & merged_neg_expl_mut_filt["mutation_range_2"].isna()
      & ~merged_neg_expl_mut_filt["mutation_short_2"].isin(acc))

m1 = (merged_neg_expl_mut_filt["mutation_begin_1"].notna()
      & merged_neg_expl_mut_filt["mutation_range_1"].isna()
      & ~merged_neg_expl_mut_filt["mutation_short_1"].isin(acc))

merged_neg_expl_mut_filt["invalid"] = m1 | m2
print(f"Total invalid rows (mutation begin but no range, and the feature is mutation-related) = {merged_neg_expl_mut_filt['invalid'].sum()}")
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.loc[~merged_neg_expl_mut_filt["invalid"]].reset_index(drop=True)

Total invalid rows (mutation begin but no range, and the feature is mutation-related) = 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_neg_expl_mut_filt[cols] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_neg_expl_mut_filt["invalid"] = m1 | m2


In [295]:
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutation Interactor Matches"]=="B") & 
    (merged_expl_mut_filt["mutation_range_1"].notna()) & 
    (merged_expl_mut_filt["mutation_range_2"].isna())
])
print(f"There are {test1} rows where curated-mutation data matches interactor B, but we only pulled data for interactor A.")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutation Interactor Matches"]=="A") & 
    (merged_expl_mut_filt["mutation_range_2"].notna()) & 
    (merged_expl_mut_filt["mutation_range_1"].isna())
])
print(f"There are {test1} rows where curated-mutation data matches interactor A, but we only pulled data for interactor B.")
merged_expl_mut_filt = merged_expl_mut_filt.loc[
    ~(
    (merged_expl_mut_filt["Mutation Interactor Matches"]=="B") & 
    (merged_expl_mut_filt["mutation_range_1"].notna()) & 
    (merged_expl_mut_filt["mutation_range_2"].isna()))
]
merged_expl_mut_filt = merged_expl_mut_filt.loc[
    ~(
    (merged_expl_mut_filt["Mutation Interactor Matches"]=="A") & 
    (merged_expl_mut_filt["mutation_range_2"].notna()) & 
    (merged_expl_mut_filt["mutation_range_1"].isna())
    )
]
print(f"Dropped these rows. Remaining: {len(merged_expl_mut_filt)}")

There are 0 rows where curated-mutation data matches interactor B, but we only pulled data for interactor A.
There are 0 rows where curated-mutation data matches interactor A, but we only pulled data for interactor B.
Dropped these rows. Remaining: 93177


In [296]:
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutation Interactor Matches"]=="B") & 
    (merged_neg_expl_mut_filt["mutation_range_1"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_range_2"].isna())
])
print(f"There are {test1} rows where curated-mutation data matches interactor B, but we only pulled data for interactor A.")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutation Interactor Matches"]=="A") & 
    (merged_neg_expl_mut_filt["mutation_range_2"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_range_1"].isna())
])
print(f"There are {test1} rows where curated-mutation data matches interactor A, but we only pulled data for interactor B.")
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.loc[
    ~(
    (merged_neg_expl_mut_filt["Mutation Interactor Matches"]=="B") & 
    (merged_neg_expl_mut_filt["mutation_range_1"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_range_2"].isna()))
]
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.loc[
    ~(
    (merged_neg_expl_mut_filt["Mutation Interactor Matches"]=="A") & 
    (merged_neg_expl_mut_filt["mutation_range_2"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_range_1"].isna())
    )
]
print(f"Dropped these rows. Remaining: {len(merged_neg_expl_mut_filt)}")

There are 0 rows where curated-mutation data matches interactor B, but we only pulled data for interactor A.
There are 0 rows where curated-mutation data matches interactor A, but we only pulled data for interactor B.
Dropped these rows. Remaining: 14


In [297]:
import numpy as np
import pandas as pd

merged_expl_mut_filt = harmonize_nulls_to_nan(merged_expl_mut_filt)
merged_neg_expl_mut_filt = harmonize_nulls_to_nan(merged_neg_expl_mut_filt)

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})
  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


In [298]:
# clean up the mutation columns a bit to allow easier filtering
# (1) if mutation_begin_i and mutation_end_i and mutatin_orig_i and mutation_new_i are na, then every scraped column for that mutation for that interactor should be na
# (2) if mutation feature is MI:0429, then every scraped column for that mutation for that interactor should be na
import pandas as pd
import numpy as np
import re

MUT_COLS = [
    'mutation_mi_1','mutation_name_1','mutation_short_1','mutation_begin_1',
    'mutation_end_1','mutation_range_1','mutation_orig_1','mutation_new_1',
    'mutation_mi_2','mutation_name_2','mutation_short_2','mutation_begin_2',
    'mutation_end_2','mutation_range_2','mutation_orig_2','mutation_new_2',
]

def null_mutation_blocks(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # normalize common sentinels on text-like cols only
    text_cols = out.select_dtypes(include=["object","string"]).columns.intersection(MUT_COLS)
    out[text_cols] = out[text_cols].replace({"": pd.NA, "None": pd.NA, "nan": pd.NA}, regex=False)

    for i in (1, 2):
        core  = [f"mutation_begin_{i}", f"mutation_end_{i}", f"mutation_orig_{i}", f"mutation_new_{i}"]
        block = [c for c in MUT_COLS if c.endswith(f"_{i}")]

        # rows where ALL 4 core fields are missing (robust)
        core_mask = out[core].notna().sum(axis=1) == 0

        # robust MI:0429 match (handles "MI:0429", "MI:0429; MI:xxxx", spaces, case)
        mi = out[f"mutation_mi_{i}"].astype("string")
        mi_mask = mi.str.strip().str.contains(r'(^|[;,]\s*)MI:0429([;,]\s*|$)', flags=re.I, na=False)

        mask = core_mask | mi_mask

        # null out the whole block
        out.loc[mask, block] = np.nan

    return out

def debug_core(df, i, idx_here):
    core = [f"mutation_begin_{i}", f"mutation_end_{i}",
            f"mutation_orig_{i}", f"mutation_new_{i}"]
    s = df.loc[idx_here, core]
    print("Values:")
    print(s.to_dict())
    print("Types:")
    print(s.map(lambda x: type(x).__name__).to_dict())
    print("isna():")
    print(s.isna().to_dict())
    print("non-null count:", s.notna().sum())
    
    mask = s.isna().all()
    print(mask)

In [299]:
import pandas as pd
import numpy as np
import re

MUT_COLS = [
    'mutation_mi_1','mutation_name_1','mutation_short_1','mutation_begin_1',
    'mutation_end_1','mutation_range_1','mutation_orig_1','mutation_new_1',
    'mutation_mi_2','mutation_name_2','mutation_short_2','mutation_begin_2',
    'mutation_end_2','mutation_range_2','mutation_orig_2','mutation_new_2',
]

def fill_mutations_when_AB(df: pd.DataFrame, cols,
                           match_col: str = "Mutation Interactor Matches") -> pd.DataFrame:
    """
    If match_col == 'A,B' (any whitespace; accepts 'A,B' or 'B,A') and
      - side 1 has any mutation_*_1 info but side 2 has none -> copy 1 -> 2
      - side 2 has any mutation_*_2 info but side 1 has none -> copy 2 -> 1
    """
    out = df.copy()

    # Normalize obvious sentinel strings on text-like cols so NA detection works
    text_cols = out.select_dtypes(include=["object","string"]).columns.intersection(cols + [match_col])
    out[text_cols] = out[text_cols].replace({"": pd.NA, "None": pd.NA, "nan": pd.NA}, regex=False)

    # Build paired column lists
    block1 = [c for c in cols if c.endswith("_1")]
    block2 = [c for c in cols if c.endswith("_2")]

    # Map base -> pair, e.g. 'mutation_begin' -> ('mutation_begin_1','mutation_begin_2')
    pairs = []
    for c1 in block1:
        base = c1[:-2]
        c2 = f"{base}_2"
        if c2 in out.columns:
            pairs.append((c1, c2))

    # Rows where interactor matches are A,B (either order, ignore spaces)
    ab_mask = (
        out[match_col]
        .astype("string")
        .str.strip()
        .str.contains(r'^(A\s*,\s*B|B\s*,\s*A)$', flags=re.I, na=False)
    )

    # "Has info" = any non-null across the block
    has1 = out[block1].notna().any(axis=1)
    has2 = out[block2].notna().any(axis=1)

    # Exactly-one-side-only masks
    mask_copy_12 = ab_mask & has1 & ~has2
    mask_copy_21 = ab_mask & has2 & ~has1

    # Copy 1 -> 2
    if mask_copy_12.any():
        idx = mask_copy_12[mask_copy_12].index
        for c1, c2 in pairs:
            out.loc[idx, c2] = out.loc[idx, c1].values

    # Copy 2 -> 1
    if mask_copy_21.any():
        idx = mask_copy_21[mask_copy_21].index
        for c1, c2 in pairs:
            out.loc[idx, c1] = out.loc[idx, c2].values

    return out

In [300]:
merged_expl_mut_filt = fill_mutations_when_AB(merged_expl_mut_filt, MUT_COLS)
merged_expl_mut_filt = null_mutation_blocks(merged_expl_mut_filt)

  out[match_col]
  mi_mask = mi.str.strip().str.contains(r'(^|[;,]\s*)MI:0429([;,]\s*|$)', flags=re.I, na=False)
  mi_mask = mi.str.strip().str.contains(r'(^|[;,]\s*)MI:0429([;,]\s*|$)', flags=re.I, na=False)


In [301]:
merged_neg_expl_mut_filt = fill_mutations_when_AB(merged_neg_expl_mut_filt, MUT_COLS)
merged_neg_expl_mut_filt = null_mutation_blocks(merged_neg_expl_mut_filt)

  out[match_col]
  mi_mask = mi.str.strip().str.contains(r'(^|[;,]\s*)MI:0429([;,]\s*|$)', flags=re.I, na=False)
  mi_mask = mi.str.strip().str.contains(r'(^|[;,]\s*)MI:0429([;,]\s*|$)', flags=re.I, na=False)


In [302]:
agg_mut_cols = [
"Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"Mutation new_nobind_to_uniprot_bo_annotation",
"Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type",
]
scraped_mut_cols = [
"mutation_begin_1",
"mutation_begin_2",
"mutation_end_1",
"mutation_end_2",
"mutation_mi_1",
"mutation_mi_2",
"mutation_name_1",
"mutation_name_2",
"mutation_new_1",
"mutation_new_2",
"mutation_orig_1",
"mutation_orig_2",
"mutation_range_1",
"mutation_range_2",
"mutation_short_1",
"mutation_short_2",
]
keep_cols = scraped_mut_cols + agg_mut_cols + [
"uniprot_A",
"uniprot_A_equalseq",
"uniprot_A_equalseq_canonical",
"uniprot_A_full",
"uniprot_A_inseq",
"uniprot_A_inseq_canonical",
"uniprot_A_intact",
"uniprot_A_noiso1",
"uniprot_A_noisoforms",
"uniprot_B",
"uniprot_B_equalseq",
"uniprot_B_equalseq_canonical",
"uniprot_B_full",
"uniprot_B_inseq",
"uniprot_B_inseq_canonical",
"uniprot_B_intact",
"uniprot_B_noiso1",
"uniprot_B_noisoforms",
"uniprot_gene_name_A",
"uniprot_gene_name_B",
"uniprotkb_1",
"uniprotkb_2",
"unique_all_intact_sorted",
"unique_expansions",
"unique_id",
"unique_score_int",
"unique_scores",
"unique_uniprot_noiso1_pair",
"unique_uniprot_noisoforms_pair",
"unique_uniprot_pair",
"interaction_intactid",
"intactid_1",
"intactid_2",
"dip_1", "dip_2",
"seq_pair_id",
"length_1",
"length_2",
"aa_1","aa_2",
"invalids_aa_1", "invalids_aa_2"
]

In [303]:
merged_expl_mut_filt = merged_expl_mut_filt[keep_cols]
merged_expl_mut_filt["scraped_mut_has_info"] = merged_expl_mut_filt[scraped_mut_cols].notna().any(axis=1)
merged_expl_mut_filt["scraped_mut_has_info_1"] = merged_expl_mut_filt[[x for x in scraped_mut_cols if x.endswith("_1")]].notna().any(axis=1)
merged_expl_mut_filt["scraped_mut_has_info_2"] = merged_expl_mut_filt[[x for x in scraped_mut_cols if x.endswith("_2")]].notna().any(axis=1)
merged_expl_mut_filt["agg_mut_has_info"] = merged_expl_mut_filt[agg_mut_cols].notna().any(axis=1)

In [304]:
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt[keep_cols]
merged_neg_expl_mut_filt["scraped_mut_has_info"] = merged_neg_expl_mut_filt[scraped_mut_cols].notna().any(axis=1)
merged_neg_expl_mut_filt["scraped_mut_has_info_1"] = merged_neg_expl_mut_filt[[x for x in scraped_mut_cols if x.endswith("_1")]].notna().any(axis=1)
merged_neg_expl_mut_filt["scraped_mut_has_info_2"] = merged_neg_expl_mut_filt[[x for x in scraped_mut_cols if x.endswith("_2")]].notna().any(axis=1)
merged_neg_expl_mut_filt["agg_mut_has_info"] = merged_neg_expl_mut_filt[agg_mut_cols].notna().any(axis=1)

In [305]:
for c in merged_expl_mut_filt:
    merged_expl_mut_filt[c] = merged_expl_mut_filt[c].apply(lambda x: "|".join(x) if type(x)==list else x)

In [306]:
for c in merged_neg_expl_mut_filt:
    merged_neg_expl_mut_filt[c] = merged_neg_expl_mut_filt[c].apply(lambda x: "|".join(x) if type(x)==list else x)

In [307]:
merged_expl_mut_filt = merged_expl_mut_filt.drop_duplicates().reset_index(drop=True)
mask = merged_expl_mut_filt[MUT_COLS].notna().sum(axis=1) == 0
merged_expl_mut_filt = merged_expl_mut_filt.loc[~mask].reset_index(drop=True)
print(len(merged_expl_mut_filt))

73211


In [308]:
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.drop_duplicates().reset_index(drop=True)
mask = merged_neg_expl_mut_filt[MUT_COLS].notna().sum(axis=1) == 0
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.loc[~mask].reset_index(drop=True)
print(len(merged_neg_expl_mut_filt))

13


In [309]:
display(merged_neg_expl_mut_filt[[
    "interaction_intactid","Mutation Interactor Matches","Mutation Feature short label","mutation_short_1","mutation_short_2"
]])

Unnamed: 0,interaction_intactid,Mutation Interactor Matches,Mutation Feature short label,mutation_short_1,mutation_short_2
0,EBI-8840423,B,P20340:p.Gln72Leu,,P20340:p.Gln72Leu
1,EBI-8840419,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
2,EBI-8840571,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
3,EBI-8840307,B,Q9NRW1:p.Gln72Leu,,Q9NRW1:p.Gln72Leu
4,EBI-8840307,B,Q9NRW1:p.Thr27Asn,,Q9NRW1:p.Thr27Asn
5,EBI-8596072,A,Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],
6,EBI-8596042,"A,B",Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu],Q61686:p.[Ile165Glu;Tyr168Glu]
7,EBI-8551826,B,Q13526:p.Trp34Ala,,Q13526:p.Trp34Ala
8,EBI-492194,B,P62834:p.Gly12Val,,P62834:p.Gly12Val
9,EBI-7444014,A,Q96LW7:p.Leu65Ala,Q96LW7:p.Leu65Ala,


In [310]:
merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutation Affected protein AC"].fillna("").str.contains("dip") 
][
    ["interaction_intactid","dip_2",
     "mutation_orig_2","mutation_new_2","mutation_range_2","mutation_short_2",
     "Mutation Feature short label","Mutation Feature range(s)"]]

Unnamed: 0,interaction_intactid,dip_2,mutation_orig_2,mutation_new_2,mutation_range_2,mutation_short_2,Mutation Feature short label,Mutation Feature range(s)
14920,EBI-15643757,DIP-60258N,"G,G","A,A","7-7,6-6",h3k9:p.[Gly6Ala;Gly7Ala],h3k9:p.[Gly6Ala;Gly7Ala],"7-7,6-6"
14921,EBI-15643757,DIP-60258N,S,A,4-4,h3k9:p.Ser4Ala,h3k9:p.Ser4Ala,4-4
14922,EBI-15643684,DIP-60259N,"P,A","A,P","7-7,6-6",h3k27:p.[Ala6Pro;Pro7Ala],h3k27:p.[Ala6Pro;Pro7Ala],"7-7,6-6"
14924,EBI-16026675,DIP-29604N,R,A,8-8,h3k9-1:p.Arg8Ala,h3k9-1:p.Arg8Ala,8-8
14925,EBI-16026675,DIP-29604N,S,A,10-10,h3k9-1:p.Ser10Ala,h3k9-1:p.Ser10Ala,10-10
...,...,...,...,...,...,...,...,...
53560,EBI-15681090,DIP-24261N,I,A,44-44,ile44ala,ile44ala,44-44
72047,EBI-15643956,DIP-29604N,G,P,12-12,h3k9-1:p.Gly12Pro,h3k9-1:p.Gly12Pro,12-12
72048,EBI-15643956,DIP-29604N,A,R,7-7,h3k9-1:p.Ala7Arg,h3k9-1:p.Ala7Arg,7-7
72059,EBI-15712296,DIP-46146N,P,S,9-9,ssb-1:p.Pro9Ser,ssb-1:p.Pro9Ser,9-9


In [311]:
# going to drop # Feature AC
to_join = [
    "Mutation # Feature AC",
    "Mutation og_binds_bo_ac",
    "Mutation new_binds_bo_ac",
]

all_except_featac = [c for c in merged_expl_mut_filt.columns if c not in to_join]

agg_spec = {c: join_unique_nonnull for c in to_join}

display(merged_expl_mut_filt.head())
merged_expl_mut_filt = (
    merged_expl_mut_filt
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Joined Mutation # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: {len(merged_expl_mut_filt)}")

display(merged_expl_mut_filt.head())

Unnamed: 0,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,...,length_1,length_2,aa_1,aa_2,invalids_aa_1,invalids_aa_2,scraped_mut_has_info,scraped_mut_has_info_1,scraped_mut_has_info_2,agg_mut_has_info
0,1033,,1033,,MI:0573,,mutation disrupting interaction,,A,,...,1367,473,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,,,True,True,False,True
1,980,,980,,MI:0573,,mutation disrupting interaction,,F,,...,1367,473,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,,,True,True,False,True
2,520,,520,,MI:0118,,mutation,,I,,...,984,591,MASNPERGEILLTELQGDSRSLPFSENVSAVQKLDFSDTMVQQKLD...,SMSYTWTGALITPCAAEESKLPINPLSNSLLRHHNMVYATTSRSAS...,,,True,True,False,True
3,520,,520,,MI:0118,,mutation,,I,,...,984,591,MASNPERGEILLTELQGDSRSLPFSENVSAVQKLDFSDTMVQQKLD...,SMSYTWTGALITPCAAEESKLPINPLSNSLLRHHNMVYATTSRSAS...,,,True,True,False,True
4,131128,,131128,,MI:0573,,mutation disrupting interaction,,"A,A",,...,1050,2316,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,True,True,False,True


Joined Mutation # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: 72771


Unnamed: 0,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,...,aa_2,invalids_aa_1,invalids_aa_2,scraped_mut_has_info,scraped_mut_has_info_1,scraped_mut_has_info_2,agg_mut_has_info,Mutation # Feature AC,Mutation og_binds_bo_ac,Mutation new_binds_bo_ac
0,1,1,134,134,MI:0573,MI:0573,mutation disrupting interaction,mutation disrupting interaction,,,...,MAAAMNLYTCSRSFQDSGGELMDALVPFIKSVSDSPSSSSAASASA...,,,True,True,True,True,EBI-7529161,,
1,1,1,14,14,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,.,.,...,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,,,True,True,True,True,"EBI-22050815,EBI-21928482",,
2,1,1,24,24,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,.,.,...,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,,,True,True,True,True,"EBI-22050818,EBI-21928487",,
3,1,1,76,76,MI:0382,MI:0382,mutation increasing interaction,mutation increasing interaction,,,...,MIVFVRFNSSHGFPVEVDSDTSIFQLKEVVAKRQGVPADQLRVIFA...,,,True,True,True,True,EBI-6989963,,
4,1,1,82,82,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,,,...,MNTEATHDQNEALTTGARLRNAREQLGLSQQAVAERLCLKVSTVRD...,,,True,True,True,True,EBI-8524943,,


In [312]:
# going to drop # Feature AC
to_join = [
    "Mutation # Feature AC",
    "Mutation og_binds_bo_ac",
    "Mutation new_binds_bo_ac",
]

all_except_featac = [c for c in merged_neg_expl_mut_filt.columns if c not in to_join]

agg_spec = {c: join_unique_nonnull for c in to_join}

display(merged_neg_expl_mut_filt.head())
merged_neg_expl_mut_filt = (
    merged_neg_expl_mut_filt
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Joined Mutation # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: {len(merged_neg_expl_mut_filt)}")

display(merged_neg_expl_mut_filt.head())

Unnamed: 0,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,...,length_1,length_2,aa_1,aa_2,invalids_aa_1,invalids_aa_2,scraped_mut_has_info,scraped_mut_has_info_1,scraped_mut_has_info_2,agg_mut_has_info
0,,72,,72,,MI:0118,,mutation,,L,...,837,208,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,True,False,True,True
1,,72,,72,,MI:0118,,mutation,,L,...,837,208,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,True,False,True,True
2,,72,,72,,MI:0118,,mutation,,L,...,837,208,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,True,False,True,True
3,,72,,72,,MI:0118,,mutation,,L,...,837,208,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,True,False,True,True
4,,27,,27,,MI:0118,,mutation,,N,...,837,208,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,True,False,True,True


Joined Mutation # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: 13


Unnamed: 0,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,...,aa_2,invalids_aa_1,invalids_aa_2,scraped_mut_has_info,scraped_mut_has_info_1,scraped_mut_has_info_2,agg_mut_has_info,Mutation # Feature AC,Mutation og_binds_bo_ac,Mutation new_binds_bo_ac
0,165168,165168.0,165168,165168.0,MI:0573,MI:0573,mutation disrupting interaction,mutation disrupting interaction,"E,E","E,E",...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,,,True,True,True,True,EBI-8596058,,
1,168165,,168165,,MI:0573,,mutation disrupting interaction,,"E,E",,...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,,,True,True,False,True,EBI-8596088,,
2,27,,27,,MI:0118,,mutation,,N,,...,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,,,True,True,False,True,EBI-8840331,,
3,65,,65,,MI:0573,,mutation disrupting interaction,,A,,...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,,,True,True,False,True,EBI-7444029,,
4,72,,72,,MI:0118,,mutation,,L,,...,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,,,True,True,False,True,EBI-8840592,,


In [313]:
l = mutations["Feature type"].value_counts().reset_index()["Feature type"].unique().tolist()
l = [x for x in l if x.startswith("psi-mi:")]
l = [x.split("psi-mi:")[1].split("(")[0] for x in l]
l = [x.strip("\"") for x in l]
l

['MI:2226',
 'MI:1128',
 'MI:0119',
 'MI:0118',
 'MI:0573',
 'MI:1133',
 'MI:2227',
 'MI:0382',
 'MI:1132',
 'MI:1130',
 'MI:1129',
 'MI:1131']

In [314]:
display(mutation_mi_ok.loc[mutation_mi_ok["id"].isin(l)].drop_duplicates("id").sort_values(by=["id"],ascending=True))
display(mutation_mi_ok.loc[~mutation_mi_ok["id"].isin(l)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,mutation,MI:0118,,MI:0252,biological feature
1,mutation decreasing interaction,MI:0119,MI:0118,MI:0118,mutation
10,mutation increasing interaction,MI:0382,MI:0118,MI:0118,mutation
2,mutation disrupting interaction,MI:0573,MI:0119,MI:0119,mutation decreasing interaction
4,mutation disrupting interaction strength,MI:1128,MI:0573,MI:0573,mutation disrupting interaction
6,mutation disrupting interaction rate,MI:1129,MI:0573,MI:0573,mutation disrupting interaction
8,mutation decreasing interaction rate,MI:1130,MI:0119,MI:0119,mutation decreasing interaction
11,mutation increasing interaction rate,MI:1131,MI:0382,MI:0382,mutation increasing interaction
12,mutation increasing interaction strength,MI:1132,MI:0382,MI:0382,mutation increasing interaction
9,mutation decreasing interaction strength,MI:1133,MI:0119,MI:0119,mutation decreasing interaction


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
3,necessary binding region,MI:0429,MI:0573,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting..."
15,mutation with complex effect,MI:2333,MI:0118,MI:0118,mutation


In [315]:
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutation Interactor Matches"]=="A") & 
    (merged_expl_mut_filt["mutation_range_2"].notna()) & 
    (merged_expl_mut_filt["mutation_range_1"].isna())
])==0
print(f"No rows where interactor is A but mutation_ data is associated with B: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutation Interactor Matches"]=="B") & 
    (merged_expl_mut_filt["mutation_range_1"].notna()) & 
    (merged_expl_mut_filt["mutation_range_2"].isna())
])==0
print(f"No rows where interactor is B but mutation_ data is associated with A: {test1}")

No rows where interactor is A but mutation_ data is associated with B: True
No rows where interactor is B but mutation_ data is associated with A: True


In [316]:
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutation Interactor Matches"]=="A") & 
    (merged_neg_expl_mut_filt["mutation_range_2"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_range_1"].isna())
])==0
print(f"No rows where interactor is A but mutation_ data is associated with B: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutation Interactor Matches"]=="B") & 
    (merged_neg_expl_mut_filt["mutation_range_1"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_range_2"].isna())
])==0
print(f"No rows where interactor is B but mutation_ data is associated with A: {test1}")

No rows where interactor is A but mutation_ data is associated with B: True
No rows where interactor is B but mutation_ data is associated with A: True


In [317]:
agg_mut_cols = [x for x in agg_mut_cols if x!="Mutation # Feature AC"]
merged_expl_mut_filt.loc[
    merged_expl_mut_filt["interaction_intactid"]=="EBI-1001015"
][["Mutation Interactor Matches"] + agg_mut_cols + scraped_mut_cols]

Unnamed: 0,Mutation Interactor Matches,Mutation Affected protein AC,Mutation Affected protein full name,Mutation Affected protein organism,Mutation Affected protein symbol,Mutation Feature annotation(s),Mutation Feature range(s),Mutation Feature short label,Mutation Feature type,Mutation Figure legend(s),...,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,mutation_orig_1,mutation_orig_2,mutation_range_1,mutation_range_2,mutation_short_1,mutation_short_2
40280,A,uniprotkb:P00533,Epidermal growth factor receptor,taxid:9606(human)|taxid:9606(Homo sapiens),uniprotkb:EGFR(gene name),-,721-721,P00533:p.Gly721Arg,"psi-mi:""MI:0118""(mutation)",figure legend:4 C,...,mutation,,R,,G,,721-721,,P00533:p.Gly721Arg,
69756,B,uniprotkb:P04626,Receptor tyrosine-protein kinase erbB-2,taxid:9606(human)|taxid:9606(Homo sapiens),uniprotkb:ERBB2(gene name),-,776-777,P04626:p.Gly776_Val777insTyrValMetAla,"psi-mi:""MI:0382""(mutation increasing interaction)",figure legend:4 C,...,,mutation increasing interaction,,GYVMAV,,GV,,776-777,,P04626:p.Gly776_Val777insTyrValMetAla


In [318]:
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutation Feature range(s)"].isna())
])
print(f"{test1} rows do not have a Mutation Feature range(s) value.")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutation Feature range(s)"].isna()) & 
    (merged_expl_mut_filt["agg_mut_has_info"])
])==0
print(f"\tAll of these rows do not have Mutation-aggregated data at all: {test1}")

49 rows do not have a Mutation Feature range(s) value.
	All of these rows do not have Mutation-aggregated data at all: True


In [319]:
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutation Feature range(s)"].isna())
])
print(f"{test1} rows do not have a Mutation Feature range(s) value.")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutation Feature range(s)"].isna()) & 
    (merged_neg_expl_mut_filt["agg_mut_has_info"])
])==0
print(f"\tAll of these rows do not have Mutation-aggregated data at all: {test1}")

0 rows do not have a Mutation Feature range(s) value.
	All of these rows do not have Mutation-aggregated data at all: True


In [320]:
test1 = len(merged_expl_mut_filt.loc[
        merged_expl_mut_filt["Mutation Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_expl_mut_filt["Mutation Original sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of mutation feature ranges as original sequences: {test1}")
test1 = len(merged_expl_mut_filt.loc[
        merged_expl_mut_filt["Mutation Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_expl_mut_filt["Mutation Resulting sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of mutation feature ranges as resulting sequences: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    ~merged_expl_mut_filt["Mutation Feature range(s)"].fillna("-").str.contains("-")
    ])==0
print(f"All rows have a dash - in mutation feature range indicating the span of the feature: {test1}")


All rows have the same # of mutation feature ranges as original sequences: True
All rows have the same # of mutation feature ranges as resulting sequences: True
All rows have a dash - in mutation feature range indicating the span of the feature: True


In [321]:
test1 = len(merged_neg_expl_mut_filt.loc[
        merged_neg_expl_mut_filt["Mutation Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_neg_expl_mut_filt["Mutation Original sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of mutation feature ranges as original sequences: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
        merged_neg_expl_mut_filt["Mutation Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_neg_expl_mut_filt["Mutation Resulting sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of mutation feature ranges as resulting sequences: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    ~merged_neg_expl_mut_filt["Mutation Feature range(s)"].fillna("-").str.contains("-")
    ])==0
print(f"All rows have a dash - in mutation feature range indicating the span of the feature: {test1}")


All rows have the same # of mutation feature ranges as original sequences: True
All rows have the same # of mutation feature ranges as resulting sequences: True
All rows have a dash - in mutation feature range indicating the span of the feature: True


In [322]:
import re

def get_mutated_sequence(row, partner="A"):
    """
    Build the mutated sequence for partner 'A' or 'B' from a row.
    Supports multiple edits (comma-separated), substitutions, deletions, insertions.

    Expected columns:
      - "Mutated Partner"  (e.g., "A", "B", "A,B")
      - "mutation_range_1", "mutation_range_2"    (e.g., "173-173" or "120-122, 140-139")
      - "mutation_orig_1", "mutation_orig_2"      (original substrings, comma-separated)
      - "mutation_new_1",  "mutation_new_2"       (new substrings, comma-separated)
      - amino-acid sequence: "aa_1" (A) and "aa_2" (B)

    Conventions:
      - Insertion:   original == "" or "-"  (insert before position a)
      - Deletion:    replacement in {"", "-"} or only dots (".", "..", "...") or includes dots (".", "..", "...")
                     → any dots are treated as complete removal (no "-" or "." in final sequence). totally blank is also treated as complete removal
    """
    matches = row.get("Mutated Partner")
    if matches is None:
        return None
    
    if partner == "A":
        mutation_range_col = "mutation_range_1"
        orig_col = "mutation_orig_1"
        new_col  = "mutation_new_1"
        aa_col   = "aa_1"
    else:
        mutation_range_col = "mutation_range_2"
        orig_col = "mutation_orig_2"
        new_col  = "mutation_new_2"
        aa_col   = "aa_2"

    # Accept "A", "B", "A,B" (any spacing); also handle lists/tuples/sets
    def _has_partner(m, p):
        if isinstance(m, (list, tuple, set)):
            return p in m
        s = str(m)
        return bool(re.search(rf'(^|,)\s*{re.escape(p)}\s*(,|$)', s))

    if not _has_partner(matches, partner):
        return None

    ranges_str = row.get(mutation_range_col)
    if ranges_str is None or str(ranges_str).strip() == "":
        return None

    aa_og = row.get(aa_col)
    if aa_og is None or not isinstance(aa_og, str) or len(aa_og) == 0:
        return None

    exp_str = str(row.get(orig_col, ""))
    rep_str = str(row.get(new_col, ""))

    # Tokenize edits
    ranges     = [t.strip() for t in str(ranges_str).split(",") if t.strip()]
    exp_tokens = [t.strip() for t in exp_str.split(",")] if ranges else []
    rep_tokens = [t.strip() for t in rep_str.split(",")] if ranges else []

    # Make sure counts line up
    if not (len(ranges) == len(exp_tokens) == len(rep_tokens)):
        return None

    edits = []

    for r, exp, rep in zip(ranges, exp_tokens, rep_tokens):
        # Normalize and drop all whitespace
        exp_norm = re.sub(r"\s+", "", exp or "")
        rep_norm = re.sub(r"\s+", "", rep or "")

        # Treat dot characters as "delete this position" markers
        # - Mixed letters + dots: keep only letters (e.g., "GAG......." -> "GAG")
        # - All dots: becomes "" -> full deletion
        rep_norm = rep_norm.replace(".", "")

        # Parse range: "a-b" or "a"
        if "-" in r:
            a_str, b_str = r.split("-", 1)
        else:
            a_str = b_str = r
        a, b = int(a_str), int(b_str)

        # INSERTION: original is empty or "-"
        if exp_norm in ("", "-"):
            start0 = a - 1
            end0   = a - 1
            rep_use = rep_norm  # cleaned

        else:
            # SUBSTITUTION / DELETION
            start0 = a - 1
            end0   = b

            # Verify original substring matches expected
            if aa_og[start0:end0] != exp_norm:
                return None

            # After cleaning, empty or "-" means delete region
            if rep_norm in ("", "-"):
                rep_use = ""
            else:
                rep_use = rep_norm

        edits.append((start0, end0, rep_use))

    # Apply edits left→right with running offset
    s = aa_og
    offset = 0
    for start0, end0, rep_use in sorted(edits):
        s = s[:start0 + offset] + rep_use + s[end0 + offset:]
        offset += len(rep_use) - (end0 - start0)

    return s


In [323]:
import re

def parse_residue_list(residue_str: str) -> list[str]:
    """
    Split a comma-separated residue string into a clean list,
    stripping whitespace and stray quotes.
    """
    if type(residue_str)==float or residue_str is None:
        return []
    return [
        x.strip().strip(" '\"")
        for x in residue_str.split(',')
        if x.strip() != ''
    ]


def check_residue_list_lengths(original_residues: str, mutated_residues: str) -> bool:
    """
    Return True if the comma-split original and mutated residue lists
    are the same length, False otherwise.
    """
    orig_list = parse_residue_list(original_residues)
    mut_list = parse_residue_list(mutated_residues)
    return len(orig_list) == len(mut_list)


def check_deletion_consistency(short_name: str, mutated_residues: str) -> bool:
    """
    Check that for each mutated residue that is one or more '.' characters,
    the corresponding mutation entry in the short name contains 'del'.

    - short_name: e.g. "p.[Cys244_Arg271del;Met44Cys;Met90Val;...]"
    - mutated_residues: e.g. ".,C,V,C,..." (comma-separated)

    Returns True if the condition holds for all positions, False otherwise.
    """
    
    if type(mutated_residues)==float or mutated_residues is None:
        return True
    if type(short_name)==float or short_name is None:
        return False
    
    if not("\\." in mutated_residues):
        return True
    
    # Strip outer "p.[...]" if present
    s = short_name.strip()
    if s.startswith("p.[") and s.endswith("]"):
        s = s[3:-1]

    short_parts = [x.strip() for x in s.split(';') if x.strip() != '']
    mut_list = parse_residue_list(mutated_residues)

    if len(short_parts) != len(mut_list):
        print(
            f"Length mismatch: {len(short_parts)} mutations in short name, "
            f"but {len(mut_list)} mutated residues."
        )
        return False

    for i, (mut_res, mut_desc) in enumerate(zip(mut_list, short_parts)):
        # One or more dots only (".", "..", ".....", etc.)
        if re.fullmatch(r"\.+", mut_res):
            if "del" not in mut_desc:
                # Inconsistent: deletion-like residue but no 'del' in short name
                return False

    return True


In [324]:
# make sure that orig and mutated are same # of changes
merged_expl_mut_filt["mutation_orig_new_samelen_1"] = merged_expl_mut_filt.apply(lambda row: check_residue_list_lengths(row["mutation_orig_1"],row["mutation_new_1"]),axis=1)
merged_expl_mut_filt["mutation_orig_new_samelen_2"] = merged_expl_mut_filt.apply(lambda row: check_residue_list_lengths(row["mutation_orig_1"],row["mutation_new_1"]),axis=1)

test1 = len(merged_expl_mut_filt.loc[
    ~merged_expl_mut_filt["mutation_orig_new_samelen_1"]
])==0
print(f"In all rows, mutation_orig_1 and mutation_new_1 refer to the same number of features: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    ~merged_expl_mut_filt["mutation_orig_new_samelen_2"]
])==0
print(f"In all rows, mutation_orig_2 and mutation_new_2 refer to the same number of features: {test1}")

merged_expl_mut_filt = merged_expl_mut_filt.drop(columns=["mutation_orig_new_samelen_1","mutation_orig_new_samelen_1"])

In all rows, mutation_orig_1 and mutation_new_1 refer to the same number of features: True
In all rows, mutation_orig_2 and mutation_new_2 refer to the same number of features: True


In [325]:
# make sure that orig and mutated are same # of changes
merged_neg_expl_mut_filt["mutation_orig_new_samelen_1"] = merged_neg_expl_mut_filt.apply(lambda row: check_residue_list_lengths(row["mutation_orig_1"],row["mutation_new_1"]),axis=1)
merged_neg_expl_mut_filt["mutation_orig_new_samelen_2"] = merged_neg_expl_mut_filt.apply(lambda row: check_residue_list_lengths(row["mutation_orig_1"],row["mutation_new_1"]),axis=1)

test1 = len(merged_neg_expl_mut_filt.loc[
    ~merged_neg_expl_mut_filt["mutation_orig_new_samelen_1"]
])==0
print(f"In all rows, mutation_orig_1 and mutation_new_1 refer to the same number of features: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    ~merged_neg_expl_mut_filt["mutation_orig_new_samelen_2"]
])==0
print(f"In all rows, mutation_orig_2 and mutation_new_2 refer to the same number of features: {test1}")

merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.drop(columns=["mutation_orig_new_samelen_1","mutation_orig_new_samelen_1"])

In all rows, mutation_orig_1 and mutation_new_1 refer to the same number of features: True
In all rows, mutation_orig_2 and mutation_new_2 refer to the same number of features: True


In [326]:
test1 =len(merged_expl_mut_filt.loc[(merged_expl_mut_filt["mutation_short_1"].isna()) & (merged_expl_mut_filt["mutation_short_2"].isna())])==0
print(f"Everything has a value in either mutation_short_1 or mutation_short_2: {test1}")

def get_final_mutated_partner(row):
    matches = []
    if type(row["Mutation Interactor Matches"])==str:
        return row["Mutation Interactor Matches"]
    else:
        if type(row["mutation_short_1"])==str:
            matches.append("A")
        if type(row["mutation_short_2"])==str:
            matches.append("B")
        return ",".join(matches)
        
merged_expl_mut_filt["Mutated Partner"] = merged_expl_mut_filt.apply(lambda row: get_final_mutated_partner(row),axis=1)

Everything has a value in either mutation_short_1 or mutation_short_2: True


In [327]:
test1 =len(merged_neg_expl_mut_filt.loc[(merged_neg_expl_mut_filt["mutation_short_1"].isna()) & (merged_neg_expl_mut_filt["mutation_short_2"].isna())])==0
print(f"Everything has a value in either mutation_short_1 or mutation_short_2: {test1}")

def get_final_mutated_partner(row):
    matches = []
    if type(row["Mutation Interactor Matches"])==str:
        return row["Mutation Interactor Matches"]
    else:
        if type(row["mutation_short_1"])==str:
            matches.append("A")
        if type(row["mutation_short_2"])==str:
            matches.append("B")
        return ",".join(matches)
        
merged_neg_expl_mut_filt["Mutated Partner"] = merged_neg_expl_mut_filt.apply(lambda row: get_final_mutated_partner(row),axis=1)

Everything has a value in either mutation_short_1 or mutation_short_2: True


In [328]:
test0 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("A")) 
])
print(f"There are {test0} rows where Partner A is mutated")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_expl_mut_filt["mutation_begin_1"].isna()) & 
    (merged_expl_mut_filt["mutation_end_1"].isna())
])==0
print(f"\tEverywhere that Partner A is mutated, we have a beginning and end range: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_expl_mut_filt["mutation_orig_1"].notna()) & 
    (merged_expl_mut_filt["mutation_new_1"].notna())
])
print(f"\tRows where we also have an original and ending sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_expl_mut_filt["mutation_orig_1"].notna()) & 
    (merged_expl_mut_filt["mutation_new_1"].isna())
])
print(f"\tRows where we have an orig but not new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_expl_mut_filt["mutation_orig_1"].isna()) & 
    (merged_expl_mut_filt["mutation_new_1"].isna())
])
print(f"\tRows where we have no orig and no new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")

# going to drop these rows
merged_expl_mut_filt = merged_expl_mut_filt.loc[
    ~(
        (merged_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
        (merged_expl_mut_filt["mutation_orig_1"].isna()) & 
        (merged_expl_mut_filt["mutation_new_1"].isna())
    )
].reset_index(drop=True)
print(f"\tSize of DataFrame after dropping these {test1} unhelpful rows: {len(merged_expl_mut_filt)}")

test0 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("B")) 
])
print(f"There are {test0} rows where Partner B is mutated")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_expl_mut_filt["mutation_begin_2"].isna()) & 
    (merged_expl_mut_filt["mutation_end_2"].isna())
])==0
print(f"\tEverywhere that Partner B is mutated, we have a beginning and end range: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_expl_mut_filt["mutation_orig_2"].notna()) & 
    (merged_expl_mut_filt["mutation_new_2"].notna())
])
print(f"\tRows where we also have an original and ending sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_expl_mut_filt["mutation_orig_2"].notna()) & 
    (merged_expl_mut_filt["mutation_new_2"].isna())
])
print(f"\tRows where we have an orig but not new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_expl_mut_filt["mutation_orig_2"].isna()) & 
    (merged_expl_mut_filt["mutation_new_2"].isna())
])
print(f"\tRows where we have no orig and no new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")

# going to drop these rows
merged_expl_mut_filt = merged_expl_mut_filt.loc[
    ~(
        (merged_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
        (merged_expl_mut_filt["mutation_orig_2"].isna()) & 
        (merged_expl_mut_filt["mutation_new_2"].isna())
    )
].reset_index(drop=True)
print(f"\tSize of DataFrame after dropping these {test1} unhelpful rows: {len(merged_expl_mut_filt)}")

There are 44817 rows where Partner A is mutated
	Everywhere that Partner A is mutated, we have a beginning and end range: True
	Rows where we also have an original and ending sequence: 44054/44817 (98.30%)
	Rows where we have an orig but not new sequence: 0/44817 (0.00%)
	Rows where we have no orig and no new sequence: 763/44817 (1.70%)
	Size of DataFrame after dropping these 763 unhelpful rows: 72008
There are 30197 rows where Partner B is mutated
	Everywhere that Partner B is mutated, we have a beginning and end range: True
	Rows where we also have an original and ending sequence: 29524/30197 (97.77%)
	Rows where we have an orig but not new sequence: 0/30197 (0.00%)
	Rows where we have no orig and no new sequence: 673/30197 (2.23%)
	Size of DataFrame after dropping these 673 unhelpful rows: 71335


In [329]:
test0 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("A")) 
])
print(f"There are {test0} rows where Partner A is mutated")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_neg_expl_mut_filt["mutation_begin_1"].isna()) & 
    (merged_neg_expl_mut_filt["mutation_end_1"].isna())
])==0
print(f"\tEverywhere that Partner A is mutated, we have a beginning and end range: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_neg_expl_mut_filt["mutation_orig_1"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_new_1"].notna())
])
print(f"\tRows where we also have an original and ending sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_neg_expl_mut_filt["mutation_orig_1"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_new_1"].isna())
])
print(f"\tRows where we have an orig but not new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
    (merged_neg_expl_mut_filt["mutation_orig_1"].isna()) & 
    (merged_neg_expl_mut_filt["mutation_new_1"].isna())
])
print(f"\tRows where we have no orig and no new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")

# going to drop these rows
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.loc[
    ~(
        (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("A")) & 
        (merged_neg_expl_mut_filt["mutation_orig_1"].isna()) & 
        (merged_neg_expl_mut_filt["mutation_new_1"].isna())
    )
].reset_index(drop=True)
print(f"\tSize of DataFrame after dropping these {test1} unhelpful rows: {len(merged_neg_expl_mut_filt)}")

test0 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("B")) 
])
print(f"There are {test0} rows where Partner B is mutated")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_neg_expl_mut_filt["mutation_begin_2"].isna()) & 
    (merged_neg_expl_mut_filt["mutation_end_2"].isna())
])==0
print(f"\tEverywhere that Partner B is mutated, we have a beginning and end range: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_neg_expl_mut_filt["mutation_orig_2"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_new_2"].notna())
])
print(f"\tRows where we also have an original and ending sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_neg_expl_mut_filt["mutation_orig_2"].notna()) & 
    (merged_neg_expl_mut_filt["mutation_new_2"].isna())
])
print(f"\tRows where we have an orig but not new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
    (merged_neg_expl_mut_filt["mutation_orig_2"].isna()) & 
    (merged_neg_expl_mut_filt["mutation_new_2"].isna())
])
print(f"\tRows where we have no orig and no new sequence: {test1}/{test0} ({100*test1/test0:.2f}%)")

# going to drop these rows
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.loc[
    ~(
        (merged_neg_expl_mut_filt["Mutated Partner"].str.contains("B")) & 
        (merged_neg_expl_mut_filt["mutation_orig_2"].isna()) & 
        (merged_neg_expl_mut_filt["mutation_new_2"].isna())
    )
].reset_index(drop=True)
print(f"\tSize of DataFrame after dropping these {test1} unhelpful rows: {len(merged_neg_expl_mut_filt)}")

There are 6 rows where Partner A is mutated
	Everywhere that Partner A is mutated, we have a beginning and end range: True
	Rows where we also have an original and ending sequence: 6/6 (100.00%)
	Rows where we have an orig but not new sequence: 0/6 (0.00%)
	Rows where we have no orig and no new sequence: 0/6 (0.00%)
	Size of DataFrame after dropping these 0 unhelpful rows: 13
There are 8 rows where Partner B is mutated
	Everywhere that Partner B is mutated, we have a beginning and end range: True
	Rows where we also have an original and ending sequence: 8/8 (100.00%)
	Rows where we have an orig but not new sequence: 0/8 (0.00%)
	Rows where we have no orig and no new sequence: 0/8 (0.00%)
	Size of DataFrame after dropping these 0 unhelpful rows: 13


In [330]:
## Investigate the types of sequences and potentail sequence mismatches we could have
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"]=="A") &
    (merged_expl_mut_filt["agg_mut_has_info"]) &
    (merged_expl_mut_filt["Mutation Original sequence"]!=merged_expl_mut_filt["mutation_orig_1"])
])==0
print(f"There are no rows where Mutated Partner is A, info came from both mutated and scraped, and original sequence does not match: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"]=="A") &
    (merged_expl_mut_filt["agg_mut_has_info"]) &
    (merged_expl_mut_filt["Mutation Resulting sequence"]!=merged_expl_mut_filt["mutation_new_1"])
])==0
print(f"There are no rows where Mutated Partner is A, info came from both mutated and scraped, and mutated sequence does not match: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"]=="B") &
    (merged_expl_mut_filt["agg_mut_has_info"]) &
    (merged_expl_mut_filt["Mutation Original sequence"]!=merged_expl_mut_filt["mutation_orig_2"])
])==0
print(f"There are no rows where Mutated Partner is B, info came from both mutated and scraped, and original sequence does not match: {test1}")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"]=="B") &
    (merged_expl_mut_filt["agg_mut_has_info"]) &
    (merged_expl_mut_filt["Mutation Resulting sequence"]!=merged_expl_mut_filt["mutation_new_2"])
])==0
print(f"There are no rows where Mutated Partner is B, info came from both mutated and scraped, and mutated sequence does not match: {test1}")


# Figure out the .. situation
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"]=="A") &
    (merged_expl_mut_filt["mutation_new_1"].str.contains("\\."))
])
print(f"Total rows where the new sequence for mutated partner A is one or multiple dots: {test1}/{len(merged_expl_mut_filt)} ({100*test1/len(merged_expl_mut_filt):.2f}%)")
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated Partner"]=="B") &
    (merged_expl_mut_filt["mutation_new_2"].str.contains("\\."))
])
print(f"Total rows where the new sequence for mutated partner B is one or multiple dots: {test1}/{len(merged_expl_mut_filt)} ({100*test1/len(merged_expl_mut_filt):.2f}%)")

# going to assume these are deletions and change it to empty string
merged_expl_mut_filt["mutation_del_consistent_1"] = merged_expl_mut_filt.apply(lambda row: check_deletion_consistency(row["mutation_short_1"],row["mutation_new_1"]),axis=1)
merged_expl_mut_filt["mutation_del_consistent_2"] = merged_expl_mut_filt.apply(lambda row: check_deletion_consistency(row["mutation_short_2"],row["mutation_new_2"]),axis=1)

test1 = len(merged_expl_mut_filt.loc[
    ~merged_expl_mut_filt["mutation_del_consistent_1"]
])==0
print(f"In all rows, mutation_short_1 has a deletion everywhere that mutation_new_1 has 1+ periods (\".\",\"..\", etc): {test1}")
test1 = len(merged_expl_mut_filt.loc[
    ~merged_expl_mut_filt["mutation_del_consistent_2"]
])==0
print(f"In all rows, mutation_short_2 has a deletion everywhere that mutation_new_2 has 1+ periods (\".\",\"..\", etc): {test1}")
merged_expl_mut_filt = merged_expl_mut_filt.drop(columns=["mutation_del_consistent_1","mutation_del_consistent_2"])


There are no rows where Mutated Partner is A, info came from both mutated and scraped, and original sequence does not match: True
There are no rows where Mutated Partner is A, info came from both mutated and scraped, and mutated sequence does not match: True
There are no rows where Mutated Partner is B, info came from both mutated and scraped, and original sequence does not match: True
There are no rows where Mutated Partner is B, info came from both mutated and scraped, and mutated sequence does not match: True
Total rows where the new sequence for mutated partner A is one or multiple dots: 1084/71335 (1.52%)
Total rows where the new sequence for mutated partner B is one or multiple dots: 200/71335 (0.28%)
In all rows, mutation_short_1 has a deletion everywhere that mutation_new_1 has 1+ periods (".","..", etc): True
In all rows, mutation_short_2 has a deletion everywhere that mutation_new_2 has 1+ periods (".","..", etc): True


In [331]:
## Investigate the types of sequences and potentail sequence mismatches we could have
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"]=="A") &
    (merged_neg_expl_mut_filt["agg_mut_has_info"]) &
    (merged_neg_expl_mut_filt["Mutation Original sequence"]!=merged_neg_expl_mut_filt["mutation_orig_1"])
])==0
print(f"There are no rows where Mutated Partner is A, info came from both mutated and scraped, and original sequence does not match: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"]=="A") &
    (merged_neg_expl_mut_filt["agg_mut_has_info"]) &
    (merged_neg_expl_mut_filt["Mutation Resulting sequence"]!=merged_neg_expl_mut_filt["mutation_new_1"])
])==0
print(f"There are no rows where Mutated Partner is A, info came from both mutated and scraped, and mutated sequence does not match: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"]=="B") &
    (merged_neg_expl_mut_filt["agg_mut_has_info"]) &
    (merged_neg_expl_mut_filt["Mutation Original sequence"]!=merged_neg_expl_mut_filt["mutation_orig_2"])
])==0
print(f"There are no rows where Mutated Partner is B, info came from both mutated and scraped, and original sequence does not match: {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"]=="B") &
    (merged_neg_expl_mut_filt["agg_mut_has_info"]) &
    (merged_neg_expl_mut_filt["Mutation Resulting sequence"]!=merged_neg_expl_mut_filt["mutation_new_2"])
])==0
print(f"There are no rows where Mutated Partner is B, info came from both mutated and scraped, and mutated sequence does not match: {test1}")


# Figure out the .. situation
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"]=="A") &
    (merged_neg_expl_mut_filt["mutation_new_1"].str.contains("\\."))
])
print(f"Total rows where the new sequence for mutated partner A is one or multiple dots: {test1}/{len(merged_neg_expl_mut_filt)} ({100*test1/len(merged_neg_expl_mut_filt):.2f}%)")
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated Partner"]=="B") &
    (merged_neg_expl_mut_filt["mutation_new_2"].str.contains("\\."))
])
print(f"Total rows where the new sequence for mutated partner B is one or multiple dots: {test1}/{len(merged_neg_expl_mut_filt)} ({100*test1/len(merged_neg_expl_mut_filt):.2f}%)")

# going to assume these are deletions and change it to empty string
merged_neg_expl_mut_filt["mutation_del_consistent_1"] = merged_neg_expl_mut_filt.apply(lambda row: check_deletion_consistency(row["mutation_short_1"],row["mutation_new_1"]),axis=1)
merged_neg_expl_mut_filt["mutation_del_consistent_2"] = merged_neg_expl_mut_filt.apply(lambda row: check_deletion_consistency(row["mutation_short_2"],row["mutation_new_2"]),axis=1)

test1 = len(merged_neg_expl_mut_filt.loc[
    ~merged_neg_expl_mut_filt["mutation_del_consistent_1"]
])==0
print(f"In all rows, mutation_short_1 has a deletion everywhere that mutation_new_1 has 1+ periods (\".\",\"..\", etc): {test1}")
test1 = len(merged_neg_expl_mut_filt.loc[
    ~merged_neg_expl_mut_filt["mutation_del_consistent_2"]
])==0
print(f"In all rows, mutation_short_2 has a deletion everywhere that mutation_new_2 has 1+ periods (\".\",\"..\", etc): {test1}")
merged_neg_expl_mut_filt = merged_neg_expl_mut_filt.drop(columns=["mutation_del_consistent_1","mutation_del_consistent_2"])


There are no rows where Mutated Partner is A, info came from both mutated and scraped, and original sequence does not match: True
There are no rows where Mutated Partner is A, info came from both mutated and scraped, and mutated sequence does not match: True
There are no rows where Mutated Partner is B, info came from both mutated and scraped, and original sequence does not match: True
There are no rows where Mutated Partner is B, info came from both mutated and scraped, and mutated sequence does not match: True
Total rows where the new sequence for mutated partner A is one or multiple dots: 0/13 (0.00%)
Total rows where the new sequence for mutated partner B is one or multiple dots: 0/13 (0.00%)
In all rows, mutation_short_1 has a deletion everywhere that mutation_new_1 has 1+ periods (".","..", etc): True
In all rows, mutation_short_2 has a deletion everywhere that mutation_new_2 has 1+ periods (".","..", etc): True


In [332]:
merged_expl_mut_filt["mutation_new_1"] = merged_expl_mut_filt["mutation_new_1"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_expl_mut_filt["mutation_new_2"] = merged_expl_mut_filt["mutation_new_2"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_expl_mut_filt["mutated_aa_1"] = merged_expl_mut_filt.apply(lambda row: get_mutated_sequence(row, partner="A"), axis=1)
merged_expl_mut_filt["mutated_aa_2"] = merged_expl_mut_filt.apply(lambda row: get_mutated_sequence(row, partner="B"), axis=1)

merged_expl_mut_filt["invalids_mutated_aa_1"] = merged_expl_mut_filt["mutated_aa_1"].apply(lambda x: find_invalid_chars(x,VALID_AAS) if type(x)==str else np.nan)
merged_expl_mut_filt["invalids_mutated_aa_2"] = merged_expl_mut_filt["mutated_aa_2"].apply(lambda x: find_invalid_chars(x,VALID_AAS) if type(x)==str else np.nan)

merged_expl_mut_filt[[
    "unique_id",
    "aa_1",
    "aa_2",
    "Mutation Interactor Matches",
    "Mutation Feature range(s)",
    "Mutation Original sequence",
    "Mutation Resulting sequence",
    "mutated_aa_1",
    "mutated_aa_2",
    "invalids_mutated_aa_1",
    "invalids_mutated_aa_2"
]]

Unnamed: 0,unique_id,aa_1,aa_2,Mutation Interactor Matches,Mutation Feature range(s),Mutation Original sequence,Mutation Resulting sequence,mutated_aa_1,mutated_aa_2,invalids_mutated_aa_1,invalids_mutated_aa_2
0,intact:EBI-352682_intact:EBI-352682,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,"A,B",1-14,MTERRVPFSLLRGP,.,SWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPP...,SWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPP...,,
1,intact:EBI-352682_intact:EBI-352682,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,"A,B",1-24,MTERRVPFSLLRGPSWDPFRDWYP,.,HSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPPAAIESPAVAA...,HSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPPAAIESPAVAA...,,
2,intact:EBI-1044067_intact:EBI-2431589,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV,MSGGGPSGGGPGGSGRARTSSFAEPGGGGGGGGGGPGGSASGPGGT...,A,1-1,D,CD,CDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV,,,
3,intact:EBI-297693_intact:EBI-538767,WVTF,MIGGLFIYNHKGEVLISRVYRDDIGRNAVDAFRVNVIHARQQVRSP...,A,1-1,W,A,AVTF,,,
4,intact:EBI-20959097_intact:EBI-2826852,MSSEMEPLLLAWSYFRRRKFQLCADLCTQMLEKSPYDQAAWILKAR...,MSLFKARDWWSTILGDKEEFDQGCLCLANVDNSGNGQDKIIVGSFM...,A,1-1,M,A,ASSEMEPLLLAWSYFRRRKFQLCADLCTQMLEKSPYDQAAWILKAR...,,,
...,...,...,...,...,...,...,...,...,...,...,...
71330,intact:EBI-26444823_intact:EBI-26444832,MAMNYNAKDEVDGGPPCAPGGTAKTRRPDNTAFKQQRLPAWQPILT...,MALSVDSSWHRWQWRVRDGFPHCPSETTPLLSPEKGRQSYNLTQQR...,B,993-993,E,A,,MALSVDSSWHRWQWRVRDGFPHCPSETTPLLSPEKGRQSYNLTQQR...,,
71331,intact:EBI-7340552_intact:EBI-7341579,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MYRAGEPGKRQPGPAPPRVRSVEVARGRAGYGFTLSGQAPCVLSCV...,B,995-995,H,L,,MYRAGEPGKRQPGPAPPRVRSVEVARGRAGYGFTLSGQAPCVLSCV...,,
71332,intact:EBI-20731422_intact:EBI-357481,MPVRKQDTQRALHLLEEYRSKLSQTEDRQLRSSIERVINIFQSNLF...,MATPGPRDIPLLPGSPRRLSPQAGSRGGQGPKHGQQCLKMPGPRAP...,B,998-998,S,E,,MATPGPRDIPLLPGSPRRLSPQAGSRGGQGPKHGQQCLKMPGPRAP...,,
71333,intact:EBI-1044672_intact:EBI-710402,MAEHLELLAEMPMVGRMSTQERLKHAQKRRAQQVKMWAQAEKEAQG...,MATAATEEPFPFHGLLPKKETGAASFLCRYPEYDGRGVLIAVLDTG...,B,998-998,T,R,,MATAATEEPFPFHGLLPKKETGAASFLCRYPEYDGRGVLIAVLDTG...,,


In [333]:
merged_neg_expl_mut_filt["mutation_new_1"] = merged_neg_expl_mut_filt["mutation_new_1"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_neg_expl_mut_filt["mutation_new_2"] = merged_neg_expl_mut_filt["mutation_new_2"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_neg_expl_mut_filt["mutated_aa_1"] = merged_neg_expl_mut_filt.apply(lambda row: get_mutated_sequence(row, partner="A"), axis=1)
merged_neg_expl_mut_filt["mutated_aa_2"] = merged_neg_expl_mut_filt.apply(lambda row: get_mutated_sequence(row, partner="B"), axis=1)

merged_neg_expl_mut_filt["invalids_mutated_aa_1"] = merged_neg_expl_mut_filt["mutated_aa_1"].apply(lambda x: find_invalid_chars(x,VALID_AAS) if type(x)==str else np.nan)
merged_neg_expl_mut_filt["invalids_mutated_aa_2"] = merged_neg_expl_mut_filt["mutated_aa_2"].apply(lambda x: find_invalid_chars(x,VALID_AAS) if type(x)==str else np.nan)

merged_neg_expl_mut_filt[[
    "unique_id",
    "aa_1",
    "aa_2",
    "Mutation Interactor Matches",
    "Mutation Feature range(s)",
    "Mutation Original sequence",
    "Mutation Resulting sequence",
    "mutated_aa_1",
    "mutated_aa_2",
    "invalids_mutated_aa_1",
    "invalids_mutated_aa_2"
]]

Unnamed: 0,unique_id,aa_1,aa_2,Mutation Interactor Matches,Mutation Feature range(s),Mutation Original sequence,Mutation Resulting sequence,mutated_aa_1,mutated_aa_2,invalids_mutated_aa_1,invalids_mutated_aa_2
0,intact:EBI-307973_intact:EBI-307973,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,"A,B","165-165,168-168","I,Y","E,E",MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,,
1,intact:EBI-302230_intact:EBI-307973,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,A,"168-168,165-165","Y,I","E,E",MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,,,
2,intact:EBI-8851226_intact:EBI-9247467,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,A,27-27,T,N,MSTGGDFGNPLRKFKLVFLGEQSVGKNSLITRFMYDSFDNTYQATI...,,,
3,intact:EBI-7443927_intact:EBI-958922,MTDQTYCDRLVQDTPFLTGHGRLSEQQVDRIILQLNRYYPQILTNK...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,A,65-65,L,A,MTDQTYCDRLVQDTPFLTGHGRLSEQQVDRIILQLNRYYPQILTNK...,,,
4,intact:EBI-8851226_intact:EBI-9247467,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,A,72-72,Q,L,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,
5,intact:EBI-8851226_intact:EBI-9247467,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,A,72-72,Q,L,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,,
6,intact:EBI-476965_intact:EBI-491414,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,MREYKLVVLGSGGVGKSALTVQFVQGIFVEKYDPTIEDSYRKQVEV...,B,12-12,G,V,,MREYKLVVLGSVGVGKSALTVQFVQGIFVEKYDPTIEDSYRKQVEV...,,
7,intact:EBI-1760079_intact:EBI-9247467,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSAGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,B,27-27,T,N,,MSAGGDFGNPLRKFKLVFLGEQSVGKNSLITRFMYDSFDNTYQATI...,,
8,intact:EBI-366083_intact:EBI-714158,MEEPQSDPSVEPPLSQETFSDLWKLLPENNVLSPLPSQAMDDLMLS...,MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQWERPSGNSSSGGK...,B,34-34,W,A,,MADEEKLPPGWEKRMSRSSGRVYYFNHITNASQAERPSGNSSSGGK...,,
9,intact:EBI-1052826_intact:EBI-9247467,MNHLEGSAEVEVTDEAAGGEVNESVEADLEHPEVEEEQQQPPQQQH...,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,B,72-72,Q,L,,MSTGGDFGNPLRKFKLVFLGEQSVGKTSLITRFMYDSFDNTYQATI...,,


In [334]:
merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
][["mutation_orig_2","mutation_new_2","mutation_range_2","mutated_aa_2","aa_2","invalids_mutated_aa_2"]]

Unnamed: 0,mutation_orig_2,mutation_new_2,mutation_range_2,mutated_aa_2,aa_2,invalids_mutated_aa_2
35120,K,X,575-575,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,MAEMGSKGVTAGKIASNVQKKLTRAQEKVLQKLGKADETKDEQFEQ...,X
40053,I,B,750-750,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...,MDVNPTLLFLKVPAQNAISTTFPYTGDPPYSHGTGTGYTMDTVNRT...,B
44073,MSDNGPQSNQRSAPRITFGGPTDSTDNNQNGGRNGARPKQRRPQGL...,1,1-207,1PARMASGGGETALALLLLDRLNQLESKVSGKGQQQQGQTVTKKSA...,MSDNGPQSNQRSAPRITFGGPTDSTDNNQNGGRNGARPKQRRPQGL...,1
44555,"SI,YG","SXXXXXI,YXXXXXG","107-108,46-47",MAFSVNYDSSFGGYSIHDYLGQWASTFGDVNHTNGNVTDANSGGFY...,MAFSVNYDSSFGGYSIHDYLGQWASTFGDVNHTNGNVTDANSGGFY...,X
44983,PG,PXG,116-117,MRALPICLVALMLSGCSMLSRSPVEPVQSTAPQPKAEPAKPKAPRA...,MRALPICLVALMLSGCSMLSRSPVEPVQSTAPQPKAEPAKPKAPRA...,X
46333,K,KXK,13-13,TVFGLNVWKRYSKXK,TVFGLNVWKRYSK,X
46334,K,KXK,13-13,TVFGLNVWKRYSKXK,TVFGLNVWKRYSK,X
46335,K,KXK,13-13,TVFGLNVWKRYSKXK,TVFGLNVWKRYSK,X
46336,K,KXK,13-13,TVFGLNVWKRYSKXK,TVFGLNVWKRYSK,X
46337,K,KXK,13-13,TVFGLNVWKRYSKXK,TVFGLNVWKRYSK,X


In [335]:
list(merged_expl_mut_filt["invalids_mutated_aa_1"].dropna().value_counts().index)

['X', 'B']

In [336]:
merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
][["mutation_orig_2","mutation_new_2","mutation_range_2","mutated_aa_2","aa_2","invalids_mutated_aa_2"]]

Unnamed: 0,mutation_orig_2,mutation_new_2,mutation_range_2,mutated_aa_2,aa_2,invalids_mutated_aa_2


In [337]:
# make sure nothing weird in the mutataed sequence 
test1 = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
    (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
]
print(f"Total rows where some of the mutated sequences have invalid characters: {len(test1)}")
test1 = set(list(test1["invalids_mutated_aa_1"].dropna().value_counts().index) + list(test1["invalids_mutated_aa_2"].dropna().value_counts().index))
print(f"Unique invalid characters found: {test1}")

# make sure nothing weird in the mutataed sequence 
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["invalids_aa_1"].notna()) |
    (merged_expl_mut_filt["invalids_aa_2"].notna())
])==0
print(f"None of the original sequences had invalid characters: {test1}")

Total rows where some of the mutated sequences have invalid characters: 42
Unique invalid characters found: {'B', '1', 'X', '*,X'}
None of the original sequences had invalid characters: True


In [338]:
# make sure nothing weird in the mutataed sequence 
test1 = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
    (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
]
print(f"Total rows where some of the mutated sequences have invalid characters: {len(test1)}")
test1 = set(list(test1["invalids_mutated_aa_1"].dropna().value_counts().index) + list(test1["invalids_mutated_aa_2"].dropna().value_counts().index))
print(f"Unique invalid characters found: {test1}")

# make sure nothing weird in the mutataed sequence 
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["invalids_aa_1"].notna()) |
    (merged_neg_expl_mut_filt["invalids_aa_2"].notna())
])==0
print(f"None of the original sequences had invalid characters: {test1}")

Total rows where some of the mutated sequences have invalid characters: 0
Unique invalid characters found: set()
None of the original sequences had invalid characters: True


In [339]:
mutation_mis_labeled = mutation_feature_types_labeled.copy(deep=True)
mutation_mis_labeled = mutation_mis_labeled.loc[mutation_mis_labeled["feature"].str.startswith("psi-mi")]
mutation_mis_labeled["mi"] = mutation_mis_labeled["feature"].apply(lambda x: x.split("psi-mi:")[1].split("(")[0].strip("\""))
mutation_mis_labeled = pd.concat([
    mutation_mis_labeled, 
    pd.DataFrame(data=
        {"feature": ["MI:2333"],"original_sequence":["unknown"],"mutated_sequence":["unknown"],"mi":["MI:2333"]}
    )
])
test1 = set(merged_expl_mut_filt["mutation_mi_1"].dropna().tolist() + merged_expl_mut_filt["mutation_mi_2"].dropna().tolist())
test1 = len(test1 - set(mutation_mis_labeled["mi"].tolist()))==0
print(f"All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: {test1}")
mutation_mis_og_labeled = dict(zip(mutation_mis_labeled["mi"],mutation_mis_labeled["original_sequence"]))
mutation_mis_new_labeled = dict(zip(mutation_mis_labeled["mi"],mutation_mis_labeled["mutated_sequence"]))

merged_expl_mut_filt["mutation_new_binds_bo_mi"] = merged_expl_mut_filt.apply(
    lambda row: mutation_mis_new_labeled.get(row["mutation_mi_1"]) if type(row["mutation_mi_1"])==str else mutation_mis_new_labeled.get(row["mutation_mi_2"]),axis=1)
merged_expl_mut_filt["mutation_og_binds_bo_mi"] = merged_expl_mut_filt.apply(
    lambda row: mutation_mis_og_labeled.get(row["mutation_mi_1"]) if type(row["mutation_mi_1"])==str else mutation_mis_og_labeled.get(row["mutation_mi_2"]),axis=1)

All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: True


In [340]:
test1 = set(merged_neg_expl_mut_filt["mutation_mi_1"].dropna().tolist() + merged_neg_expl_mut_filt["mutation_mi_2"].dropna().tolist())
test1 = len(test1 - set(mutation_mis_labeled["mi"].tolist()))==0
print(f"All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: {test1}")
if not(test1):
    test1 = set(merged_neg_expl_mut_filt["mutation_mi_1"].dropna().tolist() + merged_neg_expl_mut_filt["mutation_mi_2"].dropna().tolist())
    print(f"\tTotal unique mutation MIs in merged_neg_expl_mut_filt: {len(test1)}\n\t\t{test1}")
    test1 = test1 - set(mutation_mis_labeled["mi"].tolist())
    print(f"\tNon-matching: {test1}")

merged_neg_expl_mut_filt["mutation_new_binds_bo_mi"] = merged_neg_expl_mut_filt.apply(
    lambda row: mutation_mis_new_labeled.get(row["mutation_mi_1"]) if type(row["mutation_mi_1"])==str else mutation_mis_new_labeled.get(row["mutation_mi_2"]),axis=1)
merged_neg_expl_mut_filt["mutation_og_binds_bo_mi"] = merged_neg_expl_mut_filt.apply(
    lambda row: mutation_mis_og_labeled.get(row["mutation_mi_1"]) if type(row["mutation_mi_1"])==str else mutation_mis_og_labeled.get(row["mutation_mi_2"]),axis=1)

display(merged_neg_expl_mut_filt[["mutation_new_binds_bo_mi","mutation_og_binds_bo_mi"]])

All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: True


Unnamed: 0,mutation_new_binds_bo_mi,mutation_og_binds_bo_mi
0,no,yes
1,no,yes
2,unknown,unknown
3,no,yes
4,unknown,unknown
5,unknown,unknown
6,unknown,unknown
7,unknown,unknown
8,unknown,unknown
9,unknown,unknown


In [341]:
# now combine to get unique assignments per row 
newbindcols = ["Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation","mutation_new_binds_bo_mi"]
ogbindcols = ["Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type","mutation_og_binds_bo_mi"]


In [342]:
merged_expl_mut_filt["Mutated all_new_binds"] = (
    merged_expl_mut_filt.apply(lambda r: _collect_row_values(r, newbindcols), axis=1)
)
merged_expl_mut_filt["Mutated all_og_binds"] = (
    merged_expl_mut_filt.apply(lambda r: _collect_row_values(r, ogbindcols), axis=1)
)

In [343]:
merged_neg_expl_mut_filt["Mutated all_new_binds"] = (
    merged_neg_expl_mut_filt.apply(lambda r: _collect_row_values(r, newbindcols), axis=1)
)
merged_neg_expl_mut_filt["Mutated all_og_binds"] = (
    merged_neg_expl_mut_filt.apply(lambda r: _collect_row_values(r, ogbindcols), axis=1)
)

In [344]:
test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated all_new_binds"].isna()) | 
    (merged_expl_mut_filt["Mutated all_og_binds"].isna())
])==0
print(f"Could map whether original and mutated sequences are binding for every row: {test1}")

Could map whether original and mutated sequences are binding for every row: True


In [345]:
test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated all_new_binds"].isna()) | 
    (merged_neg_expl_mut_filt["Mutated all_og_binds"].isna())
])==0
print(f"Could map whether original and mutated sequences are binding for every row: {test1}")

Could map whether original and mutated sequences are binding for every row: True


In [346]:
# Figure out if any rows have contradicting labels 
def simplify_mut_bind_labels(s):
    """
    Turn s into a catchall label
    """
    s = set(s.split(","))
    options = set(["yes","no","unknown"])
    if s.intersection(options)==set(["yes"]):
        return "yes"
    elif s.intersection(options)==set(["no"]):
        return "no"
    elif s.intersection(options)==set(["yes","unknown"]):
        return "yes"
    elif s.intersection(options)==set(["no","unknown"]):
        return "no"
    elif s.intersection(options)==set(["yes","no"]):
        return "unknown"
    elif s.intersection(options)==set(["yes","no","unknown"]):
        return "unknown"
    return "unknown"
    
merged_expl_mut_filt["Mutated decisive_entry_new_binds"] = merged_expl_mut_filt["Mutated all_new_binds"].apply(lambda s: simplify_mut_bind_labels(s))
merged_expl_mut_filt["Mutated decisive_entry_og_binds"] = merged_expl_mut_filt["Mutated all_og_binds"].apply(lambda s: simplify_mut_bind_labels(s))

merged_neg_expl_mut_filt["Mutated decisive_entry_new_binds"] = merged_neg_expl_mut_filt["Mutated all_new_binds"].apply(lambda s: simplify_mut_bind_labels(s))
merged_neg_expl_mut_filt["Mutated decisive_entry_og_binds"] = merged_neg_expl_mut_filt["Mutated all_og_binds"].apply(lambda s: simplify_mut_bind_labels(s))

In [347]:
# Group by seq_sort 
def get_seqsort_for_mut_pair(row, seq_type="og"):
    """
    Get the unique pair of sequences involved in this interaction. For the og or the mutant
    """
    # Figure out which partner is mutated
    mutated_partner = row["Mutated Partner"]
    
    # Get original and mutated sequences
    og_aa_1 = row["aa_1"]
    new_aa_1 = row["mutated_aa_1"]
    
    og_aa_2 = row["aa_2"]
    new_aa_2 = row["mutated_aa_2"]
    
    # Assemble sequence pair based on whether we want the mutated interaction or the original interactin
    seqpair = [None, None]
    if not(type(mutated_partner)==float or mutated_partner is None):
        if mutated_partner=="A,B":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [new_aa_1,new_aa_2]
        elif mutated_partner == "A":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [new_aa_1,og_aa_2]
        elif mutated_partner == "B":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [og_aa_1,new_aa_2]
                
    intA = seqpair[0]
    intB = seqpair[1]
    
    if intA is None or (type(intA)==float and np.isnan(intA)):
        intA=""
    if intB is None or (type(intB)==float and np.isnan(intB)):
        intB=""
    
    if intA <= intB:
        return f"{intA}_{intB}"
    return f"{intB}_{intA}"

In [348]:
merged_expl_mut_filt["seq_sort_og"] = merged_expl_mut_filt.apply(lambda row: get_seqsort_for_mut_pair(row, seq_type="og"), axis=1)
merged_expl_mut_filt["seq_sort_new"] = merged_expl_mut_filt.apply(lambda row: get_seqsort_for_mut_pair(row, seq_type="new"), axis=1)

In [349]:
merged_neg_expl_mut_filt["seq_sort_og"] = merged_neg_expl_mut_filt.apply(lambda row: get_seqsort_for_mut_pair(row, seq_type="og"), axis=1)
merged_neg_expl_mut_filt["seq_sort_new"] = merged_neg_expl_mut_filt.apply(lambda row: get_seqsort_for_mut_pair(row, seq_type="new"), axis=1)

In [350]:
my_neg.loc[my_neg["interaction_intactid"]=="EBI-8596042"]["aa_1"].tolist()

['MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSEEHNTWEPEKNLDCPELISEFMKKYKKMKEGENNKPREKSEGNKRKSSFSNSADDIKSKKKREQSNDIARGFERGLEPEKIIGATDSCGDLMFLMKWKDTDEADLVLAKEANVKCPQIVIAFYEERLTWHAYPEDAENKEKESAKS',
 'MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSEEHNTWEPEKNLDCPELISEFMKKYKKMKEGENNKPREKSEGNKRKSSFSNSADDIKSKKKREQSNDIARGFERGLEPEKIIGATDSCGDLMFLMKWKDTDEADLVLAKEANVKCPQIVIAFYEERLTWHAYPEDAENKEKESAKS']

In [351]:
s = "MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSEEHNTWEPEKNLDCPELISEFMKKYKKMKEGENNKPREKSEGNKRKSSFSNSADDIKSKKKREQSNDIARGFERGLEPEKIIGATDSCGDLMFLMKWKDTDEADLVLAKEANVKCPQIVIAFYEERLTWHAYPEDAENKEKESAKS"
print(s[164])
print(s[167])

I
Y


In [352]:
merged_neg_expl_mut_filt[["interaction_intactid","mutation_name_1","mutation_name_2","Mutated Partner","Mutated decisive_entry_new_binds","Mutated decisive_entry_og_binds"]]

Unnamed: 0,interaction_intactid,mutation_name_1,mutation_name_2,Mutated Partner,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds
0,EBI-8596042,mutation disrupting interaction,mutation disrupting interaction,"A,B",no,yes
1,EBI-8596072,mutation disrupting interaction,,A,no,yes
2,EBI-8840304,mutation,,A,unknown,unknown
3,EBI-7444014,mutation disrupting interaction,,A,no,yes
4,EBI-8840574,mutation,,A,unknown,unknown
5,EBI-8840304,mutation,,A,unknown,unknown
6,EBI-492194,,mutation,B,unknown,unknown
7,EBI-8840307,,mutation,B,unknown,unknown
8,EBI-8551826,,mutation,B,unknown,unknown
9,EBI-8840423,,mutation,B,unknown,unknown


In [353]:
# 
gb_og = merged_expl_mut_filt.groupby("seq_sort_og").agg(
    decisive_entry_og_binds=("Mutated decisive_entry_og_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_og["seq_sort_og_id"] = [f"seq_sort_og_{i+1}" for i in range(len(gb_og))]
gb_new = merged_expl_mut_filt.groupby("seq_sort_new").agg(
    decisive_entry_new_binds=("Mutated decisive_entry_new_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_new["seq_sort_new_id"] = [f"seq_sort_new_{i+1}" for i in range(len(gb_new))]
display(gb_og.loc[gb_og["decisive_entry_og_binds"].str.contains(",")].head())
display(gb_new.loc[gb_new["decisive_entry_new_binds"].str.contains(",")].head())

test1 = len(gb_og.loc[gb_og["decisive_entry_og_binds"].str.contains("no,yes")])
print(f"Total og pairs that have yes AND no for binding based on mutation annotations: {test1}/{len(gb_og)} ({100*test1/len(gb_og):.2f}%)")
test1 = len(gb_new.loc[gb_new["decisive_entry_new_binds"].str.contains("no,yes")])
print(f"Total new pairs that have yes AND no for binding based on mutation annotations: {test1}/{len(gb_og)} ({100*test1/len(gb_new):.2f}%)")

Unnamed: 0,seq_sort_og,decisive_entry_og_binds,seq_sort_og_id
6,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,"unknown,yes",seq_sort_og_7
9,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,"unknown,yes",seq_sort_og_10
14,AIASEFSSLPSYAAFATAQEAYEQAVANGDSEVVLKKLKKSLNVAK...,"unknown,yes",seq_sort_og_15
30,APIKGVTFGEDTVWEVQGYKNVRITFELDERVDKVLNEKCSVYTVE...,"unknown,yes",seq_sort_og_31
32,APIKGVTFGEDTVWEVQGYKNVRITFELDERVDKVLNEKCSVYTVE...,"no,yes",seq_sort_og_33


Unnamed: 0,seq_sort_new,decisive_entry_new_binds,seq_sort_new_id
43,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,"no,yes",seq_sort_new_44
48,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,"no,yes",seq_sort_new_49
55,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,"no,yes",seq_sort_new_56
60,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,"no,yes",seq_sort_new_61
67,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,"no,yes",seq_sort_new_68


Total og pairs that have yes AND no for binding based on mutation annotations: 357/16685 (2.14%)
Total new pairs that have yes AND no for binding based on mutation annotations: 1153/16685 (2.20%)


In [354]:
# 
gb_neg_og = merged_neg_expl_mut_filt.groupby("seq_sort_og").agg(
    decisive_entry_og_binds=("Mutated decisive_entry_og_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_neg_og["seq_sort_og_id"] = [f"seq_sort_og_{i+1}" for i in range(len(gb_neg_og))]
gb_neg_new = merged_neg_expl_mut_filt.groupby("seq_sort_new").agg(
    decisive_entry_new_binds=("Mutated decisive_entry_new_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_neg_new["seq_sort_new_id"] = [f"seq_sort_new_{i+1}" for i in range(len(gb_neg_new))]
display(gb_neg_og.loc[gb_neg_og["decisive_entry_og_binds"].str.contains(",")].head())
display(gb_neg_new.loc[gb_neg_new["decisive_entry_new_binds"].str.contains(",")].head())

test1 = len(gb_neg_og.loc[gb_neg_og["decisive_entry_og_binds"].str.contains("no,yes")])
print(f"Total og pairs that have yes AND no for binding based on mutation annotations: {test1}/{len(gb_neg_og)} ({100*test1/len(gb_neg_og):.2f}%)")
test1 = len(gb_neg_new.loc[gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")])
print(f"Total new pairs that have yes AND no for binding based on mutation annotations: {test1}/{len(gb_neg_og)} ({100*test1/len(gb_neg_new):.2f}%)")

Unnamed: 0,seq_sort_og,decisive_entry_og_binds,seq_sort_og_id


Unnamed: 0,seq_sort_new,decisive_entry_new_binds,seq_sort_new_id


Total og pairs that have yes AND no for binding based on mutation annotations: 0/7 (0.00%)
Total new pairs that have yes AND no for binding based on mutation annotations: 0/7 (0.00%)


In [355]:
gb_new["decisive_entry_new_binds"].value_counts()

decisive_entry_new_binds
yes               36193
no                12087
unknown            2644
no,yes             1141
unknown,yes         284
unknown,no           76
unknown,no,yes       12
Name: count, dtype: int64

In [356]:
test1 = gb_new.loc[gb_new["decisive_entry_new_binds"].str.contains("no,yes")]["seq_sort_new"].unique().tolist()
gb_og_id_dict = dict(zip(gb_og["seq_sort_og"],gb_og["seq_sort_og_id"]))
gb_new_id_dict = dict(zip(gb_new["seq_sort_new"],gb_new["seq_sort_new_id"]))

gb_og["Mutated decisive_seqpair_og_binds"] = gb_og["decisive_entry_og_binds"].apply(lambda s: simplify_mut_bind_labels(s))
gb_new["Mutated decisive_seqpair_new_binds"] = gb_new["decisive_entry_new_binds"].apply(lambda s: simplify_mut_bind_labels(s))

gb_og_dict = dict(zip(gb_og["seq_sort_og"],gb_og["Mutated decisive_seqpair_og_binds"]))
gb_new_dict = dict(zip(gb_new["seq_sort_new"],gb_new["Mutated decisive_seqpair_new_binds"]))

merged_expl_mut_filt["seq_sort_og_id"] = merged_expl_mut_filt["seq_sort_og"].map(gb_og_id_dict)
merged_expl_mut_filt["seq_sort_new_id"] = merged_expl_mut_filt["seq_sort_new"].map(gb_new_id_dict)

merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"] = merged_expl_mut_filt["seq_sort_og"].map(gb_og_dict)
merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"] = merged_expl_mut_filt["seq_sort_new"].map(gb_new_dict)

display_cols = [
    "interaction_intactid","Mutated Partner","seq_sort_new","seq_sort_new_id","Mutated all_new_binds","Mutated decisive_entry_new_binds","Mutated decisive_seqpair_new_binds",
    "Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"Mutation new_nobind_to_uniprot_bo_annotation",
"Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type",
"mutation_new_binds_bo_mi",
"mutation_og_binds_bo_mi",
"mutation_mi_1","mutation_mi_2"
]
merged_expl_mut_filt.loc[
    merged_expl_mut_filt["seq_sort_new"].isin(test1)
].sort_values(by=["seq_sort_new"])[display_cols].to_csv("contradicting_mutant_labels_nov19_2025.csv",index=False)

In [357]:
merged_expl_mut_filt

Unnamed: 0,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,...,Mutated all_new_binds,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds
0,1,1,14,14,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,.,.,...,yes,yes,yes,yes,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,SWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPP...,seq_sort_og_16524,seq_sort_new_52431,yes,yes
1,1,1,24,24,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,.,.,...,yes,yes,yes,yes,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,HSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPPAAIESPAVAA...,seq_sort_og_16524,seq_sort_new_753,yes,yes
2,1,,1,,MI:0118,,mutation,,CD,,...,unknown,unknown,unknown,unknown,DAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV_MSGGG...,CDAEFRHDSGYEVHHQKLVFFAEDVGSNKGAIIGLMVGGVV_MSGG...,seq_sort_og_138,seq_sort_new_478,unknown,unknown
3,1,,1,,MI:0119,,mutation decreasing interaction,,A,,...,yes,yes,yes,yes,MIGGLFIYNHKGEVLISRVYRDDIGRNAVDAFRVNVIHARQQVRSP...,AVTF_MIGGLFIYNHKGEVLISRVYRDDIGRNAVDAFRVNVIHARQ...,seq_sort_og_12085,seq_sort_new_474,yes,yes
4,1,,1,,MI:2226,,mutation with no effect,,A,,...,"unknown,yes","unknown,yes",yes,yes,MSLFKARDWWSTILGDKEEFDQGCLCLANVDNSGNGQDKIIVGSFM...,ASSEMEPLLLAWSYFRRRKFQLCADLCTQMLEKSPYDQAAWILKAR...,seq_sort_og_16003,seq_sort_new_440,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71330,,993,,993,,MI:2226,,mutation with no effect,,A,...,yes,yes,yes,yes,MALSVDSSWHRWQWRVRDGFPHCPSETTPLLSPEKGRQSYNLTQQR...,MALSVDSSWHRWQWRVRDGFPHCPSETTPLLSPEKGRQSYNLTQQR...,seq_sort_og_3865,seq_sort_new_13816,yes,yes
71331,,995,,995,,MI:0119,,mutation decreasing interaction,,L,...,yes,yes,yes,yes,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,seq_sort_og_16529,seq_sort_new_52058,yes,yes
71332,,998,,998,,MI:1133,,mutation decreasing interaction strength,,E,...,yes,yes,yes,yes,MATPGPRDIPLLPGSPRRLSPQAGSRGGQGPKHGQQCLKMPGPRAP...,MATPGPRDIPLLPGSPRRLSPQAGSRGGQGPKHGQQCLKMPGPRAP...,seq_sort_og_6386,seq_sort_new_21037,yes,yes
71333,,998,,998,,MI:2226,,mutation with no effect,,R,...,yes,yes,yes,yes,MAEHLELLAEMPMVGRMSTQERLKHAQKRRAQQVKMWAQAEKEAQG...,MAEHLELLAEMPMVGRMSTQERLKHAQKRRAQQVKMWAQAEKEAQG...,seq_sort_og_2397,seq_sort_new_10173,yes,yes


In [358]:
test1 = gb_neg_new.loc[gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")]["seq_sort_new"].unique().tolist()
gb_neg_og_id_dict = dict(zip(gb_neg_og["seq_sort_og"],gb_neg_og["seq_sort_og_id"]))
gb_neg_new_id_dict = dict(zip(gb_neg_new["seq_sort_new"],gb_neg_new["seq_sort_new_id"]))

gb_neg_og["Mutated decisive_seqpair_og_binds"] = gb_neg_og["decisive_entry_og_binds"].apply(lambda s: simplify_mut_bind_labels(s))
gb_neg_new["Mutated decisive_seqpair_new_binds"] = gb_neg_new["decisive_entry_new_binds"].apply(lambda s: simplify_mut_bind_labels(s))

gb_neg_og_dict = dict(zip(gb_neg_og["seq_sort_og"],gb_neg_og["Mutated decisive_seqpair_og_binds"]))
gb_neg_new_dict = dict(zip(gb_neg_new["seq_sort_new"],gb_neg_new["Mutated decisive_seqpair_new_binds"]))

merged_neg_expl_mut_filt["seq_sort_og_id"] = merged_neg_expl_mut_filt["seq_sort_og"].map(gb_neg_og_id_dict)
merged_neg_expl_mut_filt["seq_sort_new_id"] = merged_neg_expl_mut_filt["seq_sort_new"].map(gb_neg_new_id_dict)

merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"] = merged_neg_expl_mut_filt["seq_sort_og"].map(gb_neg_og_dict)
merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"] = merged_neg_expl_mut_filt["seq_sort_new"].map(gb_neg_new_dict)

display_cols = [
    "interaction_intactid","Mutated Partner","seq_sort_new","seq_sort_new_id","Mutated all_new_binds","Mutated decisive_entry_new_binds","Mutated decisive_seqpair_new_binds",
    "Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"Mutation new_nobind_to_uniprot_bo_annotation",
"Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type",
"mutation_new_binds_bo_mi",
"mutation_og_binds_bo_mi",
"mutation_mi_1","mutation_mi_2"
]
merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["seq_sort_new"].isin(test1)
].sort_values(by=["seq_sort_new"])[display_cols].to_csv("contradicting_mutant_labels_nov19_2025.csv",index=False)

In [359]:
test1 = gb_new.loc[gb_new["decisive_entry_new_binds"].str.contains("no,yes")]["seq_sort_new"].unique().tolist()
temp = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["seq_sort_new"].isin(test1)
].reset_index(drop=True)
temp["mutation_mi"] = temp["mutation_mi_1"].fillna("") + "," + temp["mutation_mi_2"].fillna("")
temp["mutation_mi"] = temp["mutation_mi"].fillna("").str.strip(",")
temp = temp.groupby("seq_sort_new").agg(
    mutation_new_binds_bo_mi=("mutation_new_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    mutation_mi=("mutation_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()
temp

Unnamed: 0,seq_sort_new,mutation_new_binds_bo_mi,mutation_mi
0,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,no|yes|yes,MI:1128|MI:1133|MI:1133
1,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,no|yes|yes,MI:1128|MI:1133|MI:1133
2,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,no|no|yes,MI:1128|MI:1128|MI:1133
3,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,no|no|yes,MI:1128|MI:1128|MI:1133
4,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,no|yes|yes,MI:1128|MI:1133|MI:1133
...,...,...,...
1148,MVARGRTDEISTDVSEANSEHSLMITETSAPFRSIFSHSGKVANAG...,no|yes,"MI:1128,MI:1128|MI:1133,MI:1133"
1149,MVARGRTDEISTDVSEANSEHSLMITETSAPFRSIFSHSGKVANAG...,no|yes,"MI:1128,MI:1128|MI:1133,MI:1133"
1150,MVARGRTDEISTDVSEANSEHSLMITETSSPFRSIFSHSGKVANAG...,no|yes,"MI:1128,MI:1128|MI:2226,MI:2226"
1151,MVDSVYRTRSLGVAAEGLPDQYADGEAARVWQLYIGDTRSRTAEYK...,no|no|no|yes|yes,"MI:1128,MI:1128|MI:1128,MI:1128|MI:1128,MI:112..."


In [360]:
test1 = gb_neg_new.loc[gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")]["seq_sort_new"].unique().tolist()
temp = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["seq_sort_new"].isin(test1)
].reset_index(drop=True)
temp["mutation_mi"] = temp["mutation_mi_1"].fillna("") + "," + temp["mutation_mi_2"].fillna("")
temp["mutation_mi"] = temp["mutation_mi"].fillna("").str.strip(",")
temp = temp.groupby("seq_sort_new").agg(
    mutation_new_binds_bo_mi=("mutation_new_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    mutation_mi=("mutation_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()
temp

Unnamed: 0,seq_sort_new,mutation_new_binds_bo_mi,mutation_mi


In [361]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_new.loc[
    (gb_new["decisive_entry_new_binds"].str.contains("no,yes")) | 
    (gb_new["decisive_entry_new_binds"].str.contains("yes,no")) 
    ]["seq_sort_new"].unique().tolist()
temp = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["seq_sort_new"].isin(test1)
].reset_index(drop=True)
temp["mutation_mi"] = temp["mutation_mi_1"].fillna("") + "," + temp["mutation_mi_2"].fillna("")
temp["mutation_mi"] = temp["mutation_mi"].fillna("").str.strip(",")
temp = temp.groupby("seq_sort_new").agg(
    mutation_new_binds_bo_mi=("mutation_new_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    mutation_mi=("mutation_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()

temp["mutation_new_binds_bo_mi_unique_conflicting"] = temp["mutation_new_binds_bo_mi"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(set(["no","yes"]))
        )
    )
)

test1 = len(temp.loc[
    ~(
        (temp["mutation_new_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
        (temp["mutation_new_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
    )
])==0

print(f"There are {len(temp)} unique sequence pairs (where at least one seq is mutated) where there are conflicting labels about whether or not binding has occurred.")
print(f"\tIn every case where we have contradicting labels for whether a mutated sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
mutation_mis_labeled["description"] = mutation_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
mutation_mis_labeled["description"] = mutation_mis_labeled.apply(lambda row: row["description"] if row["mi"]!="MI:2333" else "mutation with complex effect", axis=1)
mutation_mis_labeled_mi_desc_dict = dict(zip(mutation_mis_labeled["mi"],mutation_mis_labeled["description"]))

temp["mutation_desc"] = temp["mutation_mi"].apply(lambda x: mi_to_desc_string(x,mutation_mis_labeled_mi_desc_dict))

temp.sort_values(by=["seq_sort_new"]).to_csv("contradicting_mutant_labels_grouped_by_seq_new_nov19_2025.csv",index=False)
temp.loc[temp["mutation_mi"].str.contains(",")].head()


There are 1153 unique sequence pairs (where at least one seq is mutated) where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether a mutated sequence interacts with another sequence, it is because of conflicting MIs: True


Unnamed: 0,seq_sort_new,mutation_new_binds_bo_mi,mutation_mi,mutation_new_binds_bo_mi_unique_conflicting,mutation_desc
157,MADQAPFDTDVNTLTRFVMEEGRKARGTGELTQLLNSLCTAVKAIS...,no|no|yes|yes|yes,"MI:1128,MI:1128|MI:1128,MI:1128|MI:2226,MI:222...","no,yes","mutation disrupting interaction strength,mutat..."
158,MADQAPFDTDVNTLTRFVMEEGRKARGTGELTQLLNSLCTAVKAIS...,no|no|no|no|no|yes,"MI:1128,MI:1128|MI:1128,MI:1128|MI:1128,MI:112...","no,yes","mutation disrupting interaction strength,mutat..."
160,MADQAPFDTDVNTLTRFVMEEGRKARGTGELTQLLNSLCTAVKAIS...,no|no|yes|yes|yes,"MI:1128,MI:1128|MI:1128,MI:1128|MI:2226,MI:222...","no,yes","mutation disrupting interaction strength,mutat..."
162,MADQAPFDTDVNTLTRFVMEEGRKARGTGELTQLLNSLCTAVKAIS...,no|no|yes|yes|yes,"MI:1128,MI:1128|MI:1128,MI:1128|MI:2226,MI:222...","no,yes","mutation disrupting interaction strength,mutat..."
202,MAERESGLSGGAASPPAASPFLGLHIASPPNFRLTHDISLEEFEDE...,yes|no|no,"MI:0119,MI:0119|MI:0573,MI:0573|MI:0573,MI:0573","no,yes","mutation decreasing interaction,mutation decre..."


In [362]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no")) |
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes"))
    ]["seq_sort_og"].unique().tolist()
temp2 = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["seq_sort_og"].isin(test1)
].reset_index(drop=True)
temp2["mutation_mi"] = temp2["mutation_mi_1"].fillna("") + "," + temp2["mutation_mi_2"].fillna("")
temp2["mutation_mi"] = temp2["mutation_mi"].fillna("").str.strip(",")
temp2 = temp2.groupby("seq_sort_og").agg(
    mutation_og_binds_bo_mi=("mutation_og_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    mutation_mi=("mutation_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()
temp2["mutation_og_binds_bo_mi_unique_conflicting"] = temp2["mutation_og_binds_bo_mi"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(set(["no","yes"]))
        )
    )
)

test1 = len(temp2.loc[
    ~(
        (temp2["mutation_og_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
        (temp2["mutation_og_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
    )
])==0

print(f"There are {len(temp2)} unique sequence pairs where there are conflicting labels about whether or not binding has occurred.")
print(f"\tIn every case where we have contradicting labels for whether an original sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
mutation_mis_labeled["description"] = mutation_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
mutation_mis_labeled["description"] = mutation_mis_labeled.apply(lambda row: row["description"] if row["mi"]!="MI:2333" else "mutation with complex effect", axis=1)
mutation_mis_labeled_mi_desc_dict = dict(zip(mutation_mis_labeled["mi"],mutation_mis_labeled["description"]))

temp2["mutation_desc"] = temp2["mutation_mi"].apply(lambda x: mi_to_desc_string(x,mutation_mis_labeled_mi_desc_dict))

temp2.sort_values(by=["seq_sort_og"]).to_csv("contradicting_mutant_labels_grouped_by_seq_og_nov19_2025.csv",index=False)
temp2.loc[temp2["mutation_mi"].str.contains(",")].head()

There are 357 unique sequence pairs where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether an original sequence interacts with another sequence, it is because of conflicting MIs: True


Unnamed: 0,seq_sort_og,mutation_og_binds_bo_mi,mutation_mi,mutation_og_binds_bo_mi_unique_conflicting,mutation_desc
251,MEEEIAALVIDNGSGMCKAGFAGDDAPRAVFPSIVGRPRHQGVMVG...,yes|yes|yes|yes|no|yes|yes|no,"MI:1128,MI:1128|MI:1128,MI:1128|MI:2226,MI:222...","no,yes","mutation disrupting interaction strength,mutat..."
312,MKYILVTGGVISGIGKGIIASSIGTILKSCGLRVTAIKIDPYINID...,yes|no,"MI:2226,MI:2226|MI:2227,MI:2227","no,yes","mutation with no effect,mutation with no effec..."
355,MTLQCTKSAGPWKMVVWDEDGFQGRRHEFTAECPSVLELGFETVRS...,yes|yes|no|no|no|no|no|no,"MI:1132,MI:1132|MI:1132,MI:1132|MI:2227,MI:222...","no,yes","mutation increasing interaction strength,mutat..."


In [363]:
# get unique groupings of mutation effects
temp3 = pd.concat(
    [temp[["seq_sort_new","mutation_mi", "mutation_desc"]],
    temp2[["seq_sort_og","mutation_mi", "mutation_desc"]]]
).reset_index(drop=True)
temp3["mutation_unique_mi"] = temp3["mutation_mi"].apply(
    lambda x: ",".join(sorted([x for x in
            set(
                ",".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split(",")  # split on whitespace
            )
        if x!="nan"]))
)
temp3 = temp3.loc[
    (temp3["mutation_unique_mi"]!="nan") & 
    (temp3["mutation_unique_mi"]!="")
].reset_index(drop=True)
temp3["mutation_desc"] = temp3["mutation_desc"].fillna("").apply(lambda x: x.replace("no effect","no-effect"))
keywords = set(["no-effect","causing","increasing","decreasing","disrupting"])
# Extract keywords, ignoring comma vs pipe grouping
temp3["mutation_keywords"] = temp3["mutation_desc"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(keywords)
        )
    )
)
print(temp3["mutation_keywords"].value_counts())

test1 = temp3["mutation_keywords"].value_counts().reset_index()["mutation_keywords"].str.contains(",").all()
print(f"\nEvery sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): {test1}")

mutation_keywords
decreasing,disrupting                                 665
disrupting,no-effect                                  425
causing,disrupting,no-effect                          151
causing,increasing                                    127
causing,no-effect                                      47
causing,disrupting                                     34
disrupting,increasing                                  18
decreasing,disrupting,no-effect                        16
causing,increasing,no-effect                            9
decreasing,disrupting,increasing                        4
causing,decreasing,disrupting,no-effect                 3
causing,decreasing                                      3
causing,decreasing,disrupting,increasing,no-effect      2
causing,decreasing,disrupting,increasing                2
causing,decreasing,increasing,no-effect                 1
causing,decreasing,no-effect                            1
causing,decreasing,disrupting                         

In [364]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_neg_new.loc[
    (gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")) | 
    (gb_neg_new["decisive_entry_new_binds"].str.contains("yes,no")) 
    ]["seq_sort_new"].unique().tolist()
temp = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["seq_sort_new"].isin(test1)
].reset_index(drop=True)
temp["mutation_mi"] = temp["mutation_mi_1"].fillna("") + "," + temp["mutation_mi_2"].fillna("")
temp["mutation_mi"] = temp["mutation_mi"].fillna("").str.strip(",")
temp = temp.groupby("seq_sort_new").agg(
    mutation_new_binds_bo_mi=("mutation_new_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    mutation_mi=("mutation_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()

temp["mutation_new_binds_bo_mi_unique_conflicting"] = temp["mutation_new_binds_bo_mi"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(set(["no","yes"]))
        )
    )
)

test1 = len(temp.loc[
    ~(
        (temp["mutation_new_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
        (temp["mutation_new_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
    )
])==0

print(f"There are {len(temp)} unique sequence pairs (where at least one seq is mutated) where there are conflicting labels about whether or not binding has occurred.")
print(f"\tIn every case where we have contradicting labels for whether a mutated sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
mutation_mis_labeled["description"] = mutation_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
mutation_mis_labeled["description"] = mutation_mis_labeled.apply(lambda row: row["description"] if row["mi"]!="MI:2333" else "mutation with complex effect", axis=1)
mutation_mis_labeled_mi_desc_dict = dict(zip(mutation_mis_labeled["mi"],mutation_mis_labeled["description"]))

temp["mutation_desc"] = temp["mutation_mi"].apply(lambda x: mi_to_desc_string(x,mutation_mis_labeled_mi_desc_dict))

temp.sort_values(by=["seq_sort_new"]).to_csv("contradicting_mutant_labels_grouped_by_seq_new_nov19_2025.csv",index=False)
temp.loc[temp["mutation_mi"].str.contains(",")].head()


There are 0 unique sequence pairs (where at least one seq is mutated) where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether a mutated sequence interacts with another sequence, it is because of conflicting MIs: True


Unnamed: 0,seq_sort_new,mutation_new_binds_bo_mi,mutation_mi,mutation_new_binds_bo_mi_unique_conflicting,mutation_desc


In [365]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_neg_og.loc[
    (gb_neg_og["decisive_entry_og_binds"].str.contains("yes,no")) |
    (gb_neg_og["decisive_entry_og_binds"].str.contains("no,yes"))
    ]["seq_sort_og"].unique().tolist()
temp2 = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["seq_sort_og"].isin(test1)
].reset_index(drop=True)
temp2["mutation_mi"] = temp2["mutation_mi_1"].fillna("") + "," + temp2["mutation_mi_2"].fillna("")
temp2["mutation_mi"] = temp2["mutation_mi"].fillna("").str.strip(",")
temp2 = temp2.groupby("seq_sort_og").agg(
    mutation_og_binds_bo_mi=("mutation_og_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    mutation_mi=("mutation_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()
temp2["mutation_og_binds_bo_mi_unique_conflicting"] = temp2["mutation_og_binds_bo_mi"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(set(["no","yes"]))
        )
    )
)

test1 = len(temp2.loc[
    ~(
        (temp2["mutation_og_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
        (temp2["mutation_og_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
    )
])==0

print(f"There are {len(temp2)} unique sequence pairs where there are conflicting labels about whether or not binding has occurred.")
print(f"\tIn every case where we have contradicting labels for whether an original sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
mutation_mis_labeled["description"] = mutation_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
mutation_mis_labeled["description"] = mutation_mis_labeled.apply(lambda row: row["description"] if row["mi"]!="MI:2333" else "mutation with complex effect", axis=1)
mutation_mis_labeled_mi_desc_dict = dict(zip(mutation_mis_labeled["mi"],mutation_mis_labeled["description"]))

temp2["mutation_desc"] = temp2["mutation_mi"].apply(lambda x: mi_to_desc_string(x,mutation_mis_labeled_mi_desc_dict))

temp2.sort_values(by=["seq_sort_og"]).to_csv("contradicting_mutant_labels_grouped_by_seq_og_nov19_2025.csv",index=False)
temp2.loc[temp2["mutation_mi"].str.contains(",")].head()

There are 0 unique sequence pairs where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether an original sequence interacts with another sequence, it is because of conflicting MIs: True


Unnamed: 0,seq_sort_og,mutation_og_binds_bo_mi,mutation_mi,mutation_og_binds_bo_mi_unique_conflicting,mutation_desc


In [366]:
# get unique groupings of mutation effects
temp3 = pd.concat(
    [temp[["seq_sort_new","mutation_mi", "mutation_desc"]],
    temp2[["seq_sort_og","mutation_mi", "mutation_desc"]]]
).reset_index(drop=True)
temp3["mutation_unique_mi"] = temp3["mutation_mi"].apply(
    lambda x: ",".join(sorted([x for x in
            set(
                ",".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split(",")  # split on whitespace
            )
        if x!="nan"]))
)
temp3 = temp3.loc[
    (temp3["mutation_unique_mi"]!="nan") & 
    (temp3["mutation_unique_mi"]!="")
].reset_index(drop=True)
temp3["mutation_desc"] = temp3["mutation_desc"].fillna("").apply(lambda x: x.replace("no effect","no-effect"))
keywords = set(["no-effect","causing","increasing","decreasing","disrupting"])
# Extract keywords, ignoring comma vs pipe grouping
temp3["mutation_keywords"] = temp3["mutation_desc"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(keywords)
        )
    )
)
print(temp3["mutation_keywords"].value_counts())

test1 = temp3["mutation_keywords"].value_counts().reset_index()["mutation_keywords"].str.contains(",").all()
print(f"\nEvery sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): {test1}")

Series([], Name: count, dtype: int64)

Every sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): True


In [367]:
mutation_mis_labeled["description"] = mutation_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
mutation_mis_labeled["description"] = mutation_mis_labeled.apply(lambda row: row["description"] if row["mi"]!="MI:2333" else "mutation with complex effect", axis=1)
mutation_mis_labeled

Unnamed: 0,feature,original_sequence,mutated_sequence,comments,mi,description
0,"psi-mi:""MI:2226""(mutation with no effect)",yes,yes,,MI:2226,mutation with no effect
1,"psi-mi:""MI:0573""(mutation disrupting interaction)",yes,no,disrupting means no interaction,MI:0573,mutation disrupting interaction
2,"psi-mi:""MI:1128""(mutation disrupting interacti...",yes,no,disrupting means no interaction,MI:1128,mutation disrupting interaction strength
3,"psi-mi:""MI:1133""(mutation decreasing interacti...",yes,yes,,MI:1133,mutation decreasing interaction strength
4,"psi-mi:""MI:0119""(mutation decreasing interaction)",yes,yes,,MI:0119,mutation decreasing interaction
5,"psi-mi:""MI:2227""(mutation causing an interaction)",no,yes,,MI:2227,mutation causing an interaction
6,"psi-mi:""MI:1132""(mutation increasing interacti...",yes,yes,,MI:1132,mutation increasing interaction strength
7,"psi-mi:""MI:0118""(mutation)",unknown,unknown,some annotations with MI:0118 have comments in...,MI:0118,mutation
8,"psi-mi:""MI:1129""(mutation disrupting interacti...",yes,no,disrupting means no interaction,MI:1129,mutation disrupting interaction rate
9,"psi-mi:""MI:1130""(mutation decreasing interacti...",yes,yes,,MI:1130,mutation decreasing interaction rate


In [368]:
print("Positive:\nValue counts for Mutated all_new_binds")
print(merged_expl_mut_filt["Mutated all_new_binds"].value_counts())
print("\nValue counts for Mutated all_og_binds")
print(merged_expl_mut_filt["Mutated all_og_binds"].value_counts())

Positive:
Value counts for Mutated all_new_binds
Mutated all_new_binds
yes                    27420
unknown,yes            19545
no                     11581
no,unknown              8265
unknown                 4501
no,yes                    20
VPS18,mixed,unknown        2
calmodulin,mixed,no        1
Name: count, dtype: int64

Value counts for Mutated all_og_binds
Mutated all_og_binds
yes               38179
unknown,yes       26583
unknown            4501
no,unknown         1236
no                  834
no,unknown,yes        1
no,yes                1
Name: count, dtype: int64


In [369]:
print("Negative:\nValue counts for Mutated all_new_binds")
print(merged_neg_expl_mut_filt["Mutated all_new_binds"].value_counts())
print("\nValue counts for Mutated all_og_binds")
print(merged_neg_expl_mut_filt["Mutated all_og_binds"].value_counts())

Negative:
Value counts for Mutated all_new_binds
Mutated all_new_binds
unknown    10
no          3
Name: count, dtype: int64

Value counts for Mutated all_og_binds
Mutated all_og_binds
unknown    10
yes         3
Name: count, dtype: int64


In [370]:
print("\nPositive:\nValue counts for Mutated decisive_entry_new_binds")
print(merged_expl_mut_filt["Mutated decisive_entry_new_binds"].value_counts())
print("\nValue counts for Mutated decisive_entry_og_binds")
print(merged_expl_mut_filt["Mutated decisive_entry_og_binds"].value_counts())


Positive:
Value counts for Mutated decisive_entry_new_binds
Mutated decisive_entry_new_binds
yes        46965
no         19847
unknown     4523
Name: count, dtype: int64

Value counts for Mutated decisive_entry_og_binds
Mutated decisive_entry_og_binds
yes        64762
unknown     4503
no          2070
Name: count, dtype: int64


In [371]:
print("\nNegative:\nValue counts for Mutated decisive_entry_new_binds")
print(merged_neg_expl_mut_filt["Mutated decisive_entry_new_binds"].value_counts())
print("\nValue counts for Mutated decisive_entry_og_binds")
print(merged_neg_expl_mut_filt["Mutated decisive_entry_og_binds"].value_counts())


Negative:
Value counts for Mutated decisive_entry_new_binds
Mutated decisive_entry_new_binds
unknown    10
no          3
Name: count, dtype: int64

Value counts for Mutated decisive_entry_og_binds
Mutated decisive_entry_og_binds
unknown    10
yes         3
Name: count, dtype: int64


In [372]:
print("\nPositive:\nValue counts for Mutated decisive_seqpair_new_binds")
print(merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"].value_counts())
print("\nValue counts for Mutated decisive_seqpair_og_binds")
print(merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"].value_counts())


Positive:
Value counts for Mutated decisive_seqpair_new_binds
Mutated decisive_seqpair_new_binds
yes        45250
no         17975
unknown     8110
Name: count, dtype: int64

Value counts for Mutated decisive_seqpair_og_binds
Mutated decisive_seqpair_og_binds
yes        64455
unknown     5578
no          1302
Name: count, dtype: int64


In [373]:
print("\nNegative:\nValue counts for Mutated decisive_seqpair_new_binds")
print(merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"].value_counts())
print("\nValue counts for Mutated decisive_seqpair_og_binds")
print(merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"].value_counts())


Negative:
Value counts for Mutated decisive_seqpair_new_binds
Mutated decisive_seqpair_new_binds
unknown    10
no          3
Name: count, dtype: int64

Value counts for Mutated decisive_seqpair_og_binds
Mutated decisive_seqpair_og_binds
unknown    10
yes         3
Name: count, dtype: int64


In [374]:
# perfect! now that we have Mutated decisive_seqpair_new_binds and Mutated decisive_seqpair_og_binds, we can come up with some positive and negative-binding pairs from here
# first let's save all of this 
import os
os.makedirs("data_files/processed/intact/clean",exist_ok=True)
merged_expl_mut_filt[[
"interaction_intactid",
"unique_id",
"seq_pair_id",
"Mutated Partner",
"Mutated decisive_seqpair_new_binds",
"Mutated decisive_seqpair_og_binds",
"Mutated decisive_entry_new_binds",
"Mutated decisive_entry_og_binds",
"Mutated all_new_binds",
"Mutated all_og_binds",
"agg_mut_has_info",
"Mutation Affected protein AC",
"Mutation # Feature AC",
"Mutation Feature annotation(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"Mutation new_nobind_to_uniprot_bo_annotation",
"Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type",
"aa_1",
"aa_2",
"intactid_1",
"intactid_2",
"dip_1",
"dip_2",
"length_1",
"length_2",
"mutated_aa_1",
"mutated_aa_2",
"mutation_begin_1",
"mutation_begin_2",
"mutation_end_1",
"mutation_end_2",
"mutation_mi_1",
"mutation_mi_2",
"mutation_name_1",
"mutation_name_2",
"mutation_new_1",
"mutation_new_2",
"mutation_new_binds_bo_mi",
"mutation_og_binds_bo_mi",
"mutation_orig_1",
"mutation_orig_2",
"mutation_range_1",
"mutation_range_2",
"mutation_short_1",
"mutation_short_2",
"uniprot_A",
"uniprot_A_intact",
"uniprot_B",
"uniprot_B_intact",
"unique_expansions",
"unique_uniprot_pair"
]].to_csv("data_files/processed/intact/clean/mutations_dec11_2025.csv",index=False)
merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="yes"
]

Unnamed: 0,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,mutation_name_2,mutation_new_1,mutation_new_2,...,Mutated all_new_binds,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds
0,1,1,14,14,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,.,.,...,yes,yes,yes,yes,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,SWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPP...,seq_sort_og_16524,seq_sort_new_52431,yes,yes
1,1,1,24,24,MI:0119,MI:0119,mutation decreasing interaction,mutation decreasing interaction,.,.,...,yes,yes,yes,yes,MTERRVPFSLLRGPSWDPFRDWYPHSRLFDQAFGLPRLPEEWSQWL...,HSRLFDQAFGLPRLPEEWSQWLGGSSWPGYVRPLPPAAIESPAVAA...,seq_sort_og_16524,seq_sort_new_753,yes,yes
3,1,,1,,MI:0119,,mutation decreasing interaction,,A,,...,yes,yes,yes,yes,MIGGLFIYNHKGEVLISRVYRDDIGRNAVDAFRVNVIHARQQVRSP...,AVTF_MIGGLFIYNHKGEVLISRVYRDDIGRNAVDAFRVNVIHARQ...,seq_sort_og_12085,seq_sort_new_474,yes,yes
4,1,,1,,MI:2226,,mutation with no effect,,A,,...,"unknown,yes","unknown,yes",yes,yes,MSLFKARDWWSTILGDKEEFDQGCLCLANVDNSGNGQDKIIVGSFM...,ASSEMEPLLLAWSYFRRRKFQLCADLCTQMLEKSPYDQAAWILKAR...,seq_sort_og_16003,seq_sort_new_440,yes,yes
5,1,,1,,MI:2226,,mutation with no effect,,A,,...,"unknown,yes","unknown,yes",yes,yes,MAEERVATRTQFPVSTESQKPRQKKAPEFPILEKQNWLIHLHYIRK...,ALKAAAKRPELSGKNTISNNSDMAEVKSMFREVLPKQGPLFVEDIM...,seq_sort_og_2339,seq_sort_new_151,yes,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
71330,,993,,993,,MI:2226,,mutation with no effect,,A,...,yes,yes,yes,yes,MALSVDSSWHRWQWRVRDGFPHCPSETTPLLSPEKGRQSYNLTQQR...,MALSVDSSWHRWQWRVRDGFPHCPSETTPLLSPEKGRQSYNLTQQR...,seq_sort_og_3865,seq_sort_new_13816,yes,yes
71331,,995,,995,,MI:0119,,mutation decreasing interaction,,L,...,yes,yes,yes,yes,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQVVI...,seq_sort_og_16529,seq_sort_new_52058,yes,yes
71332,,998,,998,,MI:1133,,mutation decreasing interaction strength,,E,...,yes,yes,yes,yes,MATPGPRDIPLLPGSPRRLSPQAGSRGGQGPKHGQQCLKMPGPRAP...,MATPGPRDIPLLPGSPRRLSPQAGSRGGQGPKHGQQCLKMPGPRAP...,seq_sort_og_6386,seq_sort_new_21037,yes,yes
71333,,998,,998,,MI:2226,,mutation with no effect,,R,...,yes,yes,yes,yes,MAEHLELLAEMPMVGRMSTQERLKHAQKRRAQQVKMWAQAEKEAQG...,MAEHLELLAEMPMVGRMSTQERLKHAQKRRAQQVKMWAQAEKEAQG...,seq_sort_og_2397,seq_sort_new_10173,yes,yes


In [375]:
# perfect! now that we have Mutated decisive_seqpair_new_binds and Mutated decisive_seqpair_og_binds, we can come up with some positive and negative-binding pairs from here
# first let's save all of this 
import os
os.makedirs("data_files/processed/intact/clean",exist_ok=True)
merged_neg_expl_mut_filt[[
"interaction_intactid",
"unique_id",
"seq_pair_id",
"Mutated Partner",
"Mutated decisive_seqpair_new_binds",
"Mutated decisive_seqpair_og_binds",
"Mutated decisive_entry_new_binds",
"Mutated decisive_entry_og_binds",
"Mutated all_new_binds",
"Mutated all_og_binds",
"agg_mut_has_info",
"Mutation Affected protein AC",
"Mutation # Feature AC",
"Mutation Feature annotation(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"Mutation new_nobind_to_uniprot_bo_annotation",
"Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type",
"aa_1",
"aa_2",
"intactid_1",
"intactid_2",
"dip_1",
"dip_2",
"length_1",
"length_2",
"mutated_aa_1",
"mutated_aa_2",
"mutation_begin_1",
"mutation_begin_2",
"mutation_end_1",
"mutation_end_2",
"mutation_mi_1",
"mutation_mi_2",
"mutation_name_1",
"mutation_name_2",
"mutation_new_1",
"mutation_new_2",
"mutation_new_binds_bo_mi",
"mutation_og_binds_bo_mi",
"mutation_orig_1",
"mutation_orig_2",
"mutation_range_1",
"mutation_range_2",
"mutation_short_1",
"mutation_short_2",
"uniprot_A",
"uniprot_A_intact",
"uniprot_B",
"uniprot_B_intact",
"unique_expansions",
"unique_uniprot_pair"
]].to_csv("data_files/processed/intact/clean/mutations_neg_dec11_2025.csv",index=False)

In [376]:
# trying to find the least common set of unique identifierrs in merged
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(merged.loc[
    merged.duplicated(["unique_id"])
])
print(f"Rows in merged with duplicate unique_id: {test1}")
test1 = len(merged.loc[
    merged.duplicated(["interaction_intactid"])
])
print(f"Rows in merged with duplicate interaction_intactid: {test1}")
test1 = len(merged.loc[
    merged.duplicated(["seq_pair_id"])
])
print(f"Rows in merged with duplicate seq_pair_id: {test1}")

Rows in merged with duplicate unique_id: 316590
Rows in merged with duplicate interaction_intactid: 1958
Rows in merged with duplicate seq_pair_id: 318546


In [377]:
# trying to find the least common set of unique identifierrs in merged_neg
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(merged_neg.loc[
    merged_neg.duplicated(["unique_id"])
])
print(f"Rows in merged_neg with duplicate unique_id: {test1}")
test1 = len(merged_neg.loc[
    merged_neg.duplicated(["interaction_intactid"])
])
print(f"Rows in merged_neg with duplicate interaction_intactid: {test1}")
test1 = len(merged_neg.loc[
    merged_neg.duplicated(["seq_pair_id"])
])
print(f"Rows in merged_neg with duplicate seq_pair_id: {test1}")

Rows in merged_neg with duplicate unique_id: 52
Rows in merged_neg with duplicate interaction_intactid: 1
Rows in merged_neg with duplicate seq_pair_id: 54


In [378]:
keepcols = [
"Negative",
"aa_1",
"aa_2",
"invalids_aa_1",
"invalids_aa_2",
"all_intact_A_sorted",
"all_intact_B_sorted",
"chain_seq_end_1",
"chain_seq_end_2",
"chain_seq_start_1",
"chain_seq_start_2",
"confidence_val_int",
"ensg_1",
"ensg_2",
"ensp_1",
"ensp_2",
"enst_1",
"enst_2",
"equal_score_int",
"gene_symbol_1",
"gene_symbol_2",
"go_1",
"go_2",
"dip_1",
"dip_2",
"host_cell_type_1",
"host_cell_type_2",
"host_compartment_1",
"host_compartment_2",
"host_label_full_1",
"host_label_full_2",
"host_label_short_1",
"host_label_short_2",
"host_taxid_1",
"host_taxid_2",
"host_tissue_1",
"host_tissue_2",
"intactid_1",
"intactid_2",
"interaction_detection_methods_sorted",
"interaction_intactid",
"interaction_label",
"interaction_mi",
"interaction_xml_id",
"interpro_1",
"interpro_2",
"length_1",
"length_2",
"miscore",
"mol_type_1",
"mol_type_2",
"no_uniprot_update_A",
"no_uniprot_update_B",
"primaryref_db_1",
"primaryref_db_2",
"primaryref_id_1",
"primaryref_id_2",
"protein_1",
"protein_2",
"pubmeds",
"reactome_1",
"reactome_2",
"rscbpdb_1",
"rscbpdb_2",
"seq_pair_id",
"seq_sort",
"species_label_1",
"species_label_2",
"species_taxid_1",
"species_taxid_2",
"uniprot_A",
"uniprot_A_equalseq",
"uniprot_A_equalseq_canonical",
"uniprot_A_full",
"uniprot_A_inseq",
"uniprot_A_inseq_canonical",
"uniprot_A_intact",
"uniprot_A_noiso1",
"uniprot_A_noisoforms",
"uniprot_B",
"uniprot_B_equalseq",
"uniprot_B_equalseq_canonical",
"uniprot_B_full",
"uniprot_B_inseq",
"uniprot_B_inseq_canonical",
"uniprot_B_intact",
"uniprot_B_noiso1",
"uniprot_B_noisoforms",
"uniprot_gene_name_A",
"uniprot_gene_name_B",
"uniprotkb_1",
"uniprotkb_2",
"unique_all_intact_sorted",
"unique_expansions",
"unique_id",
"unique_score_int",
"unique_scores",
"unique_uniprot_noiso1_pair",
"unique_uniprot_noisoforms_pair",
"unique_uniprot_pair",
"year",
"binding_mi_1", 
"binding_name_1", 
"binding_short_1", 
"binding_begin_1", 
"binding_end_1", 
"binding_mi_2", 
"binding_name_2", 
"binding_short_2",
"binding_begin_2", 
"binding_end_2"
]
simplemerged = merged[keepcols]
simplemerged_neg = merged_neg[keepcols]

simplemerged = simplemerged.drop_duplicates().reset_index(drop=True)
simplemerged_neg = simplemerged_neg.drop_duplicates().reset_index(drop=True)

binding_cols = ["binding_mi_1", 
"binding_name_1", 
"binding_short_1", 
"binding_begin_1", 
"binding_end_1", 
"binding_mi_2", 
"binding_name_2", 
"binding_short_2",
"binding_begin_2", 
"binding_end_2"]
binding_col_change = {x: f"all_{x}" for x in binding_cols}
simplemerged = simplemerged.rename(columns=binding_col_change)
simplemerged_neg = simplemerged_neg.rename(columns=binding_col_change)

print(f"Length of simplemerged: {len(simplemerged)}")
print(f"Length of simplemerged_neg: {len(simplemerged_neg)}")


Length of simplemerged: 744614
Length of simplemerged_neg: 970


In [379]:
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + ["all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"]

simplemerged = harmonize_nulls_to_nan(simplemerged)

all_except_featac = [c for c in simplemerged.columns if c not in need_pipejoin]

agg_spec = {c: join_unique_nonnull for c in need_pipejoin}

display(simplemerged.head())
simplemerged = (
    simplemerged
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: {len(simplemerged)}")

display(simplemerged.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-101707,intact:EBI-100018,,,,...,,,,,,,,,,
1,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,,,intact:EBI-100018,intact:EBI-102069,,,,...,MI:0117,binding-associated region,region,1207.0,1783.0,MI:0117,binding-associated region,region,,
2,False,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-104215,intact:EBI-100018,,,,...,,,,,,,,,,
3,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,,,intact:EBI-100018,intact:EBI-107089,,,,...,,,,,,,,,,
4,False,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-117032,intact:EBI-100018,,,,...,,,,,,,,,,


Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: 743130


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,AAAAARPAGGSARRWGRPGRCGLLAAGPKRVRSEPGGRLPERSLGP...,MTVFRQENVDDYYDTGEELGSGQFAVVKKCREKSTGLQYAAKFIKK...,,,intact:EBI-20589573,intact:EBI-358616,,,,...,,,,,,,,,,
1,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAKWGEGDPRWIVEERADATNVNNWHWTERDASNWSTDKLKTLFLA...,,,intact:EBI-25507607,intact:EBI-448610,7176.0,,6878.0,...,,,,,,,,,,
2,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MANDPLEGFHEVNLASPTSPDLLGVCDPGTQEQTTSPSVIYRPHPS...,,,intact:EBI-25507607,intact:EBI-16730154,7176.0,,6878.0,...,,,,,,,,,,
3,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAQYGHPSPLGMAAREELYSKVTPRRNRQQRPGTIKHGSALDVLLS...,,,intact:EBI-25507607,intact:EBI-1380492,7176.0,,6878.0,...,,,,,,,,,,
4,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAVALLEEWCKIMGVDVQKSLLVVDIPVDCGEPEIQTVLQEALKCV...,,,intact:EBI-25507607,intact:EBI-25508298,7176.0,,6878.0,...,,,,,,,,,,


In [380]:
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + [
    "all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"
]

simplemerged_neg = harmonize_nulls_to_nan(simplemerged_neg)

all_except_featac = [c for c in simplemerged_neg.columns if c not in need_pipejoin]

agg_spec = {c: join_unique_nonnull for c in need_pipejoin}

display(simplemerged_neg.head())
simplemerged_neg = (
    simplemerged_neg
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: {len(simplemerged_neg)}")

display(simplemerged_neg.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,True,MARKHTFPSLKRAEILECIDGLGIPFTAKELDQPTSKAVIPLYEEF...,MLVELLEFTPLSFIDDVINITNQLLYKGVNGVDKAFSQTRFAKKAP...,,,intact:EBI-1002565,intact:EBI-1002822,,,,...,,,,,,,,,,
1,True,MAGAIASRMSFSSLKRKQPKTFTVRIVTMDAEMEFNCEMKWKGKDL...,MERSGQRVTTWDCDQGKHSDSDYREDGMDLGSDAGSSSSSSRASSQ...,,,intact:EBI-1014500,intact:EBI-1397518,,,,...,,,,,,,,,,
2,True,MDEESLESALQTYRAQLQQVELALGAGLDSSEQADLRQLQGDLKEL...,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,,,intact:EBI-16428984,intact:EBI-10171697,,,,...,,,,,,,,,,
3,True,MASSTPSSSATSSNAGADPNTTNLRPTTYDTWCGVAHGCTRKLGLK...,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,,,intact:EBI-16467584,intact:EBI-10171697,,,,...,,,,,,,,,,
4,True,MSSSSSSPRETYEEDREYESQAKRLKTEEGEIDYSAEEGENRREAT...,MTSDCSSTHCSPESCGTASGCAPASSCSVETACLPGTCATSRCQTP...,,,intact:EBI-16468000,intact:EBI-10171697,,,,...,,,,,,,,,,


Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: 969


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,True,IAAPGPALCLFDVDGTLTAPRQKITKEMDDFLQKLRQKIKIGVVGG...,MCSLPVPREPLRRVAVTGGTHGNEMSGVYLARHWLHAPAELQRASF...,,,intact:EBI-16472255,intact:EBI-3916242,,,,...,,,,,,,,,,
1,True,ISGMEATVLSPSLCSRPSQSGKTSHMGLLEV,MNSSTSTMSEEPDALSVVNQLRDLAADPLNRRAIVQDQGCLPGLIL...,,,intact:EBI-16467021,intact:EBI-3506974,,,,...,,,,,,,,,,
2,True,ISGMEATVLSPSLCSRPSQSGKTSHMGLLEV,MPLEQRSQHCKPEEGLEAQGEALGLVGAQAPATEEQETASSSSTLV...,,,intact:EBI-16467021,intact:EBI-749530,,,,...,,,,,,,,,,
3,True,LNYMPGTASLIEDIDKKHLVLLRDGRTLIGFLRSIDQFGLGKGE,MADDVDQQQTTNTVEEPLDLIRLSLDERIYVKMRNDRELRGRLHAY...,,,intact:EBI-16434023,intact:EBI-348239,,,,...,,,,,,,,,,
4,True,MAAAAGSCARVAAWGGKLRRGLAVSRQAVRSPGPLAAAVAGAALAG...,MTGAEIEPSAQAKPEKKAGEEVIAGPERENDVPLVVRPKVRTQATT...,,,intact:EBI-3197790,intact:EBI-473189,,,,...,,,,,,,,,,


In [381]:
## AFTER cleaning simplemerged a bit, run this again!
# trying to find the least common set of unique identifierrs in merged
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id"])
])
print(f"Rows in simplemerged with duplicate unique_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged with duplicate unique_id: 314635
Rows in simplemerged with duplicate interaction_intactid: 3
Rows in simplemerged with duplicate seq_pair_id: 316591
Rows in simplemerged with duplicate unique_id+seq_pair_id: 314635
Rows in simplemerged with duplicate unique_id+interaction_intactid: 0
Rows in simplemerged with duplicate seq_pair_id+interaction_intactid: 0


In [382]:
## AFTER cleaning simplemerged_neg a bit, run this again!
# trying to find the least common set of unique identifierrs in merged_neg
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_neg with duplicate unique_id: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_neg with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_neg with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_neg with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged_neg with duplicate unique_id: 51
Rows in simplemerged_neg with duplicate interaction_intactid: 0
Rows in simplemerged_neg with duplicate seq_pair_id: 53
Rows in simplemerged_neg with duplicate unique_id+seq_pair_id: 51
Rows in simplemerged_neg with duplicate unique_id+interaction_intactid: 0
Rows in simplemerged_neg with duplicate seq_pair_id+interaction_intactid: 0


In [383]:
## Negative pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_new_binds is false
## Positive pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_new_binds is true
merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="no")
][["interaction_intactid",
"unique_id",
"seq_pair_id",
"Mutated Partner",
"Mutated decisive_seqpair_new_binds",
"Mutated decisive_seqpair_og_binds",
"Mutated all_new_binds",
"Mutated all_og_binds",
"agg_mut_has_info",
"aa_1",
"aa_2",
"mutated_aa_1",
"mutation_mi_1",
"mutation_short_1",
"mutation_name_1",
"mutation_begin_1",
"mutation_end_1",
"mutation_range_1",
"mutation_orig_1",
"mutation_new_1",
"mutated_aa_2",
"mutation_mi_2",
"mutation_short_2",
"mutation_name_2",
"mutation_begin_2",
"mutation_end_2",
"mutation_range_2",
"mutation_orig_2",
"mutation_new_2",
"mutation_new_binds_bo_mi",
"mutation_og_binds_bo_mi",]].reset_index(drop=True).head(100).to_csv("temp_dec11_2025.csv",index=False)

In [384]:
## Positive pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_new_binds is true
## Positive pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is true
pos_new_from_mutation_data = merged_expl_mut_filt.loc[
    # new binds 
    (merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="yes") & 
    # but the new sequence is not invalid!!
    ~(
        (merged_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: {len(pos_new_from_mutation_data)}")
test1 = len(pos_new_from_mutation_data.loc[
    (pos_new_from_mutation_data["invalids_mutated_aa_1"].notna()) |
    (pos_new_from_mutation_data["invalids_mutated_aa_2"].notna())
])==0
print(f"\tNo invalid characters in mutated sequences: {test1}")

merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
    (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
]

pos_og_from_mutation_data = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="yes")
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_og_binds==yes: {len(pos_og_from_mutation_data)}")

Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: 45237
	No invalid characters in mutated sequences: True
Total rows where Mutated decisive_seqpair_og_binds==yes: 64455


In [385]:
pos_og_from_mutation_data["seq_sort_og_id"]

0        seq_sort_og_16524
1        seq_sort_og_16524
2        seq_sort_og_12085
3        seq_sort_og_16003
4         seq_sort_og_2339
               ...        
64450     seq_sort_og_3865
64451    seq_sort_og_16529
64452     seq_sort_og_6386
64453     seq_sort_og_2397
64454     seq_sort_og_5376
Name: seq_sort_og_id, Length: 64455, dtype: object

In [386]:
## Positive pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_new_binds is true
## Positive pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is true
pos_new_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    # new binds 
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="yes") & 
    # but the new sequence is not invalid!!
    ~(
        (merged_neg_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: {len(pos_new_from_mutation_neg_data)}")
test1 = len(pos_new_from_mutation_neg_data.loc[
    (pos_new_from_mutation_neg_data["invalids_mutated_aa_1"].notna()) |
    (pos_new_from_mutation_neg_data["invalids_mutated_aa_2"].notna())
])==0
print(f"\tNo invalid characters in mutated sequences: {test1}")

merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
    (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
]

pos_og_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="yes")
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_og_binds==yes: {len(pos_og_from_mutation_neg_data)}")

Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: 0
	No invalid characters in mutated sequences: True
Total rows where Mutated decisive_seqpair_og_binds==yes: 3


In [387]:
test1 = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated all_new_binds"].str.contains("yes")) & 
    ~((merged_expl_mut_filt["Mutated all_new_binds"].str.contains(",no")) | 
    (merged_expl_mut_filt["Mutated all_new_binds"].str.contains("no,")))
]
test1 = len(test1.loc[
    ~(
        (test1["invalids_mutated_aa_1"].notna()) |
        (test1["invalids_mutated_aa_2"].notna())
    )
])
print(f"Total rows where Mutated all_new_binds has yes and does not have no, AND mutated sequences are all valid: {test1}")

test1 = len(merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated all_og_binds"].str.contains("yes")) & 
    ~((merged_expl_mut_filt["Mutated all_og_binds"].str.contains(",no")) | 
    (merged_expl_mut_filt["Mutated all_og_binds"].str.contains("no,")))
][["Mutated all_new_binds","Mutated decisive_seqpair_og_binds"]])
print(f"Total rows where Mutated all_og_binds has yes and does not have no: {test1}")

Total rows where Mutated all_new_binds has yes and does not have no, AND mutated sequences are all valid: 46952
Total rows where Mutated all_og_binds has yes and does not have no: 64762


In [388]:
test1 = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated all_new_binds"].str.contains("yes")) & 
    ~((merged_neg_expl_mut_filt["Mutated all_new_binds"].str.contains(",no")) | 
    (merged_neg_expl_mut_filt["Mutated all_new_binds"].str.contains("no,")))
]
test1 = len(test1.loc[
    ~(
        (test1["invalids_mutated_aa_1"].notna()) |
        (test1["invalids_mutated_aa_2"].notna())
    )
])
print(f"Total rows where Mutated all_new_binds has yes and does not have no, AND mutated sequences are all valid: {test1}")

test1 = len(merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated all_og_binds"].str.contains("yes")) & 
    ~((merged_neg_expl_mut_filt["Mutated all_og_binds"].str.contains(",no")) | 
    (merged_neg_expl_mut_filt["Mutated all_og_binds"].str.contains("no,")))
][["Mutated all_new_binds","Mutated decisive_seqpair_og_binds"]])
print(f"Total rows where Mutated all_og_binds has yes and does not have no: {test1}")

Total rows where Mutated all_new_binds has yes and does not have no, AND mutated sequences are all valid: 0
Total rows where Mutated all_og_binds has yes and does not have no: 3


In [389]:
unknown_og_from_mutation_data = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="unknown")
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_og_binds==unknown: {len(unknown_og_from_mutation_data)}")

Total rows where Mutated decisive_seqpair_og_binds==unknown: 5578


In [390]:
unknown_og_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="unknown")
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_og_binds==unknown: {len(unknown_og_from_mutation_neg_data)}")

Total rows where Mutated decisive_seqpair_og_binds==unknown: 10


In [391]:
## Negative pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
## Negative pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
neg_new_from_mutation_data = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="no") & 
    ~(
        (merged_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_new_binds==no AND mutated sequence has all valid characters: {len(neg_new_from_mutation_data)}")
test1 = len(neg_new_from_mutation_data.loc[
    (neg_new_from_mutation_data["invalids_mutated_aa_1"].notna()) |
    (neg_new_from_mutation_data["invalids_mutated_aa_2"].notna())
])==0
print(f"\tNo invalid characters in mutated sequences: {test1}")

neg_og_from_mutation_data = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="no"
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_og_binds==no: {len(neg_og_from_mutation_data)}")


Total rows where Mutated decisive_seqpair_new_binds==no AND mutated sequence has all valid characters: 17970
	No invalid characters in mutated sequences: True
Total rows where Mutated decisive_seqpair_og_binds==no: 1302


In [392]:
## Negative pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
## Negative pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
neg_new_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="no") & 
    ~(
        (merged_neg_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_new_binds==no AND mutated sequence has all valid characters: {len(neg_new_from_mutation_neg_data)}")
test1 = len(neg_new_from_mutation_neg_data.loc[
    (neg_new_from_mutation_neg_data["invalids_mutated_aa_1"].notna()) |
    (neg_new_from_mutation_neg_data["invalids_mutated_aa_2"].notna())
])==0
print(f"\tNo invalid characters in mutated sequences: {test1}")

neg_og_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="no"
].reset_index(drop=True)
print(f"Total rows where Mutated decisive_seqpair_og_binds==no: {len(neg_og_from_mutation_neg_data)}")


Total rows where Mutated decisive_seqpair_new_binds==no AND mutated sequence has all valid characters: 3
	No invalid characters in mutated sequences: True
Total rows where Mutated decisive_seqpair_og_binds==no: 0


In [393]:
test1 = merged_expl_mut_filt.loc[
    (
        ~(merged_expl_mut_filt["Mutated all_new_binds"].str.contains("yes")) & 
            ((merged_expl_mut_filt["Mutated all_new_binds"]==("no")) | 
            (merged_expl_mut_filt["Mutated all_new_binds"].str.contains(",no")) | 
            (merged_expl_mut_filt["Mutated all_new_binds"].str.contains("no,")))) 

]
test1 = len(test1.loc[
    ~(
        (test1["invalids_mutated_aa_1"].notna()) |
        (test1["invalids_mutated_aa_2"].notna())
    )
])
print(f"Total rows where Mutated all_new_binds has no and does not have yes, and mutated sequenes are all valid: {test1}")

test1 = len(merged_expl_mut_filt.loc[
    ~(merged_expl_mut_filt["Mutated all_og_binds"].str.contains("yes")) & 
    ((merged_expl_mut_filt["Mutated all_og_binds"]==("no")) | 
    (merged_expl_mut_filt["Mutated all_og_binds"].str.contains(",no")) | 
    (merged_expl_mut_filt["Mutated all_og_binds"].str.contains("no,")))
][["Mutated all_new_binds","Mutated decisive_seqpair_og_binds"]])
print(f"Total rows where Mutated all_og_binds has no and does not have yes: {test1}")

Total rows where Mutated all_new_binds has no and does not have yes, and mutated sequenes are all valid: 19842
Total rows where Mutated all_og_binds has no and does not have yes: 2070


In [394]:
test1 = merged_neg_expl_mut_filt.loc[
    (
        ~(merged_neg_expl_mut_filt["Mutated all_new_binds"].str.contains("yes")) & 
            ((merged_neg_expl_mut_filt["Mutated all_new_binds"]==("no")) | 
            (merged_neg_expl_mut_filt["Mutated all_new_binds"].str.contains(",no")) | 
            (merged_neg_expl_mut_filt["Mutated all_new_binds"].str.contains("no,")))) 

]
test1 = len(test1.loc[
    ~(
        (test1["invalids_mutated_aa_1"].notna()) |
        (test1["invalids_mutated_aa_2"].notna())
    )
])
print(f"Total rows where Mutated all_new_binds has no and does not have yes, and mutated sequenes are all valid: {test1}")

test1 = len(merged_neg_expl_mut_filt.loc[
    ~(merged_neg_expl_mut_filt["Mutated all_og_binds"].str.contains("yes")) & 
    ((merged_neg_expl_mut_filt["Mutated all_og_binds"]==("no")) | 
    (merged_neg_expl_mut_filt["Mutated all_og_binds"].str.contains(",no")) | 
    (merged_neg_expl_mut_filt["Mutated all_og_binds"].str.contains("no,")))
][["Mutated all_new_binds","Mutated decisive_seqpair_og_binds"]])
print(f"Total rows where Mutated all_og_binds has no and does not have yes: {test1}")

Total rows where Mutated all_new_binds has no and does not have yes, and mutated sequenes are all valid: 3
Total rows where Mutated all_og_binds has no and does not have yes: 0


In [395]:
def convert_mut_cols_to_ppi(row):
    """
    Convert mutation columns to ppi
    """
    # defaults
    aa_1 = row["aa_1"]
    length_1 = row["length_1"]
    invalids_aa_1 = row["invalids_aa_1"]
    uniprot_A = row["uniprot_A"]
    chain_seq_start_1 = row["chain_seq_start_1"]
    chain_seq_end_1 = row["chain_seq_end_1"]
    uniprot_A_equalseq = row["uniprot_A_equalseq"]
    uniprot_A_equalseq_canonical = row["uniprot_A_equalseq_canonical"]
    uniprot_A_full = row["uniprot_A_full"]
    uniprot_A_inseq = row["uniprot_A_inseq"]
    uniprot_A_inseq_canonical = row["uniprot_A_inseq_canonical"]
    uniprot_A_noiso1 = row["uniprot_A_noiso1"]
    
    aa_2 = row["aa_2"]
    length_2 = row["length_2"]
    invalids_aa_2 = row["invalids_aa_2"]
    uniprot_B = row["uniprot_B"]
    chain_seq_start_2 = row["chain_seq_start_2"]
    chain_seq_end_2 = row["chain_seq_end_2"]
    uniprot_B_equalseq = row["uniprot_B_equalseq"]
    uniprot_B_equalseq_canonical = row["uniprot_B_equalseq_canonical"]
    uniprot_B_full = row["uniprot_B_full"]
    uniprot_B_inseq = row["uniprot_B_inseq"]
    uniprot_B_inseq_canonical = row["uniprot_B_inseq_canonical"]
    uniprot_B_noiso1 = row["uniprot_B_noiso1"]
    
    mutated_parter = row["Mutated Partner"]
    if not(type(mutated_parter)==float or mutated_parter is None):
        if "A" in mutated_parter:
            aa_1 = row["mutated_aa_1"]
            length_1 = len(aa_1)
            invalids_aa_1 = row["invalids_mutated_aa_1"]
            uniprot_A = str(row["uniprot_A"]) + "_mutated" if type(row["uniprot_A"])==str else None
            chain_seq_start_1 = np.nan
            chain_seq_end_1 = np.nan
            uniprot_A_equalseq = np.nan
            uniprot_A_equalseq_canonical = np.nan
            uniprot_A_full = np.nan
            uniprot_A_inseq = np.nan
            uniprot_A_inseq_canonical = np.nan
            uniprot_A_noiso1 = np.nan
        if "B" in mutated_parter:
            aa_2 = row["mutated_aa_2"]
            length_2 = len(aa_2)
            invalids_aa_2 = row["invalids_mutated_aa_2"]
            uniprot_B = str(row["uniprot_B"]) + "_mutated" if type(row["uniprot_B"])==str else None
            chain_seq_start_2 = np.nan
            chain_seq_end_2 = np.nan
            uniprot_B_equalseq = np.nan
            uniprot_B_equalseq_canonical = np.nan
            uniprot_B_full = np.nan
            uniprot_B_inseq = np.nan
            uniprot_B_inseq_canonical = np.nan
            uniprot_B_noiso1 = np.nan
    
    return pd.Series({
        "aa_1": aa_1,
        "length_1": length_1,
        "invalids_aa_1": invalids_aa_1,
        "uniprot_A": uniprot_A,
        "chain_seq_start_1": chain_seq_start_1,
        "chain_seq_end_1": chain_seq_end_1,
        "uniprot_A_equalseq": uniprot_A_equalseq,
        "uniprot_A_equalseq_canonical": uniprot_A_equalseq_canonical,
        "uniprot_A_full": uniprot_A_full,
        "uniprot_A_inseq": uniprot_A_inseq,
        "uniprot_A_inseq_canonical": uniprot_A_inseq_canonical,
        "uniprot_A_noiso1": uniprot_A_noiso1,
        "aa_2": aa_2,
        "length_2": length_2,
        "invalids_aa_2": invalids_aa_2,
        "uniprot_B": uniprot_B,
        "chain_seq_start_2": chain_seq_start_2,
        "chain_seq_end_2": chain_seq_end_2,
        "uniprot_B_equalseq": uniprot_B_equalseq,
        "uniprot_B_equalseq_canonical": uniprot_B_equalseq_canonical,
        "uniprot_B_full": uniprot_B_full,
        "uniprot_B_inseq": uniprot_B_inseq,
        "uniprot_B_inseq_canonical": uniprot_B_inseq_canonical,
        "uniprot_B_noiso1": uniprot_B_noiso1,
    })
    

In [396]:
# somehow get it back into simplemerged
# simplemerged has 
change_cols = ["aa_1",
 "length_1",
 "invalids_aa_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "invalids_aa_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]

In [397]:
pos_og_from_mutation_data = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="yes")
].reset_index(drop=True)
pos_og_from_mutation_data["Mutated Partner Status"] = ["original"]*len(pos_og_from_mutation_data)
print(f"Total rows where Mutated decisive_seqpair_og_binds==yes: {len(pos_og_from_mutation_data)}")

# what columns are in common?
common_cols = list(set(pos_og_from_mutation_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
pos_og_from_mutation_data = pd.merge(
    simplemerged,
    pos_og_from_mutation_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_og_from_mutation_data["seq_sort"] = pos_og_from_mutation_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(pos_og_from_mutation_data.loc[
    ~pos_og_from_mutation_data["seq_sort"].isin(test1)
])==0
print(f"As expected, no new sequence pairs arose from pos_og_from_mutation_data: {test1}. Size of simplemerged: {len(simplemerged)}")

temp = pos_og_from_mutation_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "Mutated Partner","Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

## Creation of simplemerged_mut
# Goal: get mutation dta into simplemerged_mut, starting with the positive OGs.
temp = pd.merge(
    simplemerged,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
print(f"Merged in some mutation-related information. New size of dataframe = {len(temp)}")


Total rows where Mutated decisive_seqpair_og_binds==yes: 64455
As expected, no new sequence pairs arose from pos_og_from_mutation_data: True. Size of simplemerged: 743130
Merged in some mutation-related information. New size of dataframe = 779638


In [398]:
## Positive pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
## Positive pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
pos_og_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="yes"
].reset_index(drop=True)
pos_og_from_mutation_neg_data["Mutated Partner Status"] = ["original"]*len(pos_og_from_mutation_neg_data)
print(f"Total rows where Mutated decisive_seqpair_og_binds==yes: {len(pos_og_from_mutation_neg_data)}")

# what columns are in common?
common_cols = list(set(pos_og_from_mutation_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
pos_og_from_mutation_neg_data = pd.merge(
    simplemerged_neg,
    pos_og_from_mutation_neg_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_og_from_mutation_neg_data["seq_sort"] = pos_og_from_mutation_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(pos_og_from_mutation_neg_data.loc[
    pos_og_from_mutation_neg_data["seq_sort"].isin(test1)
])
print(f"Total positive OG interactions that are currently negative in our neg-PPI data: {test1}")

print(f"Joining in positive data from the negatives database: simplemerged_neg")
temp = pos_og_from_mutation_neg_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "Mutated Partner", "Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

temp = pd.merge(
    simplemerged_neg,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
)
temp = temp.loc[temp["Mutated Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tSize of simplemerged_neg_mut_pos with just original negatives: {len(temp)}")

Total rows where Mutated decisive_seqpair_og_binds==yes: 3
Total positive OG interactions that are currently negative in our neg-PPI data: 3
Joining in positive data from the negatives database: simplemerged_neg
	Size of simplemerged_neg_mut_pos with just original negatives: 3


In [399]:
common_cols = list(set(pos_og_from_mutation_data.columns).intersection(set(simplemerged.columns)))
l = sorted(list(set(pos_og_from_mutation_data.columns) - set(common_cols)))
print(
    ",\n".join(
        [f"\"{x}\"" for x in l]
    ))

"Mutated Partner",
"Mutated Partner Status",
"Mutated all_new_binds",
"Mutated all_og_binds",
"Mutated decisive_entry_new_binds",
"Mutated decisive_entry_og_binds",
"Mutated decisive_seqpair_new_binds",
"Mutated decisive_seqpair_og_binds",
"Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"

In [400]:
len(pos_og_from_mutation_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates())

27781

In [401]:
len(pos_og_from_mutation_data[["unique_id",
        "interaction_intactid","seq_pair_id"]].drop_duplicates())

27781

In [402]:
pos_new_from_mutation_data = merged_expl_mut_filt.loc[
    # new binds 
    (merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="yes") & 
    # but the new sequence is not invalid!!
    ~(
        (merged_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
pos_new_from_mutation_data["Mutated Partner Status"] = ["mutated"]*len(pos_new_from_mutation_data)
print(f"Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: {len(pos_new_from_mutation_data)}")
test1 = len(pos_new_from_mutation_data.loc[
    (pos_new_from_mutation_data["invalids_mutated_aa_1"].notna()) |
    (pos_new_from_mutation_data["invalids_mutated_aa_2"].notna())
])==0
print(f"\tNo invalid characters in mutated sequences: {test1}")


# what columns are in common?
common_cols = list(set(pos_new_from_mutation_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
pos_new_from_mutation_data = pd.merge(
    simplemerged,
    pos_new_from_mutation_data, 
    on=common_cols, 
    how="inner"
)

pos_new_from_mutation_data[
    change_cols
] = pos_new_from_mutation_data.apply(lambda row: convert_mut_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_new_from_mutation_data["seq_sort"] = pos_new_from_mutation_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(pos_new_from_mutation_data.loc[
    ~pos_new_from_mutation_data["seq_sort"].isin(test1)
])
print(f"Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from mutation data: {test1}")

Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: 45237
	No invalid characters in mutated sequences: True
Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from mutation data: 45237


In [403]:
display(merged_neg_expl_mut_filt[["Mutated decisive_seqpair_new_binds","Mutated decisive_seqpair_og_binds"]])

Unnamed: 0,Mutated decisive_seqpair_new_binds,Mutated decisive_seqpair_og_binds
0,no,yes
1,no,yes
2,unknown,unknown
3,no,yes
4,unknown,unknown
5,unknown,unknown
6,unknown,unknown
7,unknown,unknown
8,unknown,unknown
9,unknown,unknown


In [404]:
pos_new_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    # new binds 
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="yes") & 
    # but the new sequence is not invalid!!
    ~(
        (merged_neg_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
pos_new_from_mutation_neg_data["Mutated Partner Status"] = ["mutated"]*len(pos_new_from_mutation_neg_data)
print(f"Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: {len(pos_new_from_mutation_neg_data)}")
test1 = len(pos_new_from_mutation_neg_data.loc[
    (pos_new_from_mutation_neg_data["invalids_mutated_aa_1"].notna()) |
    (pos_new_from_mutation_neg_data["invalids_mutated_aa_2"].notna())
])==0
print(f"\tNo invalid characters in mutated sequences: {test1}")


# what columns are in common?
common_cols = list(set(pos_new_from_mutation_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
pos_new_from_mutation_neg_data = pd.merge(
    simplemerged_neg,
    pos_new_from_mutation_neg_data, 
    on=common_cols, 
    how="inner"
)

if len(pos_new_from_mutation_neg_data)>0:
    pos_new_from_mutation_neg_data[
        change_cols
    ] = pos_new_from_mutation_neg_data.apply(lambda row: convert_mut_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_new_from_mutation_neg_data["seq_sort"] = pos_new_from_mutation_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(pos_new_from_mutation_neg_data.loc[
    ~pos_new_from_mutation_neg_data["seq_sort"].isin(test1)
])
print(f"Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from mutation data: {test1}")

Total rows where Mutated decisive_seqpair_new_binds==yes AND mutated sequence has all valid characters: 0
	No invalid characters in mutated sequences: True
Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from mutation data: 0


In [405]:
## Unknown pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is unknown
unknown_og_from_mutation_data = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="unknown"
].reset_index(drop=True)
unknown_og_from_mutation_data["Mutated Partner Status"] = ["original"]*len(unknown_og_from_mutation_data)
print(f"Total rows where Mutated decisive_seqpair_og_binds==unknown: {len(unknown_og_from_mutation_data)}")

# what columns are in common?
common_cols = list(set(unknown_og_from_mutation_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
unknown_og_from_mutation_data = pd.merge(
    simplemerged,
    unknown_og_from_mutation_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_og_from_mutation_data["seq_sort"] = unknown_og_from_mutation_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(unknown_og_from_mutation_data.loc[
    unknown_og_from_mutation_data["seq_sort"].isin(test1)
])
print(f"Total unknown OG interactions that are currently positive in our PPI data: {test1}")

Total rows where Mutated decisive_seqpair_og_binds==unknown: 5578
Total unknown OG interactions that are currently positive in our PPI data: 5578


In [406]:
## Unknown pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is unknown
unknown_og_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="unknown"
].reset_index(drop=True)
unknown_og_from_mutation_neg_data["Mutated Partner Status"] = ["original"]*len(unknown_og_from_mutation_neg_data)
print(f"Total rows where Mutated decisive_seqpair_og_binds==unknown: {len(unknown_og_from_mutation_neg_data)}")

# what columns are in common?
common_cols = list(set(unknown_og_from_mutation_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
unknown_og_from_mutation_neg_data = pd.merge(
    simplemerged_neg,
    unknown_og_from_mutation_neg_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_og_from_mutation_neg_data["seq_sort"] = unknown_og_from_mutation_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(unknown_og_from_mutation_neg_data.loc[
    unknown_og_from_mutation_neg_data["seq_sort"].isin(test1)
])
print(f"Total unknown OG interactions that are currently negative in our PPI data: {test1}")

Total rows where Mutated decisive_seqpair_og_binds==unknown: 10
Total unknown OG interactions that are currently negative in our PPI data: 10


In [407]:
## Unknown pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_new_binds is unknown
unknown_new_from_mutation_data = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="unknown"
].reset_index(drop=True)
unknown_new_from_mutation_data["Mutated Partner Status"] = ["mutated"]*len(unknown_new_from_mutation_data)
print(f"Total rows where Mutated decisive_seqpair_new_binds==unknown: {len(unknown_new_from_mutation_data)}")

# what columns are in common?
common_cols = list(set(unknown_new_from_mutation_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
unknown_new_from_mutation_data = pd.merge(
    simplemerged,
    unknown_new_from_mutation_data, 
    on=common_cols, 
    how="inner"
)

unknown_new_from_mutation_data[
    change_cols
] = unknown_new_from_mutation_data.apply(lambda row: convert_mut_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_new_from_mutation_data["seq_sort"] = unknown_new_from_mutation_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_mutation_data.loc[
    unknown_new_from_mutation_data["seq_sort"].isin(test1)
])
print(f"Total unknown mutated interactions that are currently positive in our PPI data: {test1}")

Total rows where Mutated decisive_seqpair_new_binds==unknown: 8110
Total unknown mutated interactions that are currently positive in our PPI data: 0


In [408]:
## Unknown pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_new_binds is unknown
unknown_new_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="unknown"
].reset_index(drop=True)
unknown_new_from_mutation_neg_data["Mutated Partner Status"] = ["mutated"]*len(unknown_new_from_mutation_neg_data)
print(f"Total rows where Mutated decisive_seqpair_new_binds==unknown: {len(unknown_new_from_mutation_neg_data)}")

# what columns are in common?
common_cols = list(set(unknown_new_from_mutation_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
unknown_new_from_mutation_neg_data = pd.merge(
    simplemerged_neg,
    unknown_new_from_mutation_neg_data, 
    on=common_cols, 
    how="inner"
)

unknown_new_from_mutation_neg_data[
    change_cols
] = unknown_new_from_mutation_neg_data.apply(lambda row: convert_mut_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_new_from_mutation_neg_data["seq_sort"] = unknown_new_from_mutation_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_mutation_neg_data.loc[
    unknown_new_from_mutation_neg_data["seq_sort"].isin(test1)
])
print(f"Total unknown mutated interactions that are currently negative in our PPI data: {test1}")

Total rows where Mutated decisive_seqpair_new_binds==unknown: 10
Total unknown mutated interactions that are currently negative in our PPI data: 0


In [409]:
# Include the positives and negatives here
print("Working with unknown data from positives: simplemerged")
temp = unknown_og_from_mutation_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "Mutated Partner", "Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_mut_unknown = pd.merge(
    simplemerged,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
simplemerged_mut_unknown = simplemerged_mut_unknown.loc[simplemerged_mut_unknown["Mutated Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tMerged in mutation-related information when UNKNOWN if original sequence binds. Size of mutated unknown og sequence dataframe = {len(simplemerged_mut_unknown)}")

simplemerged_mut_unknown = pd.concat([
    simplemerged_mut_unknown,
    unknown_new_from_mutation_data
]).drop_duplicates().reset_index(drop=True)

print(f"\tMerged in mutation-related information when UNKNOWN if new sequence binds. Size of mutated unknown mutated sequence dataframe = {len(simplemerged_mut_unknown)}")

# Add the negatives now 
print("Working with unknown data from negatives: simplemerged_neg")
temp = unknown_og_from_mutation_neg_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "Mutated Partner", "Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_neg_mut_unknown = pd.merge(
    simplemerged_neg,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
simplemerged_neg_mut_unknown = simplemerged_neg_mut_unknown.loc[
    simplemerged_neg_mut_unknown["Mutated Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tMerged in mutation-related information from the NEGATIVES database when UNKNOWN if original sequence binds. Size of mutated unknown og sequence dataframe = {len(simplemerged_neg_mut_unknown)}")

simplemerged_neg_mut_unknown = pd.concat([
    simplemerged_neg_mut_unknown,
    unknown_new_from_mutation_neg_data
]).drop_duplicates().reset_index(drop=True)

print(f"\tMerged in mutation-related information from the NEGATIVES database when UNKNOWN if new sequence binds. Size of mutated unknown mutated sequence dataframe = {len(simplemerged_neg_mut_unknown)}")

simplemerged_mut_unknown = pd.concat([
    simplemerged_mut_unknown,
    simplemerged_neg_mut_unknown,
]).drop_duplicates().reset_index(drop=True)

print(f"Combined simplemerged_neg_mut_unknown with simplemerged_mut_unknown. Size of mutated unknown mutated sequence dataframe = {len(simplemerged_mut_unknown)}")

Working with unknown data from positives: simplemerged
	Merged in mutation-related information when UNKNOWN if original sequence binds. Size of mutated unknown og sequence dataframe = 5575
	Merged in mutation-related information when UNKNOWN if new sequence binds. Size of mutated unknown mutated sequence dataframe = 13685
Working with unknown data from negatives: simplemerged_neg
	Merged in mutation-related information from the NEGATIVES database when UNKNOWN if original sequence binds. Size of mutated unknown og sequence dataframe = 10
	Merged in mutation-related information from the NEGATIVES database when UNKNOWN if new sequence binds. Size of mutated unknown mutated sequence dataframe = 20
Combined simplemerged_neg_mut_unknown with simplemerged_mut_unknown. Size of mutated unknown mutated sequence dataframe = 13705


In [410]:
simplemerged_mut_unknown[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds","Mutated decisive_seqpair_new_binds",
        "Mutated Partner", "Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].sample(10)

Unnamed: 0,unique_id,interaction_intactid,seq_pair_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner,Mutated Partner Status,mutation_short_1,mutation_short_2,seq_sort,seq_sort_og,seq_sort_og_id
4723,intact:EBI-296087_intact:EBI-365980,EBI-63962426,seqpair260740,unknown,,B,original,,P15056:p.Val600Glu,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,MAALSGGGGGGAEPGQALFNGDMEPEAGAGAGAAASSAADPAIPEE...,seq_sort_og_1209
2914,intact:EBI-1391211_intact:EBI-25474821,EBI-27021086,seqpair119737,unknown,,A,original,P0DTC2:p.[Arg682_Arg685delinsGlySerAlaSer;Lys9...,,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFRSS...,seq_sort_og_10893
10211,intact:EBI-22304327_intact:EBI-25648504,EBI-25648433,seqpair209859,unknown,unknown,B,mutated,,A0A0M5L610:p.[Gln164Pro;Glu168Lys;Arg184Leu],MAAALFVLLGFALLGTHGASGAAGTVFTTVEDLGSKILLTCSLNDS...,MAAALFVLLGFALLGTHGASGAAGTVFTTVEDLGSKILLTCSLNDS...,seq_sort_og_472
6832,intact:EBI-466029_intact:EBI-745901,EBI-25965131,seqpair373791,yes,unknown,A,mutated,P42858:p.Gln18[20],,MAATDIARQVGEGCRTVPLAGHVGFDSLPDQLVNKSVSQGFCFNIL...,MAATDIARQVGEGCRTVPLAGHVGFDSLPDQLVNKSVSQGFCFNIL...,seq_sort_og_1610
12942,intact:EBI-750444_intact:EBI-79893,EBI-10700715,seqpair414141,unknown,unknown,B,mutated,,Q92569:p.Arg383Lys,MSSAPAPGPAPASLTLWDEEDFQGRRCRLLSDCANVCERGGLPRVR...,MSSAPAPGPAPASLTLWDEEDFQGRRCRLLSDCANVCERGGLPRVR...,seq_sort_og_16251
1821,intact:EBI-4397791_intact:EBI-466029,EBI-9051484,seqpair365408,unknown,,A,original,P42858:p.Gln18[55],,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,seq_sort_og_6124
11893,intact:EBI-349854_intact:EBI-723716,EBI-40235747,seqpair285200,unknown,unknown,A,mutated,P13569:p.Phe508del,,MASSAASSEHFEKLHEIFRGLHEDLQGVPERLLGTAGTEEKKKLIR...,MASSAASSEHFEKLHEIFRGLHEDLQGVPERLLGTAGTEEKKKLIR...,seq_sort_og_5446
349,intact:EBI-12512266_intact:EBI-12595681,EBI-12595757,seqpair100835,unknown,,B,original,,Q764M6:p.[Ser394Glu;Ser396Glu;Ser400Glu;Thr402...,MAENLLDGPPNPKRAKLSSPGFSANDSTDFGSLFDLENDLPDELIP...,MAENLLDGPPNPKRAKLSSPGFSANDSTDFGSLFDLENDLPDELIP...,seq_sort_og_2555
12111,intact:EBI-287394_intact:EBI-6174091,EBI-16094866,seqpair256064,yes,unknown,A,mutated,P60953-2:p.Gly12Val,,MQTIKCVVVGDVAVGKTCLLISYTTNKFPSEYVPTVFDNYAVTVMI...,MQTIKCVVVGDGAVGKTCLLISYTTNKFPSEYVPTVFDNYAVTVMI...,seq_sort_og_14977
8778,intact:EBI-192200_intact:EBI-868840,EBI-38259158,seqpair191343,yes,unknown,B,mutated,,Q24478:p.Leu118Ala,MDAQPSYPDLATLCRLCLKEHQDAYAIFDEDDTQLSIPVRLMACVA...,MDAQPSYPDLATLCRLCLKEHQDAYAIFDEDDTQLSIPVRLMACVA...,seq_sort_og_6980


In [411]:
## Negative pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
## Negative pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
neg_og_from_mutation_data = merged_expl_mut_filt.loc[
    merged_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="no"
].reset_index(drop=True)
neg_og_from_mutation_data["Mutated Partner Status"] = ["original"]*len(neg_og_from_mutation_data)
print(f"Total rows where Mutated decisive_seqpair_og_binds==no: {len(neg_og_from_mutation_data)}")

# what columns are in common?
common_cols = list(set(neg_og_from_mutation_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
neg_og_from_mutation_data = pd.merge(
    simplemerged,
    neg_og_from_mutation_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_og_from_mutation_data["seq_sort"] = neg_og_from_mutation_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(neg_og_from_mutation_data.loc[
    neg_og_from_mutation_data["seq_sort"].isin(test1)
])
print(f"Total negative OG interactions that are currently positive in our PPI data: {test1}")

print(f"Joining in negative data from the positives database: simplemerged")
temp = neg_og_from_mutation_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "Mutated Partner", "Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

temp = pd.merge(
    simplemerged,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
)
temp = temp.loc[temp["Mutated Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tSize of simplemerged_mut_neg with just original negatives: {len(temp)}")

Total rows where Mutated decisive_seqpair_og_binds==no: 1302
Total negative OG interactions that are currently positive in our PPI data: 1302
Joining in negative data from the positives database: simplemerged
	Size of simplemerged_mut_neg with just original negatives: 1302


In [412]:
neg_og_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_og_binds"]=="no")
].reset_index(drop=True)
neg_og_from_mutation_neg_data["Mutated Partner Status"] = ["original"]*len(neg_og_from_mutation_neg_data)
print(f"Total rows where Mutated decisive_seqpair_og_binds==no: {len(neg_og_from_mutation_neg_data)}")

# what columns are in common?
common_cols = list(set(neg_og_from_mutation_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
neg_og_from_mutation_neg_data = pd.merge(
    simplemerged_neg,
    neg_og_from_mutation_neg_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_og_from_mutation_neg_data["seq_sort"] = neg_og_from_mutation_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(neg_og_from_mutation_neg_data.loc[
    ~neg_og_from_mutation_neg_data["seq_sort"].isin(test1)
])==0
print(f"As expected, no new sequence pairs arose from neg_og_from_mutation_neg_data: {test1}. Size of simplemerged_neg: {len(simplemerged_neg)}")

temp = neg_og_from_mutation_neg_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds",
        "Mutated Partner","Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

## Creation of simplemerged_neg_mut
# Goal: get mutation dta into simplemerged_neg_mut, starting with the positive OGs.
temp = pd.merge(
    simplemerged_neg,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
print(f"Merged in some mutation-related information. New size of dataframe = {len(temp)}")

Total rows where Mutated decisive_seqpair_og_binds==no: 0
As expected, no new sequence pairs arose from neg_og_from_mutation_neg_data: True. Size of simplemerged_neg: 969
Merged in some mutation-related information. New size of dataframe = 969


In [413]:
neg_og_from_mutation_data[
    ["mutation_mi_1","mutation_mi_2", "mutation_range_1","mutation_range_2","mutation_orig_1","mutation_new_2","mutation_orig_2","mutation_new_2",
    "Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation # Feature AC",
"aa_1","aa_2","mutated_aa_1","mutated_aa_2"
]].to_csv("negative_mutation_interactions_dec11_2025.csv",index=False)

In [414]:
## Negative pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
## Negative pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
neg_new_from_mutation_data = merged_expl_mut_filt.loc[
    (merged_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="no") & 
    # but the new sequence is not invalid!!
    ~(
        (merged_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
neg_new_from_mutation_data["Mutated Partner Status"] = ["mutated"]*len(neg_new_from_mutation_data)
print(f"Total rows where Mutated decisive_seqpair_new_binds==no: {len(neg_new_from_mutation_data)}")

# what columns are in common?
common_cols = list(set(neg_new_from_mutation_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
neg_new_from_mutation_data = pd.merge(
    simplemerged,
    neg_new_from_mutation_data, 
    on=common_cols, 
    how="inner"
)

neg_new_from_mutation_data[
    change_cols
] = neg_new_from_mutation_data.apply(lambda row: convert_mut_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_new_from_mutation_data["seq_sort"] = neg_new_from_mutation_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(neg_new_from_mutation_data.loc[
    neg_new_from_mutation_data["seq_sort"].isin(test1)
])
print(f"Total negative MUTATED interactions that are currently positive in our PPI data: {test1}")



Total rows where Mutated decisive_seqpair_new_binds==no: 17970
Total negative MUTATED interactions that are currently positive in our PPI data: 0


In [415]:
## Negative pairs: (mutated-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
## Negative pairs: (mutated-partner)'s mutated_sequence plus (other partner)'s og_sequence, anywhere Mutated decisive_seqpair_og_binds is false
neg_new_from_mutation_neg_data = merged_neg_expl_mut_filt.loc[
    (merged_neg_expl_mut_filt["Mutated decisive_seqpair_new_binds"]=="no") & 
    # but the new sequence is not invalid!!
    ~(
        (merged_neg_expl_mut_filt["invalids_mutated_aa_1"].notna()) |
        (merged_neg_expl_mut_filt["invalids_mutated_aa_2"].notna())
    )
].reset_index(drop=True)
neg_new_from_mutation_neg_data["Mutated Partner Status"] = ["mutated"]*len(neg_new_from_mutation_neg_data)
print(f"Total rows where Mutated decisive_seqpair_new_binds==no: {len(neg_new_from_mutation_neg_data)}")

# what columns are in common?
common_cols = list(set(neg_new_from_mutation_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
neg_new_from_mutation_neg_data = pd.merge(
    simplemerged_neg,
    neg_new_from_mutation_neg_data, 
    on=common_cols, 
    how="inner"
)

neg_new_from_mutation_neg_data[
    change_cols
] = neg_new_from_mutation_neg_data.apply(lambda row: convert_mut_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_new_from_mutation_neg_data["seq_sort"] = neg_new_from_mutation_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(neg_new_from_mutation_neg_data.loc[
    neg_new_from_mutation_neg_data["seq_sort"].isin(test1)
])
print(f"Total negative MUTATED interactions that are currently positive in our PPI data: {test1}")



Total rows where Mutated decisive_seqpair_new_binds==no: 3
Total negative MUTATED interactions that are currently positive in our PPI data: 0


In [416]:
display(pos_new_from_mutation_neg_data)
display(pos_og_from_mutation_neg_data)

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,True,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,,,intact:EBI-307973,intact:EBI-302230,,,,...,yes,no,yes,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,seq_sort_og_3,seq_sort_new_3,yes,no,original
1,True,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,,,intact:EBI-307973,intact:EBI-307973,,,,...,yes,no,yes,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,seq_sort_og_5,seq_sort_new_5,yes,no,original
2,True,MTDQTYCDRLVQDTPFLTGHGRLSEQQVDRIILQLNRYYPQILTNK...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,,,intact:EBI-7443927,intact:EBI-958922,,,,...,yes,no,yes,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,seq_sort_og_4,seq_sort_new_4,yes,no,original


In [417]:
simplemerged[["interaction_intactid","unique_id"]].head()
simplemerged.loc[simplemerged.duplicated(["interaction_intactid","unique_id"])]

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2


In [418]:
pos_new_from_mutation_neg_data["Negative"] = False
pos_og_from_mutation_neg_data["Negative"] = False
# make simplemerged, starting wtih nothing that's in the other databases already
temp = pd.concat([
    pos_og_from_mutation_neg_data,
    pos_og_from_mutation_data
])
temp["temp"] = temp["interaction_intactid"] + "_" + temp["unique_id"] 
temp = temp[["temp"]]
exclude_combo = temp["temp"].unique().tolist()
print(len(exclude_combo))
print(f"Merging the newly found positive ogs and news from each dataset with original simplemerged.")
print(f"First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.")
print(f"Total to be excluded: {len(exclude_combo)}")
simplemerged_mut = simplemerged.copy(deep=True)
test1 = len(simplemerged_mut)
simplemerged_mut["temp"] = simplemerged_mut["interaction_intactid"] + "_" + simplemerged_mut["unique_id"] 
simplemerged_mut = simplemerged_mut.loc[~simplemerged_mut["temp"].isin(exclude_combo)].reset_index(drop=True)
simplemerged_mut = simplemerged_mut.drop(columns=["temp"])
print(f"Total simplemerged_mut rows that were excluded: {test1-len(simplemerged_mut)}. Length now: {len(simplemerged_mut)}")

simplemerged_mut = pd.concat([
    simplemerged_mut,
    pos_new_from_mutation_data,
    pos_new_from_mutation_neg_data,
    pos_og_from_mutation_neg_data,
    pos_og_from_mutation_data
]).reset_index(drop=True)
simplemerged_mut = simplemerged_mut.drop_duplicates().reset_index(drop=True)
print(f"Size of simplemerged_mut after we concatenate new positives found from mutations (both from positive and negative initial datasets): {len(simplemerged_mut)}")

## Effectively combine this information. Delete any rows from simplemerged where there is a negative interaction shown by mutation
neg_seq_pairs = neg_new_from_mutation_data["seq_sort"].unique().tolist() + neg_og_from_mutation_data["seq_sort"].unique().tolist() + unknown_og_from_mutation_data["seq_sort"].unique().tolist() + unknown_new_from_mutation_data["seq_sort"].unique().tolist()
neg_seq_pairs += (neg_new_from_mutation_neg_data["seq_sort"].unique().tolist() + neg_og_from_mutation_neg_data["seq_sort"].unique().tolist() + unknown_og_from_mutation_neg_data["seq_sort"].unique().tolist() + unknown_new_from_mutation_neg_data["seq_sort"].unique().tolist())
print(f"Size of simplemerged_mut before we remove erroneous positives (should be negative or unknown): {len(simplemerged_mut)}")
simplemerged_mut = simplemerged_mut.loc[
    ~simplemerged_mut["seq_sort"].isin(neg_seq_pairs)
]
print(f"Size of simplemerged_mut after we remove erroneous positives (should be negative or unknown): {len(simplemerged_mut)}")


27784
Merging the newly found positive ogs and news from each dataset with original simplemerged.
First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.
Total to be excluded: 27784
Total simplemerged_mut rows that were excluded: 27781. Length now: 715349
Size of simplemerged_mut after we concatenate new positives found from mutations (both from positive and negative initial datasets): 825044
Size of simplemerged_mut before we remove erroneous positives (should be negative or unknown): 825044
Size of simplemerged_mut after we remove erroneous positives (should be negative or unknown): 817896


In [419]:
neg_new_from_mutation_data["Negative"] = True
neg_og_from_mutation_data["Negative"] = True
# make simplemerged_neg, starting wtih nothing that's in the other databases already
temp = pd.concat([
    neg_og_from_mutation_neg_data,
    neg_og_from_mutation_data
])
temp["temp"] = temp["interaction_intactid"] + "_" + temp["unique_id"]
temp = temp[["temp"]]
exclude_combo = temp["temp"].unique().tolist()
print(f"Merging the newly found positive ogs and news from each dataset with original simplemerged_neg.")
print(f"First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.")
print(f"Total to be excluded: {len(exclude_combo)}")
simplemerged_neg_mut = simplemerged_neg.copy(deep=True)
test1 = len(simplemerged_neg_mut)
simplemerged_neg_mut["temp"] = simplemerged_neg_mut["interaction_intactid"] + "_" + simplemerged_neg_mut["unique_id"]
simplemerged_neg_mut = simplemerged_neg_mut.loc[~simplemerged_neg_mut["temp"].isin(exclude_combo)].reset_index(drop=True)
simplemerged_neg_mut = simplemerged_neg_mut.drop(columns=["temp"])
print(f"Total simplemerged_neg_mut rows that were excluded: {test1-len(simplemerged_neg_mut)}. Length now: {len(simplemerged_neg_mut)}")

simplemerged_neg_mut = pd.concat([
    simplemerged_neg_mut,
    neg_new_from_mutation_data,
    neg_new_from_mutation_neg_data,
    neg_og_from_mutation_neg_data,
    neg_og_from_mutation_data
]).reset_index(drop=True)
simplemerged_neg_mut = simplemerged_neg_mut.drop_duplicates().reset_index(drop=True)
print(f"Size of simplemerged_neg_mut after we concatenate new positives found from mutations (both from positive and negative initial datasets): {len(simplemerged_neg_mut)}")

# Delete any rows from simplemerged where there is a positive interaction that should be negative
pos_seq_pairs = pos_new_from_mutation_data["seq_sort"].unique().tolist() + pos_og_from_mutation_data["seq_sort"].unique().tolist() + unknown_og_from_mutation_data["seq_sort"].unique().tolist() + unknown_new_from_mutation_data["seq_sort"].unique().tolist()
pos_seq_pairs += (pos_new_from_mutation_neg_data["seq_sort"].unique().tolist() + pos_og_from_mutation_neg_data["seq_sort"].unique().tolist() + unknown_og_from_mutation_neg_data["seq_sort"].unique().tolist() + unknown_new_from_mutation_neg_data["seq_sort"].unique().tolist())

print(f"Size of simplemerged_neg_mut before we remove erroneous negatives (should be positive or unknown): {len(simplemerged_neg_mut)}")
simplemerged_neg_mut = simplemerged_neg_mut.loc[
    ~simplemerged_neg_mut["seq_sort"].isin(pos_seq_pairs)
]
print(f"Size of simplemerged_neg_mut after we remove erroneous negatives (should be positive or unknown): {len(simplemerged_neg_mut)}")

Merging the newly found positive ogs and news from each dataset with original simplemerged_neg.
First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.
Total to be excluded: 1209
Total simplemerged_neg_mut rows that were excluded: 0. Length now: 969
Size of simplemerged_neg_mut after we concatenate new positives found from mutations (both from positive and negative initial datasets): 20244
Size of simplemerged_neg_mut before we remove erroneous negatives (should be positive or unknown): 20244
Size of simplemerged_neg_mut after we remove erroneous negatives (should be positive or unknown): 20222


In [420]:
l = pos_og_from_mutation_data["interaction_intactid"].unique().tolist()
simplemerged_mut.loc[
    (simplemerged_mut["Mutated Partner"].isna()) & 
    (simplemerged_mut["interaction_intactid"].isin(l))
    ]

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status


In [421]:
neg_new_from_mutation_data.loc[
    neg_new_from_mutation_data["interaction_intactid"]=="EBI-10425432"
]

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
706,True,MAAQKDQQKDAEAEGLSGTTLLPKLIPSGAGREWLERRRATIRPWS...,MAAILGDTIMVAKGLVKLTQAAVETHLQHLGIGGELIMAARALQST...,,,intact:EBI-712367,intact:EBI-745535,,,,...,"unknown,yes",no,yes,MAAILGDTIMVAKGLVKLTQAAVETHLQHLGIGGELIMAARALQST...,MAAILGDTIMVAKGLVKLTQAAVETHLQHLGIGGELIMAARALQST...,seq_sort_og_836,seq_sort_new_4344,yes,no,mutated


In [422]:
simplemerged_neg.loc[
    simplemerged_neg["interaction_intactid"]=="EBI-10425432"
]

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2


In [423]:
print(f"Total negative mutated sequences from processing positive PPI data: {len(neg_new_from_mutation_data)}")
display(neg_new_from_mutation_data.head())
print(f"Total negative mutated sequences from processing negative PPI data: {len(neg_new_from_mutation_neg_data)}")
display(neg_new_from_mutation_neg_data.head())
print(f"Total negative original sequences from processing positive PPI data: {len(neg_og_from_mutation_data)}")
display(neg_og_from_mutation_data.head())
print(f"Total negative original sequences from processing negative PPI data: {len(neg_og_from_mutation_neg_data)}")
display(neg_og_from_mutation_neg_data.head())

Total negative mutated sequences from processing positive PPI data: 17970


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,True,ADTCPEVKVVGLEGSDKLTILRGCPGLPGAPGPKGEAGVIGERGER...,MHLLAILFCALWSAVLAENSDDYDLMYVNLDNEIDNGLHPTEDPTP...,,,intact:EBI-11784425,intact:EBI-11574553,,,,...,yes,no,yes,ADTCPEVKVVGLEGSDKLTILRGCPGLPGAPGPKGEAGVIGERGER...,ADTCPEVKVVGLEGSDKLTILRGCPGLPGAPGPKGEAGVIGERGER...,seq_sort_og_6,seq_sort_new_14,yes,no,mutated
1,True,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,AGNATEVPANSTVLSFCAFAVDAAKAYKDYLASGGQPITNCVKMLC...,,,intact:EBI-25475920,intact:EBI-25475880,6452.0,,5926.0,...,yes,no,yes,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,seq_sort_og_10,seq_sort_new_71,yes,no,mutated
2,True,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,AGNATEVPANSTVLSFCAFAVDAAKAYKDYLASGGQPITNCVKMLC...,,,intact:EBI-25475920,intact:EBI-25475880,6452.0,,5926.0,...,yes,no,yes,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,seq_sort_og_10,seq_sort_new_73,yes,no,mutated
3,True,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,AGNATEVPANSTVLSFCAFAVDAAKAYKDYLASGGQPITNCVKMLC...,,,intact:EBI-25475920,intact:EBI-25475880,6452.0,,5926.0,...,yes,no,yes,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,AENVTGLFKDCSKVITGLHPTQAPTHLSVDTKFKTEGLCVDIPGIP...,seq_sort_og_10,seq_sort_new_74,yes,no,mutated
4,True,AGNATEVPANSTVLSFCAFAVDPAKAYKDYLASGGQPITNCVKMLC...,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,,,intact:EBI-25487277,intact:EBI-25487328,,6429.0,,...,yes,no,yes,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,AENVTGLFKDCSKIITGLHPTQAPTHLSVDIKFKTEGLCVDIPGIP...,seq_sort_og_8,seq_sort_new_65,yes,no,mutated


Total negative mutated sequences from processing negative PPI data: 3


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,True,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,,,intact:EBI-307973,intact:EBI-302230,,,,...,yes,no,yes,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,seq_sort_og_3,seq_sort_new_3,yes,no,mutated
1,True,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,,,intact:EBI-307973,intact:EBI-307973,,,,...,yes,no,yes,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,seq_sort_og_5,seq_sort_new_5,yes,no,mutated
2,True,MTDQTYCDRLVQDTPFLTGHGRLSEQQVDRIILQLNRYYPQILTNK...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,,,intact:EBI-7443927,intact:EBI-958922,,,,...,yes,no,yes,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,seq_sort_og_4,seq_sort_new_4,yes,no,mutated


Total negative original sequences from processing positive PPI data: 1302


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,True,MAAAAAAATTAACSSGSAGTDAAGASGLQQPPPQPQPQPAAAAPAQ...,MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVD...,,,intact:EBI-604615,intact:EBI-296087,,,,...,no,yes,no,MAAAAAAATTAACSSGSAGTDAAGASGLQQPPPQPQPQPAAAAPAQ...,MAAAAAAATTAACSSGSAGTDAAGASGLQQPPPQPQPQPAAAAPAQ...,seq_sort_og_263,seq_sort_new_908,no,yes,original
1,True,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,MKSNQERSNECLPPKKREIPATSRSSEEKAPTLPSDNHRVEGTAWL...,,,intact:EBI-742790,intact:EBI-930964,,,,...,"no,unknown",yes,no,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,seq_sort_og_334,seq_sort_new_1105,no,yes,original
2,True,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,MKSNQERSNECLPPKKREIPATSRSSEEKAPTLPSDNHRVEGTAWL...,,,intact:EBI-742790,intact:EBI-930964,,,,...,"no,unknown",yes,no,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,seq_sort_og_334,seq_sort_new_1105,no,yes,original
3,True,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,MKSNQERSNECLPPKKREIPATSRSSEEKAPTLPSDNHRVEGTAWL...,,,intact:EBI-742790,intact:EBI-930964,,,,...,"no,unknown",yes,no,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,MAAAAASHLNLDALREVLECPICMESFTEEQLRPKLLHCGHTICRQ...,seq_sort_og_334,seq_sort_new_1105,no,yes,original
4,True,MAAAAGNRASSSGFPGARATSPEAGGGGGALKASSAPAAAAGLLRE...,MSDVAIVKEGWLHKRGEYIKTWRPRYFLLKNDGTFIGYKERPQDVD...,,,intact:EBI-49776,intact:EBI-296087,,,,...,no,yes,no,MAAAAGNRASSSGFPGARATSPEAGGGGGALKASSAPAAAAGLLRE...,MAAAAGNRASSSGFPGARATSPEAGGGGGALKASSAPAAAAGLLRE...,seq_sort_og_352,seq_sort_new_1140,no,yes,original


Total negative original sequences from processing negative PPI data: 0


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status


In [424]:
print(len(simplemerged_neg_mut.loc[simplemerged_neg_mut["Mutated Partner"].notna()]))

19275


In [425]:
simplemerged_neg_mut

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,True,IAAPGPALCLFDVDGTLTAPRQKITKEMDDFLQKLRQKIKIGVVGG...,MCSLPVPREPLRRVAVTGGTHGNEMSGVYLARHWLHAPAELQRASF...,,,intact:EBI-16472255,intact:EBI-3916242,,,,...,,,,,,,,,,
1,True,ISGMEATVLSPSLCSRPSQSGKTSHMGLLEV,MNSSTSTMSEEPDALSVVNQLRDLAADPLNRRAIVQDQGCLPGLIL...,,,intact:EBI-16467021,intact:EBI-3506974,,,,...,,,,,,,,,,
2,True,ISGMEATVLSPSLCSRPSQSGKTSHMGLLEV,MPLEQRSQHCKPEEGLEAQGEALGLVGAQAPATEEQETASSSSTLV...,,,intact:EBI-16467021,intact:EBI-749530,,,,...,,,,,,,,,,
3,True,LNYMPGTASLIEDIDKKHLVLLRDGRTLIGFLRSIDQFGLGKGE,MADDVDQQQTTNTVEEPLDLIRLSLDERIYVKMRNDRELRGRLHAY...,,,intact:EBI-16434023,intact:EBI-348239,,,,...,,,,,,,,,,
4,True,MAAAAGSCARVAAWGGKLRRGLAVSRQAVRSPGPLAAAVAGAALAG...,MTGAEIEPSAQAKPEKKAGEEVIAGPERENDVPLVVRPKVRTQATT...,,,intact:EBI-3197790,intact:EBI-473189,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20239,True,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,,intact:EBI-1028277,intact:EBI-297353,,,,...,no,yes,no,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,seq_sort_og_15319,seq_sort_new_47467,no,yes,original
20240,True,MWSWKCLLFWAVLVTATLCTARPSPTLPEQAQPWGAPVEVESFLVH...,MTSRRWFHPNITGVEAENLLLTRGVDGSFLARPSKSNPGDFTLSVR...,,,intact:EBI-1028277,intact:EBI-297779,,,,...,no,yes,no,MTSRRWFHPNITGVEAENLLLTRGVDGSFLARPSKSNPGDFTLSVR...,MTSRRWFHPNITGVEAENLLLTRGVDGSFLARPSKSNPGDFTLSVR...,seq_sort_og_16587,seq_sort_new_52219,no,yes,original
20241,True,MWVTKLLPALLLQHVLLHLLLLPIAIPYAEGQRKRRNTIHEFKKSA...,MTSRRWFHPNITGVEAENLLLTRGVDGSFLARPSKSNPGDFTLSVR...,,,intact:EBI-1039104,intact:EBI-297779,,,,...,no,yes,no,MTSRRWFHPNITGVEAENLLLTRGVDGSFLARPSKSNPGDFTLSVR...,MTSRRWFHPNITGVEAENLLLTRGVDGSFLARPSKSNPGDFTLSVR...,seq_sort_og_16588,seq_sort_new_52220,no,yes,original
20242,True,MYGSARSVGKVEPSSQSPGRSPRLPRSPRLGHRRTNSTGGSSGSSV...,MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVY...,,,intact:EBI-1044755,intact:EBI-696162,,,,...,no,yes,no,MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVY...,MTAIIKEIVSRNKRRYQEDGFDLDLTYIYPNIIAMGFPAERLEGVY...,seq_sort_og_16481,seq_sort_new_51936,no,yes,original


In [426]:
# let's concatenate the new negatives we just learned about!!
simplemerged_neg_mut = pd.concat([
    simplemerged_neg_mut,
    neg_new_from_mutation_data,
    neg_new_from_mutation_neg_data,
    neg_og_from_mutation_neg_data,
    neg_og_from_mutation_data
]).reset_index(drop=True)

harmonize_nulls_to_nan(simplemerged_neg_mut)
simplemerged_neg_mut = simplemerged_neg_mut.drop_duplicates().reset_index(drop=True)
print(f"Size of negatives database after combining with mutation-derived negatives: {len(simplemerged_neg_mut)}")

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Size of negatives database after combining with mutation-derived negatives: 20222


In [427]:
# eventually we must get rid of the current negatives that should be positives
pos_og_from_mutation_neg_data

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,False,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,,,intact:EBI-307973,intact:EBI-302230,,,,...,yes,no,yes,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,MAENLKGCSVCCKSSWNQLQDLCRLAKLSCPALGVSKKNLYDFEVE...,seq_sort_og_3,seq_sort_new_3,yes,no,original
1,False,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,,,intact:EBI-307973,intact:EBI-307973,,,,...,yes,no,yes,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,MGKKTKRTADSSSSEDEEEYVVEKVLDRRMVKGQVEYLLKWKGFSE...,seq_sort_og_5,seq_sort_new_5,yes,no,original
2,False,MTDQTYCDRLVQDTPFLTGHGRLSEQQVDRIILQLNRYYPQILTNK...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,,,intact:EBI-7443927,intact:EBI-958922,,,,...,yes,no,yes,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,MEPTAPSLTEEDLTEVKKDALENLRVYLCEKIIAERHFDHLRAKKI...,seq_sort_og_4,seq_sort_new_4,yes,no,original


In [428]:
simplemerged_neg_mut[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "Mutated decisive_seqpair_og_binds","Mutated decisive_seqpair_new_binds",
        "Mutated Partner", "Mutated Partner Status", "mutation_short_1","mutation_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].sample(10,random_state=42)

Unnamed: 0,unique_id,interaction_intactid,seq_pair_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner,Mutated Partner Status,mutation_short_1,mutation_short_2,seq_sort,seq_sort_og,seq_sort_og_id
8746,intact:EBI-27189_intact:EBI-28714,EBI-8509044,seqpair241802,yes,no,A,mutated,Q06708:p.Leu149Arg,,MEKSIAKGLSDKLYEKRKAAALELEKLVKQCVLEGDYDRIDKIIDE...,MEKSIAKGLSDKLYEKRKAAALELEKLVKQCVLEGDYDRIDKIIDE...,seq_sort_og_9449
8064,intact:EBI-10226858_intact:EBI-985879,EBI-26095341,seqpair11837,yes,no,A,mutated,P37840:p.Glu46Lys,,MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKK...,MDVFMKGLSKAKEGVVAAAEKTKQGVAEAAGKTKEGVLYVGSKTKE...,seq_sort_og_8370
10650,intact:EBI-21776319_intact:EBI-517127,EBI-25746121,seqpair207333,yes,no,A,mutated,P83105:p.Ser326Ala,,MIRPQLRTAGLGRCLLPGLLLLLVPVLWAGAEKLHTQPSCPAVCQP...,MIRPQLRTAGLGRCLLPGLLLLLVPVLWAGAEKLHTQPSCPAVCQP...,seq_sort_og_12150
4189,intact:EBI-466029_intact:EBI-721293,EBI-26270380,seqpair373729,yes,no,A,mutated,P42858:p.Gln18_Gln38del,,MAANYSSTSTRREHVKVKTSSQPGFLERLSETSGGMFVGLMAFLLS...,MAANYSSTSTRREHVKVKTSSQPGFLERLSETSGGMFVGLMAFLLS...,seq_sort_og_1325
16605,intact:EBI-16359_intact:EBI-770,EBI-11436582,seqpair155864,yes,no,A,mutated,Q12330:p.Asp47Ala,,MSESSDISAMQPVNPKPFLKGLVNHRVGVKLKFNSTEYRGTLVSTD...,MSESSDISAMQPVNPKPFLKGLVNHRVGVKLKFNSTEYRGTLVSTD...,seq_sort_og_15751
18789,intact:EBI-2073_intact:EBI-4110,EBI-16157448,seqpair199213,yes,no,B,mutated,,P21192:p.Pro286Ala,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...,MDNVVDPWYINPSGFAKDTQDEEYVQHHDNVNPTIPPPDNYILNNE...,seq_sort_og_7927
15793,intact:EBI-2559016_intact:EBI-603614,EBI-21993556,seqpair230586,yes,no,B,mutated,,Q6NZI2:p.Glu176Argfs*99,MEDPTLYIVERPLPGYPDAEAPEPSSAGAQAAEEPSGAGSEELIKS...,MEDPTLYIVERPLPGYPDAEAPEPSSAGAQAAEEPSGAGSEELIKS...,seq_sort_og_8728
12918,intact:EBI-10175300_intact:EBI-355607,EBI-12689087,seqpair6725,yes,no,A,mutated,P06753:p.Arg168Gly,,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,MMEAIKKKMQMLKLDKENALDRAEQAEAEQKQAEERSKQLEDELAA...,seq_sort_og_13490
14720,intact:EBI-466029_intact:EBI-726271,EBI-26273651,seqpair373748,unknown,no,B,mutated,,P42858:p.Gln18[79],MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQQQQQQQQQ...,MATLEKLMKAFESLKSFQQQQQQQQQQQQQQQQQQQQQPPPPPPPP...,seq_sort_og_6172
9690,intact:EBI-1401_intact:EBI-1404,EBI-8849756,seqpair121289,yes,no,A,mutated,P06240:p.[Cys20Ala;Cys23Ala],,MCRAISLRRLLLLLLQLSQLLAVTQGKTLVLGKEGESAELPCESSQ...,MCRAISLRRLLLLLLQLSQLLAVTQGKTLVLGKEGESAELPCESSQ...,seq_sort_og_6859


In [429]:
# see how many unique sequences are here
print(f"Total unique sequence pairs in positive database: {simplemerged_mut['seq_sort'].nunique()}")
test0 = simplemerged_mut.drop_duplicates(["seq_sort","interaction_mi"])
test1 = test0["interaction_mi"].value_counts()
print(f"Distribution of interaction_mis among sequence pairs (total unique seq_sort+mi={len(test0)}): {test1}")

Total unique sequence pairs in positive database: 460430
Distribution of interaction_mis among sequence pairs (total unique seq_sort+mi=473598): interaction_mi
MI:0915            397912
MI:0407             30317
MI:0914             23625
MI:2364             11519
MI:0403              5568
MI:0217              2334
MI:0570               456
MI:0203               446
MI:0194               260
MI:0195               199
MI:0414               149
MI:0213               131
MI:1110               104
MI:0192                81
MI:0557                58
MI:0220                56
MI:0871                54
MI:0197                45
MI:0408                40
MI:0945                38
MI:0844                38
MI:0204                29
MI:0559                28
MI:0210                25
MI:0216                21
MI:0566                10
MI:1126                 7
MI:1127                 6
MI:1148                 5
MI:1310                 5
MI:2280                 4
MI:0985                 4
MI:1143 

In [430]:
print(f"Total unique sequence pairs in negative database: {simplemerged_neg_mut['seq_sort'].nunique()}")
test0 = simplemerged_neg_mut.drop_duplicates(["seq_sort","interaction_mi"])
test1 = test0["interaction_mi"].value_counts()
print(f"Distribution of interaction_mis among sequence pairs (total unique seq_sort+mi={len(test0)}): {test1}")

Total unique sequence pairs in negative database: 13766
Distribution of interaction_mis among sequence pairs (total unique seq_sort+mi=14220): interaction_mi
MI:0915    9868
MI:0407    2808
MI:2364     514
MI:0217     409
MI:0403     217
MI:0570      87
MI:0203      66
MI:0213      65
MI:0914      37
MI:0194      28
MI:0414      23
MI:0192      17
MI:0871      15
MI:0559      14
MI:0195       8
MI:0566       8
MI:0204       7
MI:0216       6
MI:0844       6
MI:1127       5
MI:0197       3
MI:0210       3
MI:0945       3
MI:0220       2
MI:0556       1
Name: count, dtype: int64


In [431]:
# 
test1 = len(simplemerged_mut.loc[
    (simplemerged_mut["invalids_aa_1"].notna()) | 
    (simplemerged_mut["invalids_aa_2"].notna()) | 
    (simplemerged_mut["invalids_mutated_aa_1"].notna()) | 
    (simplemerged_mut["invalids_mutated_aa_2"].notna())
])==0
print(f"Nothing in the filtered simplemerged_mut database has invalids: {test1}")

Nothing in the filtered simplemerged_mut database has invalids: False


In [432]:
# 
test1 = len(simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["invalids_aa_1"].notna()) | 
    (simplemerged_neg_mut["invalids_aa_2"].notna()) | 
    (simplemerged_neg_mut["invalids_mutated_aa_1"].notna()) | 
    (simplemerged_neg_mut["invalids_mutated_aa_2"].notna())
])==0
print(f"Nothing in the filtered negatives database has invalids: {test1}")

Nothing in the filtered negatives database has invalids: True


In [433]:
# Let's save these databases. Two versions 
test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_mut with duplicate unique_id: {test1}")
test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_mut with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_mut with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_mut with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_mut with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_mut with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged_mut with duplicate unique_id: 390963
Rows in simplemerged_mut with duplicate interaction_intactid: 79246
Rows in simplemerged_mut with duplicate seq_pair_id: 392878
Rows in simplemerged_mut with duplicate unique_id+seq_pair_id: 390962
Rows in simplemerged_mut with duplicate unique_id+interaction_intactid: 79244
Rows in simplemerged_mut with duplicate seq_pair_id+interaction_intactid: 79244


In [434]:
# Let's save these databases. Two versions 
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged_neg_mut with duplicate unique_id: 12798
Rows in simplemerged_neg_mut with duplicate interaction_intactid: 7271
Rows in simplemerged_neg_mut with duplicate seq_pair_id: 12870
Rows in simplemerged_neg_mut with duplicate unique_id+seq_pair_id: 12797
Rows in simplemerged_neg_mut with duplicate unique_id+interaction_intactid: 7271
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: 7271


In [435]:
simplemerged_mut[["unique_id","seq_pair_id","interaction_intactid","Mutated Partner"]]

Unnamed: 0,unique_id,seq_pair_id,interaction_intactid,Mutated Partner
0,intact:EBI-20589573_intact:EBI-358616,seqpair197876,EBI-20589590,
1,intact:EBI-25507607_intact:EBI-448610,seqpair228746,EBI-25507641,
2,intact:EBI-16730154_intact:EBI-25507607,seqpair160363,EBI-25508294,
3,intact:EBI-1380492_intact:EBI-25507607,seqpair117000,EBI-25507637,
4,intact:EBI-25507607_intact:EBI-25508298,seqpair228743,EBI-25508313,
...,...,...,...,...
825039,intact:EBI-9159693_intact:EBI-9159704,seqpair424756,EBI-9159800,A
825040,intact:EBI-25473602_intact:EBI-25473661,seqpair226866,EBI-25473663,B
825041,intact:EBI-25473602_intact:EBI-25473661,seqpair226866,EBI-25473663,B
825042,intact:EBI-25473602_intact:EBI-25473661,seqpair226866,EBI-25473663,B


In [436]:
print(
    ",\n".join(sorted([f"\"{x}\"" for x in simplemerged_mut.columns]))
)

"Mutated Partner Status",
"Mutated Partner",
"Mutated all_new_binds",
"Mutated all_og_binds",
"Mutated decisive_entry_new_binds",
"Mutated decisive_entry_og_binds",
"Mutated decisive_seqpair_new_binds",
"Mutated decisive_seqpair_og_binds",
"Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"

In [437]:
simplemerged_mut.loc[
    simplemerged_mut["mutation_short_2"].fillna("").str.contains("\\|")
]

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status


In [438]:
# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_mut.loc[
    (simplemerged_mut["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_mut["mutation_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join mutation labels with pipe because pipe is not used in them: {test1}")
simplemerged_mut["mutation_short"] = simplemerged_mut["mutation_short_1"].fillna("") + "|" + simplemerged_mut["mutation_short_2"].fillna("")
simplemerged_mut["mutation_short"] = simplemerged_mut["mutation_short"].str.strip("\\|")
simplemerged_mut[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","mutation_short_1","mutation_short_2","mutation_short","Mutated Partner"]]

test1 = len(simplemerged_mut.loc[
    simplemerged_mut.duplicated(["unique_id","seq_pair_id","interaction_intactid","mutation_short"])
])
print(f"Rows in simplemerged_mut with duplicate seq_pair_id+interaction_intactid: {test1}")

Can join mutation labels with pipe because pipe is not used in them: True
Rows in simplemerged_mut with duplicate seq_pair_id+interaction_intactid: 42224


In [439]:
# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_neg_mut["mutation_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join mutation labels with pipe because pipe is not used in them: {test1}")
simplemerged_neg_mut["mutation_short"] = simplemerged_neg_mut["mutation_short_1"].fillna("") + "|" + simplemerged_neg_mut["mutation_short_2"].fillna("")
simplemerged_neg_mut["mutation_short"] = simplemerged_neg_mut["mutation_short"].str.strip("\\|")
simplemerged_neg_mut[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","mutation_short_1","mutation_short_2","mutation_short","Mutated Partner"]]

test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","seq_pair_id","interaction_intactid","mutation_short"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: {test1}")

Can join mutation labels with pipe because pipe is not used in them: True
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: 33


In [440]:
simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_neg_mut["mutation_short_2"].fillna("").str.contains("\\|"))
][["interaction_intactid","mutation_short_1","mutation_short_2"]]

Unnamed: 0,interaction_intactid,mutation_short_1,mutation_short_2


In [441]:
# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_neg_mut["mutation_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join mutation labels with pipe because pipe is not used in them: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    ((simplemerged_neg_mut["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_neg_mut["mutation_short_2"].fillna("").str.contains("\\|"))) & 
    (simplemerged_neg_mut["Mutated Partner Status"].notna())
])==0 
print(f"\tWe are going to pipe-join anyways because these are already separated mutation events from the raw scraping. {test1}")
simplemerged_neg_mut["mutation_short"] = simplemerged_neg_mut["mutation_short_1"].fillna("") + "|" + simplemerged_neg_mut["mutation_short_2"].fillna("")
simplemerged_neg_mut["mutation_short"] = simplemerged_neg_mut["mutation_short"].str.strip("\\|")
simplemerged_neg_mut[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","mutation_short_1","mutation_short_2","mutation_short","Mutated Partner"]]

test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","seq_pair_id","interaction_intactid","mutation_short"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: {test1}")

Can join mutation labels with pipe because pipe is not used in them: True
	We are going to pipe-join anyways because these are already separated mutation events from the raw scraping. True
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: 33


In [442]:
# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_mut_unknown.loc[
    (simplemerged_mut_unknown["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_mut_unknown["mutation_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join mutation labels with pipe because pipe is not used in them: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    ((simplemerged_mut_unknown["mutation_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_mut_unknown["mutation_short_2"].fillna("").str.contains("\\|"))) & 
    (simplemerged_mut_unknown["Mutated Partner Status"].notna())
])==0 
print(f"\tWe are going to pipe-join anyways because these are already separated mutation events from the raw scraping. {test1}")
simplemerged_mut_unknown["mutation_short"] = simplemerged_mut_unknown["mutation_short_1"].fillna("") + "|" + simplemerged_mut_unknown["mutation_short_2"].fillna("")
simplemerged_mut_unknown["mutation_short"] = simplemerged_mut_unknown["mutation_short"].str.strip("\\|")
simplemerged_mut_unknown[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","mutation_short_1","mutation_short_2","mutation_short","Mutated Partner"]]

test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id","seq_pair_id","interaction_intactid","mutation_short"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid: {test1}")

Can join mutation labels with pipe because pipe is not used in them: True
	We are going to pipe-join anyways because these are already separated mutation events from the raw scraping. True
Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid: 2907


In [443]:
temp = simplemerged_mut.loc[
    simplemerged_mut.duplicated(["unique_id","seq_pair_id","interaction_intactid","mutation_short"],keep=False)
].sort_values(by=["unique_id","seq_pair_id","interaction_intactid","mutation_short"]).reset_index(drop=True)
specialcase_dup_cols = ["mutated_aa_1","mutated_aa_2","Mutated Partner","scraped_mut_has_info_1","scraped_mut_has_info_2","aa_1","aa_2",'uniprot_A', 'uniprot_A_equalseq', 'uniprot_A_equalseq_canonical', 'uniprot_A_full', 'uniprot_A_inseq', 'uniprot_A_inseq_canonical', 'uniprot_A_noiso1', 'uniprot_B', 'uniprot_B_equalseq', 'uniprot_B_equalseq_canonical', 'uniprot_B_full', 'uniprot_B_inseq', 'uniprot_B_inseq_canonical', 'uniprot_B_noiso1']
dup_cols = [
    "Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"Mutation new_nobind_to_uniprot_bo_annotation",
"Mutation og_binds_bo_ac",
"Mutation og_binds_bo_annotation",
"Mutation og_binds_bo_feature_type",
"mutation_begin_1",
"mutation_begin_2",
"mutation_end_1",
"mutation_end_2",
"mutation_mi_1",
"mutation_mi_2",
"mutation_name_1",
"mutation_name_2",
"mutation_new_1",
"mutation_new_2",
"mutation_new_binds_bo_mi",
"mutation_og_binds_bo_mi",
"mutation_orig_1",
"mutation_orig_2",
"mutation_orig_new_samelen_2",
"mutation_range_1",
"mutation_range_2",
"mutation_short_1",
"mutation_short_2",
"Mutated all_new_binds","Mutated all_og_binds","Mutated decisive_entry_new_binds", "Mutated decisive_entry_og_binds",
] + specialcase_dup_cols
temp = temp.drop(columns=dup_cols)
temp = temp.drop_duplicates(keep=False).sort_values(by=["seq_pair_id","interaction_intactid","mutation_short"]).reset_index(drop=True)

test1 = len(temp)==0
print(test1)

False


In [444]:
temp = simplemerged_mut.loc[
    simplemerged_mut.duplicated(["unique_id","seq_pair_id","interaction_intactid","mutation_short"],keep=False)
].sort_values(by=["unique_id","seq_pair_id","interaction_intactid","mutation_short"]).reset_index(drop=True)
specialcase_dup_cols = [
    "mutated_aa_1","mutated_aa_2",
    "Mutated Partner",
    "scraped_mut_has_info_1",
    "scraped_mut_has_info_2",
    "aa_1","aa_2",
    'uniprot_A', 
    'uniprot_A_equalseq', 
    'uniprot_A_equalseq_canonical',
    'uniprot_A_full',
    'uniprot_A_inseq',
    'uniprot_A_inseq_canonical',
    'uniprot_A_noiso1',
    'uniprot_B',
    'uniprot_B_equalseq',
    'uniprot_B_equalseq_canonical',
    'uniprot_B_full',
    'uniprot_B_inseq',
    'uniprot_B_inseq_canonical',
    'uniprot_B_noiso1']
keep_all_info_dup_cols = [
    "Mutation new_binds_bo_ac",
    "Mutation new_binds_bo_annotation",
    "Mutation new_binds_bo_feature_type",
    "Mutation new_binds_to_gname_bo_annotation",
    "Mutation new_binds_to_uniprot_bo_annotation",
    "Mutation new_nobind_to_gname_bo_annotation",
    "Mutation new_nobind_to_uniprot_bo_annotation",
    "Mutation og_binds_bo_ac",
    "Mutation og_binds_bo_annotation",
    "Mutation og_binds_bo_feature_type",
    "Mutated all_new_binds","Mutated all_og_binds",
    "Mutated decisive_entry_new_binds", "Mutated decisive_entry_og_binds",
    "Mutation Feature type",
    "Mutation Figure legend(s)",
    "Mutation # Feature AC",
    "Mutation Feature annotation(s)",
    "mutation_mi_1",
    "mutation_mi_2",
    "mutation_new_binds_bo_mi",
    "mutation_og_binds_bo_mi",
]
dup_cols = [
    "Mutation Affected protein AC",
    "Mutation Affected protein full name",
    "Mutation Affected protein organism",
    "Mutation Affected protein symbol",
    "Mutation Feature range(s)",
    "Mutation Feature short label",
    "Mutation Interaction AC",
    "Mutation Interaction participants",
    "Mutation Interactor Matches",
    "Mutation Original sequence",
    "Mutation PubMedID",
    "Mutation Resulting sequence",
    "Mutation Xref ID(s)",
    "mutation_begin_1",
    "mutation_begin_2",
    "mutation_end_1",
    "mutation_end_2",
    "mutation_name_1",
    "mutation_name_2",
    "mutation_new_1",
    "mutation_new_2",
    "mutation_orig_1",
    "mutation_orig_2",
    #"mutation_orig_new_samelen_1",
    "mutation_orig_new_samelen_2",
    "mutation_range_1",
    "mutation_range_2",
    "mutation_short_1",
    "mutation_short_2",
] + keep_all_info_dup_cols + specialcase_dup_cols
temp = temp.drop(columns=dup_cols)
temp = temp.drop_duplicates().sort_values(by=["seq_pair_id","interaction_intactid","mutation_short"]).reset_index(drop=True)

l = ["EBI-22091507","EBI-22091555"]
print(len(temp.loc[temp["interaction_intactid"].isin(l)]))
temp

2


Unnamed: 0,Negative,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,chain_seq_start_2,confidence_val_int,...,invalids_mutated_aa_1,invalids_mutated_aa_2,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status,mutation_short
0,False,,,intact:EBI-1245329,intact:EBI-743923,,,,,0.67,...,,,MASASSSRAGVALPFEKSQLTLKVVSAKPKVHNRQPRINSYVEVAV...,MASASSSRAGVALPFEKSQLTLKVVSAKPKVHNRQPRINSYVEVAV...,seq_sort_og_4761,seq_sort_new_16214,yes,yes,mutated,O00308:p.Arg315Cys
1,False,,,intact:EBI-1245329,intact:EBI-743923,,,,,0.67,...,,,MASASSSRAGVALPFEKSQLTLKVVSAKPKVHNRQPRINSYVEVAV...,MASASSSRAGVALPFEKSQLTLKVVSAKPKVHNRQPRINSYVEVAV...,seq_sort_og_4761,seq_sort_new_16214,yes,yes,original,O00308:p.Arg315Cys
2,False,,,intact:EBI-10194262,intact:EBI-11522780,,,,,0.56,...,,,MLFPISMSATEFLLASVIFCLVFWVIRASRPQVPKGLKNPPGPWGW...,MLFPISMSATEFLLASVIFCLVFWVIRASRPQVPKGLKNPPGPWGW...,seq_sort_og_12927,seq_sort_new_39440,yes,yes,mutated,P04798:p.Ile462Val
3,False,,,intact:EBI-10194262,intact:EBI-11522780,,,,,0.56,...,,,MLFPISMSATEFLLASVIFCLVFWVIRASRPQVPKGLKNPPGPWGW...,MLFPISMSATEFLLASVIFCLVFWVIRASRPQVPKGLKNPPGPWGW...,seq_sort_og_12927,seq_sort_new_39440,yes,yes,original,P04798:p.Ile462Val
4,False,,,intact:EBI-10194262,intact:EBI-11522780,,,,,0.56,...,,,MLFPISMSATEFLLASVIFCLVFWVIRASRPQVPKGLKNPPGPWGW...,MLFPISMSATEFLLASVIFCLVFWVIRASRPQVPKGLKNPPGPWGW...,seq_sort_og_12927,seq_sort_new_39439,yes,yes,mutated,P04798:p.Thr461Asn
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83948,False,,,intact:EBI-2952751,intact:EBI-10194102,,,,,0.72,...,,,MARRKPEGSSFNMTHLSMAMAFSFPPVASGQLHPQLGNTQHQTELG...,MARRKPEGSSFNMTHLSMAMAFSFPPVASGQLHPQLGNTQHQTELG...,seq_sort_og_4642,seq_sort_new_15907,yes,yes,original,P04075-2:p.Asp183Gly
83949,False,,,intact:EBI-2952751,intact:EBI-10194102,,,,,0.72,...,,,MARRKPEGSSFNMTHLSMAMAFSFPPVASGQLHPQLGNTQHQTELG...,MARRKPEGSSFNMTHLSMAMAFSFPPVASGQLHPQLGNTQHQTELG...,seq_sort_og_4642,seq_sort_new_15905,yes,yes,mutated,P04075-2:p.Cys393Tyr
83950,False,,,intact:EBI-2952751,intact:EBI-10194102,,,,,0.72,...,,,MARRKPEGSSFNMTHLSMAMAFSFPPVASGQLHPQLGNTQHQTELG...,MARRKPEGSSFNMTHLSMAMAFSFPPVASGQLHPQLGNTQHQTELG...,seq_sort_og_4642,seq_sort_new_15905,yes,yes,original,P04075-2:p.Cys393Tyr
83951,False,,,intact:EBI-8659,intact:EBI-12430,,,,,0.37,...,,,MASETFEFQAEITQLMSLIINTVYSNKEIFLRELISNASDALDKIR...,MASETFEFQAEITQLMSLIINTVYSNKEIFLRALISNASDALDKIR...,seq_sort_og_4954,seq_sort_new_16572,yes,yes,mutated,P02829:p.Glu33Ala


In [445]:
# There are columns that are basically the same but they've been annotated differently with their mutation features 
# They represent the same mutation becaues they have the same range, start and end sequence
# But different MIs, differnet comments
indexcols = ["unique_id","seq_pair_id","interaction_intactid","mutation_short"]
print(len(simplemerged_mut))
print(len(simplemerged_mut.drop_duplicates(indexcols)))

817896
775672


In [446]:
need_pipejoin_cols = keep_all_info_dup_cols
keep_first_cols = dup_cols + specialcase_dup_cols

simplemerged_mut = harmonize_nulls_to_nan(simplemerged_mut)

groupby_cols = ["unique_id", "seq_pair_id", "interaction_intactid", "mutation_short"]

# Sanity check: make sure these exist
missing = [c for c in groupby_cols if c not in simplemerged_mut.columns]
assert not missing, f"Missing groupby cols: {missing}"

# All non-group columns
non_group_cols = [c for c in simplemerged_mut.columns if c not in groupby_cols]

# Build agg_spec per-column
agg_spec: dict[str, object] = {}
for c in non_group_cols:
    if c in need_pipejoin_cols:
        agg_spec[c] = join_unique_nonnull
    elif c in keep_first_cols:
        agg_spec[c] = take_first
    else:
        # whatever default you want for "other" columns
        agg_spec[c] = take_first

simplemerged_mut = (
    simplemerged_mut
    .groupby(groupby_cols, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Grouped on {groupby_cols}. New db size: {len(simplemerged_mut)}")
display(simplemerged_mut.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Grouped on ['unique_id', 'seq_pair_id', 'interaction_intactid', 'mutation_short']. New db size: 775672


Unnamed: 0,unique_id,seq_pair_id,interaction_intactid,mutation_short,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,intact:EBI-100018_intact:EBI-101707,seqpair1,EBI-22074159,,False,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-101707,...,,,,,,,,,,
1,intact:EBI-100018_intact:EBI-102069,seqpair2,EBI-502739,,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,,,intact:EBI-100018,...,,,,,,,,,,
2,intact:EBI-100018_intact:EBI-104215,seqpair3,EBI-263347,,False,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-104215,...,,,,,,,,,,
3,intact:EBI-100018_intact:EBI-107089,seqpair4,EBI-235587,,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,,,intact:EBI-100018,...,,,,,,,,,,
4,intact:EBI-100018_intact:EBI-117032,seqpair5,EBI-22074151,,False,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-117032,...,,,,,,,,,,


In [447]:
l = ["EBI-22091507","EBI-22091555"]
simplemerged_mut.loc[simplemerged_mut["interaction_intactid"].isin(l)][
    ["interaction_intactid","aa_1","aa_2","all_intact_A_sorted","all_intact_B_sorted","mutation_short_1","mutation_short_2","Mutation Affected protein AC","Mutated Partner","Mutation Interactor Matches"]
]

Unnamed: 0,interaction_intactid,aa_1,aa_2,all_intact_A_sorted,all_intact_B_sorted,mutation_short_1,mutation_short_2,Mutation Affected protein AC,Mutated Partner,Mutation Interactor Matches
754385,EBI-22091507,MTLQCTKSAGPWKMVVWDEDGFQGRRHEFTAECPSVLELGFETVRS...,MTLQCTKSAGPWKMVVWDEDGFQGRRHEFTAECPSVLELGFETVRS...,intact:EBI-7519711,intact:EBI-7519711,P53673:p.Gly64Trp,,,A,
754388,EBI-22091555,MTLQCTKSAGPWKMVVWDEDGFQGRRHEFTAECPSVLELGFETVRS...,MTLQCTKSAGPWKMVVWDEDGFQGRRHEFTAECPSVLELGFETVRS...,intact:EBI-7519711,intact:EBI-7519711,P53673:p.Gly64Trp,,,A,


In [448]:
# Let's save these databases. Two versions 
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id","interaction_intactid","mutation_short","mutation_short"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid+mutation_short: {test1}")

Rows in simplemerged_neg_mut with duplicate unique_id: 12798
Rows in simplemerged_neg_mut with duplicate interaction_intactid: 7271
Rows in simplemerged_neg_mut with duplicate seq_pair_id: 12870
Rows in simplemerged_neg_mut with duplicate unique_id+seq_pair_id: 12797
Rows in simplemerged_neg_mut with duplicate unique_id+interaction_intactid: 7271
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: 7271
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid+mutation_short: 33


In [449]:
# Let's save these databases. Two versions 
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate unique_id: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["seq_pair_id","interaction_intactid","mutation_short","mutation_short"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid+mutation_short: {test1}")

Rows in simplemerged_mut_unknown with duplicate unique_id: 10689
Rows in simplemerged_mut_unknown with duplicate interaction_intactid: 6348
Rows in simplemerged_mut_unknown with duplicate seq_pair_id: 10713
Rows in simplemerged_mut_unknown with duplicate unique_id+seq_pair_id: 10688
Rows in simplemerged_mut_unknown with duplicate unique_id+interaction_intactid: 6347
Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid: 6347
Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid+mutation_short: 2907


In [450]:
need_pipejoin_cols = keep_all_info_dup_cols
keep_first_cols = dup_cols + specialcase_dup_cols

simplemerged_neg_mut = harmonize_nulls_to_nan(simplemerged_neg_mut)

groupby_cols = ["unique_id", "seq_pair_id", "interaction_intactid", "mutation_short"]

# Sanity check: make sure these exist
missing = [c for c in groupby_cols if c not in simplemerged_neg_mut.columns]
assert not missing, f"Missing groupby cols: {missing}"

# All non-group columns
non_group_cols = [c for c in simplemerged_neg_mut.columns if c not in groupby_cols]

# Build agg_spec per-column
agg_spec: dict[str, object] = {}
for c in non_group_cols:
    if c in need_pipejoin_cols:
        agg_spec[c] = join_unique_nonnull
    elif c in keep_first_cols:
        agg_spec[c] = take_first
    else:
        # whatever default you want for "other" columns
        agg_spec[c] = take_first

simplemerged_neg_mut = (
    simplemerged_neg_mut
    .groupby(groupby_cols, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Grouped on {groupby_cols}. New db size: {len(simplemerged_neg_mut)}")
display(simplemerged_neg_mut.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Grouped on ['unique_id', 'seq_pair_id', 'interaction_intactid', 'mutation_short']. New db size: 20189


Unnamed: 0,unique_id,seq_pair_id,interaction_intactid,mutation_short,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
0,intact:EBI-1000553_intact:EBI-475981,seqpair85,EBI-2437595,P08069:p.Lys1033Ala,True,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,,,intact:EBI-475981,...,yes,no,yes,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,seq_sort_og_12604,seq_sort_new_38713,yes,no,mutated
1,intact:EBI-1000553_intact:EBI-475981,seqpair85,EBI-2437595,P08069:p.Tyr980Phe,True,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,,,intact:EBI-475981,...,yes,no,yes,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,MKSGSGGGSPTSLWGLLFLSAALSLWPTSGEICGPGIDIRNDYQQL...,seq_sort_og_12604,seq_sort_new_38712,yes,no,mutated
2,intact:EBI-1001438_intact:EBI-10973816,seqpair215,EBI-15947498,Q8NG31-2:p.[Ile213Ala;Phe215Ala;Phe218Ala],True,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,intact:EBI-1001438,...,yes,no,yes,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,seq_sort_og_1700,seq_sort_new_5944,yes,no,mutated
3,intact:EBI-1001438_intact:EBI-10973816,seqpair215,EBI-15947498,Q8NG31-2:p.[Phe215Ala;Ile219Ala],True,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,intact:EBI-1001438,...,yes,no,yes,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,seq_sort_og_1700,seq_sort_new_5946,yes,no,mutated
4,intact:EBI-1001438_intact:EBI-10973816,seqpair215,EBI-15947498,Q8NG31-2:p.[Phe215Ala;Phe218Ala],True,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,intact:EBI-1001438,...,yes,no,yes,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,seq_sort_og_1700,seq_sort_new_5945,yes,no,mutated


In [451]:
# Let's save these databases. Two versions 
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg_mut.loc[
    simplemerged_neg_mut.duplicated(["seq_pair_id","interaction_intactid","mutation_short","mutation_short"])
])
print(f"Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid+mutation_short: {test1}")

Rows in simplemerged_neg_mut with duplicate unique_id: 12765
Rows in simplemerged_neg_mut with duplicate interaction_intactid: 7238
Rows in simplemerged_neg_mut with duplicate seq_pair_id: 12837
Rows in simplemerged_neg_mut with duplicate unique_id+seq_pair_id: 12764
Rows in simplemerged_neg_mut with duplicate unique_id+interaction_intactid: 7238
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid: 7238
Rows in simplemerged_neg_mut with duplicate seq_pair_id+interaction_intactid+mutation_short: 0


In [452]:
[x for x in simplemerged_neg_mut.columns if "decisive" in x]

['Mutated decisive_entry_new_binds',
 'Mutated decisive_entry_og_binds',
 'Mutated decisive_seqpair_og_binds',
 'Mutated decisive_seqpair_new_binds']

In [453]:
need_pipejoin_cols = keep_all_info_dup_cols
keep_first_cols = dup_cols + specialcase_dup_cols

simplemerged_mut_unknown = harmonize_nulls_to_nan(simplemerged_mut_unknown)

groupby_cols = ["unique_id", "seq_pair_id", "interaction_intactid", "mutation_short"]

# Sanity check: make sure these exist
missing = [c for c in groupby_cols if c not in simplemerged_mut_unknown.columns]
assert not missing, f"Missing groupby cols: {missing}"

# All non-group columns
non_group_cols = [c for c in simplemerged_mut_unknown.columns if c not in groupby_cols]

# Build agg_spec per-column
agg_spec: dict[str, object] = {}
for c in non_group_cols:
    if c in need_pipejoin_cols:
        agg_spec[c] = join_unique_nonnull
    elif c in keep_first_cols:
        agg_spec[c] = take_first
    else:
        # whatever default you want for "other" columns
        agg_spec[c] = take_first

simplemerged_mut_unknown = (
    simplemerged_mut_unknown
    .groupby(groupby_cols, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Grouped on {groupby_cols}. New db size: {len(simplemerged_mut_unknown)}")
display(simplemerged_mut_unknown.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Grouped on ['unique_id', 'seq_pair_id', 'interaction_intactid', 'mutation_short']. New db size: 10798


Unnamed: 0,unique_id,seq_pair_id,interaction_intactid,mutation_short,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,...,invalids_mutated_aa_2,mutation_new_binds_bo_mi,mutation_og_binds_bo_mi,Mutated all_new_binds,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_new,seq_sort_new_id,Mutated decisive_seqpair_new_binds
0,intact:EBI-10006231_intact:EBI-2511350,seqpair97,EBI-9678671,Q16513:p.Leu520Ile,False,MASNPERGEILLTELQGDSRSLPFSENVSAVQKLDFSDTMVQQKLD...,SMSYTWTGALITPCAAEESKLPINPLSNSLLRHHNMVYATTSRSAS...,,,intact:EBI-2511350,...,,unknown,unknown,unknown,unknown,unknown,unknown,,,
1,intact:EBI-10006231_intact:EBI-2511350,seqpair97,EBI-9998408,Q16513:p.Leu520Ile,False,MASNPERGEILLTELQGDSRSLPFSENVSAVQKLDFSDTMVQQKLD...,SMSYTWTGALITPCAAEESKLPINPLSNSLLRHHNMVYATTSRSAS...,,,intact:EBI-2511350,...,,unknown,unknown,unknown,unknown,unknown,unknown,,,
2,intact:EBI-1001438_intact:EBI-10973816,seqpair215,EBI-15947469,O60566:p.[Leu128Ala;Leu131Ala],False,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,intact:EBI-1001438,...,,yes,yes,yes,yes,yes,yes,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,seq_sort_new_5937,unknown
3,intact:EBI-1001438_intact:EBI-10973816,seqpair215,EBI-15947469,O60566:p.[Tyr141Ala;Leu142Ala],False,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,intact:EBI-1001438,...,,yes,yes,yes,yes,yes,yes,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,seq_sort_new_5938,unknown
4,intact:EBI-1001438_intact:EBI-10973816,seqpair215,EBI-15947532,O60566:p.[Leu128Ala;Leu131Ala],False,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MDGVSSEANEENDNIERPVRRRHSSILKPPRSPLQDLRGGNERVQE...,,,intact:EBI-1001438,...,,no,yes,no,yes,no,yes,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,seq_sort_new_5937,unknown


In [454]:
# Let's save these databases. Two versions 
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate unique_id: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid: {test1}")
test1 = len(simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown.duplicated(["seq_pair_id","interaction_intactid","mutation_short","mutation_short"])
])
print(f"Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid+mutation_short: {test1}")

Rows in simplemerged_mut_unknown with duplicate unique_id: 7782
Rows in simplemerged_mut_unknown with duplicate interaction_intactid: 3441
Rows in simplemerged_mut_unknown with duplicate seq_pair_id: 7806
Rows in simplemerged_mut_unknown with duplicate unique_id+seq_pair_id: 7781
Rows in simplemerged_mut_unknown with duplicate unique_id+interaction_intactid: 3440
Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid: 3440
Rows in simplemerged_mut_unknown with duplicate seq_pair_id+interaction_intactid+mutation_short: 0


In [455]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact.to_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv",index=False)
# intact-clust
intact_clust.to_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv",index=False)
# intact_neg
intact_neg.to_csv(f"{savedir}/intact_neg_post_mutation_processing_dec11_2025.csv",index=False)
# intact-clust
intact_clust_neg.to_csv(f"{savedir}/intact_clust_neg_post_mutation_processing_dec11_2025.csv",index=False)
# merged
merged.to_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv",index=False)
# my_pos
my_pos.to_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv",index=False)
# my_neg
my_neg.to_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv",index=False)
# simplemerged (merged, but with mutant data merged in and bad mutant data removed. Also doesn't have every column merged had)
simplemerged_mut.to_csv(f"{savedir}/simplemerged_mut_post_mutation_processing_dec11_2025.csv",index=False)
# simplemerged_mut_unknown 
simplemerged_mut_unknown.to_csv(f"{savedir}/simplemerged_mut_unknown_post_mutation_processing_dec11_2025.csv",index=False)
# my_neg_with_mut
simplemerged_neg_mut.to_csv(f"{savedir}/my_neg_with_mut_post_mutation_processing_dec11_2025.csv",index=False)


# PTMs

## Intermediate file load: Only run if need to load intermediate files

In [456]:
# Now to read bindsites we have to bump the limit
# bump the limit (use a big number; sys.maxsize may OverflowError on some platforms)
limit = 10**9
try:
    csv.field_size_limit(limit)
except OverflowError:
    # fallback: shrink until it fits the platform
    while True:
        try:
            csv.field_size_limit(limit)
            break
        except OverflowError:
            limit //= 10

In [457]:
ptms_path = "data_files/raw/intact/psimitab/features/ptms.tsv"
ptms = pd.read_csv(ptms_path, sep="\t", engine="python")
ptms["Interaction AC"] = ptms["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

mutations_path = "data_files/raw/intact/psimitab/features/mutations.tsv"
mutations = pd.read_csv(mutations_path, sep="\t", engine="python")
mutations["Interaction AC"] = mutations["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

bindsites_path = "data_files/raw/intact/psimitab/features/bindings_regions.tsv"            
bindsites = pd.read_csv(bindsites_path, sep="\t", engine="python")
bindsites["Interaction AC"] = bindsites["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

analyzed_mods_dir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/features_analyzed"
bindsite_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/bindsite_types_analyzed.csv")
mutation_feature_ac_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_ac_analyzed.csv")
mutation_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_annotations_analyzed.csv")
mutation_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_types_analyzed.csv")
ptm_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_types_analyzed.csv")
ptm_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_annotations_analyzed.csv")

interaction_milabel_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0190_subtree.csv")
mutation_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0118_subtree.csv")
bindsite_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0117_subtree.csv")
ptm_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0925_subtree.csv")

In [458]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv")
# intact-clust
intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv")

# merged
merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")

# 
#merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")

# my_pos
my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv")
# my_neg
my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv")

simplemerged_mut = pd.read_csv(f"{savedir}/simplemerged_mut_post_mutation_processing_dec11_2025.csv")

simplemerged_mut_unknown = pd.read_csv(f"{savedir}/simplemerged_mut_unknown_post_mutation_processing_dec11_2025.csv")

simplemerged_neg_mut = pd.read_csv(f"{savedir}/my_neg_with_mut_post_mutation_processing_dec11_2025.csv")



  intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv")
  intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv")
  merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
  merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
  my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv")
  my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv")
  simplemerged_mut = pd.read_csv(f"{savedir}/simplemerged_mut_post_mutation_processing_dec11_2025.csv")
  simplemerged_mut_unknown = pd.read_csv(f"{savedir}/simplemerged_mut_unknown_post_mutation_processing_dec11_2025.csv")
  simplemerged_neg_mut = pd.read_csv(f"{savedir}/my_neg_with_mut_post_mutation_processing_dec11_2025.csv")


In [459]:
intact_dtypes = {k: "string" for k in intact.columns}
intact_dtypes["Negative"] = "bool"
#intact_dtypes["miscore"] = "float"

intact_clust_dtypes = {k: "string" for k in intact_clust.columns}
intact_clust_dtypes["Negative"] = "bool"
intact_clust_dtypes["miscore"] = "float"
intact_clust_dtypes["equal_score_int"] = "bool"

simplemerged_dtypes = {k: "string" for k in simplemerged_mut.columns}
simplemerged_dtypes["Negative"] = "bool"
simplemerged_dtypes["length_1"] = "int"
simplemerged_dtypes["length_2"] = "int"
simplemerged_dtypes["miscore"] = "float"
simplemerged_dtypes["year"] = "int"
simplemerged_dtypes["confidence_val_int"] = "float"
simplemerged_dtypes["unique_score_int"] = "float"

#simplemerged_dtypes["scraped_mut_has_info"] = "bool"
#simplemerged_dtypes["scraped_mut_has_info_1"] = "bool"
#simplemerged_dtypes["scraped_mut_has_info_2"] = "bool"
#simplemerged_dtypes["agg_mut_has_info"] = "bool"

merged_dtypes = {k: "string" for k in merged.columns}
merged_dtypes["Negative"] = "bool"
merged_dtypes["length_1"] = "int"
merged_dtypes["length_2"] = "int"
merged_dtypes["miscore"] = "float"
merged_dtypes["year"] = "int"
merged_dtypes["confidence_val_int"] = "float"
merged_dtypes["unique_score_int"] = "float"

my_pos_dtypes = {k: "string" for k in my_pos.columns}
my_pos_dtypes["length_1"] = "int"
my_pos_dtypes["length_2"] = "int"
my_pos_dtypes["year"] = "int"

my_neg_dtypes = {k: "string" for k in my_neg.columns}
my_neg_dtypes["miscore"] = "float"
#my_neg_dtypes["Negative"] = "bool"
my_neg_dtypes["length_1"] = "int"
my_neg_dtypes["length_2"] = "int"
my_neg_dtypes["year"] = "int"


In [461]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv",
                     dtype=intact_dtypes)
# intact-clust
intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv",
                           dtype=intact_clust_dtypes)

# merged
merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv",
                     dtype=merged_dtypes)

# my_pos
my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv",
                     dtype=my_pos_dtypes)
# my_neg
my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv",
                     dtype=my_neg_dtypes)

simplemerged_mut = pd.read_csv(f"{savedir}/simplemerged_mut_post_mutation_processing_dec11_2025.csv", dtype=simplemerged_dtypes)

simplemerged_mut_unknown = pd.read_csv(f"{savedir}/simplemerged_mut_unknown_post_mutation_processing_dec11_2025.csv", dtype=simplemerged_dtypes)

simplemerged_neg_mut = pd.read_csv(f"{savedir}/my_neg_with_mut_post_mutation_processing_dec11_2025.csv", dtype=simplemerged_dtypes)



In [462]:
intact = harmonize_nulls_to_nan(intact)
intact_clust = harmonize_nulls_to_nan(intact_clust)
merged = harmonize_nulls_to_nan(merged)
merged_neg = harmonize_nulls_to_nan(merged_neg)
my_pos = harmonize_nulls_to_nan(my_pos)
my_neg = harmonize_nulls_to_nan(my_neg)
simplemerged_mut = harmonize_nulls_to_nan(simplemerged_mut)
simplemerged_mut_unknown = harmonize_nulls_to_nan(simplemerged_mut_unknown)
simplemerged_neg_mut = harmonize_nulls_to_nan(simplemerged_neg_mut)


In [463]:
# make the mut_has_info columns equal to bools when they aren't nans
should_be_bool = ['scraped_mut_has_info', 'scraped_mut_has_info_1', 'scraped_mut_has_info_2', 'agg_mut_has_info']
for x in should_be_bool:
    for df in [simplemerged_mut, simplemerged_mut_unknown, simplemerged_neg_mut, my_pos, my_neg]:
        if x in df: 
            df[x] = df[x].apply(lambda x: bool(x) if not(type(x)==float) else False)

In [464]:
simplemerged_mut["scraped_mut_has_info"].value_counts()

scraped_mut_has_info
False    708214
True      67458
Name: count, dtype: int64

## Data processing

In [465]:
merged_expl = merged.copy(deep=True)
print(f"Length of merged: {len(merged_expl)}. Merged is already exploded by IntAct interaction identifier(s)")
# confirm just one more time that there is only one interaction identifier per row
test1 = len(merged_expl.loc[merged_expl["interaction_intactid"].isna()])==0
print(f"\tAll rows have one intact interaction ID after exploding: {test1}")
test1 = len(merged_expl.loc[merged_expl["interaction_intactid"].str.count("EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID after exploding: {test1}")

Length of merged: 745085. Merged is already exploded by IntAct interaction identifier(s)
	All rows have one intact interaction ID after exploding: True
	No rows have >1 intact interaction ID after exploding: True


In [466]:
merged_neg_expl = merged_neg.copy(deep=True)
print(f"Length of merged_neg: {len(merged_neg_expl)}. Merged is already exploded by IntAct interaction identifier(s)")
# confirm just one more time that there is only one interaction identifier per row
test1 = len(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"].isna()])==0
print(f"\tAll rows have one intact interaction ID after exploding: {test1}")
test1 = len(merged_neg_expl.loc[merged_neg_expl["interaction_intactid"].str.count("EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID after exploding: {test1}")

Length of merged_neg: 745085. Merged is already exploded by IntAct interaction identifier(s)
	All rows have one intact interaction ID after exploding: True
	No rows have >1 intact interaction ID after exploding: True


In [467]:
print(
    "\n".join([
        f"\"{x}\"" for x in merged_expl.columns if "ptm" in x
    ])
)

"ptm_mi_1"
"ptm_name_1"
"ptm_short_1"
"ptm_begin_1"
"ptm_end_1"
"ptm_orig_1"
"ptm_new_1"
"ptm_mi_2"
"ptm_name_2"
"ptm_short_2"
"ptm_begin_2"
"ptm_end_2"
"ptm_orig_2"
"ptm_new_2"


In [468]:
scraped_ptm_cols = [
"ptm_mi_1",
"ptm_name_1",
"ptm_short_1",
"ptm_begin_1",
"ptm_end_1",
"ptm_orig_1",
"ptm_new_1",
"ptm_mi_2",
"ptm_name_2",
"ptm_short_2",
"ptm_begin_2",
"ptm_end_2",
"ptm_orig_2",
"ptm_new_2"
]

In [469]:
merged_expl["scraped_ptm_has_info"] = merged_expl[scraped_ptm_cols].notna().any(axis=1)
merged_expl["scraped_ptm_has_info_1"] = merged_expl[[x for x in scraped_ptm_cols if x.endswith("_1")]].notna().any(axis=1)
merged_expl["scraped_ptm_has_info_2"] = merged_expl[[x for x in scraped_ptm_cols if x.endswith("_2")]].notna().any(axis=1)
test1 = len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info"]) & 
    (merged_expl["scraped_ptm_has_info_1"]) &
    (merged_expl["scraped_ptm_has_info_2"])
    ][scraped_ptm_cols])
print(f"Total rows with PTM entries for both interactor 1 and interactor 2: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info"])
    ][scraped_ptm_cols])
print(f"Total rows with PTM entries for either interactor 1 or interactor 2: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")

Total rows with PTM entries for both interactor 1 and interactor 2: 485/745085 (0.07%)
Total rows with PTM entries for either interactor 1 or interactor 2: 6455/745085 (0.87%)


In [470]:
merged_neg_expl["scraped_ptm_has_info"] = merged_neg_expl[scraped_ptm_cols].notna().any(axis=1)
merged_neg_expl["scraped_ptm_has_info_1"] = merged_neg_expl[[x for x in scraped_ptm_cols if x.endswith("_1")]].notna().any(axis=1)
merged_neg_expl["scraped_ptm_has_info_2"] = merged_neg_expl[[x for x in scraped_ptm_cols if x.endswith("_2")]].notna().any(axis=1)
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info"]) & 
    (merged_neg_expl["scraped_ptm_has_info_1"]) &
    (merged_neg_expl["scraped_ptm_has_info_2"])
    ][scraped_ptm_cols])
print(f"Total rows with PTM entries for both interactor 1 and interactor 2: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info"])
    ][scraped_ptm_cols])
print(f"Total rows with PTM entries for either interactor 1 or interactor 2: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")

Total rows with PTM entries for both interactor 1 and interactor 2: 485/745085 (0.07%)
Total rows with PTM entries for either interactor 1 or interactor 2: 6455/745085 (0.87%)


In [471]:
merged_expl["ptms_expandable_1"] = merged_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_ptm_cols, interactor=1) if row["scraped_ptm_has_info_1"] else False, axis=1)
merged_expl["ptms_expandable_2"] = merged_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_ptm_cols, interactor=2) if row["scraped_ptm_has_info_2"] else False, axis=1)

In [472]:
merged_neg_expl["ptms_expandable_1"] = merged_neg_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_ptm_cols, interactor=1) if row["scraped_ptm_has_info_1"] else False, axis=1)
merged_neg_expl["ptms_expandable_2"] = merged_neg_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_ptm_cols, interactor=2) if row["scraped_ptm_has_info_2"] else False, axis=1)

In [473]:
test1 = len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info_1"] != merged_expl["ptms_expandable_1"])
])
print(f"Total rows where interactor 1 PTM info exists but is not expandable: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info_1"] != merged_expl["ptms_expandable_1"]) & 
    (merged_expl["ptm_orig_1"].notna() & merged_expl["ptm_new_1"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info_2"] != merged_expl["ptms_expandable_2"])
])
print(f"Total rows where interactor 2 PTM info exists but is not expandable: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info_2"] != merged_expl["ptms_expandable_2"]) & 
    (merged_expl["ptm_orig_2"].notna() & merged_expl["ptm_new_2"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")

Total rows where interactor 1 PTM info exists but is not expandable: 0/745085 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True
Total rows where interactor 2 PTM info exists but is not expandable: 0/745085 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True


In [474]:
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info_1"] != merged_neg_expl["ptms_expandable_1"])
])
print(f"Total rows where interactor 1 PTM info exists but is not expandable: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info_1"] != merged_neg_expl["ptms_expandable_1"]) & 
    (merged_neg_expl["ptm_orig_1"].notna() & merged_neg_expl["ptm_new_1"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info_2"] != merged_neg_expl["ptms_expandable_2"])
])
print(f"Total rows where interactor 2 PTM info exists but is not expandable: {test1}/{len(merged_neg_expl)} ({100*test1/len(merged_neg_expl):.2f}%)")
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info_2"] != merged_neg_expl["ptms_expandable_2"]) & 
    (merged_neg_expl["ptm_orig_2"].notna() & merged_neg_expl["ptm_new_2"].notna())
])==0
print(f"\tAll of these cases are because before-and-after sequences weren't provided: {test1}")

Total rows where interactor 1 PTM info exists but is not expandable: 0/745085 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True
Total rows where interactor 2 PTM info exists but is not expandable: 0/745085 (0.00%)
	All of these cases are because before-and-after sequences weren't provided: True


In [475]:
ptms.loc[ptms["Feature range(s)"]=="?-?"]

Unnamed: 0,# Feature AC,Feature short label,Feature range(s),Original sequence,Resulting sequence,Feature type,Feature annotation(s),Affected protein AC,Affected protein symbol,Affected protein full name,Affected protein organism,Interaction participants,PubMedID,Figure legend(s),Interaction AC,Xref ID(s)
6,EBI-10696222,possible_phophosite,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:P46108-2,uniprotkb:CRK(gene name),Adapter molecule crk,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:O60496(psi-mi:""MI:0326""(protein)), ...",pubmed:25814554|imex:IM-22632,figure legend:Suppl. table S3,intact:EBI-10696209,-
9,EBI-10696220,possible_phophosite,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:O60496,uniprotkb:DOK2(gene name),Docking protein 2,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:O60496(psi-mi:""MI:0326""(protein)), ...",pubmed:25814554|imex:IM-22632,figure legend:Suppl. table S3,intact:EBI-10696209,-
16,EBI-26450950,tyr-?,?-?,-,-,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)",-,uniprotkb:Q8NDB2,uniprotkb:BANK1(gene name),B-cell scaffold protein with ankyrin repeats,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P07948(psi-mi:""MI:0326""(protein)), ...",pubmed:11782428|imex:IM-28511,figure legend:Fig. 5A,intact:EBI-26450945,-
25,EBI-11290674,possible_phospho_res,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:Q01147,uniprotkb:Creb1(gene name),Cyclic AMP-responsive element-binding protein 1,taxid:10090(mouse)|taxid:10090(Mus musculus),"(ensembl:ENSMUSG00000039521(psi-mi:""MI:0250""(g...",pubmed:17591856|imex:IM-24534,figure legend:5B,intact:EBI-11290665,-
26,EBI-8557398,sumoylated lysine,?-?,-,-,"psi-mod:""MOD:01149""(sumoylated lysine)",resulting-ptm:resulting-ptm,uniprotkb:Q92844,uniprotkb:TANK(gene name),TRAF family member-associated NF-kappa-B activ...,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q9UHD2(psi-mi:""MI:0326""(protein)), ...",pubmed:21212807|imex:IM-15406,figure legend:f1a,intact:EBI-8557380,mint:MINT-8151326(identity)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10275,EBI-984462,region,?-?,-,-,"psi-mi:""MI:0176""(O-phospho-L-serine)",-,uniprotkb:P09803,uniprotkb:Cdh1(gene name),Cadherin-1,taxid:10090(mouse)|taxid:10090(Mus musculus),"(uniprotkb:Q02248(psi-mi:""MI:0326""(protein)), ...",pubmed:16293619|imex:IM-14500,figure legend:3 and Table 1,intact:EBI-984457,-
10277,EBI-10692940,possible_phophosite,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:Q9UKG1,uniprotkb:APPL1(gene name),DCC-interacting protein 13-alpha,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q9UKG1(psi-mi:""MI:0326""(protein)), ...",pubmed:25814554|imex:IM-22632,figure legend:Suppl. table S3,intact:EBI-10692929,-
10282,EBI-6374967,region,?-?,-,-,"psi-mod:""MOD:00018""(L-histidine residue)",-,uniprotkb:Q12948,uniprotkb:FOXC1(gene name),Forkhead box protein C1,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q07021(psi-mi:""MI:0326""(protein)), ...",pubmed:18676636|imex:IM-17919,figure legend:1A,intact:EBI-6374962,-
10284,EBI-9825471,decreasing_phosres,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",ptm decreasing an interaction,uniprotkb:G3H996,uniprotkb:Ctnnb1(gene name),-,"taxid:10029(crigr)|taxid:10029(""Cricetulus gri...","(uniprotkb:G3HL00(psi-mi:""MI:0326""(protein)), ...",pubmed:21118991|imex:IM-22907,figure legend:Fig.4C,intact:EBI-9819035,-


In [476]:
# Figure out how to explode along these columns
scraped_ptm_cols_1 = [x for x in scraped_ptm_cols if x.endswith("_1")]
scraped_ptm_cols_2 = [x for x in scraped_ptm_cols if x.endswith("_2")]
for c in scraped_ptm_cols_1:
    merged_expl[c] = merged_expl.apply(lambda row: row[c].split("|") if (row["ptms_expandable_1"] and type(row[c])==str) else [row[c]], axis=1)
for c in scraped_ptm_cols_2:
    merged_expl[c] = merged_expl.apply(lambda row: row[c].split("|") if (row["ptms_expandable_2"] and type(row[c])==str) else [row[c]], axis=1)

In [477]:
for c in scraped_ptm_cols_1:
    merged_neg_expl[c] = merged_neg_expl.apply(lambda row: row[c].split("|") if (row["ptms_expandable_1"] and type(row[c])==str) else [row[c]], axis=1)
for c in scraped_ptm_cols_2:
    merged_neg_expl[c] = merged_neg_expl.apply(lambda row: row[c].split("|") if (row["ptms_expandable_2"] and type(row[c])==str) else [row[c]], axis=1)

In [478]:
merged_expl.loc[merged_expl["scraped_ptm_has_info"]][scraped_ptm_cols].head()

Unnamed: 0,ptm_mi_1,ptm_name_1,ptm_short_1,ptm_begin_1,ptm_end_1,ptm_orig_1,ptm_new_1,ptm_mi_2,ptm_name_2,ptm_short_2,ptm_begin_2,ptm_end_2,ptm_orig_2,ptm_new_2
370,[MI:0925],"[observed-ptm,monoacetylated residue]",[acres],[668],[668],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan]
398,[MI:0639],"[resulting-ptm,observed-ptm,monoacetylated res...",[acetylated residue],[250],[250],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan]
497,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0925],"[observed-ptm,O-phospho-L-threonine]",[pT257],[257],[257],[nan],[nan]
853,[nan],[nan],[nan],[nan],[nan],[nan],[nan],[MI:0925],"[observed-ptm,N6-glycyl-L-lysine]",[Poly-Ub],[0],[0],[nan],[nan]
863,[MI:0925],"[observed-ptm,N6-glycyl-L-lysine]",[Poly-Ub],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan],[nan]


In [479]:
print(f"Going to explode merged_expl by interactor 1, then interactor 2 PTM columns. Length before: {len(merged_expl)}")
merged_expl = merged_expl.explode(scraped_ptm_cols_1).reset_index(drop=True)
print(f"\tLength after exploding across 1: {len(merged_expl)}")
merged_expl = merged_expl.explode(scraped_ptm_cols_2).reset_index(drop=True)
print(f"\tLength after exploding across 2: {len(merged_expl)}")

Going to explode merged_expl by interactor 1, then interactor 2 PTM columns. Length before: 745085
	Length after exploding across 1: 745310
	Length after exploding across 2: 746131


In [480]:
print(f"Going to explode merged_neg_expl by interactor 1, then interactor 2 PTM columns. Length before: {len(merged_neg_expl)}")
merged_neg_expl = merged_neg_expl.explode(scraped_ptm_cols_1).reset_index(drop=True)
print(f"\tLength after exploding across 1: {len(merged_neg_expl)}")
merged_neg_expl = merged_neg_expl.explode(scraped_ptm_cols_2).reset_index(drop=True)
print(f"\tLength after exploding across 2: {len(merged_neg_expl)}")

Going to explode merged_neg_expl by interactor 1, then interactor 2 PTM columns. Length before: 745085
	Length after exploding across 1: 745310
	Length after exploding across 2: 746131


In [481]:
# if there is a merged_expl row that has ptm info for interactor A and interactor B, then duplicate that row, and delete all the ptm_*_1 info for one and all the ptm_*_2 info for the other 
import pandas as pd
import numpy as np
import re

def split_rows_by_ptm_blocks(merged_expl: pd.DataFrame) -> pd.DataFrame:
    """
    If a row has ptm info in ANY ptm_*_1 column AND ANY ptm_*_2 column,
    duplicate that row into two:
      - copy A: keep *_1, null out *_2
      - copy B: keep *_2, null out *_1
    All other columns are preserved. Returns a new DataFrame.
    """
    df = merged_expl.copy()

    # Find all ptm columns and split into sides
    mut_cols = [c for c in df.columns if re.match(r"^ptm_.*_(1|2)$", c)]
    block1 = [c for c in mut_cols if c.endswith("_1")]
    block2 = [c for c in mut_cols if c.endswith("_2")]

    # "Has info" = any non-null value in that side's block
    has1 = df[block1].notna().any(axis=1)
    has2 = df[block2].notna().any(axis=1)

    both = has1 & has2
    keep = ~both

    # Rows that don't need splitting
    base = df.loc[keep]

    # Rows that do need splitting -> make two copies
    to_split = df.loc[both]

    left  = to_split.copy()
    right = to_split.copy()

    # In left copy: keep side 1, wipe side 2
    left.loc[:, block2] = np.nan

    # In right copy: keep side 2, wipe side 1
    right.loc[:, block1] = np.nan

    # Return combined result (preserve order roughly: base rows first, then splits)
    out = pd.concat([base, left, right], ignore_index=True)
    out = out.reset_index(drop=True)

    return out

In [482]:
merged_expl = split_rows_by_ptm_blocks(merged_expl)
print(f"Length of merged_expl after splitting multiple PTM effects for the same row: {len(merged_expl)}")

Length of merged_expl after splitting multiple PTM effects for the same row: 746676


In [483]:
merged_neg_expl = split_rows_by_ptm_blocks(merged_neg_expl)
print(f"Length of merged_neg_expl after splitting multiple PTM effects for the same row: {len(merged_neg_expl)}")

Length of merged_neg_expl after splitting multiple PTM effects for the same row: 746676


In [484]:
ptms_to_merge = ptms.copy(deep=True)
test1 = len(ptms.loc[ptms["Interaction AC"].str.count("intact:EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID in ptms dataframe: {test1}")
ptms_to_merge.columns = "PTM " + ptms_to_merge.columns
ptms_to_merge["PTM interaction_intactid"] = ptms_to_merge["PTM Interaction AC"].apply(lambda x: x.split("intact:")[-1] if (type(x)==str and x.count("intact:")==1) else (x if type(x)!=float else None))

	No rows have >1 intact interaction ID in ptms dataframe: True


In [485]:
merged_expl = pd.merge(
    merged_expl,
    ptms_to_merge.rename(columns={"PTM interaction_intactid":"interaction_intactid"}),
    on=["interaction_intactid"],
    how="left"
)

In [486]:
merged_neg_expl = pd.merge(
    merged_neg_expl,
    ptms_to_merge.rename(columns={"PTM interaction_intactid":"interaction_intactid"}),
    on=["interaction_intactid"],
    how="left"
)

In [487]:
agg_ptm_cols = [
    "PTM # Feature AC",
    "PTM Feature short label",
    "PTM Feature range(s)",
    "PTM Original sequence",
    "PTM Resulting sequence",
    "PTM Feature type",
    "PTM Feature annotation(s)",
    "PTM Affected protein AC",
    "PTM Affected protein symbol",
    "PTM Affected protein full name",
    "PTM Affected protein organism",
    "PTM Interaction participants",
    "PTM PubMedID",
    "PTM Figure legend(s)",
    "PTM Interaction AC",
    "PTM Xref ID(s)"
]

In [488]:
merged_expl["agg_ptm_has_info"] = merged_expl[agg_ptm_cols].notna().any(axis=1)
merged_neg_expl["agg_ptm_has_info"] = merged_neg_expl[agg_ptm_cols].notna().any(axis=1)

display(merged_expl["agg_ptm_has_info"].value_counts().reset_index())
display(merged_neg_expl["agg_ptm_has_info"].value_counts().reset_index())

Unnamed: 0,agg_ptm_has_info,count
0,False,738637
1,True,13369


Unnamed: 0,agg_ptm_has_info,count
0,False,738637
1,True,13369


In [489]:
len(merged_expl.loc[
    (merged_expl["scraped_ptm_has_info"]) &
    (merged_expl["agg_ptm_has_info"])
].drop_duplicates(subset=["seq_pair_id"]))

3338

In [490]:
l = merged_expl.loc[
    (merged_expl["scraped_ptm_has_info"]) &
    ~(merged_expl["agg_ptm_has_info"])
]["ptm_mi_1"].dropna().unique().tolist() 
l2 = merged_expl.loc[
    (merged_expl["scraped_ptm_has_info"]) &
    ~(merged_expl["agg_ptm_has_info"])
]["ptm_mi_2"].dropna().unique().tolist()
l = l+l2
ptm_mi_ok.loc[
    ptm_mi_ok["id"].isin(l+l2)
].drop_duplicates(subset=["id"])

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
1,prerequisite-ptm,MI:0638,MI:0925,MI:0925,observed-ptm
2,resulting-ptm,MI:0639,MI:0925,MI:0925,observed-ptm
4,ptm decreasing an interaction,MI:1223,MI:0925,MI:0925,observed-ptm
6,ptm disrupting an interaction,MI:1225,MI:0925,MI:0925,observed-ptm


In [491]:
l = merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info"]) &
    ~(merged_neg_expl["agg_ptm_has_info"])
]["ptm_mi_1"].dropna().unique().tolist() 
l2 = merged_neg_expl.loc[
    (merged_neg_expl["scraped_ptm_has_info"]) &
    ~(merged_neg_expl["agg_ptm_has_info"])
]["ptm_mi_2"].dropna().unique().tolist()
l = l+l2
ptm_mi_ok.loc[
    ptm_mi_ok["id"].isin(l+l2)
].drop_duplicates(subset=["id"])

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
1,prerequisite-ptm,MI:0638,MI:0925,MI:0925,observed-ptm
2,resulting-ptm,MI:0639,MI:0925,MI:0925,observed-ptm
4,ptm decreasing an interaction,MI:1223,MI:0925,MI:0925,observed-ptm
6,ptm disrupting an interaction,MI:1225,MI:0925,MI:0925,observed-ptm


In [492]:
# fix issues like this: QQQQQQQQQQQQQQQQQQQQQQQQQ\r\nQQQQQQQ
for c in ["ptm_orig_1","ptm_new_1","ptm_orig_2","ptm_new_2"]:
    merged_expl[c] = merged_expl[c].apply(lambda x: x if type(x)!=str else x.replace("\r","").replace("\n","").strip())

In [493]:
for c in ["ptm_orig_1","ptm_new_1","ptm_orig_2","ptm_new_2"]:
    merged_neg_expl[c] = merged_neg_expl[c].apply(lambda x: x if type(x)!=str else x.replace("\r","").replace("\n","").strip())

In [494]:
# For what's below, I have to keep in mind that the aggregated ptm data is NOT binary interactions only. It can also be n-ary interactions. 
interactions_with_xml_ptm_data_only = merged_expl.loc[(merged_expl["scraped_ptm_has_info"]) & ~(merged_expl["agg_ptm_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_agg_ptm_data_only = merged_expl.loc[~(merged_expl["scraped_ptm_has_info"]) & (merged_expl["agg_ptm_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_xml_and_agg_ptm_data = merged_expl.loc[(merged_expl["scraped_ptm_has_info"]) & (merged_expl["agg_ptm_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_in_my_pos = my_pos["interaction_intactid"].dropna().unique().tolist()
total_intactids = len(merged_expl["interaction_intactid"].dropna().unique())
print(f"Total interaction IDs with ptm data only from XML scraping: {len(interactions_with_xml_ptm_data_only)}/{total_intactids} = ({100*len(interactions_with_xml_ptm_data_only)/total_intactids:.2f}%)")
print(f"Total interaction IDs with ptm data only from aggregated ptm table: {len(interactions_with_agg_ptm_data_only)}/{total_intactids} = ({100*len(interactions_with_agg_ptm_data_only)/total_intactids:.2f}%)")
print(f"\tTotal that are also in my_pos: {len(set(interactions_with_agg_ptm_data_only).intersection(set(interactions_in_my_pos)))}")
print(f"Total interaction IDs with ptm data from both XML scraping and aggregated ptm table: {len(interactions_with_xml_and_agg_ptm_data)}/{total_intactids} = ({100*len(interactions_with_xml_and_agg_ptm_data)/total_intactids:.2f}%)")

# For what's below, I have to keep in mind that the aggregated ptm data is NOT binary interactions only. It can also be n-ary interactions. 
seq_pair_ids_with_xml_ptm_data_only = merged_expl.loc[(merged_expl["scraped_ptm_has_info"]) & ~(merged_expl["agg_ptm_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_agg_ptm_data_only = merged_expl.loc[~(merged_expl["scraped_ptm_has_info"]) & (merged_expl["agg_ptm_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_xml_and_agg_ptm_data = merged_expl.loc[(merged_expl["scraped_ptm_has_info"]) & (merged_expl["agg_ptm_has_info"])]["seq_pair_id"].dropna().unique().tolist()
total_seq_pair_ids = len(merged_expl["seq_pair_id"].dropna().unique())
print(f"\nTotal seq_pair_ids with ptm data only from XML scraping: {len(seq_pair_ids_with_xml_ptm_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_ptm_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with ptm data only from aggregated ptm table: {len(seq_pair_ids_with_agg_ptm_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_agg_ptm_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with ptm data from both XML scraping and aggregated ptm table: {len(seq_pair_ids_with_xml_and_agg_ptm_data)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_and_agg_ptm_data)/total_seq_pair_ids:.2f}%)")

Total interaction IDs with ptm data only from XML scraping: 7/743127 = (0.00%)
Total interaction IDs with ptm data only from aggregated ptm table: 0/743127 = (0.00%)
	Total that are also in my_pos: 0
Total interaction IDs with ptm data from both XML scraping and aggregated ptm table: 6410/743127 = (0.86%)

Total seq_pair_ids with ptm data only from XML scraping: 6/426539 = (0.00%)
Total seq_pair_ids with ptm data only from aggregated ptm table: 0/426539 = (0.00%)
Total seq_pair_ids with ptm data from both XML scraping and aggregated ptm table: 3338/426539 = (0.78%)


In [495]:
# For what's below, I have to keep in mind that the aggregated ptm data is NOT binary interactions only. It can also be n-ary interactions. 
interactions_with_xml_ptm_data_only = merged_neg_expl.loc[(merged_neg_expl["scraped_ptm_has_info"]) & ~(merged_neg_expl["agg_ptm_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_agg_ptm_data_only = merged_neg_expl.loc[~(merged_neg_expl["scraped_ptm_has_info"]) & (merged_neg_expl["agg_ptm_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_xml_and_agg_ptm_data = merged_neg_expl.loc[(merged_neg_expl["scraped_ptm_has_info"]) & (merged_neg_expl["agg_ptm_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_in_my_pos = my_pos["interaction_intactid"].dropna().unique().tolist()
total_intactids = len(merged_neg_expl["interaction_intactid"].dropna().unique())
print(f"Total interaction IDs with ptm data only from XML scraping: {len(interactions_with_xml_ptm_data_only)}/{total_intactids} = ({100*len(interactions_with_xml_ptm_data_only)/total_intactids:.2f}%)")
print(f"Total interaction IDs with ptm data only from aggregated ptm table: {len(interactions_with_agg_ptm_data_only)}/{total_intactids} = ({100*len(interactions_with_agg_ptm_data_only)/total_intactids:.2f}%)")
print(f"\tTotal that are also in my_pos: {len(set(interactions_with_agg_ptm_data_only).intersection(set(interactions_in_my_pos)))}")
print(f"Total interaction IDs with ptm data from both XML scraping and aggregated ptm table: {len(interactions_with_xml_and_agg_ptm_data)}/{total_intactids} = ({100*len(interactions_with_xml_and_agg_ptm_data)/total_intactids:.2f}%)")

# For what's below, I have to keep in mind that the aggregated ptm data is NOT binary interactions only. It can also be n-ary interactions. 
seq_pair_ids_with_xml_ptm_data_only = merged_neg_expl.loc[(merged_neg_expl["scraped_ptm_has_info"]) & ~(merged_neg_expl["agg_ptm_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_agg_ptm_data_only = merged_neg_expl.loc[~(merged_neg_expl["scraped_ptm_has_info"]) & (merged_neg_expl["agg_ptm_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_xml_and_agg_ptm_data = merged_neg_expl.loc[(merged_neg_expl["scraped_ptm_has_info"]) & (merged_neg_expl["agg_ptm_has_info"])]["seq_pair_id"].dropna().unique().tolist()
total_seq_pair_ids = len(merged_neg_expl["seq_pair_id"].dropna().unique())
print(f"\nTotal seq_pair_ids with ptm data only from XML scraping: {len(seq_pair_ids_with_xml_ptm_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_ptm_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with ptm data only from aggregated ptm table: {len(seq_pair_ids_with_agg_ptm_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_agg_ptm_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with ptm data from both XML scraping and aggregated ptm table: {len(seq_pair_ids_with_xml_and_agg_ptm_data)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_and_agg_ptm_data)/total_seq_pair_ids:.2f}%)")

Total interaction IDs with ptm data only from XML scraping: 7/743127 = (0.00%)
Total interaction IDs with ptm data only from aggregated ptm table: 0/743127 = (0.00%)
	Total that are also in my_pos: 0
Total interaction IDs with ptm data from both XML scraping and aggregated ptm table: 6410/743127 = (0.86%)

Total seq_pair_ids with ptm data only from XML scraping: 6/426539 = (0.00%)
Total seq_pair_ids with ptm data only from aggregated ptm table: 0/426539 = (0.00%)
Total seq_pair_ids with ptm data from both XML scraping and aggregated ptm table: 3338/426539 = (0.78%)


In [496]:
merged_expl.loc[~(merged_expl["scraped_ptm_has_info"]) & (merged_expl["agg_ptm_has_info"])][
    ["interaction_intactid","uniprot_A","uniprot_B"] + scraped_ptm_cols + agg_ptm_cols
].to_csv("potentially_missed_ptms_from_scrape_dec11_2025.csv",index=False)

In [497]:
# how do we determine if a mutation row is a mutation row? 
# if it has # Feature AC 
test1 = len(ptms.loc[ptms["# Feature AC"].isna()])==0
print(f"Everything in the ptms dataframe has a # Feature AC entry: {test1}")
test1 = len(ptms.loc[ptms["Feature type"].isna()])==0
print(f"Everything in the ptms dataframe has a Feature type entry: {test1}")
test1 = len(ptms.loc[ptms["Feature type"].isna()])
print(f"\tTotal mutation features without an MI term identifier: {test1}/{len(ptms)} ({100*test1/len(ptms):.2f}%)")
test1 = len(ptms.loc[ptms["Feature range(s)"].isna()])
print(f"\tTotal mutation features without a feature range: {test1}/{len(ptms)} ({100*test1/len(ptms):.2f}%)")
test1 = len(ptms.loc[
    ptms["Affected protein AC"].isna()
])
print(f"\tTotal ptms that do not indicate which protein is affected: {test1}")
test1 = len(ptms.loc[
    ptms["Interaction AC"].isna()
])
print(f"\tTotal ptms that do not indicate which interaction is affected: {test1}")

test1 = len(ptms.loc[
    ptms["Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal ptms that have multiple comma-separated entries for affected protein AC: {test1}")
test1 = len(ptms.loc[
    ptms["Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal ptms that have multiple pipe-separated entries for affected protein AC: {test1}")

Everything in the ptms dataframe has a # Feature AC entry: True
Everything in the ptms dataframe has a Feature type entry: False
	Total mutation features without an MI term identifier: 1/10286 (0.01%)
	Total mutation features without a feature range: 1/10286 (0.01%)
	Total ptms that do not indicate which protein is affected: 2
	Total ptms that do not indicate which interaction is affected: 3
	Total ptms that have multiple comma-separated entries for affected protein AC: 0
	Total ptms that have multiple pipe-separated entries for affected protein AC: 0


In [498]:
# Check for comma-separated identifiers of affected protein
test1 = len(merged_expl.loc[
    merged_expl["PTM Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal merged-in ptms that have multiple comma-separated entries for affected protein AC: {test1}")

# Check for pipe-separated identifiers of affected protein
test1 = len(merged_expl.loc[
    merged_expl["PTM Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal merged-in ptms that have multiple pipe-separated entries for affected protein AC: {test1}")

# Check for UniProtKB vs. IntAct identifiers for affected protein
test2 = len(merged_expl.loc[
    merged_expl["PTM Affected protein AC"].notna()
])
test1 = len(merged_expl.loc[
    merged_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
print(f"\tTotal merged-in ptms that have uniprotkb identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")
# How many uniprots have isoforms? 
test2 = len(merged_expl.loc[
    merged_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
test1 = len(merged_expl.loc[
    (merged_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_expl["PTM Affected protein AC"].fillna("").str.contains("-"))
])
print(f"\t\tFraction where an isoform is present: {test1}/{test2} ({100*test1/test2:.2f}%)")

test1 = len(merged_expl.loc[
    merged_expl["PTM Affected protein AC"].fillna("").str.contains("intact:EBI-")
])
print(f"\tTotal merged-in ptms that have IntAct identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")

temp = merged_expl.loc[
    (merged_expl["PTM Affected protein AC"].notna()) & 
    ~(merged_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")) &
    ~(merged_expl["PTM Affected protein AC"].fillna("").str.contains("intact:EBI-")) 
]["PTM Affected protein AC"].apply(lambda x: x.split(":")[0]).value_counts().to_dict()
test1 = sum([v for v in list(temp.values())])
print(f"\tTotal merged-in ptms that have other identifiers: {test1}/{test2} ({100*test1/test2:.2f}%)")
print(f"\t\tBreakdown: {temp}")


	Total merged-in ptms that have multiple comma-separated entries for affected protein AC: 0
	Total merged-in ptms that have multiple pipe-separated entries for affected protein AC: 0
	Total merged-in ptms that have uniprotkb identifier for affected protein: 12144/13369 (90.84%)
		Fraction where an isoform is present: 707/12144 (5.82%)
	Total merged-in ptms that have IntAct identifier for affected protein: 244/12144 (2.01%)
	Total merged-in ptms that have other identifiers: 981/12144 (8.08%)
		Breakdown: {'dip': 981}


In [499]:
# Check for comma-separated identifiers of affected protein
print("merged_neg:")
test1 = len(merged_neg_expl.loc[
    merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal merged-in ptms that have multiple comma-separated entries for affected protein AC: {test1}")

# Check for pipe-separated identifiers of affected protein
test1 = len(merged_neg_expl.loc[
    merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal merged-in ptms that have multiple pipe-separated entries for affected protein AC: {test1}")

# Check for UniProtKB vs. IntAct identifiers for affected protein
test2 = len(merged_neg_expl.loc[
    merged_neg_expl["PTM Affected protein AC"].notna()
])
test1 = len(merged_neg_expl.loc[
    merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
print(f"\tTotal merged-in ptms that have uniprotkb identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")
# How many uniprots have isoforms? 
test2 = len(merged_neg_expl.loc[
    merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
test1 = len(merged_neg_expl.loc[
    (merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("-"))
])
print(f"\t\tFraction where an isoform is present: {test1}/{test2} ({100*test1/test2:.2f}%)")

test1 = len(merged_neg_expl.loc[
    merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("intact:EBI-")
])
print(f"\tTotal merged-in ptms that have IntAct identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")

temp = merged_neg_expl.loc[
    (merged_neg_expl["PTM Affected protein AC"].notna()) & 
    ~(merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")) &
    ~(merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("intact:EBI-")) 
]["PTM Affected protein AC"].apply(lambda x: x.split(":")[0]).value_counts().to_dict()
test1 = sum([v for v in list(temp.values())])
print(f"\tTotal merged-in ptms that have other identifiers: {test1}/{test2} ({100*test1/test2:.2f}%)")
print(f"\t\tBreakdown: {temp}")


merged_neg:
	Total merged-in ptms that have multiple comma-separated entries for affected protein AC: 0
	Total merged-in ptms that have multiple pipe-separated entries for affected protein AC: 0
	Total merged-in ptms that have uniprotkb identifier for affected protein: 12144/13369 (90.84%)
		Fraction where an isoform is present: 707/12144 (5.82%)
	Total merged-in ptms that have IntAct identifier for affected protein: 244/12144 (2.01%)
	Total merged-in ptms that have other identifiers: 981/12144 (8.08%)
		Breakdown: {'dip': 981}


In [500]:
test1 = merged_expl.loc[
    (merged_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_expl["PTM Affected protein AC"].fillna("").str.contains("-")) 
][
    ["unique_id","uniprot_A_intact","uniprot_B_intact","uniprot_A","uniprot_B","PTM Affected protein AC", "aa_1","aa_2",]
]
test1 = len(test1.loc[
    (test1["uniprot_A"]!=test1["PTM Affected protein AC"]) & 
    (test1["uniprot_B"]!=test1["PTM Affected protein AC"]) & 
    (test1["uniprot_A_intact"]!=test1["PTM Affected protein AC"]) & 
    (test1["uniprot_B_intact"]!=test1["PTM Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test1}")


	Total rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 0


In [501]:
test1 = merged_neg_expl.loc[
    (merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_neg_expl["PTM Affected protein AC"].fillna("").str.contains("-")) 
][
    ["unique_id","uniprot_A_intact","uniprot_B_intact","uniprot_A","uniprot_B","PTM Affected protein AC", "aa_1","aa_2",]
]
test1 = len(test1.loc[
    (test1["uniprot_A"]!=test1["PTM Affected protein AC"]) & 
    (test1["uniprot_B"]!=test1["PTM Affected protein AC"]) & 
    (test1["uniprot_A_intact"]!=test1["PTM Affected protein AC"]) & 
    (test1["uniprot_B_intact"]!=test1["PTM Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test1}")


	Total rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 0


In [502]:
# process: determine which sequence is affected --> mutate its sequence according to the range --> assign labels based on my annotations
temp = merged_expl.loc[
   merged_expl["PTM # Feature AC"].notna()
   ].reset_index(drop=True)[
   ["unique_id","uniprot_A","uniprot_B","PTM Affected protein AC", "aa_1","aa_2", "Interaction identifier(s)",
    "PTM Original sequence", "PTM Resulting sequence", "PTM Feature type", "PTM Feature range(s)",
       "PTM Feature annotation(s)",
       "PTM Affected protein symbol", "PTM Affected protein full name"]
   ]
temp

Unnamed: 0,unique_id,uniprot_A,uniprot_B,PTM Affected protein AC,aa_1,aa_2,Interaction identifier(s),PTM Original sequence,PTM Resulting sequence,PTM Feature type,PTM Feature range(s),PTM Feature annotation(s),PTM Affected protein symbol,PTM Affected protein full name
0,intact:EBI-1001438_intact:EBI-296306,uniprotkb:O60566-0,uniprotkb:P45481-0,uniprotkb:O60566,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MAENLLDGPPNPKRAKLSSPGFSANDNTDFGSLFDLENDLPDELIP...,intact:EBI-9832689|intact:EBI-9832504|intact:E...,K,-,"psi-mod:""MOD:00394""(monoacetylated residue)",668-668,-,uniprotkb:BUB1B(gene name),Mitotic checkpoint serine/threonine-protein ki...
1,intact:EBI-1001438_intact:EBI-477430,uniprotkb:O60566-0,uniprotkb:Q92831-0,uniprotkb:O60566,MAAVKKEGGALSEAMSLEGDEWELSKENVQPLRQGRIMSTLQGALA...,MSEAGGAGPGGCGAGAGAGAGPGALPPQPAALPPAPPQGSPCAAAA...,intact:EBI-6984492|intact:EBI-6984518|intact:E...,K,-,"psi-mod:""MOD:00394""(monoacetylated residue)",250-250,resulting-ptm:resulting-ptm,uniprotkb:BUB1B(gene name),Mitotic checkpoint serine/threonine-protein ki...
2,intact:EBI-1002205_intact:EBI-15986737,uniprotkb:O94235-0,uniprotkb:O59757-0,uniprotkb:O59757,MSKRNPPVTNIADLVSDSSLDEDSLSFLEELQDPELYFKNDTFSSK...,MPTSPRRNSIATTDNVIGRNKSRKRPHSLGGPGALQELKEHTNPAK...,intact:EBI-15986772,T,-,"psi-mi:""MI:0177""(O-phospho-L-threonine)",257-257,-,uniprotkb:spc7(gene name),Outer kinetochore KNL1 complex subunit spc7
3,intact:EBI-1003422_intact:EBI-15824480,uniprotkb:Q5U349-0,uniprotkb:Q07820-0,uniprotkb:Q07820,MSQLSSTLKRYTESSRYTDAPYAKSGYGTYTPSSYGANLAASFLEK...,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,intact:EBI-15824534,-,-,"psi-mi:""MI:0189""(N6-glycyl-L-lysine)",?-?,comment:MCL1 is polyubiquitinated.,uniprotkb:MCL1(gene name),Induced myeloid leukemia cell differentiation ...
4,intact:EBI-1003422_intact:EBI-302524,uniprotkb:Q07820-0,uniprotkb:Q93008-0,uniprotkb:Q07820,MFGLKRNAVIGLNLYCGGAGLGAGSGGATRPGGRLLATEKEASARR...,MTATTRGSPVGGNDNQGQAPDGQSQPPLQQNQTSSPDSSNENSPAT...,intact:EBI-15824630|intact:EBI-15824784|intact...,-,-,"psi-mi:""MI:0189""(N6-glycyl-L-lysine)",?-?,comment:MCL1 is polyubiquitinated.,uniprotkb:MCL1(gene name),Induced myeloid leukemia cell differentiation ...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13364,intact:EBI-9090282_intact:EBI-968198,uniprotkb:P27986-2,uniprotkb:O15524-0,uniprotkb:P27986-2,MYNTVWNMEDLDLEYAKTDINCGTDLMFYIEMDPPALPPKPPKPTT...,MVAHNQVAADNAVSTAAEPRRRPEPSSSSSSSPAAPARPRPCPAVP...,intact:EBI-10704987|intact:EBI-10695667,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",?-?,-,uniprotkb:PIK3R1(gene name),Phosphatidylinositol 3-kinase regulatory subun...
13365,intact:EBI-910_intact:EBI-968198,uniprotkb:O15524-0,uniprotkb:P46109-0,uniprotkb:P46109,MVAHNQVAADNAVSTAAEPRRRPEPSSSSSSSPAAPARPRPCPAVP...,MSSARFDSSDRSAWYMGPVSRQEAQTRLQGQRHGMFLVRDSSTCPG...,intact:EBI-10704998|intact:EBI-10696315,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",?-?,-,uniprotkb:CRKL(gene name),Crk-like protein
13366,intact:EBI-910_intact:EBI-968198,uniprotkb:O15524-0,uniprotkb:P46109-0,uniprotkb:O15524,MVAHNQVAADNAVSTAAEPRRRPEPSSSSSSSPAAPARPRPCPAVP...,MSSARFDSSDRSAWYMGPVSRQEAQTRLQGQRHGMFLVRDSSTCPG...,intact:EBI-10704998|intact:EBI-10696315,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",?-?,-,uniprotkb:SOCS1(gene name),Suppressor of cytokine signaling 1
13367,intact:EBI-958408_intact:EBI-958408,uniprotkb:P48551-0,uniprotkb:P48551-0,uniprotkb:P48551,MLLSQNAFIFRSLNLVLMVYISLVFGISYDSPDYTDESCTFKISLR...,MLLSQNAFIFRSLNLVLMVYISLVFGISYDSPDYTDESCTFKISLR...,intact:EBI-15481918,-,-,"psi-mod:""MOD:00689""(disulfide crosslinked resi...",?-?,-,uniprotkb:IFNAR2(gene name),Interferon alpha/beta receptor 2


In [503]:
## Positives: merge in feature types labeled
merged_expl = pd.merge(
    merged_expl,
    ptm_feature_types_labeled.rename(
        columns={
            "feature": "PTM Feature type",
            "original_sequence": "PTM og_binds_bo_feature_type",
            "ptm_sequence": "PTM new_binds_bo_feature_type"
            }
    )[[
        "PTM Feature type",
        "PTM og_binds_bo_feature_type",
        "PTM new_binds_bo_feature_type"
            ]],
    on="PTM Feature type",
    how="left"
)
## Positives: merge in feature annotations labeled
merged_expl = pd.merge(
    merged_expl,
    ptm_feature_annotations_labeled.rename(
        columns={
            "feature": "PTM Feature annotation(s)",
            "original_sequence": "PTM og_binds_bo_annotation",
            "ptm_sequence": "PTM new_binds_bo_annotation",
            }
    )[[
        "PTM Feature annotation(s)",
        "PTM og_binds_bo_annotation",
        "PTM new_binds_bo_annotation",
            ]],
    on="PTM Feature annotation(s)",
    how="left"
)
merged_expl.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,PTM Interaction participants,PTM PubMedID,PTM Figure legend(s),PTM Interaction AC,PTM Xref ID(s),agg_ptm_has_info,PTM og_binds_bo_feature_type,PTM new_binds_bo_feature_type,PTM og_binds_bo_annotation,PTM new_binds_bo_annotation
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,,,,False,,,,
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,,,,,,False,,,,
2,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,,,,False,,,,
3,intact:EBI-100018,intact:EBI-107089,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,,,,False,,,,
4,intact:EBI-117032,intact:EBI-100018,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,,,,False,,,,


In [504]:
## Negatives: merge in feature types labeled
merged_neg_expl = pd.merge(
    merged_neg_expl,
    ptm_feature_types_labeled.rename(
        columns={
            "feature": "PTM Feature type",
            "original_sequence": "PTM og_binds_bo_feature_type",
            "ptm_sequence": "PTM new_binds_bo_feature_type"
            }
    )[[
        "PTM Feature type",
        "PTM og_binds_bo_feature_type",
        "PTM new_binds_bo_feature_type"
            ]],
    on="PTM Feature type",
    how="left"
)
## Negatives: merge in feature annotations labeled
merged_neg_expl = pd.merge(
    merged_neg_expl,
    ptm_feature_annotations_labeled.rename(
        columns={
            "feature": "PTM Feature annotation(s)",
            "original_sequence": "PTM og_binds_bo_annotation",
            "ptm_sequence": "PTM new_binds_bo_annotation",
            }
    )[[
        "PTM Feature annotation(s)",
        "PTM og_binds_bo_annotation",
        "PTM new_binds_bo_annotation",
            ]],
    on="PTM Feature annotation(s)",
    how="left"
)
merged_neg_expl.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,PTM Interaction participants,PTM PubMedID,PTM Figure legend(s),PTM Interaction AC,PTM Xref ID(s),agg_ptm_has_info,PTM og_binds_bo_feature_type,PTM new_binds_bo_feature_type,PTM og_binds_bo_annotation,PTM new_binds_bo_annotation
0,intact:EBI-101707,intact:EBI-100018,uniprotkb:Q86P48,uniprotkb:Q9VE54,psi-mi:atbp_drome|psi-mi:ATbp|uniprotkb:ATbp|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,,,,False,,,,
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,,,,,,False,,,,
2,intact:EBI-104215,intact:EBI-100018,uniprotkb:Q9VTR6,uniprotkb:Q9VE54,psi-mi:q9vtr6_drome|psi-mi:prc|uniprotkb:prc|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,,,,False,,,,
3,intact:EBI-100018,intact:EBI-107089,uniprotkb:Q9VE54,uniprotkb:Q9VWG2,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:q9vwg2_drome|psi-mi:SDS3|uniprotkb:SDS3...,"psi-mi:""MI:0018""(two hybrid)",Giot et al. (2003),pubmed:14605208|imex:IM-16524|mint:MINT-5216804,taxid:7227(drome),...,,,,,,False,,,,
4,intact:EBI-117032,intact:EBI-100018,uniprotkb:Q9VHR4,uniprotkb:Q9VE54,psi-mi:q9vhr4_drome|psi-mi:Dmel\CG7963|uniprot...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0397""(two hybrid array)",Shokri et al. (2019),doi:10.1016/j.celrep.2019.03.071|pubmed:309954...,taxid:7227(drome),...,,,,,,False,,,,


In [505]:
# look at the subset of merged_expl that has ptm-related data
mask = merged_expl["scraped_ptm_has_info"] | merged_expl["agg_ptm_has_info"]
merged_expl_ptm = merged_expl.loc[mask].reset_index(drop=True)
print(f"\tTotal rows of exploded merged that have at least one column of ptm-related data (from XML or aggregated): {len(merged_expl_ptm)}/{len(merged_expl)} ({100*len(merged_expl_ptm)/len(merged_expl):.2f}%)")

	Total rows of exploded merged that have at least one column of ptm-related data (from XML or aggregated): 13376/752006 (1.78%)


In [506]:
# look at the subset of merged_neg_expl that has ptm-related data
mask = merged_neg_expl["scraped_ptm_has_info"] | merged_neg_expl["agg_ptm_has_info"]
merged_neg_expl_ptm = merged_neg_expl.loc[mask].reset_index(drop=True)
print(f"\tTotal rows of exploded merged_neg that have at least one column of ptm-related data (from XML or aggregated): {len(merged_neg_expl_ptm)}/{len(merged_neg_expl)} ({100*len(merged_neg_expl_ptm)/len(merged_neg_expl):.2f}%)")

	Total rows of exploded merged_neg that have at least one column of ptm-related data (from XML or aggregated): 13376/752006 (1.78%)


In [507]:
del merged_expl
del merged_neg_expl

In [508]:
merged_expl_ptm["PTM Interactor Matches"] = merged_expl_ptm.apply(lambda row: feature_affected_protein_matches_id(row,feature="PTM"), axis=1)
merged_expl_ptm.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,PTM PubMedID,PTM Figure legend(s),PTM Interaction AC,PTM Xref ID(s),agg_ptm_has_info,PTM og_binds_bo_feature_type,PTM new_binds_bo_feature_type,PTM og_binds_bo_annotation,PTM new_binds_bo_annotation,PTM Interactor Matches
0,intact:EBI-1001438,intact:EBI-296306,uniprotkb:O60566,uniprotkb:P45481,psi-mi:bub1b_human|psi-mi:BUB1B|uniprotkb:O605...,psi-mi:cbp_mouse|psi-mi:Crebbp|uniprotkb:E9QPH...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitatio...",North et al. (2014),pubmed:24825348|imex:IM-23516,taxid:9606(human),...,pubmed:24825348|imex:IM-23516,figure legend:f3d,intact:EBI-9832689,-,True,unknown,unknown,,,A
1,intact:EBI-1001438,intact:EBI-477430,uniprotkb:O60566,uniprotkb:Q92831,psi-mi:bub1b_human|psi-mi:BUB1B|uniprotkb:O605...,psi-mi:kat2b_human|psi-mi:KAT2B|uniprotkb:Q6NS...,"psi-mi:""MI:0415""(enzymatic study)|psi-mi:""MI:0...",Choi et al. (2009)|North et al. (2014),imex:IM-15488|pubmed:19407811|mint:MINT-729973...,taxid:9606(human),...,pubmed:19407811|imex:IM-15488,figure legend:sf5,intact:EBI-6984518,mint:MINT-7300005(identity),True,unknown,unknown,yes,unknown,A
2,intact:EBI-15986737,intact:EBI-1002205,uniprotkb:O94235,uniprotkb:O59757,psi-mi:mps1_schpo|psi-mi:mph1|uniprotkb:Q9P7Z5...,psi-mi:knl1_schpo|psi-mi:spc7|uniprotkb:NMS co...,"psi-mi:""MI:0424""(protein kinase assay)",Yamagishi et al. (2012),pubmed:22660415|doi:10.1038/ncb2515|imex:IM-25168,taxid:284812(schpo),...,pubmed:22660415|imex:IM-25168,-,intact:EBI-15986772,-,True,unknown,unknown,,,B
3,intact:EBI-15824480,intact:EBI-1003422,uniprotkb:Q5U349,uniprotkb:Q07820,psi-mi:ubp2_rat|psi-mi:Usp2|uniprotkb:Q9QXL3|u...,psi-mi:mcl1_human|psi-mi:MCL1|uniprotkb:Q9HD91...,"psi-mi:""MI:0415""(enzymatic study)",Schwickart et al. (2010),pubmed:20023629|doi:10.1038/nature08646|imex:I...,taxid:10116(rat),...,pubmed:20023629|imex:IM-14556,-,intact:EBI-15824534,-,True,unknown,unknown,unknown,unknown,B
4,intact:EBI-1003422,intact:EBI-302524,uniprotkb:Q07820,uniprotkb:Q93008,psi-mi:mcl1_human|psi-mi:MCL1|uniprotkb:Q9HD91...,psi-mi:usp9x_human|psi-mi:USP9X|uniprotkb:O755...,"psi-mi:""MI:0006""(anti bait coimmunoprecipitati...",Schwickart et al. (2010),pubmed:20023629|doi:10.1038/nature08646|imex:I...,taxid:9606(human),...,pubmed:20023629|imex:IM-14556,-,intact:EBI-15824847,-,True,unknown,unknown,unknown,unknown,A


In [509]:
merged_neg_expl_ptm["PTM Interactor Matches"] = merged_neg_expl_ptm.apply(lambda row: feature_affected_protein_matches_id(row,feature="PTM"), axis=1)
merged_neg_expl_ptm.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,PTM PubMedID,PTM Figure legend(s),PTM Interaction AC,PTM Xref ID(s),agg_ptm_has_info,PTM og_binds_bo_feature_type,PTM new_binds_bo_feature_type,PTM og_binds_bo_annotation,PTM new_binds_bo_annotation,PTM Interactor Matches
0,intact:EBI-1001438,intact:EBI-296306,uniprotkb:O60566,uniprotkb:P45481,psi-mi:bub1b_human|psi-mi:BUB1B|uniprotkb:O605...,psi-mi:cbp_mouse|psi-mi:Crebbp|uniprotkb:E9QPH...,"psi-mi:""MI:0007""(anti tag coimmunoprecipitatio...",North et al. (2014),pubmed:24825348|imex:IM-23516,taxid:9606(human),...,pubmed:24825348|imex:IM-23516,figure legend:f3d,intact:EBI-9832689,-,True,unknown,unknown,,,A
1,intact:EBI-1001438,intact:EBI-477430,uniprotkb:O60566,uniprotkb:Q92831,psi-mi:bub1b_human|psi-mi:BUB1B|uniprotkb:O605...,psi-mi:kat2b_human|psi-mi:KAT2B|uniprotkb:Q6NS...,"psi-mi:""MI:0415""(enzymatic study)|psi-mi:""MI:0...",Choi et al. (2009)|North et al. (2014),imex:IM-15488|pubmed:19407811|mint:MINT-729973...,taxid:9606(human),...,pubmed:19407811|imex:IM-15488,figure legend:sf5,intact:EBI-6984518,mint:MINT-7300005(identity),True,unknown,unknown,yes,unknown,A
2,intact:EBI-15986737,intact:EBI-1002205,uniprotkb:O94235,uniprotkb:O59757,psi-mi:mps1_schpo|psi-mi:mph1|uniprotkb:Q9P7Z5...,psi-mi:knl1_schpo|psi-mi:spc7|uniprotkb:NMS co...,"psi-mi:""MI:0424""(protein kinase assay)",Yamagishi et al. (2012),pubmed:22660415|doi:10.1038/ncb2515|imex:IM-25168,taxid:284812(schpo),...,pubmed:22660415|imex:IM-25168,-,intact:EBI-15986772,-,True,unknown,unknown,,,B
3,intact:EBI-15824480,intact:EBI-1003422,uniprotkb:Q5U349,uniprotkb:Q07820,psi-mi:ubp2_rat|psi-mi:Usp2|uniprotkb:Q9QXL3|u...,psi-mi:mcl1_human|psi-mi:MCL1|uniprotkb:Q9HD91...,"psi-mi:""MI:0415""(enzymatic study)",Schwickart et al. (2010),pubmed:20023629|doi:10.1038/nature08646|imex:I...,taxid:10116(rat),...,pubmed:20023629|imex:IM-14556,-,intact:EBI-15824534,-,True,unknown,unknown,unknown,unknown,B
4,intact:EBI-1003422,intact:EBI-302524,uniprotkb:Q07820,uniprotkb:Q93008,psi-mi:mcl1_human|psi-mi:MCL1|uniprotkb:Q9HD91...,psi-mi:usp9x_human|psi-mi:USP9X|uniprotkb:O755...,"psi-mi:""MI:0006""(anti bait coimmunoprecipitati...",Schwickart et al. (2010),pubmed:20023629|doi:10.1038/nature08646|imex:I...,taxid:9606(human),...,pubmed:20023629|imex:IM-14556,-,intact:EBI-15824847,-,True,unknown,unknown,unknown,unknown,A


In [510]:
# Investigate database breakdown
# Look at the different databases the results came from
print("Investigating positive-PPIs merged with PTM data")
temp = merged_expl_ptm.loc[merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x)>0)]
print("\nDatabases yielding successful matches:")
print(temp["PTM Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

temp = merged_expl_ptm.loc[merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x)==0)]
print("\nDatabases yielding unsuccessful matches:")
print(temp["PTM Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

Investigating positive-PPIs merged with PTM data

Databases yielding successful matches:
0
uniprotkb    12144
dip            981
intact         244
Name: count, dtype: int64

Databases yielding unsuccessful matches:
0
    7
Name: count, dtype: int64


In [511]:
# Investigate database breakdown
# Look at the different databases the results came from
print("Investigating negative-PPIs merged_neg with PTM data")
temp = merged_neg_expl_ptm.loc[merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x)>0)]
print("\nDatabases yielding successful matches:")
print(temp["PTM Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

temp = merged_neg_expl_ptm.loc[merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x)==0)]
print("\nDatabases yielding unsuccessful matches:")
if len(temp)>0:
    print(temp["PTM Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())
else:
    print(0)

Investigating negative-PPIs merged_neg with PTM data

Databases yielding successful matches:
0
uniprotkb    12144
dip            981
intact         244
Name: count, dtype: int64

Databases yielding unsuccessful matches:
0
    7
Name: count, dtype: int64


In [512]:
merged_expl_ptm["PTM Interactor Matches"].value_counts()

PTM Interactor Matches
B      9710
A      3341
A,B     318
          7
Name: count, dtype: int64

In [513]:
merged_neg_expl_ptm["PTM Interactor Matches"].value_counts()

PTM Interactor Matches
B      9710
A      3341
A,B     318
          7
Name: count, dtype: int64

In [514]:
print(f"Positive-PPIs: Matched ptms with their interactors A and/or B based on IDs.")
# no match
test1 = len(merged_expl_ptm.loc[
    merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x))==0
])
print(f"\tTotal rows where ptm partner could not be found: {test1}/{len(merged_expl_ptm)} ({100*test1/len(merged_expl_ptm):.2f}%)")
test1 = len(merged_expl_ptm.loc[
    (merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x))==0) & 
    (merged_expl_ptm["PTM Affected protein AC"].notna())
])==0
print(f"\t\tIn all cases, merged PTM database has no Affected protein AC: {test1}")
# A only
test1 = len(merged_expl_ptm.loc[
    (merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: x=="A" if len(x)==1 else False))
])
print(f"\tTotal rows where ptm partner is A only: {test1}/{len(merged_expl_ptm)} ({100*test1/len(merged_expl_ptm):.2f}%)")
test1 = len(merged_expl_ptm.loc[
    (merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: x=="B" if len(x)==1 else False))
])
print(f"\tTotal rows where ptm partner is B only: {test1}/{len(merged_expl_ptm)} ({100*test1/len(merged_expl_ptm):.2f}%)")
test1 = len(merged_expl_ptm.loc[
    (merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: x=="A,B"))
])
print(f"\tTotal rows where ptm partner is A and B: {test1}/{len(merged_expl_ptm)} ({100*test1/len(merged_expl_ptm):.2f}%)")


Positive-PPIs: Matched ptms with their interactors A and/or B based on IDs.
	Total rows where ptm partner could not be found: 7/13376 (0.05%)
		In all cases, merged PTM database has no Affected protein AC: True
	Total rows where ptm partner is A only: 3341/13376 (24.98%)
	Total rows where ptm partner is B only: 9710/13376 (72.59%)
	Total rows where ptm partner is A and B: 318/13376 (2.38%)


In [515]:
print(f"Negative-PPIs: Matched ptms with their interactors A and/or B based on IDs.")
# no match
test1 = len(merged_neg_expl_ptm.loc[
    merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x))==0
])
print(f"\tTotal rows where ptm partner could not be found: {test1}/{len(merged_neg_expl_ptm)} ({100*test1/len(merged_neg_expl_ptm):.2f}%)")
test1 = len(merged_neg_expl_ptm.loc[
    (merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x))==0) & 
    (merged_neg_expl_ptm["PTM Affected protein AC"].notna())
])==0
print(f"\t\tIn all cases, merged_neg PTM database has no Affected protein AC: {test1}")
# A only
test1 = len(merged_neg_expl_ptm.loc[
    (merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: x=="A" if len(x)==1 else False))
])
print(f"\tTotal rows where ptm partner is A only: {test1}/{len(merged_neg_expl_ptm)} ({100*test1/len(merged_neg_expl_ptm):.2f}%)")
test1 = len(merged_neg_expl_ptm.loc[
    (merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: x=="B" if len(x)==1 else False))
])
print(f"\tTotal rows where ptm partner is B only: {test1}/{len(merged_neg_expl_ptm)} ({100*test1/len(merged_neg_expl_ptm):.2f}%)")
test1 = len(merged_neg_expl_ptm.loc[
    (merged_neg_expl_ptm["PTM Interactor Matches"].apply(lambda x: x=="A,B"))
])
print(f"\tTotal rows where ptm partner is A and B: {test1}/{len(merged_neg_expl_ptm)} ({100*test1/len(merged_neg_expl_ptm):.2f}%)")


Negative-PPIs: Matched ptms with their interactors A and/or B based on IDs.
	Total rows where ptm partner could not be found: 7/13376 (0.05%)
		In all cases, merged_neg PTM database has no Affected protein AC: True
	Total rows where ptm partner is A only: 3341/13376 (24.98%)
	Total rows where ptm partner is B only: 9710/13376 (72.59%)
	Total rows where ptm partner is A and B: 318/13376 (2.38%)


In [516]:
# Investigate cases where no match was found
merged_expl_ptm.loc[
    merged_expl_ptm["PTM Interactor Matches"].apply(lambda x: len(x))==0
][
    ["interaction_intactid","year","intactid_1","intactid_2","aa_1","aa_2","length_1","length_2"] + scraped_ptm_cols + agg_ptm_cols
]

Unnamed: 0,interaction_intactid,year,intactid_1,intactid_2,aa_1,aa_2,length_1,length_2,ptm_mi_1,ptm_name_1,...,PTM Feature annotation(s),PTM Affected protein AC,PTM Affected protein symbol,PTM Affected protein full name,PTM Affected protein organism,PTM Interaction participants,PTM PubMedID,PTM Figure legend(s),PTM Interaction AC,PTM Xref ID(s)
600,EBI-8735113,1999,intact:EBI-80597,intact:EBI-1047946,MASGADSKGDDLSTAILKQKNRPNRLIVDEAINEDNSVVSLSQPKM...,MTSRLRALGGRINNIRTSELPKEKTRSEVICSIHFLDGVVQTFKVT...,806,913,,,...,,,,,,,,,,
4094,EBI-15098123,2002,intact:EBI-1644164,intact:EBI-347088,MAEAPASPAPLSPLEVELDPEFEPQSRPRSCTWPLQRPELQASPAK...,MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSV...,673,245,MI:1225,"disrupting-ptm,mutation disrupting interaction",...,,,,,,,,,,
4108,EBI-8436077,2010,intact:EBI-712311,intact:EBI-16715157|intact:EBI-73837,MDEKVFTKELDQWIEQLNECKQLSESQVKSLCEKAKEILTKESNVQ...,MVSSQKLEKPIEMGSSEPLPIADGDRRRKKKRRGRATDSLPGKFED...,309,347,,,...,,,,,,,,,,
7244,EBI-9678126,2012,intact:EBI-28981518|intact:EBI-726858,intact:EBI-1642546|intact:EBI-351018,MANFLLPRGTSSFRRFTRESLAAIEKRMAEKQARGSTTLQESREGL...,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,2016,499,MI:0639,"resulting-ptm,sufficient binding region",...,,,,,,,,,,
7246,EBI-9674467,2012,intact:EBI-28981518|intact:EBI-726858,intact:EBI-1642546|intact:EBI-351018,MANFLLPRGTSSFRRFTRESLAAIEKRMAEKQARGSTTLQESREGL...,MASTTTCTRFTDEYQLFEELGKGAFSVVRRCMKIPTGQEYAAKIIN...,2016,499,MI:0639,"resulting-ptm,sufficient binding region",...,,,,,,,,,,
8462,EBI-7936521,2008,MINT-6798492|intact:EBI-7936357,intact:EBI-4314481,MNKIYKVKKNAAGHLVACSEFAKGHTKKAVLGSLLIVGILGMATTA...,MGHLSAPLHRVRVPWQGLLLTASLLTFWNPPTTAQLTTESMPFNVA...,913,526,MI:0639,"resulting-ptm,mutation disrupting interaction",...,,,,,,,,,,
10037,EBI-21944249,2017,intact:EBI-9916016,intact:EBI-21356892|intact:EBI-720984,MSTPARRRLMRDFKRLQEDPPVGVSGAPSENNIMQWNAVIFGPEGT...,MPLFFRKRKPSEEARKRLEYQMCLAKEAGADDILDISKCELSEIPF...,152,723,,,...,,,,,,,,,,


In [517]:
def check_ptm_indices_for_redundancy(s):
    """
    Return true if not redundancy
    """
    # is it always the same thing on both sides? 
    if s is None or type(s)==float:
        return True
    s_list = s.split(",")
    for s in s_list:
        left = s.split("-")[0]
        right = s.split("-")[1]

        if len(list(set(left.split(".."))))!=1 or  len(list(set(right.split(".."))))!=1:
            return False
    return True

def clean_ptm_redundant_indices(s):
    # is it always the same thing on both sides? 
    if s is None or type(s)==float:
        return None
    s_list = s.split(",")
    fixed = []
    for s in s_list:
        left = s.split("-")[0]
        right = s.split("-")[1]
        
        if len(list(set(left.split(".."))))!=1 or  len(list(set(right.split(".."))))!=1:
            return None

        new_item_left = left.split("..")[0]
        new_item_right = right.split("..")[0]
        
        if (not new_item_left.isdigit()) or (not new_item_right.isdigit):
            return None
        
        new_item = f"{new_item_left}-{new_item_right}"
        fixed.append(new_item)

    return ",".join(fixed)

In [518]:
# correct ranges
merged_expl_ptm["PTM Feature range(s)"] = merged_expl_ptm["PTM Feature range(s)"].apply(lambda x: clean_ptm_redundant_indices(x))
display(merged_expl_ptm[["PTM Feature range(s)"]].head())

merged_expl_ptm["PTM redundant_indices"] = merged_expl_ptm["PTM Feature range(s)"].apply(lambda x: check_ptm_indices_for_redundancy(x))
test1 = len(merged_expl_ptm.loc[merged_expl_ptm["PTM redundant_indices"]==False])==0
print(f"None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : {test1}")
merged_expl_ptm = merged_expl_ptm.drop(columns=["PTM redundant_indices"])

Unnamed: 0,PTM Feature range(s)
0,668-668
1,250-250
2,257-257
3,
4,


None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : True


In [519]:
ptms.loc[ptms["Feature range(s)"].fillna("").str.contains("\\?")]

Unnamed: 0,# Feature AC,Feature short label,Feature range(s),Original sequence,Resulting sequence,Feature type,Feature annotation(s),Affected protein AC,Affected protein symbol,Affected protein full name,Affected protein organism,Interaction participants,PubMedID,Figure legend(s),Interaction AC,Xref ID(s)
6,EBI-10696222,possible_phophosite,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:P46108-2,uniprotkb:CRK(gene name),Adapter molecule crk,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:O60496(psi-mi:""MI:0326""(protein)), ...",pubmed:25814554|imex:IM-22632,figure legend:Suppl. table S3,intact:EBI-10696209,-
9,EBI-10696220,possible_phophosite,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:O60496,uniprotkb:DOK2(gene name),Docking protein 2,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:O60496(psi-mi:""MI:0326""(protein)), ...",pubmed:25814554|imex:IM-22632,figure legend:Suppl. table S3,intact:EBI-10696209,-
16,EBI-26450950,tyr-?,?-?,-,-,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)",-,uniprotkb:Q8NDB2,uniprotkb:BANK1(gene name),B-cell scaffold protein with ankyrin repeats,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P07948(psi-mi:""MI:0326""(protein)), ...",pubmed:11782428|imex:IM-28511,figure legend:Fig. 5A,intact:EBI-26450945,-
25,EBI-11290674,possible_phospho_res,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:Q01147,uniprotkb:Creb1(gene name),Cyclic AMP-responsive element-binding protein 1,taxid:10090(mouse)|taxid:10090(Mus musculus),"(ensembl:ENSMUSG00000039521(psi-mi:""MI:0250""(g...",pubmed:17591856|imex:IM-24534,figure legend:5B,intact:EBI-11290665,-
26,EBI-8557398,sumoylated lysine,?-?,-,-,"psi-mod:""MOD:01149""(sumoylated lysine)",resulting-ptm:resulting-ptm,uniprotkb:Q92844,uniprotkb:TANK(gene name),TRAF family member-associated NF-kappa-B activ...,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q9UHD2(psi-mi:""MI:0326""(protein)), ...",pubmed:21212807|imex:IM-15406,figure legend:f1a,intact:EBI-8557380,mint:MINT-8151326(identity)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10275,EBI-984462,region,?-?,-,-,"psi-mi:""MI:0176""(O-phospho-L-serine)",-,uniprotkb:P09803,uniprotkb:Cdh1(gene name),Cadherin-1,taxid:10090(mouse)|taxid:10090(Mus musculus),"(uniprotkb:Q02248(psi-mi:""MI:0326""(protein)), ...",pubmed:16293619|imex:IM-14500,figure legend:3 and Table 1,intact:EBI-984457,-
10277,EBI-10692940,possible_phophosite,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",-,uniprotkb:Q9UKG1,uniprotkb:APPL1(gene name),DCC-interacting protein 13-alpha,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q9UKG1(psi-mi:""MI:0326""(protein)), ...",pubmed:25814554|imex:IM-22632,figure legend:Suppl. table S3,intact:EBI-10692929,-
10282,EBI-6374967,region,?-?,-,-,"psi-mod:""MOD:00018""(L-histidine residue)",-,uniprotkb:Q12948,uniprotkb:FOXC1(gene name),Forkhead box protein C1,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q07021(psi-mi:""MI:0326""(protein)), ...",pubmed:18676636|imex:IM-17919,figure legend:1A,intact:EBI-6374962,-
10284,EBI-9825471,decreasing_phosres,?-?,-,-,"psi-mi:""MI:0170""(phosphorylated residue)",ptm decreasing an interaction,uniprotkb:G3H996,uniprotkb:Ctnnb1(gene name),-,"taxid:10029(crigr)|taxid:10029(""Cricetulus gri...","(uniprotkb:G3HL00(psi-mi:""MI:0326""(protein)), ...",pubmed:21118991|imex:IM-22907,figure legend:Fig.4C,intact:EBI-9819035,-


In [520]:
# correct ranges
merged_neg_expl_ptm["PTM Feature range(s)"] = merged_neg_expl_ptm["PTM Feature range(s)"].apply(lambda x: clean_ptm_redundant_indices(x))
display(merged_neg_expl_ptm[["PTM Feature range(s)"]].head())

merged_neg_expl_ptm["PTM redundant_indices"] = merged_neg_expl_ptm["PTM Feature range(s)"].apply(lambda x: check_ptm_indices_for_redundancy(x))
test1 = len(merged_neg_expl_ptm.loc[merged_neg_expl_ptm["PTM redundant_indices"]==False])==0
print(f"None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : {test1}")
merged_neg_expl_ptm = merged_neg_expl_ptm.drop(columns=["PTM redundant_indices"])

Unnamed: 0,PTM Feature range(s)
0,668-668
1,250-250
2,257-257
3,
4,


None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : True


In [521]:
test1 = len(ptms.loc[ptms["Feature range(s)"].fillna("").str.contains("\\?")])
print(f"In original ptms database, {test1} rows had ? in their Feature Range(s)")
test1 = len(merged_expl_ptm.loc[merged_expl_ptm["PTM Feature range(s)"].fillna("").str.contains("\\?")])==0
print(f"Processed merged_expl_ptm has no ? in its Feature Range(s): {test1}")
test1 = len(merged_neg_expl_ptm.loc[merged_neg_expl_ptm["PTM Feature range(s)"].fillna("").str.contains("\\?")])==0
print(f"Processed merged_neg_expl_ptm has no ? in its Feature Range(s): {test1}")

In original ptms database, 1845 rows had ? in their Feature Range(s)
Processed merged_expl_ptm has no ? in its Feature Range(s): True
Processed merged_neg_expl_ptm has no ? in its Feature Range(s): True


In [522]:
# make new ranges
def convert_ptm_begin_end_into_range(row, partner:int|str = 1):
    """
    Take ptm_begin_1 and ptm_end_1 (or _2) and convert into Mutation Feature range(s) format 
    e.g. ptm_begin_1 = 1033 and ptm_end_1 = 1033 --> Mutation Feature range(s) = "1033-1033"
    """
    partner = str(partner)
    beg_col = f"ptm_begin_{partner}"
    end_col = f"ptm_end_{partner}"
    
    if (row[beg_col] is None) or type(row[beg_col]) in [float,pd._libs.missing.NAType] or (row[end_col] is None) or type(row[end_col])in [float,pd._libs.missing.NAType]:
        return None
    
    # there may be commas. e.g. 91, 94. split on commas and match by index
    try:
        begins = [int(x.strip()) for x in row[beg_col].split(",")]
        ends = [int(x.strip()) for x in row[end_col].split(",")]
    except:
        return None
    
    if len(begins)!=len(ends):
        return None
    ranges = []
    for i in range(len(begins)):
        ranges.append(f"{begins[i]}-{ends[i]}")
    return ",".join(ranges)

In [523]:
merged_expl_ptm["ptm_range_1"] = merged_expl_ptm.apply(lambda row: convert_ptm_begin_end_into_range(row, partner=1), axis=1)
merged_expl_ptm["ptm_range_2"] = merged_expl_ptm.apply(lambda row: convert_ptm_begin_end_into_range(row, partner=2), axis=1)

In [524]:
merged_neg_expl_ptm["ptm_range_1"] = merged_neg_expl_ptm.apply(lambda row: convert_ptm_begin_end_into_range(row, partner=1), axis=1)
merged_neg_expl_ptm["ptm_range_2"] = merged_neg_expl_ptm.apply(lambda row: convert_ptm_begin_end_into_range(row, partner=2), axis=1)

In [525]:
merged_expl_ptm[["ptm_range_1","PTM Feature type"]].dropna().head()

Unnamed: 0,ptm_range_1,PTM Feature type
0,668-668,"psi-mod:""MOD:00394""(monoacetylated residue)"
1,250-250,"psi-mod:""MOD:00394""(monoacetylated residue)"
10,1214-1214,"psi-mi:""MI:0170""(phosphorylated residue)"
25,17-17,"psi-mi:""MI:0177""(O-phospho-L-threonine)"
30,16-16,"psi-mi:""MI:0176""(O-phospho-L-serine)"


In [526]:
merged_neg_expl_ptm[["ptm_range_2"]].head()

Unnamed: 0,ptm_range_2
0,
1,
2,257-257
3,0-0
4,


In [527]:
merged_expl_ptm.loc[
    merged_expl_ptm["PTM Affected protein AC"].fillna("").str.contains("dip")
][
    ["ptm_orig_1","ptm_new_1","ptm_range_1","ptm_short_1",
     "ptm_orig_2","ptm_new_2","ptm_range_2","ptm_short_2",
     "PTM Feature short label","PTM Feature range(s)"]]

Unnamed: 0,ptm_orig_1,ptm_new_1,ptm_range_1,ptm_short_1,ptm_orig_2,ptm_new_2,ptm_range_2,ptm_short_2,PTM Feature short label,PTM Feature range(s)
484,,,,,,,5-5,arg-5,arg-5,5-5
485,,,,,,,3-3,arg-3,arg-3,3-3
486,,,,,,,4-4,Arg4,Arg4,4-4
509,,,,,,,2-2,R2me2,R2me2,2-2
510,,,,,,,3-3,R3me2,R3me2,3-3
...,...,...,...,...,...,...,...,...,...,...
12385,,,,,,,63-63,K63,K11,11-11
12386,,,,,,,63-63,K63,K48,48-48
12387,,,,,,,48-48,K48,K63,63-63
12389,,,,,,,48-48,K48,K11,11-11


In [528]:
merged_expl_ptm_filt = merged_expl_ptm.loc[
    ((merged_expl_ptm["scraped_ptm_has_info"]) & ~(merged_expl_ptm["agg_ptm_has_info"])) |
    (
        (merged_expl_ptm["scraped_ptm_has_info"]) & 
        (merged_expl_ptm["agg_ptm_has_info"]) & 
            (
                (merged_expl_ptm["PTM Interactor Matches"].str.contains("A")) & 
                (merged_expl_ptm["ptm_short_1"]==merged_expl_ptm["PTM Feature short label"]) & 
                (merged_expl_ptm["ptm_range_1"]==merged_expl_ptm["PTM Feature range(s)"])
            ) | 
            (
                (merged_expl_ptm["PTM Interactor Matches"].str.contains("B")) & 
                (merged_expl_ptm["ptm_short_2"]==merged_expl_ptm["PTM Feature short label"]) &
                (merged_expl_ptm["ptm_range_2"]==merged_expl_ptm["PTM Feature range(s)"])
            )  
    )
]
print(len(merged_expl_ptm))
print(len(merged_expl_ptm_filt))
merged_expl_ptm_filt.head()[[
    "interaction_intactid","PTM Interactor Matches","PTM Feature type","PTM Feature short label","ptm_short_1","ptm_short_2","PTM Feature range(s)","ptm_range_1","ptm_range_2"
]]

13376
6818


Unnamed: 0,interaction_intactid,PTM Interactor Matches,PTM Feature type,PTM Feature short label,ptm_short_1,ptm_short_2,PTM Feature range(s),ptm_range_1,ptm_range_2
0,EBI-9832689,A,"psi-mod:""MOD:00394""(monoacetylated residue)",acres,acres,,668-668,668-668,
1,EBI-6984518,A,"psi-mod:""MOD:00394""(monoacetylated residue)",acetylated residue,acetylated residue,,250-250,250-250,
2,EBI-15986772,B,"psi-mi:""MI:0177""(O-phospho-L-threonine)",pT257,,pT257,257-257,,257-257
10,EBI-6967736,A,"psi-mi:""MI:0170""(phosphorylated residue)",phosphorylated,phosphorylated,,1214-1214,1214-1214,
11,EBI-7156123,B,"psi-mi:""MI:0170""(phosphorylated residue)",phosphorylated,,phosphorylated,1175-1175,,1175-1175


In [529]:
merged_neg_expl_ptm_filt = merged_neg_expl_ptm.loc[
    ((merged_neg_expl_ptm["scraped_ptm_has_info"]) & ~(merged_neg_expl_ptm["agg_ptm_has_info"])) |
    (
        (merged_neg_expl_ptm["scraped_ptm_has_info"]) & 
        (merged_neg_expl_ptm["agg_ptm_has_info"]) & 
            (
                (merged_neg_expl_ptm["PTM Interactor Matches"].str.contains("A")) & 
                (merged_neg_expl_ptm["ptm_short_1"]==merged_neg_expl_ptm["PTM Feature short label"]) & 
                (merged_neg_expl_ptm["ptm_range_1"]==merged_neg_expl_ptm["PTM Feature range(s)"])
            ) | 
            (
                (merged_neg_expl_ptm["PTM Interactor Matches"].str.contains("B")) & 
                (merged_neg_expl_ptm["ptm_short_2"]==merged_neg_expl_ptm["PTM Feature short label"]) &
                (merged_neg_expl_ptm["ptm_range_2"]==merged_neg_expl_ptm["PTM Feature range(s)"])
            )  
    )
]
print(len(merged_neg_expl_ptm))
print(len(merged_neg_expl_ptm_filt))
merged_neg_expl_ptm_filt.head()[[
    "interaction_intactid","PTM Interactor Matches","PTM Feature type","PTM Feature short label","ptm_short_1","ptm_short_2","PTM Feature range(s)","ptm_range_1","ptm_range_2"
]]

13376
6145


Unnamed: 0,interaction_intactid,PTM Interactor Matches,PTM Feature type,PTM Feature short label,ptm_short_1,ptm_short_2,PTM Feature range(s),ptm_range_1,ptm_range_2
2,EBI-15986772,B,"psi-mi:""MI:0177""(O-phospho-L-threonine)",pT257,,pT257,257-257,,257-257
11,EBI-7156123,B,"psi-mi:""MI:0170""(phosphorylated residue)",phosphorylated,,phosphorylated,1175-1175,,1175-1175
12,EBI-16882331,B,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)",tyr-419,,tyr-419,419-419,,419-419
14,EBI-16880960,B,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)",tyr-419,,tyr-419,419-419,,419-419
15,EBI-8523333,B,"psi-mi:""MI:0170""(phosphorylated residue)",phosphorylated,,phosphorylated,1175-1175,,1175-1175


In [530]:
test1 = len(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["PTM Feature range(s)"].isna()])
print(f"Total rows in merged_expl_ptm_filt with no PTM Feature range(s): {test1}")
test1 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["PTM Feature range(s)"].isna()])
print(f"Total rows in merged_neg_expl_ptm_filt with no PTM Feature range(s): {test1}")

test1 = len(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["PTM Feature short label"].isna()])
print(f"\nTotal rows in merged_expl_ptm_filt with no PTM Feature short label: {test1}")
test1 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["PTM Feature short label"].isna()])
print(f"Total rows in merged_neg_expl_ptm_filt with no PTM Feature short label: {test1}")

display(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["PTM Feature short label"].isna()][["PTM Feature short label","ptm_short_1","ptm_short_2"]])

Total rows in merged_expl_ptm_filt with no PTM Feature range(s): 7
Total rows in merged_neg_expl_ptm_filt with no PTM Feature range(s): 7

Total rows in merged_expl_ptm_filt with no PTM Feature short label: 7
Total rows in merged_neg_expl_ptm_filt with no PTM Feature short label: 7


Unnamed: 0,PTM Feature short label,ptm_short_1,ptm_short_2
600,,,P26045:p.[Tyr676Phe;Asp811Ala]
4094,,O43524:p.[Thr32Ala;Ser253Ala;Ser315Ala],
4108,,,region
7244,,region,
7246,,region,
8462,,Q8GH87:p.Ala640Glu,
10037,,,Q6UWE0:p.Leu708_Ser723delinsArgCysAlaAlaArgThr...


In [531]:
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM Affected protein AC"].fillna("").str.contains("dip")
][
    ["interaction_intactid","mol_type_2",
     "ptm_orig_2","ptm_new_2","ptm_range_2","ptm_short_2",
     "PTM Feature short label","PTM Feature range(s)"]]

Unnamed: 0,interaction_intactid,mol_type_2,ptm_orig_2,ptm_new_2,ptm_range_2,ptm_short_2,PTM Feature short label,PTM Feature range(s)
484,EBI-15576932,peptide,,,5-5,arg-5,arg-5,5-5
485,EBI-15576955,peptide,,,3-3,arg-3,arg-3,3-3
486,EBI-15576996,peptide,,,4-4,Arg4,Arg4,4-4
509,EBI-15883680,peptide,,,2-2,R2me2,R2me2,2-2
510,EBI-15883793,peptide,,,3-3,R3me2,R3me2,3-3
...,...,...,...,...,...,...,...,...
8524,EBI-16182526,protein,,,,,K27Ub,27-27
8529,EBI-16182526,protein,,,,,K63Ub,63-63
12381,EBI-15963927,protein,,,11-11,K11,K11,11-11
12383,EBI-15963927,protein,,,63-63,K63,K63,63-63


In [532]:
print(merged_expl_ptm_filt["interaction_intactid"].nunique())
print(merged_expl_ptm_filt["seq_pair_id"].nunique())
dup_seq_pair_ids = merged_expl_ptm_filt.loc[merged_expl_ptm_filt["seq_pair_id"].duplicated()]["seq_pair_id"].unique().tolist()
merged_expl_ptm_filt.loc[merged_expl_ptm_filt["seq_pair_id"].isin(dup_seq_pair_ids)].sort_values(
    by=["seq_pair_id","interaction_intactid"])[
        ["seq_pair_id","interaction_intactid","PTM Interactor Matches","PTM Feature short label","ptm_short_1","ptm_short_2"] + scraped_ptm_cols + agg_ptm_cols].reset_index(drop=True)

5395
2545


Unnamed: 0,seq_pair_id,interaction_intactid,PTM Interactor Matches,PTM Feature short label,ptm_short_1,ptm_short_2,ptm_mi_1,ptm_name_1,ptm_short_1.1,ptm_begin_1,...,PTM Feature annotation(s),PTM Affected protein AC,PTM Affected protein symbol,PTM Affected protein full name,PTM Affected protein organism,PTM Interaction participants,PTM PubMedID,PTM Figure legend(s),PTM Interaction AC,PTM Xref ID(s)
0,seqpair100340,EBI-15634428,B,lys-4,,lys-4,,,,,...,-,dip:DIP-29329N,-,Histone H3 N-terminal Peptide,"taxid:-2(chemical synthesis)|taxid:-2(""Chemica...","(uniprotkb:P41229(psi-mi:""MI:0326""(protein)), ...",pubmed:17468742|imex:IM-21849,-,intact:EBI-15634428,-
1,seqpair100340,EBI-15634428,B,K4me2,,K4me2,,,,,...,-,dip:DIP-29329N,-,Histone H3 N-terminal Peptide,"taxid:-2(chemical synthesis)|taxid:-2(""Chemica...","(uniprotkb:P41229(psi-mi:""MI:0326""(protein)), ...",pubmed:17468742|imex:IM-21849,-,intact:EBI-15634428,-
2,seqpair100340,EBI-15634428,B,K4me3,,K4me3,,,,,...,-,dip:DIP-29329N,-,Histone H3 N-terminal Peptide,"taxid:-2(chemical synthesis)|taxid:-2(""Chemica...","(uniprotkb:P41229(psi-mi:""MI:0326""(protein)), ...",pubmed:17468742|imex:IM-21849,-,intact:EBI-15634428,-
3,seqpair100354,EBI-1246588,B,lys4,,lys4,,,,,...,-,uniprotkb:P68432,-,Histone H3.1,"taxid:9913(bovin)|taxid:9913(""Bos taurus (Bovi...","(uniprotkb:P41229(psi-mi:""MI:0326""(protein)), ...",pubmed:17320160|imex:IM-11878,figure legend:2,intact:EBI-1246588,-
4,seqpair100354,EBI-1246588,B,lys4,,lys4,,,,,...,-,uniprotkb:P68432,-,Histone H3.1,"taxid:9913(bovin)|taxid:9913(""Bos taurus (Bovi...","(uniprotkb:P41229(psi-mi:""MI:0326""(protein)), ...",pubmed:17320160|imex:IM-11878,figure legend:2,intact:EBI-1246588,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5475,seqpair95045,EBI-9462936,B,phosphotyrosine,,phosphotyrosine,,,,,...,-,uniprotkb:Q13480,uniprotkb:GAB1(gene name),GRB2-associated-binding protein 1,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:Q9H6Q3(psi-mi:""MI:0326""(protein)), ...",pubmed:24728074|imex:IM-22269,"figure legend:Supplementary table 1, supplemen...",intact:EBI-9462936,-
5476,seqpair95398,EBI-22079913,B,n-formylmet,,n-formylmet,,,,,...,-,uniprotkb:P02741-PRO_0000023526,uniprotkb:CRP(gene name),C-reactive protein,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P08603(psi-mi:""MI:0326""(protein)), ...",pubmed:17244159|imex:IM-27413,"figure legend:5A, 8",intact:EBI-22079913,-
5477,seqpair95398,EBI-22079938,B,n-formylmet,,n-formylmet,,,,,...,-,uniprotkb:P02741-PRO_0000023526,uniprotkb:CRP(gene name),C-reactive protein,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P08603(psi-mi:""MI:0326""(protein)), ...",pubmed:17244159|imex:IM-27413,figure legend:5B,intact:EBI-22079938,-
5478,seqpair994,EBI-16880960,B,tyr-419,,tyr-419,,,,,...,-,uniprotkb:P12931,uniprotkb:SRC(gene name),Proto-oncogene tyrosine-protein kinase Src,taxid:9606(human)|taxid:9606(Homo sapiens),"(uniprotkb:P35968(psi-mi:""MI:0326""(protein)), ...",pubmed:12509223|imex:IM-26377,figure legend:Fig. 3A,intact:EBI-16880960,-


In [533]:
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Interactor Matches"]=="B") & 
    (merged_expl_ptm_filt["ptm_range_1"].notna()) & 
    (merged_expl_ptm_filt["ptm_range_2"].isna())
])
print(f"There are {test1} rows where curated-ptm data matches interactor B, but we only pulled data for interactor A.")
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Interactor Matches"]=="A") & 
    (merged_expl_ptm_filt["ptm_range_2"].notna()) & 
    (merged_expl_ptm_filt["ptm_range_1"].isna())
])
print(f"There are {test1} rows where curated-ptm data matches interactor A, but we only pulled data for interactor B.")
merged_expl_ptm_filt = merged_expl_ptm_filt.loc[
    ~(
    (merged_expl_ptm_filt["PTM Interactor Matches"]=="B") & 
    (merged_expl_ptm_filt["ptm_range_1"].notna()) & 
    (merged_expl_ptm_filt["ptm_range_2"].isna()))
]
merged_expl_ptm_filt = merged_expl_ptm_filt.loc[
    ~(
    (merged_expl_ptm_filt["PTM Interactor Matches"]=="A") & 
    (merged_expl_ptm_filt["ptm_range_2"].notna()) & 
    (merged_expl_ptm_filt["ptm_range_1"].isna())
    )
]
print(f"Dropped these rows. Remaining: {len(merged_expl_ptm_filt)}")

There are 0 rows where curated-ptm data matches interactor B, but we only pulled data for interactor A.
There are 0 rows where curated-ptm data matches interactor A, but we only pulled data for interactor B.
Dropped these rows. Remaining: 6818


In [534]:
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Interactor Matches"]=="B") & 
    (merged_neg_expl_ptm_filt["ptm_range_1"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_range_2"].isna())
])
print(f"There are {test1} rows where curated-ptm data matches interactor B, but we only pulled data for interactor A.")
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Interactor Matches"]=="A") & 
    (merged_neg_expl_ptm_filt["ptm_range_2"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_range_1"].isna())
])
print(f"There are {test1} rows where curated-ptm data matches interactor A, but we only pulled data for interactor B.")
merged_neg_expl_ptm_filt = merged_neg_expl_ptm_filt.loc[
    ~(
    (merged_neg_expl_ptm_filt["PTM Interactor Matches"]=="B") & 
    (merged_neg_expl_ptm_filt["ptm_range_1"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_range_2"].isna()))
]
merged_neg_expl_ptm_filt = merged_neg_expl_ptm_filt.loc[
    ~(
    (merged_neg_expl_ptm_filt["PTM Interactor Matches"]=="A") & 
    (merged_neg_expl_ptm_filt["ptm_range_2"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_range_1"].isna())
    )
]
print(f"Dropped these rows. Remaining: {len(merged_neg_expl_ptm_filt)}")

There are 0 rows where curated-ptm data matches interactor B, but we only pulled data for interactor A.
There are 0 rows where curated-ptm data matches interactor A, but we only pulled data for interactor B.
Dropped these rows. Remaining: 6145


In [535]:
merged_expl_ptm_filt = harmonize_nulls_to_nan(merged_expl_ptm_filt)
merged_neg_expl_ptm_filt = harmonize_nulls_to_nan(merged_neg_expl_ptm_filt)

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})
  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


In [536]:
PTM_COLS = [
"ptm_mi_1",
"ptm_name_1",
"ptm_short_1",
"ptm_begin_1",
"ptm_end_1",
"ptm_mi_2",
"ptm_name_2",
"ptm_short_2",
"ptm_begin_2",
"ptm_end_2",
"ptm_range_1",
"ptm_range_2"
]

In [537]:
def fill_ptms_when_AB(df: pd.DataFrame, cols,
                           match_col: str = "PTM Interactor Matches") -> pd.DataFrame:
    """
    If match_col == 'A,B' (any whitespace; accepts 'A,B' or 'B,A') and
      - side 1 has any mutation_*_1 info but side 2 has none -> copy 1 -> 2
      - side 2 has any mutation_*_2 info but side 1 has none -> copy 2 -> 1
    """
    out = df.copy()

    # Normalize obvious sentinel strings on text-like cols so NA detection works
    text_cols = out.select_dtypes(include=["object","string"]).columns.intersection(cols + [match_col])
    out[text_cols] = out[text_cols].replace({"": pd.NA, "None": pd.NA, "nan": pd.NA}, regex=False)

    # Build paired column lists
    block1 = [c for c in cols if c.endswith("_1")]
    block2 = [c for c in cols if c.endswith("_2")]

    # Map base -> pair, e.g. 'mutation_begin' -> ('mutation_begin_1','mutation_begin_2')
    pairs = []
    for c1 in block1:
        base = c1[:-2]
        c2 = f"{base}_2"
        if c2 in out.columns:
            pairs.append((c1, c2))

    # Rows where interactor matches are A,B (either order, ignore spaces)
    ab_mask = (
        out[match_col]
        .astype("string")
        .str.strip()
        .str.contains(r'^(A\s*,\s*B|B\s*,\s*A)$', flags=re.I, na=False)
    )

    # "Has info" = any non-null across the block
    has1 = out[block1].notna().any(axis=1)
    has2 = out[block2].notna().any(axis=1)

    # Exactly-one-side-only masks
    mask_copy_12 = ab_mask & has1 & ~has2
    mask_copy_21 = ab_mask & has2 & ~has1

    # Copy 1 -> 2
    if mask_copy_12.any():
        idx = mask_copy_12[mask_copy_12].index
        for c1, c2 in pairs:
            out.loc[idx, c2] = out.loc[idx, c1].values

    # Copy 2 -> 1
    if mask_copy_21.any():
        idx = mask_copy_21[mask_copy_21].index
        for c1, c2 in pairs:
            out.loc[idx, c1] = out.loc[idx, c2].values

    return out

In [538]:
merged_expl_ptm_filt = fill_ptms_when_AB(merged_expl_ptm_filt, PTM_COLS, match_col="PTM Interactor Matches")
merged_neg_expl_ptm_filt = fill_ptms_when_AB(merged_neg_expl_ptm_filt, PTM_COLS, match_col="PTM Interactor Matches")

  out[match_col]
  out[match_col]


In [539]:
display(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Feature type"].notna()) & 
    ~(merged_expl_ptm_filt["PTM Feature type"].fillna("").str.contains("psi-mod:\"MOD:"))
][["PTM Feature type"]])

merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Feature type"].notna()) & 
    ~(merged_expl_ptm_filt["PTM Feature type"].fillna("").str.contains("psi-mod:\"MOD:")) &
    ~(merged_expl_ptm_filt["PTM Feature type"].fillna("").str.contains("psi-mi:\"MI:"))
][["PTM Feature type"]]

Unnamed: 0,PTM Feature type
2,"psi-mi:""MI:0177""(O-phospho-L-threonine)"
10,"psi-mi:""MI:0170""(phosphorylated residue)"
11,"psi-mi:""MI:0170""(phosphorylated residue)"
12,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)"
14,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)"
...,...
12909,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)"
13175,"psi-mi:""MI:0170""(phosphorylated residue)"
13193,"psi-mi:""MI:0170""(phosphorylated residue)"
13322,"psi-mi:""MI:0179""(uncategorized protein modific..."


Unnamed: 0,PTM Feature type


In [540]:
merged_expl_ptm_filt[["interaction_intactid","year","interaction_xml_id","PTM Feature type"] + scraped_ptm_cols]

Unnamed: 0,interaction_intactid,year,interaction_xml_id,PTM Feature type,ptm_mi_1,ptm_name_1,ptm_short_1,ptm_begin_1,ptm_end_1,ptm_orig_1,ptm_new_1,ptm_mi_2,ptm_name_2,ptm_short_2,ptm_begin_2,ptm_end_2,ptm_orig_2,ptm_new_2
0,EBI-9832689,2014,22,"psi-mod:""MOD:00394""(monoacetylated residue)",MI:0925,"observed-ptm,monoacetylated residue",acres,668,668,,,,,,,,,
1,EBI-6984518,2009,88,"psi-mod:""MOD:00394""(monoacetylated residue)",MI:0639,"resulting-ptm,observed-ptm,monoacetylated residue",acetylated residue,250,250,,,,,,,,,
2,EBI-15986772,2012,29,"psi-mi:""MI:0177""(O-phospho-L-threonine)",,,,,,,,MI:0925,"observed-ptm,O-phospho-L-threonine",pT257,257,257,,
10,EBI-6967736,2006,41,"psi-mi:""MI:0170""(phosphorylated residue)",MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,1214,1214,,,,,,,,,
11,EBI-7156123,2004,4,"psi-mi:""MI:0170""(phosphorylated residue)",,,,,,,,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,1175,1175,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13349,EBI-21944475,2008,4,"psi-mod:""MOD:00798""(half cystine)",,,,,,,,MI:0925,"observed-ptm,half cystine",Cys in intermolecular disufide bond,426,426,,
13352,EBI-21944475,2008,4,"psi-mod:""MOD:00798""(half cystine)",,,,,,,,MI:0925,"observed-ptm,half cystine",Cys in intermolecular disufide bond,431,431,,
13359,EBI-21944475,2008,4,"psi-mod:""MOD:00798""(half cystine)",,,,,,,,MI:0925,"observed-ptm,half cystine",Cys in intermolecular disufide bond,426,426,,
13362,EBI-21944475,2008,4,"psi-mod:""MOD:00798""(half cystine)",,,,,,,,MI:0925,"observed-ptm,half cystine",Cys in intermolecular disufide bond,431,431,,


In [541]:
temp = ptms.loc[ptms["Feature type"].notna()]
test1 = len(temp)
print(f"Fraction of original intact-compiled PTMs database that has a PTM Feature type: {test1}/{len(ptms)} ({100*test1/len(ptms):.2f}%)")
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Feature type"].notna())  &
    (merged_expl_ptm_filt["PTM Feature range(s)"].notna()) 
])
test2 = len(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["scraped_ptm_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and a feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Feature type"].notna())  &
    (merged_expl_ptm_filt["PTM Feature range(s)"].isna()) 
])
test2 = len(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["scraped_ptm_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and no feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")

Fraction of original intact-compiled PTMs database that has a PTM Feature type: 10285/10286 (99.99%)
Fraction of XML-compiled database that has a feature type and a feature range: 6811/6818 (99.90%)
Fraction of XML-compiled database that has a feature type and no feature range: 0/6818 (0.00%)


In [542]:
temp = ptms.loc[ptms["Feature type"].notna()]
test1 = len(temp)
print(f"Fraction of original intact-compiled PTMs database that has a PTM Feature type: {test1}/{len(ptms)} ({100*test1/len(ptms):.2f}%)")
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Feature type"].notna())  &
    (merged_neg_expl_ptm_filt["PTM Feature range(s)"].notna()) 
])
test2 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["scraped_ptm_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and a feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Feature type"].notna())  &
    (merged_neg_expl_ptm_filt["PTM Feature range(s)"].isna()) 
])
test2 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["scraped_ptm_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and no feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")

Fraction of original intact-compiled PTMs database that has a PTM Feature type: 10285/10286 (99.99%)
Fraction of XML-compiled database that has a feature type and a feature range: 6138/6145 (99.89%)
Fraction of XML-compiled database that has a feature type and no feature range: 0/6145 (0.00%)


In [543]:
temp = ptms.replace("-",np.nan)
temp["Resulting sequence"] = temp["Resulting sequence"].apply(lambda x: x.replace(",","") if (type(x)==str and len(x.split())==1) else x)
temp["Resulting sequence"] = temp["Resulting sequence"].apply(lambda x: np.nan if x=="" else x)
test1 = len(temp.loc[
    (temp["Original sequence"].notna()) & 
    (temp["Resulting sequence"].notna())
])
print(f"Fraction of original intact-compiled PTMs database that has an original and new sequence: {test1}/{len(ptms)} ({100*test1/len(ptms):.2f}%)")
test1 = len(merged_neg_expl_ptm_filt.loc[
    ((merged_neg_expl_ptm_filt["ptm_orig_1"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_new_1"].notna())) |
    ((merged_neg_expl_ptm_filt["ptm_orig_2"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_new_2"].notna()))
])
test2 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["scraped_ptm_has_info"]])
print(f"Fraction of XML-compiled database that has an original and new sequence: {test1}/{test2} ({100*test1/test2:.2f}%)")
print("Because there are not often before and after sequences provided, we will not be relying on this data in PTM curation")

Fraction of original intact-compiled PTMs database that has an original and new sequence: 23/10286 (0.22%)
Fraction of XML-compiled database that has an original and new sequence: 43/6145 (0.70%)
Because there are not often before and after sequences provided, we will not be relying on this data in PTM curation


In [544]:
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM Interactor Matches"]=="A,B"
].reset_index(drop=True)[["PTM Interactor Matches","unique_id"] + PTM_COLS].head()

Unnamed: 0,PTM Interactor Matches,unique_id,ptm_mi_1,ptm_name_1,ptm_short_1,ptm_begin_1,ptm_end_1,ptm_mi_2,ptm_name_2,ptm_short_2,ptm_begin_2,ptm_end_2,ptm_range_1,ptm_range_2
0,"A,B",intact:EBI-1055635_intact:EBI-1055635,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,713,713,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,713,713,713-713,713-713
1,"A,B",intact:EBI-1055635_intact:EBI-1055635,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,713,713,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,713,713,713-713,713-713
2,"A,B",intact:EBI-1057697_intact:EBI-1057697,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,701,701,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,701,701,701-701,701-701
3,"A,B",intact:EBI-1057697_intact:EBI-1057697,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,701,701,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,701,701,701-701,701-701
4,"A,B",intact:EBI-1057697_intact:EBI-1057697,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,701,701,MI:0925,"observed-ptm,phosphorylated residue",phosphorylated,701,701,701-701,701-701


In [None]:
agg_ptm_cols = [
"PTM # Feature AC",
"PTM Affected protein AC",
"PTM Affected protein full name",
"PTM Affected protein organism",
"PTM Affected protein symbol",
"PTM Feature annotation(s)",
"PTM Feature range(s)",
"PTM Feature short label",
"PTM Feature type",
"PTM Figure legend(s)",
"PTM Interaction AC",
"PTM Interaction participants",
"PTM Interactor Matches",
"PTM Original sequence",
"PTM PubMedID",
"PTM Resulting sequence",
"PTM Xref ID(s)",
"PTM new_binds_bo_annotation",
"PTM new_binds_bo_feature_type",
"PTM og_binds_bo_annotation",
"PTM og_binds_bo_feature_type",
]
scraped_ptm_cols = [
"ptm_begin_1",
"ptm_begin_2",
"ptm_end_1",
"ptm_end_2",
"ptm_mi_1",
"ptm_mi_2",
"ptm_name_1",
"ptm_name_2",
"ptm_new_1",
"ptm_new_2",
"ptm_orig_1",
"ptm_orig_2",
"ptm_range_1",
"ptm_range_2",
"ptm_short_1",
"ptm_short_2",
]
keep_cols = scraped_ptm_cols + agg_ptm_cols + [
"uniprot_A",
"uniprot_A_equalseq",
"uniprot_A_equalseq_canonical",
"uniprot_A_full",
"uniprot_A_inseq",
"uniprot_A_inseq_canonical",
"uniprot_A_intact",
"uniprot_A_noiso1",
"uniprot_A_noisoforms",
"uniprot_B",
"uniprot_B_equalseq",
"uniprot_B_equalseq_canonical",
"uniprot_B_full",
"uniprot_B_inseq",
"uniprot_B_inseq_canonical",
"uniprot_B_intact",
"uniprot_B_noiso1",
"uniprot_B_noisoforms",
"uniprot_gene_name_A",
"uniprot_gene_name_B",
"uniprotkb_1",
"uniprotkb_2",
"unique_all_intact_sorted",
"unique_expansions",
"unique_id",
"unique_score_int",
"unique_scores",
"unique_uniprot_noiso1_pair",
"unique_uniprot_noisoforms_pair",
"unique_uniprot_pair",
"interaction_intactid",
"intactid_1",
"intactid_2",
"dip_1", "dip_2",
"seq_pair_id",
"length_1",
"length_2",
"aa_1","aa_2",
"invalids_aa_1", "invalids_aa_2",
]

In [None]:
merged_expl_ptm_filt = merged_expl_ptm_filt[keep_cols]
merged_expl_ptm_filt["scraped_ptm_has_info"] = merged_expl_ptm_filt[scraped_ptm_cols].notna().any(axis=1)
merged_expl_ptm_filt["scraped_ptm_has_info_1"] = merged_expl_ptm_filt[[x for x in scraped_ptm_cols if x.endswith("_1")]].notna().any(axis=1)
merged_expl_ptm_filt["scraped_ptm_has_info_2"] = merged_expl_ptm_filt[[x for x in scraped_ptm_cols if x.endswith("_2")]].notna().any(axis=1)
merged_expl_ptm_filt["agg_ptm_has_info"] = merged_expl_ptm_filt[agg_ptm_cols].notna().any(axis=1)

In [None]:
merged_neg_expl_ptm_filt = merged_neg_expl_ptm_filt[keep_cols]
merged_neg_expl_ptm_filt["scraped_ptm_has_info"] = merged_neg_expl_ptm_filt[scraped_ptm_cols].notna().any(axis=1)
merged_neg_expl_ptm_filt["scraped_ptm_has_info_1"] = merged_neg_expl_ptm_filt[[x for x in scraped_ptm_cols if x.endswith("_1")]].notna().any(axis=1)
merged_neg_expl_ptm_filt["scraped_ptm_has_info_2"] = merged_neg_expl_ptm_filt[[x for x in scraped_ptm_cols if x.endswith("_2")]].notna().any(axis=1)
merged_neg_expl_ptm_filt["agg_ptm_has_info"] = merged_neg_expl_ptm_filt[agg_ptm_cols].notna().any(axis=1)

In [550]:
for c in merged_expl_ptm_filt:
    merged_expl_ptm_filt[c] = merged_expl_ptm_filt[c].apply(lambda x: "|".join(x) if type(x)==list else x)
for c in merged_neg_expl_ptm_filt:
    merged_neg_expl_ptm_filt[c] = merged_neg_expl_ptm_filt[c].apply(lambda x: "|".join(x) if type(x)==list else x)

In [551]:
merged_expl_ptm_filt = merged_expl_ptm_filt.drop_duplicates().reset_index(drop=True)
mask = merged_expl_ptm_filt[PTM_COLS].notna().sum(axis=1) == 0
merged_expl_ptm_filt = merged_expl_ptm_filt.loc[~mask].reset_index(drop=True)
print(len(merged_expl_ptm_filt))

6640


In [552]:
merged_neg_expl_ptm_filt = merged_neg_expl_ptm_filt.drop_duplicates().reset_index(drop=True)
mask = merged_neg_expl_ptm_filt[PTM_COLS].notna().sum(axis=1) == 0
merged_neg_expl_ptm_filt = merged_neg_expl_ptm_filt.loc[~mask].reset_index(drop=True)
print(len(merged_neg_expl_ptm_filt))

5972


In [553]:
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM Affected protein AC"].fillna("").str.contains("dip") 
][
    ["interaction_intactid","dip_2",
     "ptm_orig_2","ptm_new_2","ptm_range_2","ptm_short_2",
     "PTM Feature short label","PTM Feature range(s)"]]

Unnamed: 0,interaction_intactid,dip_2,ptm_orig_2,ptm_new_2,ptm_range_2,ptm_short_2,PTM Feature short label,PTM Feature range(s)
383,EBI-15576932,DIP-60197N,,,5-5,arg-5,arg-5,5-5
384,EBI-15576955,DIP-61145N,,,3-3,arg-3,arg-3,3-3
385,EBI-15576996,DIP-61144N,,,4-4,Arg4,Arg4,4-4
403,EBI-15883680,DIP-59457N,,,2-2,R2me2,R2me2,2-2
404,EBI-15883793,DIP-59458N,,,3-3,R3me2,R3me2,3-3
...,...,...,...,...,...,...,...,...
4869,EBI-16182526,DIP-37451N,,,,,K27Ub,27-27
4870,EBI-16182526,DIP-37451N,,,,,K63Ub,63-63
6557,EBI-15963927,DIP-24261N,,,11-11,K11,K11,11-11
6558,EBI-15963927,DIP-24261N,,,63-63,K63,K63,63-63


In [554]:
# going to group on # Feature AC and other things taht don't meaningfully separate features
to_join = [
    "PTM # Feature AC",
]

all_except_featac = [c for c in merged_expl_ptm_filt.columns if c not in to_join]

agg_spec = {c: join_unique_nonnull for c in to_join}

display(merged_expl_ptm_filt.head())
merged_expl_ptm_filt = (
    merged_expl_ptm_filt
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Joined PTM # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: {len(merged_expl_ptm_filt)}")

display(merged_expl_ptm_filt.head())

Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2,scraped_ptm_has_info,scraped_ptm_has_info_1,scraped_ptm_has_info_2,agg_ptm_has_info
0,668.0,,668.0,,MI:0925,,"observed-ptm,monoacetylated residue",,,,...,,,,,,,True,True,False,True
1,250.0,,250.0,,MI:0639,,"resulting-ptm,observed-ptm,monoacetylated residue",,,,...,,MI:0117,binding-associated region,binding site,352,832,True,True,False,True
2,,257.0,,257.0,,MI:0925,,"observed-ptm,O-phospho-L-threonine",,,...,,MI:0442|MI:0442|MI:0442,sufficient binding region|sufficient binding r...,Region 1-100|Region 331-436|Region 531-877,1|331|531,100|436|877,True,False,True,True
3,1214.0,,1214.0,,MI:0925,,"observed-ptm,phosphorylated residue",,,,...,1220.0,,,,,,True,True,False,True
4,,1175.0,,1175.0,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,,MI:0117,binding-associated region,binding site,1168,1181,True,False,True,True


Joined PTM # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: 6607


Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2,scraped_ptm_has_info,scraped_ptm_has_info_1,scraped_ptm_has_info_2,agg_ptm_has_info,PTM # Feature AC
0,1,,1,,MI:0639,,"resulting-ptm,guanylated residue",,,,...,,,,,,True,True,False,True,EBI-27070440
1,1,,1,,MI:0639,,"resulting-ptm,uridylated residue",,,,...,,,,,,True,True,False,True,EBI-27070971
2,1,,124,,MI:0925,,"observed-ptm,O4'-phospho-L-tyrosine",,,,...,,,,,,True,True,False,True,EBI-2265204
3,1,,196,,MI:0925,,"observed-ptm,S-myristoylated residue",,,,...,MI:0442,sufficient binding region,region,1.0,190.0,True,True,False,True,EBI-21198720
4,1,,517,,MI:0925,,"observed-ptm,galactosylated residue",,,,...,,,,,,True,True,False,True,EBI-9210027


In [555]:
# going to group on # Feature AC and other things taht don't meaningfully separate features
to_join = [
    "PTM # Feature AC",
]

all_except_featac = [c for c in merged_neg_expl_ptm_filt.columns if c not in to_join]

agg_spec = {c: join_unique_nonnull for c in to_join}

display(merged_neg_expl_ptm_filt.head())
merged_neg_expl_ptm_filt = (
    merged_neg_expl_ptm_filt
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Joined PTM # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: {len(merged_neg_expl_ptm_filt)}")

display(merged_neg_expl_ptm_filt.head())

Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2,scraped_ptm_has_info,scraped_ptm_has_info_1,scraped_ptm_has_info_2,agg_ptm_has_info
0,,257,,257,,MI:0925,,"observed-ptm,O-phospho-L-threonine",,,...,,MI:0442|MI:0442|MI:0442,sufficient binding region|sufficient binding r...,Region 1-100|Region 331-436|Region 531-877,1|331|531,100|436|877,True,False,True,True
1,,1175,,1175,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,,MI:0117,binding-associated region,binding site,1168,1181,True,False,True,True
2,,419,,419,,MI:0638,,"prerequisite-ptm,O4'-phospho-L-tyrosine",,,...,,,,,,,True,False,True,True
3,,419,,419,,MI:0638,,"prerequisite-ptm,O4'-phospho-L-tyrosine",,,...,,MI:0442,sufficient binding region,SH3-SH2_regions,57,258,True,False,True,True
4,,1175,,1175,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,,MI:0117,binding-associated region,binding site,1168,1181,True,False,True,True


Joined PTM # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: 5940


Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2,scraped_ptm_has_info,scraped_ptm_has_info_1,scraped_ptm_has_info_2,agg_ptm_has_info,PTM # Feature AC
0,640.0,,640.0,,MI:0639,,"resulting-ptm,mutation disrupting interaction",,E,,...,,,,,,True,True,False,False,
1,1.0,,1.0,,MI:0639,,"resulting-ptm,guanylated residue",,,,...,,,,,,True,True,False,True,EBI-27070440
2,1.0,,1.0,,MI:0639,,"resulting-ptm,uridylated residue",,,,...,,,,,,True,True,False,True,EBI-27070971
3,13.0,,13.0,,MI:1224,,"ptm increasing an interaction,O4'-sulfo-L-tyro...",,,,...,MI:0442,sufficient binding region,Region 24-659,24.0,659.0,True,True,False,True,EBI-16173334
4,10.0,,10.0,,MI:0638,,"prerequisite-ptm,N6,N6-dimethyl-L-lysine",,,,...,MI:0442,sufficient binding region,BAH domain,44.0,166.0,True,True,False,True,EBI-15973544


In [556]:
test1 = len(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["PTM # Feature AC"].fillna("").str.contains(",")])
print(f"Total merged_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): {test1}")
test1 = len(merged_expl_ptm_filt.loc[merged_expl_ptm_filt["PTM # Feature AC"].fillna("").str.contains(",")])
print(f"Total merged_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): {test1}")

Total merged_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): 30
Total merged_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): 30


In [557]:
test1 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["PTM # Feature AC"].fillna("").str.contains(",")])
print(f"Total merged_neg_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): {test1}")
test1 = len(merged_neg_expl_ptm_filt.loc[merged_neg_expl_ptm_filt["PTM # Feature AC"].fillna("").str.contains(",")])
print(f"Total merged_neg_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): {test1}")

Total merged_neg_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): 29
Total merged_neg_expl_ptm_filt rows with > 1 PTM # Feature AC (entry contains commas ,): 29


In [558]:
def get_final_ptm_partner(row):
    matches = []
    if type(row["PTM Interactor Matches"])==str:
        return row["PTM Interactor Matches"]
    else:
        if type(row["ptm_short_1"])==str:
            matches.append("A")
        if type(row["ptm_short_2"])==str:
            matches.append("B")
        return ",".join(matches)
        
merged_expl_ptm_filt["PTM Partner"] = merged_expl_ptm_filt.apply(lambda row: get_final_ptm_partner(row),axis=1)
merged_neg_expl_ptm_filt["PTM Partner"] = merged_neg_expl_ptm_filt.apply(lambda row: get_final_ptm_partner(row),axis=1)

In [559]:
merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["ptm_name_1"].str.contains("mutation")) |
    (merged_expl_ptm_filt["ptm_name_2"].str.contains("mutation"))
][["interaction_intactid","PTM Partner","ptm_short_1","ptm_short_2","PTM Feature range(s)","PTM Feature type"] + scraped_ptm_cols]

Unnamed: 0,interaction_intactid,PTM Partner,ptm_short_1,ptm_short_2,PTM Feature range(s),PTM Feature type,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,...,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,ptm_orig_1,ptm_orig_2,ptm_range_1,ptm_range_2,ptm_short_1.1,ptm_short_2.1
506,EBI-15098123,A,O43524:p.[Thr32Ala;Ser253Ala;Ser315Ala],,,,32253315,,32253315,,...,"disrupting-ptm,mutation disrupting interaction",,"A,A,A",,"T,S,S",,"32-32,253-253,315-315",,O43524:p.[Thr32Ala;Ser253Ala;Ser315Ala],
900,EBI-7936521,A,Q8GH87:p.Ala640Glu,,,,640,,640,,...,"resulting-ptm,mutation disrupting interaction",,E,,A,,640-640,,Q8GH87:p.Ala640Glu,


In [560]:
simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["interaction_intactid"]=="EBI-15098123") |
    (simplemerged_neg_mut["interaction_intactid"]=="EBI-7936521") 
][["interaction_intactid","Mutated Partner","mutation_short"]]

Unnamed: 0,interaction_intactid,Mutated Partner,mutation_short
6326,EBI-15098123,A,O43524:p.[Thr32Ala;Ser253Ala;Ser315Ala]
14880,EBI-7936521,A,Q8GH87:p.Ala640Glu


In [561]:
# Drop rows where it's a mutation
merged_expl_ptm_filt = merged_expl_ptm_filt.loc[
    ~((merged_expl_ptm_filt["ptm_name_1"].fillna("").str.contains("mutation")) |
    (merged_expl_ptm_filt["ptm_name_2"].fillna("").str.contains("mutation")))
].reset_index(drop=True)
print(f"Dropped rows where the supposed PTM is actually a mutation. Remaining size of merged_expl_ptm_filt: {len(merged_expl_ptm_filt)}")

Dropped rows where the supposed PTM is actually a mutation. Remaining size of merged_expl_ptm_filt: 6603


In [562]:
# Drop rows where it's a mutation
merged_neg_expl_ptm_filt = merged_neg_expl_ptm_filt.loc[
    ~((merged_neg_expl_ptm_filt["ptm_name_1"].fillna("").str.contains("mutation")) |
    (merged_neg_expl_ptm_filt["ptm_name_2"].fillna("").str.contains("mutation")))
].reset_index(drop=True)
print(f"Dropped rows where the supposed PTM is actually a mutation. Remaining size of merged_neg_expl_ptm_filt: {len(merged_neg_expl_ptm_filt)}")

Dropped rows where the supposed PTM is actually a mutation. Remaining size of merged_neg_expl_ptm_filt: 5936


In [563]:
merged_expl_ptm_filt.loc[
    ((merged_expl_ptm_filt["ptm_orig_1"].notna()) & 
    (merged_expl_ptm_filt["ptm_new_1"].notna())) | 
    ((merged_expl_ptm_filt["ptm_orig_2"].notna()) & 
    (merged_expl_ptm_filt["ptm_new_2"].notna()))
][["PTM Partner","PTM Feature range(s)","PTM Feature type"] + scraped_ptm_cols]

Unnamed: 0,PTM Partner,PTM Feature range(s),PTM Feature type,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,ptm_orig_1,ptm_orig_2,ptm_range_1,ptm_range_2,ptm_short_1,ptm_short_2
273,A,172-172,"psi-mod:""MOD:00905""(modified L-cysteine residue)",172.0,,172.0,,MI:0925,,"observed-ptm,modified L-cysteine residue",,S,,C,,172-172,,cys-172,
458,"A,B",289-289,"psi-mod:""MOD:00689""(disulfide crosslinked resi...",289.0,289.0,289.0,289.0,MI:0925,MI:0925,"observed-ptm,disulfide crosslinked residues","observed-ptm,disulfide crosslinked residues",C,,Q,,289-289,289-289,Cys289,Cys289
459,"A,B",289-289,"psi-mod:""MOD:00689""(disulfide crosslinked resi...",289.0,289.0,289.0,289.0,MI:0925,MI:0925,"observed-ptm,disulfide crosslinked residues","observed-ptm,disulfide crosslinked residues",,C,,Q,289-289,289-289,Cys289,Cys289
466,"A,B",293-293,"psi-mod:""MOD:00689""(disulfide crosslinked resi...",293.0,293.0,293.0,293.0,MI:0925,MI:0925,"observed-ptm,disulfide crosslinked residues","observed-ptm,disulfide crosslinked residues",C,,R,,293-293,293-293,Cys293,Cys293
467,"A,B",293-293,"psi-mod:""MOD:00689""(disulfide crosslinked resi...",293.0,293.0,293.0,293.0,MI:0925,MI:0925,"observed-ptm,disulfide crosslinked residues","observed-ptm,disulfide crosslinked residues",,C,,R,293-293,293-293,Cys293,Cys293
807,A,540-540,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)",540.0,,540.0,,MI:0925,,"observed-ptm,O4'-phospho-L-tyrosine",,pY,,Y,,540-540,,tyr-540,
2577,B,1595-1595,"psi-mod:""MOD:01163""(guanylated residue)",,1595.0,,1595.0,,MI:0639,,"resulting-ptm,guanylated residue",,A,,K,,1595-1595,,lys-1595
3306,B,259-259,"psi-mi:""MI:0170""(phosphorylated residue)",,259.0,,259.0,,MI:0925,,"observed-ptm,phosphorylated residue",,S,,S,,259-259,,ser-259
3307,B,259-259,"psi-mi:""MI:0170""(phosphorylated residue)",,259.0,,259.0,,MI:0925,,"observed-ptm,phosphorylated residue",,S,,S,,259-259,,ser-259
3901,B,365-365,"psi-mi:""MI:0170""(phosphorylated residue)",,365.0,,365.0,,MI:0925,,"observed-ptm,phosphorylated residue",,S,,S,,365-365,,ser-365


In [564]:
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM Feature range(s)"].fillna("").str.contains(",")
][["PTM Partner","PTM Feature range(s)","PTM Feature type"] + scraped_ptm_cols]

Unnamed: 0,PTM Partner,PTM Feature range(s),PTM Feature type,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,ptm_orig_1,ptm_orig_2,ptm_range_1,ptm_range_2,ptm_short_1,ptm_short_2
5,A,"1-1,3-3","psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)",13,,13,,MI:1224,,"ptm increasing an interaction,O4'-sulfo-L-tyro...",,,,,,"1-1,3-3",,tyr-1;tyr-3,
130,A,"1228-1228,1149-1149,1085-1085","psi-mod:""MOD:00033""(crosslinked residues)",122811491085,,122811491085,,MI:0925,,"observed-ptm,crosslinked residues",,,,,,"1228-1228,1149-1149,1085-1085",,CL-Res,
136,A,"1234-1234,1230-1230,1235-1235","psi-mi:""MI:0170""(phosphorylated residue)",123412301235,,123412301235,,MI:0925,,"observed-ptm,phosphorylated residue",,,,,,"1234-1234,1230-1230,1235-1235",,region,
173,A,"131-131,144-144","psi-mi:""MI:0176""(O-phospho-L-serine)",131144,,131144,,MI:0925,,"observed-ptm,O-phospho-L-serine",,,,,,"131-131,144-144",,ser-131;ser-144,
186,A,"1414-1414,1094-1094","psi-mod:""MOD:00033""(crosslinked residues)",14141094,,14141094,,MI:0925,,"observed-ptm,crosslinked residues",,,,,,"1414-1414,1094-1094",,K1094/1414,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6044,B,"80-80,76-76,69-69,5-5,45-45,33-33","psi-mi:""MI:0170""(phosphorylated residue)",,80766954533,,80766954533,,MI:0925,,"observed-ptm,phosphorylated residue",,,,,,"80-80,76-76,69-69,5-5,45-45,33-33",,pT5/pT33/pT45/pS69/pS76/pS80
6086,B,"826-826,356-356,795-795,807-807,811-811,821-82...","psi-mi:""MI:0170""(phosphorylated residue)",,826356795807811821612608780788373,,826356795807811821612608780788373,,MI:0925,,"observed-ptm,phosphorylated residue",,,,,,"826-826,356-356,795-795,807-807,811-811,821-82...",,phosphorylated
6404,B,"9-9,6-6","psi-mi:""MI:0526""(N-acetylated L-lysine)",,96,,96,,MI:0925,,"observed-ptm,N-acetylated L-lysine",,,,,,"9-9,6-6",,K6/9ac
6405,B,"9-9,6-6,3-3,27-27,24-24,21-21,15-15","psi-mod:""MOD:00039""(4-hydroxy-L-proline)",,96327242115,,96327242115,,MI:0925,,"observed-ptm,4-hydroxy-L-proline",,,,,,"9-9,6-6,3-3,27-27,24-24,21-21,15-15",,Hydroxyproline


In [565]:
import re
import pandas as pd

def get_ptm_sequence(row, partner="A"):
    """
    Build the PTM-annotated sequence for partner 'A' or 'B' from a row.

    Expected columns:
      - "PTM Partner"         (e.g., "A", "B", "A,B")
      - "ptm_range_1", "ptm_range_2"    (e.g., "173-173" or "120-122, 140-140")
      - "ptm_orig_1", "ptm_orig_2"      (original substrings, comma-separated; may be NaN/blank)
      - "ptm_new_1",  "ptm_new_2"       (new substrings, comma-separated; may be NaN/blank)
      - "PTM Feature type"              (pipe-separated if multiple PTMs: "phospho|acetyl")
      - amino-acid sequence: "aa_1" (A) and "aa_2" (B)

    Behavior:
      - Only substitutions of existing residues (no insertions/deletions).
      - For each PTM range, we replace aa[a:b] with:
          "<PTM Feature type>"              if no ptm_new
          "<PTM Feature type|ptm_new>"      if ptm_new is present
      - If ptm_orig is present (non-NaN/non-empty), we check it matches aa[a:b].
        If it does not match, we return None for that row.
    """

    matches = row.get("PTM Partner")
    if matches is None:
        return None

    feature_col = "PTM Feature type"

    if partner == "A":
        ptm_range_col = "ptm_range_1"
        orig_col      = "ptm_orig_1"
        new_col       = "ptm_new_1"
        aa_col        = "aa_1"
    else:
        ptm_range_col = "ptm_range_2"
        orig_col      = "ptm_orig_2"
        new_col       = "ptm_new_2"
        aa_col        = "aa_2"

    # Accept "A", "B", "A,B" (any spacing); also handle lists/tuples/sets
    def _has_partner(m, p):
        if isinstance(m, (list, tuple, set)):
            return p in m
        s = str(m)
        return bool(re.search(rf'(^|,)\s*{re.escape(p)}\s*(,|$)', s))

    if not _has_partner(matches, partner):
        return None

    ranges_str = row.get(ptm_range_col)
    if ranges_str is None or (isinstance(ranges_str, float) and pd.isna(ranges_str)) or str(ranges_str).strip() == "":
        return None

    aa_og = row.get(aa_col)
    if aa_og is None or not isinstance(aa_og, str) or len(aa_og) == 0:
        return None

    # --- handle NaNs correctly for orig/new/feature type ---
    def _safe_str(val: object) -> str:
        """Return '' if val is NaN/None, else str(val)."""
        if val is None:
            return ""
        if isinstance(val, float) and pd.isna(val):
            return ""
        if pd.isna(val) if isinstance(val, (float, int)) else False:
            return ""
        return str(val)

    exp_str  = _safe_str(row.get(orig_col, ""))
    rep_str  = _safe_str(row.get(new_col, ""))
    feat_str = _safe_str(row.get(feature_col, ""))

    # Tokenize ranges (comma-separated)
    ranges = [t.strip() for t in str(ranges_str).split(",") if t.strip()]
    n = len(ranges)
    if n == 0:
        return None

    # Helper: split a string into n tokens (or empty tokens) by a given separator
    def _split_or_empty(raw, sep=","):
        raw = raw.strip()
        if raw == "":
            return [""] * n
        tokens = [t.strip() for t in raw.split(sep)]
        return tokens

    # orig/new are comma-separated per PTM
    exp_tokens  = _split_or_empty(exp_str, sep=",")
    rep_tokens  = _split_or_empty(rep_str, sep=",")
    # PTM Feature type: usually pipe-separated across features
    feat_tokens = _split_or_empty(feat_str, sep="|")

    # Broadcast / sanity-check lengths
    def _normalize_tokens(tokens, name):
        if len(tokens) == 1 and n > 1:
            # Broadcast a single value to all ranges
            return tokens * n
        if len(tokens) != n:
            # Mismatch between number of ranges and metadata entries
            return None
        return tokens

    exp_tokens  = _normalize_tokens(exp_tokens,  "ptm_orig")
    rep_tokens  = _normalize_tokens(rep_tokens,  "ptm_new")
    feat_tokens = _normalize_tokens(feat_tokens, "PTM Feature type")

    if exp_tokens is None or rep_tokens is None or feat_tokens is None:
        return None

    edits = []

    for r, exp, rep, feat in zip(ranges, exp_tokens, rep_tokens, feat_tokens):
        exp_norm  = re.sub(r"\s+", "", exp or "")
        rep_norm  = re.sub(r"\s+", "", rep or "")
        feat_norm = feat.strip()

        # Parse range: "a-b" or "a"
        if "-" in r:
            a_str, b_str = r.split("-", 1)
        else:
            a_str = b_str = r
        a, b = int(a_str), int(b_str)

        # 1-based inclusive indices -> 0-based slice [start0:end0)
        start0 = a - 1
        end0   = b

        # Check that ptm_orig matches the existing sequence, *only if provided*
        if exp_norm not in ("", "-"):
            if aa_og[start0:end0] != exp_norm:
                # Original substring doesn't match expectation → fail this row
                return None

        # Build replacement annotation: <Feature> or <Feature|new>
        if feat_norm == "" and rep_norm == "":
            # Nothing to insert; skip this edit and move on to the next
            continue

        if feat_norm and rep_norm:
            rep_use = f"<{feat_norm}|{rep_norm}>"
        elif feat_norm:
            rep_use = f"<{feat_norm}>"
        else:
            # No feature type but a new sequence: still annotate with <>
            rep_use = f"<{rep_norm}>"

        edits.append((start0, end0, rep_use))

    if not edits:
        # no useful edits after parsing
        return None

    # Apply edits left→right with running offset
    s = aa_og
    offset = 0
    for start0, end0, rep_use in sorted(edits):
        s = s[:start0 + offset] + rep_use + s[end0 + offset:]
        offset += len(rep_use) - (end0 - start0)

    return s


In [566]:
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["interaction_intactid"]=="EBI-8436077"
][["PTM Feature range(s)"]]

Unnamed: 0,PTM Feature range(s)
6602,


In [567]:
merged_expl_ptm_filt["ptm_new_1"] = merged_expl_ptm_filt["ptm_new_1"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_expl_ptm_filt["ptm_new_2"] = merged_expl_ptm_filt["ptm_new_2"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_expl_ptm_filt["ptm_aa_1"] = merged_expl_ptm_filt.apply(lambda row: get_ptm_sequence(row, partner="A"), axis=1)
merged_expl_ptm_filt["ptm_aa_2"] = merged_expl_ptm_filt.apply(lambda row: get_ptm_sequence(row, partner="B"), axis=1)

merged_expl_ptm_filt[[
    "interaction_intactid",
    "PTM Partner",
    "ptm_range_1",
    "ptm_range_2",
    "ptm_orig_1",
    "ptm_orig_2",
    "ptm_new_1",
    "ptm_new_2",
    "aa_1",
    "aa_2",
    "ptm_aa_1",
    "ptm_aa_2",
]]

Unnamed: 0,interaction_intactid,PTM Partner,ptm_range_1,ptm_range_2,ptm_orig_1,ptm_orig_2,ptm_new_1,ptm_new_2,aa_1,aa_2,ptm_aa_1,ptm_aa_2
0,EBI-27070431,A,1-1,,,,,,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",
1,EBI-27070959,A,1-1,,,,,,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",
2,EBI-2117618,A,1-124,,,,,,MRNEMHLQFSARSENESFARVTVAAFVAQLDPTTDELTEIKTVVSE...,MDVDVKQGQSPIKDQEMKELIRRSQEGDQEARDEIIEKNMRLVWSV...,"<psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)>EVNK...",
3,EBI-21198714,A,1-196,,,,,,MGRPLLLPLLPLLLPPAFLQPSGSTGSGPSYLYGVTQPKHLSASMG...,MATPLPPPSPRHLRLLRLLLSGLVLGAALRGAAAGHPDVAACPGSL...,"<psi-mod:""MOD:00655""(S-myristoylated residue)>...",
4,EBI-9210021,A,1-517,,,,,,MLRPEISSTSPSAPAVSPSSGETRSPQGPRYNFGLQETPQSRPSVQ...,MATPAVPVSAPPATPTPVPAAAPASVPAPTPAPAAAPVPAAAPASS...,"<psi-mod:""MOD:00728""(galactosylated residue)>Q...",
...,...,...,...,...,...,...,...,...,...,...,...,...
6598,EBI-7889284,B,,998-998,,,,,MGKEQELLEAARTGHLPAVEKLLSGKRLSSGFGGGGGGGSGGGGGG...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
6599,EBI-7889390,B,,998-998,,,,,MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
6600,EBI-7889463,B,,998-998,,,,,MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
6601,EBI-8674215,B,,999-999,,,,,MAVWIQAQQLQGEALHQMQALYGQHFPIEVRHYLSQWIESQAWDSV...,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...,,MATGGRRGAAAAPLLVAVAALLLGAAGHLYPGEVCPGMDIRNNLTR...


In [568]:
merged_neg_expl_ptm_filt["ptm_new_1"] = merged_neg_expl_ptm_filt["ptm_new_1"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_neg_expl_ptm_filt["ptm_new_2"] = merged_neg_expl_ptm_filt["ptm_new_2"].apply(lambda x: x.strip().replace(" ","").replace("\n","").replace("\t","") if type(x)==str else x)
merged_neg_expl_ptm_filt["ptm_aa_1"] = merged_neg_expl_ptm_filt.apply(lambda row: get_ptm_sequence(row, partner="A"), axis=1)
merged_neg_expl_ptm_filt["ptm_aa_2"] = merged_neg_expl_ptm_filt.apply(lambda row: get_ptm_sequence(row, partner="B"), axis=1)

merged_neg_expl_ptm_filt[[
    "interaction_intactid",
    "PTM Partner",
    "ptm_range_1",
    "ptm_range_2",
    "ptm_orig_1",
    "ptm_orig_2",
    "ptm_new_1",
    "ptm_new_2",
    "aa_1",
    "aa_2",
    "ptm_aa_1",
    "ptm_aa_2",
]]

Unnamed: 0,interaction_intactid,PTM Partner,ptm_range_1,ptm_range_2,ptm_orig_1,ptm_orig_2,ptm_new_1,ptm_new_2,aa_1,aa_2,ptm_aa_1,ptm_aa_2
0,EBI-27070431,A,1-1,,,,,,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",
1,EBI-27070959,A,1-1,,,,,,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",
2,EBI-16173311,A,"1-1,3-3",,,,,,YIYTQ,MGVLRVYVILILVGFCVQIVVVNSQNLTCNSNDLKALEGFMRGLES...,"<psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)>I<p...",
3,EBI-15973539,A,10-10,,,,,,GKGGAKRHRKVLRDNIQGI,MSRYITRLSMRRTYKWNGRPVGEDRKLRRQYYGSMSISVDGRTEDV...,"GKGGAKRHR<psi-mi:""MI:0166""(N6,N6-dimethyl-L-ly...",
4,EBI-7818275,A,10-10,,,,,,MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGT...,MATDAALRRLLRLHRTEIAVAVDSAFPLLHALADHDVVPEDKFQET...,"MARTKQTAR<psi-mi:""MI:0167""(N6,N6,N6-trimethyl-...",
...,...,...,...,...,...,...,...,...,...,...,...,...
5931,EBI-7889240,B,,998-998,,,,,MSEVLPADSGVDTLAVFMASSGTTDVTNRNSPATPPNTLNLRSSHN...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
5932,EBI-7889284,B,,998-998,,,,,MGKEQELLEAARTGHLPAVEKLLSGKRLSSGFGGGGGGGSGGGGGG...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
5933,EBI-7889390,B,,998-998,,,,,MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...
5934,EBI-7889463,B,,998-998,,,,,MSVPSSLSQSAINANSHGGPALSLPLPLHAAHNQLLNAKLQATAVG...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...


In [569]:
l_xml = merged_expl_ptm_filt["ptm_mi_1"].dropna().unique().tolist() + merged_expl_ptm_filt["ptm_mi_2"].dropna().unique().tolist()
print(l_xml)
l_xml_neg = merged_neg_expl_ptm_filt["ptm_mi_1"].dropna().unique().tolist() + merged_neg_expl_ptm_filt["ptm_mi_2"].dropna().unique().tolist()
print(l_xml_neg)

['MI:0639', 'MI:0925', 'MI:1224', 'MI:0638', 'MI:1233', 'MI:1223', 'MI:1225', 'MI:0925', 'MI:0639', 'MI:1223', 'MI:0638', 'MI:1225', 'MI:1224', 'MI:1233']
['MI:0639', 'MI:1224', 'MI:0638', 'MI:0925', 'MI:1233', 'MI:1223', 'MI:1225', 'MI:0925', 'MI:0639', 'MI:1223', 'MI:0638', 'MI:1225', 'MI:1224', 'MI:1233']


In [570]:
l = ptms["Feature type"].value_counts().reset_index()["Feature type"].unique().tolist()
l = [x for x in l if x.startswith("psi-mi:")]
l = [x.split("psi-mi:")[1].split("(")[0] for x in l]
l = [x.strip("\"") for x in l]
l

['MI:0170',
 'MI:0178',
 'MI:0176',
 'MI:0177',
 'MI:0166',
 'MI:0167',
 'MI:0526',
 'MI:0165',
 'MI:0179',
 'MI:0189',
 'MI:0527',
 'MI:0120',
 'MI:0181',
 'MI:2202',
 'MI:2435',
 'MI:2276',
 'MI:0160',
 'MI:0252',
 'MI:0911',
 'MI:0551',
 'MI:0236',
 'MI:0132',
 'MI:0124',
 'MI:0119',
 'MI:0442',
 'MI:0140',
 'MI:1257']

In [571]:
print("Display overlap between labeled MIs and the MIs in aggregated PTM database:")
display(ptm_mi_ok.loc[ptm_mi_ok["id"].isin(l)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

print("Display overlap between labeled MIs and the MIs in XML-scraped PTM database:")
display(ptm_mi_ok.loc[ptm_mi_ok["id"].isin(l_xml)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

Display overlap between labeled MIs and the MIs in aggregated PTM database:


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all


Display overlap between labeled MIs and the MIs in XML-scraped PTM database:


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
1,prerequisite-ptm,MI:0638,MI:0925,MI:0925,observed-ptm
2,resulting-ptm,MI:0639,MI:0925,MI:0925,observed-ptm
0,observed-ptm,MI:0925,,MI:0668,feature attribute name
4,ptm decreasing an interaction,MI:1223,MI:0925,MI:0925,observed-ptm
5,ptm increasing an interaction,MI:1224,MI:0925,MI:0925,observed-ptm
6,ptm disrupting an interaction,MI:1225,MI:0925,MI:0925,observed-ptm
3,resulting-cleavage,MI:1233,MI:0639,MI:0639,resulting-ptm


In [572]:
print("Display overlap between labeled MIs and the MIs in aggregated PTM database:")
display(ptm_mi_ok.loc[ptm_mi_ok["id"].isin(l)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

print("Display overlap between labeled MIs and the MIs in XML-scraped PTM database (neg):")
display(ptm_mi_ok.loc[ptm_mi_ok["id"].isin(l_xml_neg)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

Display overlap between labeled MIs and the MIs in aggregated PTM database:


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all


Display overlap between labeled MIs and the MIs in XML-scraped PTM database (neg):


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
1,prerequisite-ptm,MI:0638,MI:0925,MI:0925,observed-ptm
2,resulting-ptm,MI:0639,MI:0925,MI:0925,observed-ptm
0,observed-ptm,MI:0925,,MI:0668,feature attribute name
4,ptm decreasing an interaction,MI:1223,MI:0925,MI:0925,observed-ptm
5,ptm increasing an interaction,MI:1224,MI:0925,MI:0925,observed-ptm
6,ptm disrupting an interaction,MI:1225,MI:0925,MI:0925,observed-ptm
3,resulting-cleavage,MI:1233,MI:0639,MI:0639,resulting-ptm


In [573]:
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Interactor Matches"]=="A") & 
    (merged_expl_ptm_filt["ptm_range_2"].notna()) & 
    (merged_expl_ptm_filt["ptm_range_1"].isna())
])==0
print(f"No rows where interactor is A but ptm_ data is associated with B: {test1}")
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Interactor Matches"]=="B") & 
    (merged_expl_ptm_filt["ptm_range_1"].notna()) & 
    (merged_expl_ptm_filt["ptm_range_2"].isna())
])==0
print(f"No rows where interactor is B but ptm_ data is associated with A: {test1}")

No rows where interactor is A but ptm_ data is associated with B: True
No rows where interactor is B but ptm_ data is associated with A: True


In [574]:
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Interactor Matches"]=="A") & 
    (merged_neg_expl_ptm_filt["ptm_range_2"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_range_1"].isna())
])==0
print(f"No rows where interactor is A but ptm_ data is associated with B: {test1}")
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Interactor Matches"]=="B") & 
    (merged_neg_expl_ptm_filt["ptm_range_1"].notna()) & 
    (merged_neg_expl_ptm_filt["ptm_range_2"].isna())
])==0
print(f"No rows where interactor is B but ptm_ data is associated with A: {test1}")

No rows where interactor is A but ptm_ data is associated with B: True
No rows where interactor is B but ptm_ data is associated with A: True


In [575]:
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Feature range(s)"].isna())
])
print(f"{test1} rows do not have a PTM Feature range(s) value.")
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM Feature range(s)"].isna()) & 
    (merged_expl_ptm_filt["agg_ptm_has_info"])
])==0
print(f"\tAll of these rows do not have PTM-aggregated data at all: {test1}")

3 rows do not have a PTM Feature range(s) value.
	All of these rows do not have PTM-aggregated data at all: True


In [576]:
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Feature range(s)"].isna())
])
print(f"{test1} rows do not have a PTM Feature range(s) value.")
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM Feature range(s)"].isna()) & 
    (merged_neg_expl_ptm_filt["agg_ptm_has_info"])
])==0
print(f"\tAll of these rows do not have PTM-aggregated data at all: {test1}")

3 rows do not have a PTM Feature range(s) value.
	All of these rows do not have PTM-aggregated data at all: True


In [577]:
test1 = len(merged_expl_ptm_filt.loc[
        merged_expl_ptm_filt["PTM Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_expl_ptm_filt["PTM Original sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of ptm feature ranges as original sequences: {test1}")
test1 = len(merged_expl_ptm_filt.loc[
        merged_expl_ptm_filt["PTM Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_expl_ptm_filt["PTM Resulting sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of ptm feature ranges as resulting sequences: {test1}")
test1 = len(merged_expl_ptm_filt.loc[
    ~merged_expl_ptm_filt["PTM Feature range(s)"].fillna("-").str.contains("-")
    ])==0
print(f"All rows have a dash - in ptm feature range indicating the span of the feature: {test1}")


All rows have the same # of ptm feature ranges as original sequences: True
All rows have the same # of ptm feature ranges as resulting sequences: True
All rows have a dash - in ptm feature range indicating the span of the feature: True


In [578]:
test1 = len(merged_neg_expl_ptm_filt.loc[
        merged_neg_expl_ptm_filt["PTM Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_neg_expl_ptm_filt["PTM Original sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of ptm feature ranges as original sequences: {test1}")
test1 = len(merged_neg_expl_ptm_filt.loc[
        merged_neg_expl_ptm_filt["PTM Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_neg_expl_ptm_filt["PTM Resulting sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of ptm feature ranges as resulting sequences: {test1}")
test1 = len(merged_neg_expl_ptm_filt.loc[
    ~merged_neg_expl_ptm_filt["PTM Feature range(s)"].fillna("-").str.contains("-")
    ])==0
print(f"All rows have a dash - in ptm feature range indicating the span of the feature: {test1}")


All rows have the same # of ptm feature ranges as original sequences: True
All rows have the same # of ptm feature ranges as resulting sequences: True
All rows have a dash - in ptm feature range indicating the span of the feature: True


In [579]:
ptm_mi_ok_labeled = ptm_mi_ok.copy(deep=True)
d_og = {
    "MI:0925": "unknown",
    "MI:0638": "no",
    "MI:0639": "yes",
    "MI:1233": "yes",
    "MI:1223": "yes",
    "MI:1224": "yes",
    "MI:1225": "yes"
}
d_new = {
    "MI:0925": "unknown",
    "MI:0638": "yes",
    "MI:0639": "unknown",
    "MI:1233": "unknown",
    "MI:1223": "yes",
    "MI:1224": "yes",
    "MI:1225": "no"
}
ptm_mi_ok_labeled["original_sequence"] = ptm_mi_ok_labeled["id"].map(d_og)
ptm_mi_ok_labeled["ptm_sequence"] = ptm_mi_ok_labeled["id"].map(d_new)
ptm_mi_ok_labeled

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all,original_sequence,ptm_sequence
0,observed-ptm,MI:0925,,MI:0668,feature attribute name,unknown,unknown
1,prerequisite-ptm,MI:0638,MI:0925,MI:0925,observed-ptm,no,yes
2,resulting-ptm,MI:0639,MI:0925,MI:0925,observed-ptm,yes,unknown
3,resulting-cleavage,MI:1233,MI:0639,MI:0639,resulting-ptm,yes,unknown
4,ptm decreasing an interaction,MI:1223,MI:0925,MI:0925,observed-ptm,yes,yes
5,ptm increasing an interaction,MI:1224,MI:0925,MI:0925,observed-ptm,yes,yes
6,ptm disrupting an interaction,MI:1225,MI:0925,MI:0925,observed-ptm,yes,no


In [580]:
ptm_mis_labeled = ptm_feature_types_labeled.copy(deep=True)
ptm_mis_labeled = ptm_mis_labeled.loc[ptm_mis_labeled["feature"].str.startswith("psi-mi")]
ptm_mis_labeled["mi"] = ptm_mis_labeled["feature"].apply(lambda x: x.split("psi-mi:")[1].split("(")[0].strip("\""))
ptm_mis_labeled = pd.concat([
    ptm_mis_labeled,
    ptm_mi_ok_labeled.rename(columns={"id":"mi","label":"feature"})
])
ptm_mis_labeled

Unnamed: 0,feature,original_sequence,ptm_sequence,comments,mi,parent_id,parent_ids_all,parent_names_all
1,"psi-mi:""MI:0236""(32p radiolabel)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0236,,,
2,"psi-mi:""MI:0170""(phosphorylated residue)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0170,,,
3,"psi-mi:""MI:0176""(O-phospho-L-serine)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0176,,,
4,"psi-mi:""MI:0120""(protein modification)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0120,,,
5,"psi-mi:""MI:0178""(O4'-phospho-L-tyrosine)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0178,,,
6,"psi-mi:""MI:0177""(O-phospho-L-threonine)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0177,,,
7,"psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0166,,,
10,"psi-mi:""MI:0527""(""monoadenosine diphosphoribos...",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0527,,,
11,"psi-mi:""MI:0252""(biological feature)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0252,,,
13,"psi-mi:""MI:0165""(N6-methyl-L-lysine)",unknown,unknown,"from the label, do not know if the PTM is prer...",MI:0165,,,


In [581]:
test1 = set(merged_expl_ptm_filt["ptm_mi_1"].dropna().tolist() + merged_expl_ptm_filt["ptm_mi_2"].dropna().tolist())
test1 = len(test1-set(ptm_mis_labeled["mi"].tolist()))==0
print(f"All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: {test1}.")
ptm_mis_og_labeled = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["original_sequence"]))
ptm_mis_new_labeled = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["ptm_sequence"]))

merged_expl_ptm_filt["ptm_new_binds_bo_mi"] = merged_expl_ptm_filt.apply(
    lambda row: ptm_mis_new_labeled.get(row["ptm_mi_1"]) if type(row["ptm_mi_1"])==str else ptm_mis_new_labeled.get(row["ptm_mi_2"]),axis=1)
merged_expl_ptm_filt["ptm_og_binds_bo_mi"] = merged_expl_ptm_filt.apply(
    lambda row: ptm_mis_og_labeled.get(row["ptm_mi_1"]) if type(row["ptm_mi_1"])==str else ptm_mis_og_labeled.get(row["ptm_mi_2"]),axis=1)

All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: True.


In [582]:
print("From negative database")
test1 = set(merged_neg_expl_ptm_filt["ptm_mi_1"].dropna().tolist() + merged_neg_expl_ptm_filt["ptm_mi_2"].dropna().tolist())
test1 = len(test1-set(ptm_mis_labeled["mi"].tolist()))==0
print(f"All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: {test1}.")
ptm_mis_og_labeled = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["original_sequence"]))
ptm_mis_new_labeled = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["ptm_sequence"]))

merged_neg_expl_ptm_filt["ptm_new_binds_bo_mi"] = merged_neg_expl_ptm_filt.apply(
    lambda row: ptm_mis_new_labeled.get(row["ptm_mi_1"]) if type(row["ptm_mi_1"])==str else ptm_mis_new_labeled.get(row["ptm_mi_2"]),axis=1)
merged_neg_expl_ptm_filt["ptm_og_binds_bo_mi"] = merged_neg_expl_ptm_filt.apply(
    lambda row: ptm_mis_og_labeled.get(row["ptm_mi_1"]) if type(row["ptm_mi_1"])==str else ptm_mis_og_labeled.get(row["ptm_mi_2"]),axis=1)

From negative database
All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: True.


In [583]:
## Get positive, negative, and unknown labels for each
# We won't augment the negative dataset with this but we will take things out of the positives
# And we will make sure that none of these things are added as negatives when we do negative scraping 

# now combine to get unique assignments per row 
newbindcols = [
"PTM new_binds_bo_annotation",
"PTM new_binds_bo_feature_type",
"ptm_new_binds_bo_mi"]
ogbindcols = ["PTM og_binds_bo_annotation",
"PTM og_binds_bo_feature_type","ptm_og_binds_bo_mi"]

merged_expl_ptm_filt["PTM all_new_binds"] = (
    merged_expl_ptm_filt.apply(lambda r: _collect_row_values(r, newbindcols), axis=1)
)
merged_expl_ptm_filt["PTM all_og_binds"] = (
    merged_expl_ptm_filt.apply(lambda r: _collect_row_values(r, ogbindcols), axis=1)
)
merged_neg_expl_ptm_filt["PTM all_new_binds"] = (
    merged_neg_expl_ptm_filt.apply(lambda r: _collect_row_values(r, newbindcols), axis=1)
)
merged_neg_expl_ptm_filt["PTM all_og_binds"] = (
    merged_neg_expl_ptm_filt.apply(lambda r: _collect_row_values(r, ogbindcols), axis=1)
)

In [584]:
display(merged_expl_ptm_filt[["PTM all_new_binds","PTM all_og_binds"] + newbindcols + ogbindcols].head())
display(merged_neg_expl_ptm_filt[["PTM all_new_binds","PTM all_og_binds"] + newbindcols + ogbindcols].head())

Unnamed: 0,PTM all_new_binds,PTM all_og_binds,PTM new_binds_bo_annotation,PTM new_binds_bo_feature_type,ptm_new_binds_bo_mi,PTM og_binds_bo_annotation,PTM og_binds_bo_feature_type,ptm_og_binds_bo_mi
0,unknown,"unknown,yes",,unknown,unknown,,unknown,yes
1,unknown,"unknown,yes",,unknown,unknown,,unknown,yes
2,unknown,unknown,,unknown,unknown,,unknown,unknown
3,unknown,unknown,,unknown,unknown,,unknown,unknown
4,unknown,unknown,,unknown,unknown,,unknown,unknown


Unnamed: 0,PTM all_new_binds,PTM all_og_binds,PTM new_binds_bo_annotation,PTM new_binds_bo_feature_type,ptm_new_binds_bo_mi,PTM og_binds_bo_annotation,PTM og_binds_bo_feature_type,ptm_og_binds_bo_mi
0,unknown,"unknown,yes",,unknown,unknown,,unknown,yes
1,unknown,"unknown,yes",,unknown,unknown,,unknown,yes
2,"unknown,yes","unknown,yes",,unknown,yes,,unknown,yes
3,"unknown,yes","no,unknown",,unknown,yes,,unknown,no
4,"unknown,yes","no,unknown",yes,unknown,yes,no,unknown,no


In [585]:
display(merged_expl_ptm_filt["PTM all_new_binds"].value_counts())
display(merged_expl_ptm_filt["PTM all_og_binds"].value_counts())

PTM all_new_binds
unknown        5255
unknown,yes    1268
no,unknown       79
yes               1
Name: count, dtype: int64

PTM all_og_binds
unknown           4832
no,unknown         981
unknown,yes        776
no,unknown,yes      11
yes                  2
no                   1
Name: count, dtype: int64

In [586]:
display(merged_neg_expl_ptm_filt["PTM all_new_binds"].value_counts())
display(merged_neg_expl_ptm_filt["PTM all_og_binds"].value_counts())

PTM all_new_binds
unknown        4702
unknown,yes    1159
no,unknown       74
yes               1
Name: count, dtype: int64

PTM all_og_binds
unknown           4318
no,unknown         900
unknown,yes        704
no,unknown,yes      11
yes                  2
no                   1
Name: count, dtype: int64

In [587]:
test1 = len(merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM all_new_binds"].isna()) | 
    (merged_expl_ptm_filt["PTM all_og_binds"].isna())
])==0
print(f"Could map whether original and ptm sequences are binding for every row: {test1}")

Could map whether original and ptm sequences are binding for every row: True


In [588]:
test1 = len(merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM all_new_binds"].isna()) | 
    (merged_neg_expl_ptm_filt["PTM all_og_binds"].isna())
])==0
print(f"Could map whether original and ptm sequences are binding for every row: {test1}")

Could map whether original and ptm sequences are binding for every row: True


In [589]:
# Figure out if any rows have contradicting labels 
def simplify_ptm_bind_labels(s):
    """
    Turn s into a catchall label
    """
    s = set(s.split(","))
    options = set(["yes","no","unknown"])
    if s.intersection(options)==set(["yes"]):
        return "yes"
    elif s.intersection(options)==set(["no"]):
        return "no"
    elif s.intersection(options)==set(["yes","unknown"]):
        return "yes"
    elif s.intersection(options)==set(["no","unknown"]):
        return "no"
    elif s.intersection(options)==set(["yes","no"]):
        return "unknown"
    elif s.intersection(options)==set(["yes","no","unknown"]):
        return "unknown"
    return "unknown"

In [590]:
merged_expl_ptm_filt["PTM decisive_entry_new_binds"] = merged_expl_ptm_filt["PTM all_new_binds"].apply(lambda s: simplify_ptm_bind_labels(s))
merged_expl_ptm_filt["PTM decisive_entry_og_binds"] = merged_expl_ptm_filt["PTM all_og_binds"].apply(lambda s: simplify_ptm_bind_labels(s))

In [591]:
merged_neg_expl_ptm_filt["PTM decisive_entry_new_binds"] = merged_neg_expl_ptm_filt["PTM all_new_binds"].apply(lambda s: simplify_ptm_bind_labels(s))
merged_neg_expl_ptm_filt["PTM decisive_entry_og_binds"] = merged_neg_expl_ptm_filt["PTM all_og_binds"].apply(lambda s: simplify_ptm_bind_labels(s))

In [592]:
test1 =len(merged_expl_ptm_filt.loc[(merged_expl_ptm_filt["ptm_short_1"].isna()) & (merged_expl_ptm_filt["ptm_short_2"].isna())])==0
print(f"Positive db: Everything has a value in either ptm_short_1 or ptm_short_2: {test1}")

test1 =len(merged_neg_expl_ptm_filt.loc[(merged_neg_expl_ptm_filt["ptm_short_1"].isna()) & (merged_neg_expl_ptm_filt["ptm_short_2"].isna())])==0
print(f"Negative db: Everything has a value in either ptm_short_1 or ptm_short_2: {test1}")

Positive db: Everything has a value in either ptm_short_1 or ptm_short_2: True
Negative db: Everything has a value in either ptm_short_1 or ptm_short_2: True


In [593]:
# Group by seq_sort 
def get_seqsort_for_ptm_pair(row, seq_type="og"):
    """
    Get the unique pair of sequences involved in this interaction. For the og or the ptm
    """
    # Figure out which partner is ptm
    ptm_partner = row["PTM Partner"]
    
    # Get original and ptm sequences
    og_aa_1 = row["aa_1"]
    new_aa_1 = row["ptm_aa_1"]
    
    og_aa_2 = row["aa_2"]
    new_aa_2 = row["ptm_aa_2"]
    
    # Assemble sequence pair based on whether we want the ptm interaction or the original interactin
    seqpair = [None, None]
    if not(type(ptm_partner)==float or ptm_partner is None):
        if ptm_partner=="A,B":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [new_aa_1,new_aa_2]
        elif ptm_partner == "A":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [new_aa_1,og_aa_2]
        elif ptm_partner == "B":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [og_aa_1,new_aa_2]
                
    intA = seqpair[0]
    intB = seqpair[1]
    
    if intA is None or (type(intA)==float and np.isnan(intA)):
        intA=""
    if intB is None or (type(intB)==float and np.isnan(intB)):
        intB=""
    
    if intA <= intB:
        return f"{intA}_{intB}"
    return f"{intB}_{intA}"

In [594]:
merged_expl_ptm_filt["seq_sort_og"] = merged_expl_ptm_filt.apply(lambda row: get_seqsort_for_ptm_pair(row, seq_type="og"),axis=1)
merged_neg_expl_ptm_filt["seq_sort_og"] = merged_neg_expl_ptm_filt.apply(lambda row: get_seqsort_for_ptm_pair(row, seq_type="og"),axis=1)

merged_expl_ptm_filt["seq_sort_new"] = merged_expl_ptm_filt.apply(lambda row: get_seqsort_for_ptm_pair(row, seq_type="ptm"), axis=1)
merged_neg_expl_ptm_filt["seq_sort_new"] = merged_neg_expl_ptm_filt.apply(lambda row: get_seqsort_for_ptm_pair(row, seq_type="ptm"), axis=1)

In [595]:
# 
gb_og = merged_expl_ptm_filt.groupby("seq_sort_og").agg(
    decisive_entry_og_binds=("PTM decisive_entry_og_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_og["seq_sort_og_id"] = [f"seq_sort_og_{i+1}" for i in range(len(gb_og))]

display(gb_og.loc[gb_og["decisive_entry_og_binds"].str.contains(",")].head())

test1 = len(gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes")) 
    ])
print(f"Positive db: Total og pairs that have yes AND no for binding based on ptm annotations: {test1}/{len(gb_og)} ({100*test1/len(gb_og):.2f}%)")

Unnamed: 0,seq_sort_og,decisive_entry_og_binds,seq_sort_og_id
9,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,"unknown,yes",seq_sort_og_10
21,ARKSTGGK_MASESETLNPSARIMTFYPTMEEFRNFSRYIAYIESQ...,"unknown,yes",seq_sort_og_22
38,ARTKQTARKSTGGKAPRKQLA_MEPGSDDFLPPPECPVFEPSWAEF...,"unknown,yes",seq_sort_og_39
49,ARTKQTARKSTGGKAPRKQLA_MSLPQWCPPHSTLKRNPTTGEDVY...,"unknown,yes",seq_sort_og_50
50,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,"unknown,no,yes",seq_sort_og_51


Positive db: Total og pairs that have yes AND no for binding based on ptm annotations: 6/2541 (0.24%)


In [596]:
gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes")) 
]

Unnamed: 0,seq_sort_og,decisive_entry_og_binds,seq_sort_og_id
50,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,"unknown,no,yes",seq_sort_og_51
278,MAASRRSQHHHHHHQQQLQPAPGASAPPPPPPPPLSPGLAPGTTPA...,"no,yes",seq_sort_og_279
678,MANIAVQRIKREFKEVLKSEETSKNQIKVDLVDENFTELRGEIAGP...,"no,yes",seq_sort_og_679
988,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,"no,yes",seq_sort_og_989
995,MAVTITLKTLQQQTFKIRMEPDETVKVLKEKIEAEKGRDAFPVAGQ...,"no,yes",seq_sort_og_996
1851,MGSNKSKPKDASQRRRSLEPSENVHGAGGAFPASQTPSKPASADGH...,"no,yes",seq_sort_og_1852


In [597]:
# 
gb_new = merged_expl_ptm_filt.groupby("seq_sort_new").agg(
    decisive_entry_new_binds=("PTM decisive_entry_new_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_new["seq_sort_new_id"] = [f"seq_sort_new_{i+1}" for i in range(len(gb_new))]

display(gb_new.loc[gb_new["decisive_entry_new_binds"].str.contains(",")].head())

test1 = len(gb_new.loc[
    (gb_new["decisive_entry_new_binds"].str.contains("yes,no")) | 
    (gb_new["decisive_entry_new_binds"].str.contains("no,yes")) 
    ])
print(f"Positive db: Total new pairs that have yes AND no for binding based on ptm annotations: {test1}/{len(gb_new)} ({100*test1/len(gb_new):.2f}%)")

Unnamed: 0,seq_sort_new,decisive_entry_new_binds,seq_sort_new_id
14,"<psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)>I<p...","unknown,yes",seq_sort_new_15
81,"ART<psi-mi:""MI:0165""(N6-methyl-L-lysine)>QTARK...","no,yes",seq_sort_new_82
85,"ART<psi-mi:""MI:0165""(N6-methyl-L-lysine)>QTARK...","no,yes",seq_sort_new_86
95,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...","no,yes",seq_sort_new_96
98,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...","no,yes",seq_sort_new_99


Positive db: Total new pairs that have yes AND no for binding based on ptm annotations: 12/4632 (0.26%)


In [598]:
# 
gb_neg_og = merged_neg_expl_ptm_filt.groupby("seq_sort_og").agg(
    decisive_entry_og_binds=("PTM decisive_entry_og_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_neg_og["seq_sort_og_id"] = [f"seq_sort_og_{i+1}" for i in range(len(gb_neg_og))]

display(gb_neg_og.loc[gb_neg_og["decisive_entry_og_binds"].str.contains(",")].head())

test1 = len(gb_neg_og.loc[
    (gb_neg_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_neg_og["decisive_entry_og_binds"].str.contains("no,yes")) 
    ])
print(f"Negatives db: Total og pairs that have yes AND no for binding based on ptm annotations: {test1}/{len(gb_neg_og)} ({100*test1/len(gb_neg_og):.2f}%)")

Unnamed: 0,seq_sort_og,decisive_entry_og_binds,seq_sort_og_id
9,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,"unknown,yes",seq_sort_og_10
20,ARKSTGGK_MASESETLNPSARIMTFYPTMEEFRNFSRYIAYIESQ...,"unknown,yes",seq_sort_og_21
36,ARTKQTARKSTGGKAPRKQLA_MEPGSDDFLPPPECPVFEPSWAEF...,"unknown,yes",seq_sort_og_37
45,ARTKQTARKSTGGKAPRKQLA_MSLPQWCPPHSTLKRNPTTGEDVY...,"unknown,yes",seq_sort_og_46
46,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,"unknown,no,yes",seq_sort_og_47


Negatives db: Total og pairs that have yes AND no for binding based on ptm annotations: 6/2234 (0.27%)


In [599]:
# 
gb_neg_new = merged_neg_expl_ptm_filt.groupby("seq_sort_new").agg(
    decisive_entry_new_binds=("PTM decisive_entry_new_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_neg_new["seq_sort_new_id"] = [f"seq_sort_new_{i+1}" for i in range(len(gb_neg_new))]

display(gb_neg_new.loc[gb_neg_new["decisive_entry_new_binds"].str.contains(",")].head())

test1 = len(gb_neg_new.loc[
    (gb_neg_new["decisive_entry_new_binds"].str.contains("yes,no")) | 
    (gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")) 
    ])
print(f"Negatives db: Total new pairs that have yes AND no for binding based on ptm annotations: {test1}/{len(gb_neg_new)} ({100*test1/len(gb_neg_new):.2f}%)")

Unnamed: 0,seq_sort_new,decisive_entry_new_binds,seq_sort_new_id
11,"<psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)>I<p...","unknown,yes",seq_sort_new_12
74,"ART<psi-mi:""MI:0165""(N6-methyl-L-lysine)>QTARK...","no,yes",seq_sort_new_75
78,"ART<psi-mi:""MI:0165""(N6-methyl-L-lysine)>QTARK...","no,yes",seq_sort_new_79
88,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...","no,yes",seq_sort_new_89
91,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...","no,yes",seq_sort_new_92


Negatives db: Total new pairs that have yes AND no for binding based on ptm annotations: 11/4158 (0.26%)


In [600]:
test1 = gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes")) 
    ]["seq_sort_og"].unique().tolist()

gb_og_id_dict = dict(zip(gb_og["seq_sort_og"],gb_og["seq_sort_og_id"]))

gb_og["PTM decisive_seqpair_og_binds"] = gb_og["decisive_entry_og_binds"].apply(lambda s: simplify_ptm_bind_labels(s))

gb_og_dict = dict(zip(gb_og["seq_sort_og"],gb_og["PTM decisive_seqpair_og_binds"]))

merged_expl_ptm_filt["seq_sort_og_id"] = merged_expl_ptm_filt["seq_sort_og"].map(gb_og_id_dict)

merged_expl_ptm_filt["PTM decisive_seqpair_og_binds"] = merged_expl_ptm_filt["seq_sort_og"].map(gb_og_dict)

display_cols = [
    "interaction_intactid","seq_sort_og","seq_sort_og_id","PTM Partner","PTM all_og_binds","PTM decisive_entry_og_binds","PTM decisive_seqpair_og_binds",
    "PTM # Feature AC",
"PTM Affected protein AC",
"PTM Affected protein full name",
"PTM Affected protein organism",
"PTM Affected protein symbol",
"PTM Feature annotation(s)",
"PTM Feature range(s)",
"PTM Feature short label",
"PTM Feature type",
"PTM Figure legend(s)",
"PTM Interaction AC",
"PTM Interaction participants",
"PTM Interactor Matches",
"PTM Original sequence",
"PTM PubMedID",
"PTM Resulting sequence",
"PTM Xref ID(s)",
"PTM new_binds_bo_annotation",
"PTM new_binds_bo_feature_type",
"PTM og_binds_bo_annotation",
"PTM og_binds_bo_feature_type",
"ptm_new_binds_bo_mi",
"ptm_og_binds_bo_mi",
"ptm_mi_1","ptm_mi_2"
]
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["seq_sort_og"].isin(test1)
].sort_values(by=["seq_sort_og"])[display_cols].to_csv("contradicting_ptm_labels_dec2_2025.csv",index=False)

In [601]:
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["seq_sort_og"].isin(test1)
].sort_values(by=["seq_sort_og"])[display_cols]

Unnamed: 0,interaction_intactid,seq_sort_og,seq_sort_og_id,PTM Partner,PTM all_og_binds,PTM decisive_entry_og_binds,PTM decisive_seqpair_og_binds,PTM # Feature AC,PTM Affected protein AC,PTM Affected protein full name,...,PTM Resulting sequence,PTM Xref ID(s),PTM new_binds_bo_annotation,PTM new_binds_bo_feature_type,PTM og_binds_bo_annotation,PTM og_binds_bo_feature_type,ptm_new_binds_bo_mi,ptm_og_binds_bo_mi,ptm_mi_1,ptm_mi_2
4306,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"unknown,yes",yes,unknown,EBI-15671713,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
4312,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"unknown,yes",yes,unknown,EBI-15671719,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
4311,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"unknown,yes",yes,unknown,EBI-15671713,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
4310,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"unknown,yes",yes,unknown,EBI-15671716,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
4074,EBI-15671758,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"no,unknown",no,unknown,EBI-15671768,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,no,,MI:0638
4075,EBI-15671784,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"no,unknown",no,unknown,EBI-15671806,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,no,,MI:0638
4151,EBI-15671680,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,unknown,unknown,unknown,EBI-15671688,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,unknown,unknown,,MI:0925
4152,EBI-15671733,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,unknown,unknown,unknown,EBI-15671743,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,unknown,unknown,,MI:0925
4299,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"unknown,yes",yes,unknown,EBI-15671716,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
4301,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_51,B,"unknown,yes",yes,unknown,EBI-15671713,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224


In [602]:
test1 = gb_neg_og.loc[
    (gb_neg_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_neg_og["decisive_entry_og_binds"].str.contains("no,yes")) 
    ]["seq_sort_og"].unique().tolist()

gb_neg_og_id_dict = dict(zip(gb_neg_og["seq_sort_og"],gb_neg_og["seq_sort_og_id"]))

gb_neg_og["PTM decisive_seqpair_og_binds"] = gb_neg_og["decisive_entry_og_binds"].apply(lambda s: simplify_ptm_bind_labels(s))

gb_neg_og_dict = dict(zip(gb_neg_og["seq_sort_og"],gb_neg_og["PTM decisive_seqpair_og_binds"]))

merged_neg_expl_ptm_filt["seq_sort_og_id"] = merged_neg_expl_ptm_filt["seq_sort_og"].map(gb_neg_og_id_dict)

merged_neg_expl_ptm_filt["PTM decisive_seqpair_og_binds"] = merged_neg_expl_ptm_filt["seq_sort_og"].map(gb_neg_og_dict)

display_cols = [
    "interaction_intactid","seq_sort_og","seq_sort_og_id","PTM Partner","PTM all_og_binds","PTM decisive_entry_og_binds","PTM decisive_seqpair_og_binds",
    "PTM # Feature AC",
"PTM Affected protein AC",
"PTM Affected protein full name",
"PTM Affected protein organism",
"PTM Affected protein symbol",
"PTM Feature annotation(s)",
"PTM Feature range(s)",
"PTM Feature short label",
"PTM Feature type",
"PTM Figure legend(s)",
"PTM Interaction AC",
"PTM Interaction participants",
"PTM Interactor Matches",
"PTM Original sequence",
"PTM PubMedID",
"PTM Resulting sequence",
"PTM Xref ID(s)",
"PTM new_binds_bo_annotation",
"PTM new_binds_bo_feature_type",
"PTM og_binds_bo_annotation",
"PTM og_binds_bo_feature_type",
"ptm_new_binds_bo_mi",
"ptm_og_binds_bo_mi",
"ptm_mi_1","ptm_mi_2"
]
merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["seq_sort_og"].isin(test1)
].sort_values(by=["seq_sort_og"])[display_cols].to_csv("contradicting_ptm_labels_fromnegs_dec2_2025.csv",index=False)

In [603]:
merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["seq_sort_og"].isin(test1)
].sort_values(by=["seq_sort_og"])[display_cols]

Unnamed: 0,interaction_intactid,seq_sort_og,seq_sort_og_id,PTM Partner,PTM all_og_binds,PTM decisive_entry_og_binds,PTM decisive_seqpair_og_binds,PTM # Feature AC,PTM Affected protein AC,PTM Affected protein full name,...,PTM Resulting sequence,PTM Xref ID(s),PTM new_binds_bo_annotation,PTM new_binds_bo_feature_type,PTM og_binds_bo_annotation,PTM og_binds_bo_feature_type,ptm_new_binds_bo_mi,ptm_og_binds_bo_mi,ptm_mi_1,ptm_mi_2
3852,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"unknown,yes",yes,unknown,EBI-15671713,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
3858,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"unknown,yes",yes,unknown,EBI-15671719,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
3857,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"unknown,yes",yes,unknown,EBI-15671713,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
3856,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"unknown,yes",yes,unknown,EBI-15671716,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
3632,EBI-15671758,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"no,unknown",no,unknown,EBI-15671768,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,no,,MI:0638
3633,EBI-15671784,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"no,unknown",no,unknown,EBI-15671806,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,no,,MI:0638
3704,EBI-15671680,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,unknown,unknown,unknown,EBI-15671688,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,unknown,unknown,,MI:0925
3705,EBI-15671733,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,unknown,unknown,unknown,EBI-15671743,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,unknown,unknown,,MI:0925
3845,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"unknown,yes",yes,unknown,EBI-15671716,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224
3847,EBI-15671703,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,seq_sort_og_47,B,"unknown,yes",yes,unknown,EBI-15671713,dip:DIP-29329N,Histone H3 N-terminal Peptide,...,-,-,,unknown,,unknown,yes,yes,,MI:1224


In [604]:
test1 = gb_new.loc[
    (gb_new["decisive_entry_new_binds"].str.contains("yes,no")) | 
    (gb_new["decisive_entry_new_binds"].str.contains("no,yes")) 
    ]["seq_sort_new"].unique().tolist()

gb_new_id_dict = dict(zip(gb_new["seq_sort_new"],gb_new["seq_sort_new_id"]))

gb_new["PTM decisive_seqpair_new_binds"] = gb_new["decisive_entry_new_binds"].apply(lambda s: simplify_ptm_bind_labels(s))

gb_new_dict = dict(zip(gb_new["seq_sort_new"],gb_new["PTM decisive_seqpair_new_binds"]))

merged_expl_ptm_filt["seq_sort_new_id"] = merged_expl_ptm_filt["seq_sort_new"].map(gb_new_id_dict)

merged_expl_ptm_filt["PTM decisive_seqpair_new_binds"] = merged_expl_ptm_filt["seq_sort_new"].map(gb_new_dict)

In [605]:
test1 = gb_neg_new.loc[
    (gb_neg_new["decisive_entry_new_binds"].str.contains("yes,no")) | 
    (gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")) 
    ]["seq_sort_new"].unique().tolist()

gb_neg_new_id_dict = dict(zip(gb_neg_new["seq_sort_new"],gb_neg_new["seq_sort_new_id"]))

gb_neg_new["PTM decisive_seqpair_new_binds"] = gb_neg_new["decisive_entry_new_binds"].apply(lambda s: simplify_ptm_bind_labels(s))

gb_neg_new_dict = dict(zip(gb_neg_new["seq_sort_new"],gb_neg_new["PTM decisive_seqpair_new_binds"]))

merged_neg_expl_ptm_filt["seq_sort_new_id"] = merged_neg_expl_ptm_filt["seq_sort_new"].map(gb_neg_new_id_dict)

merged_neg_expl_ptm_filt["PTM decisive_seqpair_new_binds"] = merged_neg_expl_ptm_filt["seq_sort_new"].map(gb_neg_new_dict)

In [606]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes")) | 
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no"))
    ]["seq_sort_og"].unique().tolist()
temp = merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["seq_sort_og"].isin(test1)
].reset_index(drop=True)
temp["ptm_mi"] = temp["ptm_mi_1"].fillna("") + "," + temp["ptm_mi_2"].fillna("")
temp["ptm_mi"] = temp["ptm_mi"].fillna("").str.strip(",")
temp = temp.groupby("seq_sort_og").agg(
    ptm_og_binds_bo_mi=("ptm_og_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    ptm_mi=("ptm_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()
temp["ptm_og_binds_bo_mi_unique_conflicting"] = temp["ptm_og_binds_bo_mi"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(set(["no","yes"]))
        )
    )
)
test1 = len(temp.loc[
    ~(
        (temp["ptm_og_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
        (temp["ptm_og_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
    )
])==0

print(f"There are {len(temp)} unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.")
print(f"\tIn every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
ptm_mis_labeled["description"] = ptm_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
ptm_mis_labeled_mi_desc_dict = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["description"]))

temp["ptm_desc"] = temp["ptm_mi"].apply(lambda x: mi_to_desc_string(x,ptm_mis_labeled_mi_desc_dict))

temp.sort_values(by=["seq_sort_og"]).to_csv("contradicting_ptm_labels_grouped_by_seq_og_nov19_2025.csv",index=False)
temp.loc[temp["ptm_mi"].str.contains("\\|")].head()


There are 6 unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: True


Unnamed: 0,seq_sort_og,ptm_og_binds_bo_mi,ptm_mi,ptm_og_binds_bo_mi_unique_conflicting,ptm_desc
0,ARTKQTARKSTGGKAPRKQLA_MSLQMVTVGHNIALIQPGFSLMNF...,no|no|unknown|unknown|yes|yes|yes|yes|yes|yes|...,MI:0638|MI:0638|MI:0925|MI:0925|MI:1224|MI:122...,"no,yes",prerequisite-ptm|prerequisite-ptm|observed-ptm...
1,MAASRRSQHHHHHHQQQLQPAPGASAPPPPPPPPLSPGLAPGTTPA...,no|yes,MI:0638|MI:0639,"no,yes",prerequisite-ptm|resulting-ptm
2,MANIAVQRIKREFKEVLKSEETSKNQIKVDLVDENFTELRGEIAGP...,no|yes,MI:0638|MI:1224,"no,yes",prerequisite-ptm|ptm increasing an interaction
3,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,no|yes,MI:0638|MI:0639,"no,yes",prerequisite-ptm|resulting-ptm
4,MAVTITLKTLQQQTFKIRMEPDETVKVLKEKIEAEKGRDAFPVAGQ...,no|yes|yes|yes|yes,MI:0638|MI:1224|MI:1224|MI:1224|MI:1224,"no,yes",prerequisite-ptm|ptm increasing an interaction...


In [607]:
# get unique groupings of ptm effects
temp3 = temp[["seq_sort_og","ptm_mi", "ptm_desc"]]
temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
    lambda x: ",".join(sorted([y for y in list(set(x.split(","))) if y!="nan"]))
)
temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
    lambda x: ",".join(sorted([x for x in
            set(
                ",".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split(",")  # split on whitespace
            )
        if x!="nan"]))
)
temp3 = temp3.loc[
    (temp3["ptm_unique_mi"]!="nan") & 
    (temp3["ptm_unique_mi"]!="")
].reset_index(drop=True)
temp3["ptm_desc"] = temp3["ptm_desc"].fillna("").apply(lambda x: x.replace("no effect","no-effect"))
keywords = set(["no-effect","causing","increasing","decreasing","disrupting","prerequisite-ptm","resulting-ptm"])
# Extract keywords, ignoring comma vs pipe grouping
temp3["ptm_keywords"] = temp3["ptm_desc"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(keywords)
        )
    )
)
print(temp3["ptm_keywords"].value_counts())

test1 = temp3["ptm_keywords"].value_counts().reset_index()["ptm_keywords"].str.contains(",").all()
print(f"\nEvery sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): {test1}")

ptm_keywords
increasing,prerequisite-ptm       3
prerequisite-ptm,resulting-ptm    3
Name: count, dtype: int64

Every sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(


In [608]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_new.loc[
    (gb_new["decisive_entry_new_binds"].str.contains("no,yes")) | 
    (gb_new["decisive_entry_new_binds"].str.contains("yes,no"))
    ]["seq_sort_new"].unique().tolist()
temp = merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["seq_sort_new"].isin(test1)
].reset_index(drop=True)
temp["ptm_mi"] = temp["ptm_mi_1"].fillna("") + "," + temp["ptm_mi_2"].fillna("")
temp["ptm_mi"] = temp["ptm_mi"].fillna("").str.strip(",")
temp = temp.groupby("seq_sort_new").agg(
    ptm_new_binds_bo_mi=("ptm_new_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
    ptm_mi=("ptm_mi", lambda x: "|".join([str(y) for y in list(x)])),
).reset_index()
temp["ptm_new_binds_bo_mi_unique_conflicting"] = temp["ptm_new_binds_bo_mi"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(set(["no","yes"]))
        )
    )
)
test1 = len(temp.loc[
    ~(
        (temp["ptm_new_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
        (temp["ptm_new_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
    )
])==0

print(f"There are {len(temp)} unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.")
print(f"\tIn every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
ptm_mis_labeled["description"] = ptm_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
ptm_mis_labeled_mi_desc_dict = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["description"]))

temp["ptm_desc"] = temp["ptm_mi"].apply(lambda x: mi_to_desc_string(x,ptm_mis_labeled_mi_desc_dict))

temp.sort_values(by=["seq_sort_new"]).to_csv("contradicting_ptm_labels_grouped_by_seq_new_nov19_2025.csv",index=False)
temp.loc[temp["ptm_mi"].str.contains("\\|")].head()


There are 12 unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: True


Unnamed: 0,seq_sort_new,ptm_new_binds_bo_mi,ptm_mi,ptm_new_binds_bo_mi_unique_conflicting,ptm_desc
0,"ART<psi-mi:""MI:0165""(N6-methyl-L-lysine)>QTARK...",yes|yes|no|no|no,MI:1223|MI:1223|MI:1225|MI:1225|MI:1225,"no,yes",ptm decreasing an interaction|ptm decreasing a...
1,"ART<psi-mi:""MI:0165""(N6-methyl-L-lysine)>QTARK...",yes|yes|yes|no|no,MI:1223|MI:1223|MI:1223|MI:1225|MI:1225,"no,yes",ptm decreasing an interaction|ptm decreasing a...
2,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...",yes|no,MI:1223|MI:1225,"no,yes",ptm decreasing an interaction|ptm disrupting a...
3,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...",yes|yes|no|no|no|no|no,MI:1223|MI:1223|MI:1225|MI:1225|MI:1225|MI:122...,"no,yes",ptm decreasing an interaction|ptm decreasing a...
4,"ART<psi-mi:""MI:0166""(N6,N6-dimethyl-L-lysine)>...",yes|no,MI:1223|MI:1225,"no,yes",ptm decreasing an interaction|ptm disrupting a...


In [609]:
# get unique groupings of ptm effects
temp3 = temp[["seq_sort_new","ptm_mi", "ptm_desc"]]
temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
    lambda x: ",".join(sorted([y for y in list(set(x.split(","))) if y!="nan"]))
)
temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
    lambda x: ",".join(sorted([x for x in
            set(
                ",".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split(",")  # split on whitespace
            )
        if x!="nan"]))
)
temp3 = temp3.loc[
    (temp3["ptm_unique_mi"]!="nan") & 
    (temp3["ptm_unique_mi"]!="")
].reset_index(drop=True)
temp3["ptm_desc"] = temp3["ptm_desc"].fillna("").apply(lambda x: x.replace("no effect","no-effect"))
keywords = set(["no-effect","causing","increasing","decreasing","disrupting","prerequisite-ptm","resulting-ptm"])
# Extract keywords, ignoring comma vs pipe grouping
temp3["ptm_keywords"] = temp3["ptm_desc"].apply(
    lambda x: ",".join(
        sorted(
            set(
                " ".join(  # join all descriptions
                    x.replace("|", ",").split(",")
                ).split()  # split on whitespace
            ).intersection(keywords)
        )
    )
)
print(temp3["ptm_keywords"].value_counts())

test1 = temp3["ptm_keywords"].value_counts().reset_index()["ptm_keywords"].str.contains(",").all()
print(f"\nEvery sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): {test1}")

ptm_keywords
decreasing,disrupting    11
disrupting,increasing     1
Name: count, dtype: int64

Every sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(


In [610]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_neg_og.loc[
    (gb_neg_og["decisive_entry_og_binds"].str.contains("no,yes")) | 
    (gb_neg_og["decisive_entry_og_binds"].str.contains("yes,no"))
    ]["seq_sort_og"].unique().tolist()
temp = merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["seq_sort_og"].isin(test1)
].reset_index(drop=True)
if len(temp)>0:
    temp["ptm_mi"] = temp["ptm_mi_1"].fillna("") + "," + temp["ptm_mi_2"].fillna("")
    temp["ptm_mi"] = temp["ptm_mi"].fillna("").str.strip(",")
    temp = temp.groupby("seq_sort_og").agg(
        ptm_og_binds_bo_mi=("ptm_og_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
        ptm_mi=("ptm_mi", lambda x: "|".join([str(y) for y in list(x)])),
    ).reset_index()
    temp["ptm_og_binds_bo_mi_unique_conflicting"] = temp["ptm_og_binds_bo_mi"].apply(
        lambda x: ",".join(
            sorted(
                set(
                    " ".join(  # join all descriptions
                        x.replace("|", ",").split(",")
                    ).split()  # split on whitespace
                ).intersection(set(["no","yes"]))
            )
        )
    )
    test1 = len(temp.loc[
        ~(
            (temp["ptm_og_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
            (temp["ptm_og_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
        )
    ])==0

    print(f"There are {len(temp)} unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.")
    print(f"\tIn every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
    ptm_mis_labeled["description"] = ptm_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
    ptm_mis_labeled_mi_desc_dict = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["description"]))

    temp["ptm_desc"] = temp["ptm_mi"].apply(lambda x: mi_to_desc_string(x,ptm_mis_labeled_mi_desc_dict))

    temp.sort_values(by=["seq_sort_og"]).to_csv("contradicting_ptm_labels_grouped_by_seq_og_nov19_2025.csv",index=False)
    temp.loc[temp["ptm_mi"].str.contains("\\|")].head()


There are 6 unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: True


In [611]:
if len(temp)>0:
    # get unique groupings of ptm effects
    temp3 = temp[["seq_sort_og","ptm_mi", "ptm_desc"]]
    temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
        lambda x: ",".join(sorted([y for y in list(set(x.split(","))) if y!="nan"]))
    )
    temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
        lambda x: ",".join(sorted([x for x in
                set(
                    ",".join(  # join all descriptions
                        x.replace("|", ",").split(",")
                    ).split(",")  # split on whitespace
                )
            if x!="nan"]))
    )
    temp3 = temp3.loc[
        (temp3["ptm_unique_mi"]!="nan") & 
        (temp3["ptm_unique_mi"]!="")
    ].reset_index(drop=True)
    temp3["ptm_desc"] = temp3["ptm_desc"].fillna("").apply(lambda x: x.replace("no effect","no-effect"))
    keywords = set(["no-effect","causing","increasing","decreasing","disrupting","prerequisite-ptm","resulting-ptm"])
    # Extract keywords, ignoring comma vs pipe grouping
    temp3["ptm_keywords"] = temp3["ptm_desc"].apply(
        lambda x: ",".join(
            sorted(
                set(
                    " ".join(  # join all descriptions
                        x.replace("|", ",").split(",")
                    ).split()  # split on whitespace
                ).intersection(keywords)
            )
        )
    )
    print(temp3["ptm_keywords"].value_counts())

    test1 = temp3["ptm_keywords"].value_counts().reset_index()["ptm_keywords"].str.contains(",").all()
    print(f"\nEvery sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): {test1}")

ptm_keywords
increasing,prerequisite-ptm       3
prerequisite-ptm,resulting-ptm    3
Name: count, dtype: int64

Every sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(


In [612]:
# investigate what makes these rows conflict
# do they have MIs that are different? 
test1 = gb_neg_new.loc[
    (gb_neg_new["decisive_entry_new_binds"].str.contains("no,yes")) | 
    (gb_neg_new["decisive_entry_new_binds"].str.contains("yes,no"))
    ]["seq_sort_new"].unique().tolist()
temp = merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["seq_sort_new"].isin(test1)
].reset_index(drop=True)
if len(temp)>0:
    temp["ptm_mi"] = temp["ptm_mi_1"].fillna("") + "," + temp["ptm_mi_2"].fillna("")
    temp["ptm_mi"] = temp["ptm_mi"].fillna("").str.strip(",")
    temp = temp.groupby("seq_sort_new").agg(
        ptm_new_binds_bo_mi=("ptm_new_binds_bo_mi", lambda x: "|".join([str(y) for y in list(x)])),
        ptm_mi=("ptm_mi", lambda x: "|".join([str(y) for y in list(x)])),
    ).reset_index()
    temp["ptm_new_binds_bo_mi_unique_conflicting"] = temp["ptm_new_binds_bo_mi"].apply(
        lambda x: ",".join(
            sorted(
                set(
                    " ".join(  # join all descriptions
                        x.replace("|", ",").split(",")
                    ).split()  # split on whitespace
                ).intersection(set(["no","yes"]))
            )
        )
    )
    test1 = len(temp.loc[
        ~(
            (temp["ptm_new_binds_bo_mi_unique_conflicting"].str.contains("no,yes")) | 
            (temp["ptm_new_binds_bo_mi_unique_conflicting"].str.contains("yes,no")) 
        )
    ])==0

    print(f"There are {len(temp)} unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.")
    print(f"\tIn every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: {test1}")
    ptm_mis_labeled["description"] = ptm_mis_labeled["feature"].apply(lambda x: x.split("(")[1].split(")")[0] if "(" in x else x)
    ptm_mis_labeled_mi_desc_dict = dict(zip(ptm_mis_labeled["mi"],ptm_mis_labeled["description"]))

    temp["ptm_desc"] = temp["ptm_mi"].apply(lambda x: mi_to_desc_string(x,ptm_mis_labeled_mi_desc_dict))

    temp.sort_values(by=["seq_sort_new"]).to_csv("contradicting_ptm_labels_grouped_by_seq_new_nov19_2025.csv",index=False)
    temp.loc[temp["ptm_mi"].str.contains("\\|")].head()


There are 11 unique sequence pairs (where at least one seq is ptm) where there are conflicting labels about whether or not binding has occurred.
	In every case where we have contradicting labels for whether a ptm sequence interacts with another sequence, it is because of conflicting MIs: True


In [613]:
if len(temp)>0:
    # get unique groupings of ptm effects
    temp3 = temp[["seq_sort_new","ptm_mi", "ptm_desc"]]
    temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
        lambda x: ",".join(sorted([y for y in list(set(x.split(","))) if y!="nan"]))
    )
    temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
        lambda x: ",".join(sorted([x for x in
                set(
                    ",".join(  # join all descriptions
                        x.replace("|", ",").split(",")
                    ).split(",")  # split on whitespace
                )
            if x!="nan"]))
    )
    temp3 = temp3.loc[
        (temp3["ptm_unique_mi"]!="nan") & 
        (temp3["ptm_unique_mi"]!="")
    ].reset_index(drop=True)
    temp3["ptm_desc"] = temp3["ptm_desc"].fillna("").apply(lambda x: x.replace("no effect","no-effect"))
    keywords = set(["no-effect","causing","increasing","decreasing","disrupting","prerequisite-ptm","resulting-ptm"])
    # Extract keywords, ignoring comma vs pipe grouping
    temp3["ptm_keywords"] = temp3["ptm_desc"].apply(
        lambda x: ",".join(
            sorted(
                set(
                    " ".join(  # join all descriptions
                        x.replace("|", ",").split(",")
                    ).split()  # split on whitespace
                ).intersection(keywords)
            )
        )
    )
    print(temp3["ptm_keywords"].value_counts())

    test1 = temp3["ptm_keywords"].value_counts().reset_index()["ptm_keywords"].str.contains(",").all()
    print(f"\nEvery sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): {test1}")

ptm_keywords
decreasing,disrupting    11
Name: count, dtype: int64

Every sequence pair where we have conflicting binding labels has at least two different binding-related keywords (from: disrupting,decreasing,no-effect,increasing,causing): True


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp3["ptm_unique_mi"] = temp3["ptm_mi"].apply(


In [614]:
print(f"From positive database:")
print("Value counts for PTM all_new_binds")
print(merged_expl_ptm_filt["PTM all_new_binds"].value_counts())
print("\nValue counts for PTM all_og_binds")
print(merged_expl_ptm_filt["PTM all_og_binds"].value_counts())

From positive database:
Value counts for PTM all_new_binds
PTM all_new_binds
unknown        5255
unknown,yes    1268
no,unknown       79
yes               1
Name: count, dtype: int64

Value counts for PTM all_og_binds
PTM all_og_binds
unknown           4832
no,unknown         981
unknown,yes        776
no,unknown,yes      11
yes                  2
no                   1
Name: count, dtype: int64


In [615]:
print(f"From negative database:")
print("Value counts for PTM all_new_binds")
print(merged_neg_expl_ptm_filt["PTM all_new_binds"].value_counts())
print("\nValue counts for PTM all_og_binds")
print(merged_neg_expl_ptm_filt["PTM all_og_binds"].value_counts())

From negative database:
Value counts for PTM all_new_binds
PTM all_new_binds
unknown        4702
unknown,yes    1159
no,unknown       74
yes               1
Name: count, dtype: int64

Value counts for PTM all_og_binds
PTM all_og_binds
unknown           4318
no,unknown         900
unknown,yes        704
no,unknown,yes      11
yes                  2
no                   1
Name: count, dtype: int64


In [616]:
print("From positive database:")
print("\nValue counts for PTM decisive_entry_new_binds")
print(merged_expl_ptm_filt["PTM decisive_entry_new_binds"].value_counts())
print("\nValue counts for PTM decisive_entry_og_binds")
print(merged_expl_ptm_filt["PTM decisive_entry_og_binds"].value_counts())

From positive database:

Value counts for PTM decisive_entry_new_binds
PTM decisive_entry_new_binds
unknown    5255
yes        1269
no           79
Name: count, dtype: int64

Value counts for PTM decisive_entry_og_binds
PTM decisive_entry_og_binds
unknown    4843
no          982
yes         778
Name: count, dtype: int64


In [617]:
print("From negative database:")
print("\nValue counts for PTM decisive_entry_new_binds")
print(merged_neg_expl_ptm_filt["PTM decisive_entry_new_binds"].value_counts())
print("\nValue counts for PTM decisive_entry_og_binds")
print(merged_neg_expl_ptm_filt["PTM decisive_entry_og_binds"].value_counts())

From negative database:

Value counts for PTM decisive_entry_new_binds
PTM decisive_entry_new_binds
unknown    4702
yes        1160
no           74
Name: count, dtype: int64

Value counts for PTM decisive_entry_og_binds
PTM decisive_entry_og_binds
unknown    4329
no          901
yes         706
Name: count, dtype: int64


In [618]:
print("From positive database:")
print("\nValue counts for PTM decisive_seqpair_og_binds")
print(merged_expl_ptm_filt["PTM decisive_seqpair_og_binds"].value_counts())

From positive database:

Value counts for PTM decisive_seqpair_og_binds
PTM decisive_seqpair_og_binds
unknown    4446
no         1247
yes         910
Name: count, dtype: int64


In [619]:
print("From negative database:")
print("\nValue counts for PTM decisive_seqpair_og_binds")
print(merged_neg_expl_ptm_filt["PTM decisive_seqpair_og_binds"].value_counts())

From negative database:

Value counts for PTM decisive_seqpair_og_binds
PTM decisive_seqpair_og_binds
unknown    3990
no         1131
yes         815
Name: count, dtype: int64


In [620]:
print("From positive database:")
print("\nValue counts for PTM decisive_seqpair_new_binds")
print(merged_expl_ptm_filt["PTM decisive_seqpair_new_binds"].value_counts())

From positive database:

Value counts for PTM decisive_seqpair_new_binds
PTM decisive_seqpair_new_binds
unknown    5101
yes        1432
no           70
Name: count, dtype: int64


In [621]:
print("From negative database:")
print("\nValue counts for PTM decisive_seqpair_new_binds")
print(merged_neg_expl_ptm_filt["PTM decisive_seqpair_new_binds"].value_counts())

From negative database:

Value counts for PTM decisive_seqpair_new_binds
PTM decisive_seqpair_new_binds
unknown    4565
yes        1306
no           65
Name: count, dtype: int64


In [635]:
# perfect! now that we have PTM decisive_seqpair_new_binds and PTM decisive_seqpair_og_binds, we can come up with some positive and negative-binding pairs from here
# first let's save all of this 
import os
os.makedirs("data_files/processed/intact/clean",exist_ok=True)
merged_expl_ptm_filt[[
"interaction_intactid",
"unique_id",
"seq_pair_id",
"PTM Partner",
"PTM decisive_seqpair_og_binds",
"PTM decisive_entry_new_binds",
"PTM decisive_entry_og_binds",
"PTM all_new_binds",
"PTM all_og_binds",
"agg_ptm_has_info",
"PTM Affected protein AC",
"PTM # Feature AC",
"PTM Feature annotation(s)",
"PTM Feature short label",
"PTM Feature type",
"PTM Figure legend(s)",
"PTM Interaction AC",
"PTM Interaction participants",
"PTM Interactor Matches",
"PTM Original sequence",
"PTM PubMedID",
"PTM Resulting sequence",
"PTM Xref ID(s)",
"PTM new_binds_bo_annotation",
"PTM new_binds_bo_feature_type",
"PTM og_binds_bo_annotation",
"PTM og_binds_bo_feature_type",
"aa_1",
"aa_2",
"intactid_1",
"intactid_2",
"dip_1",
"dip_2",
"length_1",
"length_2",
"ptm_begin_1",
"ptm_begin_2",
"ptm_end_1",
"ptm_end_2",
"ptm_mi_1",
"ptm_mi_2",
"ptm_name_1",
"ptm_name_2",
"ptm_new_1",
"ptm_new_2",
"ptm_new_binds_bo_mi",
"ptm_og_binds_bo_mi",
"ptm_orig_1",
"ptm_orig_2",
"ptm_range_1",
"ptm_range_2",
"ptm_short_1",
"ptm_short_2",
"ptm_aa_1",
"ptm_aa_2",
"uniprot_A",
"uniprot_A_intact",
"uniprot_B",
"uniprot_B_intact",
"unique_expansions",
"unique_uniprot_pair"
]].to_csv("data_files/processed/intact/clean/ptms_dec11_2025.csv",index=False)
merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="yes"
]

Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,PTM all_new_binds,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds
0,1,,1,,MI:0639,,"resulting-ptm,guanylated residue",,,,...,unknown,"unknown,yes",unknown,yes,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",seq_sort_og_2539,yes,seq_sort_new_35,unknown
1,1,,1,,MI:0639,,"resulting-ptm,uridylated residue",,,,...,unknown,"unknown,yes",unknown,yes,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",seq_sort_og_2539,yes,seq_sort_new_37,unknown
5,13,,13,,MI:1224,,"ptm increasing an interaction,O4'-sulfo-L-tyro...",,,,...,"unknown,yes","unknown,yes",yes,yes,MGVLRVYVILILVGFCVQIVVVNSQNLTCNSNDLKALEGFMRGLES...,"<psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)>I<p...",seq_sort_og_1871,yes,seq_sort_new_15,yes
10,100,,100,,MI:0639,,"resulting-ptm,N-acetylated L-lysine",,,,...,unknown,"unknown,yes",unknown,yes,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,seq_sort_og_2392,yes,seq_sort_new_4346,unknown
11,100,,100,,MI:0639,,"resulting-ptm,N-acetylated L-lysine",,,,...,unknown,"unknown,yes",unknown,yes,MAEPSQAPTPAPAAQPRPLQSPAPAPTPTPAPSPASAPIPTPTPAP...,MAEPSQAPTPAPAAQPRPLQSPAPAPTPTPAPSPASAPIPTPTPAP...,seq_sort_og_452,yes,seq_sort_new_796,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6452,,929,,929,,MI:0639,,"resulting-ptm,observed-ptm,uncategorized prote...",,,...,unknown,"unknown,yes",unknown,yes,MEDRLHMDNGLVPQKIVSVHLQDSTLKEVKDQVSNKQAQILEPKPE...,MEDRLHMDNGLVPQKIVSVHLQDSTLKEVKDQVSNKQAQILEPKPE...,seq_sort_og_1352,yes,seq_sort_new_2379,unknown
6507,,94,,94,,MI:0639,,"resulting-ptm,3-hydroxy-L-asparagine",,,...,unknown,"unknown,yes",unknown,yes,MAATAAEAVASGSGEPREEAGALGPAWDESQLRSYSFPTRPIPRLS...,MAATAAEAVASGSGEPREEAGALGPAWDESQLRSYSFPTRPIPRLS...,seq_sort_og_286,yes,seq_sort_new_520,unknown
6511,,95,,95,,MI:0639,,"resulting-ptm,methylated arginine",,,...,unknown,"unknown,yes",unknown,yes,MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED...,MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED...,seq_sort_og_198,yes,seq_sort_new_378,unknown
6512,,95,,95,,MI:0639,,"resulting-ptm,observed-ptm,uncategorized prote...",,,...,unknown,"unknown,yes",unknown,yes,MERPSLRALLLGAAGLLLLLLPLSSSSSSDTCGPCEPASCPPLPPL...,MERPSLRALLLGAAGLLLLLLPLSSSSSSDTCGPCEPASCPPLPPL...,seq_sort_og_1566,yes,seq_sort_new_2782,unknown


In [634]:
# perfect! now that we have PTM decisive_seqpair_new_binds and PTM decisive_seqpair_og_binds, we can come up with some positive and negative-binding pairs from here
# first let's save all of this 
import os
os.makedirs("data_files/processed/intact/clean",exist_ok=True)
merged_neg_expl_ptm_filt[[
"interaction_intactid",
"unique_id",
"seq_pair_id",
"PTM Partner",
"PTM decisive_seqpair_og_binds",
"PTM decisive_entry_new_binds",
"PTM decisive_entry_og_binds",
"PTM all_new_binds",
"PTM all_og_binds",
"agg_ptm_has_info",
"PTM Affected protein AC",
"PTM # Feature AC",
"PTM Feature annotation(s)",
"PTM Feature short label",
"PTM Feature type",
"PTM Figure legend(s)",
"PTM Interaction AC",
"PTM Interaction participants",
"PTM Interactor Matches",
"PTM Original sequence",
"PTM PubMedID",
"PTM Resulting sequence",
"PTM Xref ID(s)",
"PTM new_binds_bo_annotation",
"PTM new_binds_bo_feature_type",
"PTM og_binds_bo_annotation",
"PTM og_binds_bo_feature_type",
"aa_1",
"aa_2",
"intactid_1",
"intactid_2",
"dip_1",
"dip_2",
"length_1",
"length_2",
"ptm_begin_1",
"ptm_begin_2",
"ptm_end_1",
"ptm_end_2",
"ptm_mi_1",
"ptm_mi_2",
"ptm_name_1",
"ptm_name_2",
"ptm_new_1",
"ptm_new_2",
"ptm_new_binds_bo_mi",
"ptm_og_binds_bo_mi",
"ptm_orig_1",
"ptm_orig_2",
"ptm_range_1",
"ptm_range_2",
"ptm_short_1",
"ptm_short_2",
"ptm_aa_1",
"ptm_aa_2",
"uniprot_A",
"uniprot_A_intact",
"uniprot_B",
"uniprot_B_intact",
"unique_expansions",
"unique_uniprot_pair"
]].to_csv("data_files/processed/intact/clean/ptms_neg_dec11_2025.csv",index=False)
merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="yes"
]

Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,PTM all_new_binds,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds
0,1,,1,,MI:0639,,"resulting-ptm,guanylated residue",,,,...,unknown,"unknown,yes",unknown,yes,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",seq_sort_og_2233,yes,seq_sort_new_30,unknown
1,1,,1,,MI:0639,,"resulting-ptm,uridylated residue",,,,...,unknown,"unknown,yes",unknown,yes,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",seq_sort_og_2233,yes,seq_sort_new_32,unknown
2,13,,13,,MI:1224,,"ptm increasing an interaction,O4'-sulfo-L-tyro...",,,,...,"unknown,yes","unknown,yes",yes,yes,MGVLRVYVILILVGFCVQIVVVNSQNLTCNSNDLKALEGFMRGLES...,"<psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)>I<p...",seq_sort_og_1638,yes,seq_sort_new_12,yes
6,100,,100,,MI:0639,,"resulting-ptm,N-acetylated L-lysine",,,,...,unknown,"unknown,yes",unknown,yes,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,seq_sort_og_2100,yes,seq_sort_new_3902,unknown
7,100,,100,,MI:0639,,"resulting-ptm,N-acetylated L-lysine",,,,...,unknown,"unknown,yes",unknown,yes,MAEPSQAPTPAPAAQPRPLQSPAPAPTPTPAPSPASAPIPTPTPAP...,MAEPSQAPTPAPAAQPRPLQSPAPAPTPTPAPSPASAPIPTPTPAP...,seq_sort_og_400,yes,seq_sort_new_731,unknown
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5790,,929,,929,,MI:0639,,"resulting-ptm,observed-ptm,uncategorized prote...",,,...,unknown,"unknown,yes",unknown,yes,MEDRLHMDNGLVPQKIVSVHLQDSTLKEVKDQVSNKQAQILEPKPE...,MEDRLHMDNGLVPQKIVSVHLQDSTLKEVKDQVSNKQAQILEPKPE...,seq_sort_og_1181,yes,seq_sort_new_2140,unknown
5791,,929,,929,,MI:0639,,"resulting-ptm,observed-ptm,uncategorized prote...",,,...,unknown,"unknown,yes",unknown,yes,MEDRLHMDNGLVPQKIVSVHLQDSTLKEVKDQVSNKQAQILEPKPE...,MEDRLHMDNGLVPQKIVSVHLQDSTLKEVKDQVSNKQAQILEPKPE...,seq_sort_og_1181,yes,seq_sort_new_2140,unknown
5849,,95,,95,,MI:0639,,"resulting-ptm,methylated arginine",,,...,unknown,"unknown,yes",unknown,yes,MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED...,MAAAEAANCIMENFVATLANGMSLQPPLEEVSCGQAESSEKPNAED...,seq_sort_og_184,yes,seq_sort_new_355,unknown
5850,,95,,95,,MI:0639,,"resulting-ptm,observed-ptm,uncategorized prote...",,,...,unknown,"unknown,yes",unknown,yes,MERPSLRALLLGAAGLLLLLLPLSSSSSSDTCGPCEPASCPPLPPL...,MERPSLRALLLGAAGLLLLLLPLSSSSSSDTCGPCEPASCPPLPPL...,seq_sort_og_1380,yes,seq_sort_new_2522,unknown


In [624]:
mutations_clean = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/clean/mutations_dec11_2025.csv")
mutations_clean_dtypes = {k: "string" for k in mutations_clean.columns}
mutations_clean_dtypes["length_1"] = "int"
mutations_clean_dtypes["length_2"] = "int"
mutations_clean = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/clean/mutations_dec11_2025.csv",
                              dtype=mutations_clean_dtypes)
# make the mut_has_info columns equal to bools when they aren't nans
should_be_bool = ['scraped_mut_has_info', 'scraped_mut_has_info_1', 'scraped_mut_has_info_2', 'agg_mut_has_info']
for x in should_be_bool:
    for df in [mutations_clean]:
        if x in df: 
            df[x] = df[x].apply(lambda x: bool(x) if not(type(x)==float) else False)

  mutations_clean = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/clean/mutations_dec11_2025.csv")


In [625]:
mutations_neg_clean = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/clean/mutations_neg_dec11_2025.csv")
mutations_neg_clean_dtypes = {k: "string" for k in mutations_neg_clean.columns}
mutations_neg_clean_dtypes["length_1"] = "int"
mutations_neg_clean_dtypes["length_2"] = "int"
mutations_neg_clean = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/clean/mutations_neg_dec11_2025.csv",
                              dtype=mutations_neg_clean_dtypes)
# make the mut_has_info columns equal to bools when they aren't nans
should_be_bool = ['scraped_mut_has_info', 'scraped_mut_has_info_1', 'scraped_mut_has_info_2', 'agg_mut_has_info']
for x in should_be_bool:
    for df in [mutations_neg_clean]:
        if x in df: 
            df[x] = df[x].apply(lambda x: bool(x) if not(type(x)==float) else False)

In [626]:
merged_expl_mut_filt_interaction_intactids = mutations_clean["interaction_intactid"].unique().tolist()
merged_neg_expl_mut_filt_interaction_intactids = mutations_neg_clean["interaction_intactid"].unique().tolist()
merged_expl_ptm_filt_interaction_intactids = merged_expl_ptm_filt["interaction_intactid"].unique().tolist()
merged_neg_expl_ptm_filt_interaction_intactids = merged_neg_expl_ptm_filt["interaction_intactid"].unique().tolist()

# is there an intersection?
test1 = set(merged_expl_ptm_filt_interaction_intactids).intersection(set(merged_expl_mut_filt_interaction_intactids))
print(f"From positive merged-ptm database: total intact ids that also have mutation annotations in mutations_clean= {len(test1)}")
test1 = set(merged_expl_ptm_filt_interaction_intactids).intersection(set(merged_neg_expl_mut_filt_interaction_intactids))
print(f"From positive merged-ptm database: total intact ids that also have mutation annotations in mutations_neg_clean= {len(test1)}")
test1 = set(merged_neg_expl_ptm_filt_interaction_intactids).intersection(set(merged_expl_mut_filt_interaction_intactids))
print(f"From negative merged-ptm database: total intact ids that also have mutation annotations in mutations_clean= {len(test1)}")
test1 = set(merged_neg_expl_ptm_filt_interaction_intactids).intersection(set(merged_neg_expl_mut_filt_interaction_intactids))
print(f"From negative merged-ptm database: total intact ids that also have mutation annotations in mutations_neg_clean= {len(test1)}")

From positive merged-ptm database: total intact ids that also have mutation annotations in mutations_clean= 598
From positive merged-ptm database: total intact ids that also have mutation annotations in mutations_neg_clean= 0
From negative merged-ptm database: total intact ids that also have mutation annotations in mutations_clean= 517
From negative merged-ptm database: total intact ids that also have mutation annotations in mutations_neg_clean= 0


In [627]:
scraped_mut_cols = [
"mutation_begin_1",
"mutation_begin_2",
"mutation_end_1",
"mutation_end_2",
"mutation_mi_1",
"mutation_mi_2",
"mutation_name_1",
"mutation_name_2",
"mutation_new_1",
"mutation_new_2",
"mutation_orig_1",
"mutation_orig_2",
"mutation_range_1",
"mutation_range_2",
"mutation_short_1",
"mutation_short_2",
]

In [628]:
test1 = set(merged_expl_ptm_filt_interaction_intactids).intersection(set(merged_expl_mut_filt_interaction_intactids))
rel_scraped_mut_cols = [x for x in scraped_mut_cols if x in list(merged.columns)]
rel_scraped_ptm_cols = [x for x in scraped_ptm_cols if x in list(merged.columns)]
temp = merged.loc[merged["interaction_intactid"].isin(test1)].reset_index(drop=True)[
    ["interaction_intactid","interaction_xml_id","year"] + rel_scraped_mut_cols + rel_scraped_ptm_cols
]
print(f"There are {len(temp)} rows that have both mutation and PTM annotations.")
display(temp.head())
temp.to_csv("rows_with_ptm_and_mutation_dec2_2025.csv",index=False)

There are 605 rows that have both mutation and PTM annotations.


Unnamed: 0,interaction_intactid,interaction_xml_id,year,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,...,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,ptm_orig_1,ptm_orig_2,ptm_short_1,ptm_short_2
0,EBI-15986772,29,2012,,"77,422,413,395,366,338,257,221|552,529,507,453",,"77,422,413,395,366,338,257,221|552,529,507,453",,MI:1129|MI:1129,,...,,MI:0925,,"observed-ptm,O-phospho-L-threonine",,,,,,pT257
1,EBI-21956996,56,2019,16|17,,16|17,,MI:2226|MI:1130,,mutation with no effect|mutation decreasing in...,...,MI:0639,,"resulting-ptm,O-phospho-L-threonine",,,,,,thr-17,
2,EBI-21955812,49,2019,16|17,,16|17,,MI:1129|MI:1130,,mutation disrupting interaction rate|mutation ...,...,MI:0639,,"resulting-ptm,O-phospho-L-serine",,,,,,ser-16,
3,EBI-10900650,23,2014,,422|418|418|418|418,,422|418|418|418|418,,MI:1133|MI:1128|MI:1132|MI:1133|MI:1132,,...,,MI:0925,,"observed-ptm,O-phospho-L-serine",,,,,,ser-422
4,EBI-10900604,16,2014,67,,67,,MI:1129,,mutation disrupting interaction rate,...,,MI:0639,,"resulting-ptm,O-phospho-L-serine",,,,,,ser-422


In [629]:
test1 = set(merged_neg_expl_ptm_filt_interaction_intactids).intersection(set(merged_neg_expl_mut_filt_interaction_intactids))
rel_scraped_mut_cols = [x for x in scraped_mut_cols if x in list(merged_neg.columns)]
rel_scraped_ptm_cols = [x for x in scraped_ptm_cols if x in list(merged_neg.columns)]
temp = merged_neg.loc[merged_neg["interaction_intactid"].isin(test1)].reset_index(drop=True)[
    ["interaction_intactid","interaction_xml_id","year"] + rel_scraped_mut_cols + rel_scraped_ptm_cols
]
print(f"There are {len(temp)} rows that have both mutation and PTM annotations.")
display(temp.head())
temp.to_csv("rows_with_ptm_and_mutation_neg_dec2_2025.csv",index=False)

There are 0 rows that have both mutation and PTM annotations.


Unnamed: 0,interaction_intactid,interaction_xml_id,year,mutation_begin_1,mutation_begin_2,mutation_end_1,mutation_end_2,mutation_mi_1,mutation_mi_2,mutation_name_1,...,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,ptm_orig_1,ptm_orig_2,ptm_short_1,ptm_short_2


In [630]:
keepcols = [
"Negative",
"aa_1",
"aa_2",
"invalids_aa_1",
"invalids_aa_2",
"all_intact_A_sorted",
"all_intact_B_sorted",
"chain_seq_end_1",
"chain_seq_end_2",
"chain_seq_start_1",
"chain_seq_start_2",
"confidence_val_int",
"ensg_1",
"ensg_2",
"ensp_1",
"ensp_2",
"enst_1",
"enst_2",
"equal_score_int",
"gene_symbol_1",
"gene_symbol_2",
"go_1",
"go_2",
"dip_1",
"dip_2",
"host_cell_type_1",
"host_cell_type_2",
"host_compartment_1",
"host_compartment_2",
"host_label_full_1",
"host_label_full_2",
"host_label_short_1",
"host_label_short_2",
"host_taxid_1",
"host_taxid_2",
"host_tissue_1",
"host_tissue_2",
"intactid_1",
"intactid_2",
"interaction_detection_methods_sorted",
"interaction_intactid",
"interaction_label",
"interaction_mi",
"interaction_xml_id",
"interpro_1",
"interpro_2",
"length_1",
"length_2",
"miscore",
"mol_type_1",
"mol_type_2",
"no_uniprot_update_A",
"no_uniprot_update_B",
"primaryref_db_1",
"primaryref_db_2",
"primaryref_id_1",
"primaryref_id_2",
"protein_1",
"protein_2",
"pubmeds",
"reactome_1",
"reactome_2",
"rscbpdb_1",
"rscbpdb_2",
"seq_pair_id",
"seq_sort",
"species_label_1",
"species_label_2",
"species_taxid_1",
"species_taxid_2",
"uniprot_A",
"uniprot_A_equalseq",
"uniprot_A_equalseq_canonical",
"uniprot_A_full",
"uniprot_A_inseq",
"uniprot_A_inseq_canonical",
"uniprot_A_intact",
"uniprot_A_noiso1",
"uniprot_A_noisoforms",
"uniprot_B",
"uniprot_B_equalseq",
"uniprot_B_equalseq_canonical",
"uniprot_B_full",
"uniprot_B_inseq",
"uniprot_B_inseq_canonical",
"uniprot_B_intact",
"uniprot_B_noiso1",
"uniprot_B_noisoforms",
"uniprot_gene_name_A",
"uniprot_gene_name_B",
"uniprotkb_1",
"uniprotkb_2",
"unique_all_intact_sorted",
"unique_expansions",
"unique_id",
"unique_score_int",
"unique_scores",
"unique_uniprot_noiso1_pair",
"unique_uniprot_noisoforms_pair",
"unique_uniprot_pair",
"year",
"binding_mi_1", 
"binding_name_1", 
"binding_short_1", 
"binding_begin_1", 
"binding_end_1", 
"binding_mi_2", 
"binding_name_2", 
"binding_short_2",
"binding_begin_2", 
"binding_end_2"
]
simplemerged = merged[keepcols]
simplemerged_neg = merged_neg[keepcols]

simplemerged = simplemerged.drop_duplicates().reset_index(drop=True)
simplemerged_neg = simplemerged_neg.drop_duplicates().reset_index(drop=True)

binding_cols = ["binding_mi_1", 
"binding_name_1", 
"binding_short_1", 
"binding_begin_1", 
"binding_end_1", 
"binding_mi_2", 
"binding_name_2", 
"binding_short_2",
"binding_begin_2", 
"binding_end_2"]
binding_col_change = {x: f"all_{x}" for x in binding_cols}
simplemerged = simplemerged.rename(columns=binding_col_change)
simplemerged_neg = simplemerged_neg.rename(columns=binding_col_change)

print(f"Length of simplemerged: {len(simplemerged)}")
print(f"Length of simplemerged_neg: {len(simplemerged_neg)}")


Length of simplemerged: 744614
Length of simplemerged_neg: 744614


In [638]:
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + ["all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"]

simplemerged = harmonize_nulls_to_nan(simplemerged)

all_except_featac = [c for c in simplemerged.columns if c not in need_pipejoin]

agg_spec = {c: join_unique_nonnull for c in need_pipejoin}

display(simplemerged.head())
simplemerged = (
    simplemerged
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: {len(simplemerged)}")

display(simplemerged.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-101707,intact:EBI-100018,,,,...,,,,,,,,,,
1,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,,,intact:EBI-100018,intact:EBI-102069,,,,...,MI:0117,binding-associated region,region,1207.0,1783.0,MI:0117,binding-associated region,region,,
2,False,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-104215,intact:EBI-100018,,,,...,,,,,,,,,,
3,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,,,intact:EBI-100018,intact:EBI-107089,,,,...,,,,,,,,,,
4,False,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-117032,intact:EBI-100018,,,,...,,,,,,,,,,


Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: 743130


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,AAAAARPAGGSARRWGRPGRCGLLAAGPKRVRSEPGGRLPERSLGP...,MTVFRQENVDDYYDTGEELGSGQFAVVKKCREKSTGLQYAAKFIKK...,,,intact:EBI-20589573,intact:EBI-358616,,,,...,,,,,,,,,,
1,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAKWGEGDPRWIVEERADATNVNNWHWTERDASNWSTDKLKTLFLA...,,,intact:EBI-25507607,intact:EBI-448610,7176.0,,6878.0,...,,,,,,,,,,
2,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MANDPLEGFHEVNLASPTSPDLLGVCDPGTQEQTTSPSVIYRPHPS...,,,intact:EBI-25507607,intact:EBI-16730154,7176.0,,6878.0,...,,,,,,,,,,
3,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAQYGHPSPLGMAAREELYSKVTPRRNRQQRPGTIKHGSALDVLLS...,,,intact:EBI-25507607,intact:EBI-1380492,7176.0,,6878.0,...,,,,,,,,,,
4,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAVALLEEWCKIMGVDVQKSLLVVDIPVDCGEPEIQTVLQEALKCV...,,,intact:EBI-25507607,intact:EBI-25508298,7176.0,,6878.0,...,,,,,,,,,,


In [639]:
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + [
    "all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"
]

simplemerged_neg = harmonize_nulls_to_nan(simplemerged_neg)

all_except_featac = [c for c in simplemerged_neg.columns if c not in need_pipejoin]

agg_spec = {c: join_unique_nonnull for c in need_pipejoin}

display(simplemerged_neg.head())
simplemerged_neg = (
    simplemerged_neg
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: {len(simplemerged_neg)}")

display(simplemerged_neg.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-101707,intact:EBI-100018,,,,...,,,,,,,,,,
1,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,,,intact:EBI-100018,intact:EBI-102069,,,,...,MI:0117,binding-associated region,region,1207.0,1783.0,MI:0117,binding-associated region,region,,
2,False,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-104215,intact:EBI-100018,,,,...,,,,,,,,,,
3,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,,,intact:EBI-100018,intact:EBI-107089,,,,...,,,,,,,,,,
4,False,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-117032,intact:EBI-100018,,,,...,,,,,,,,,,


Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: 743130


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,AAAAARPAGGSARRWGRPGRCGLLAAGPKRVRSEPGGRLPERSLGP...,MTVFRQENVDDYYDTGEELGSGQFAVVKKCREKSTGLQYAAKFIKK...,,,intact:EBI-20589573,intact:EBI-358616,,,,...,,,,,,,,,,
1,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAKWGEGDPRWIVEERADATNVNNWHWTERDASNWSTDKLKTLFLA...,,,intact:EBI-25507607,intact:EBI-448610,7176.0,,6878.0,...,,,,,,,,,,
2,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MANDPLEGFHEVNLASPTSPDLLGVCDPGTQEQTTSPSVIYRPHPS...,,,intact:EBI-25507607,intact:EBI-16730154,7176.0,,6878.0,...,,,,,,,,,,
3,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAQYGHPSPLGMAAREELYSKVTPRRNRQQRPGTIKHGSALDVLLS...,,,intact:EBI-25507607,intact:EBI-1380492,7176.0,,6878.0,...,,,,,,,,,,
4,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAVALLEEWCKIMGVDVQKSLLVVDIPVDCGEPEIQTVLQEALKCV...,,,intact:EBI-25507607,intact:EBI-25508298,7176.0,,6878.0,...,,,,,,,,,,


In [640]:
## AFTER cleaning simplemerged a bit, run this again!
# trying to find the least common set of unique identifierrs in merged
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id"])
])
print(f"Rows in simplemerged with duplicate unique_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged with duplicate unique_id: 314635
Rows in simplemerged with duplicate interaction_intactid: 3
Rows in simplemerged with duplicate seq_pair_id: 316591
Rows in simplemerged with duplicate unique_id+seq_pair_id: 314635
Rows in simplemerged with duplicate unique_id+interaction_intactid: 0
Rows in simplemerged with duplicate seq_pair_id+interaction_intactid: 0


In [641]:
## AFTER cleaning simplemerged_neg a bit, run this again!
# trying to find the least common set of unique identifierrs in merged_neg
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["unique_id"])
])
print(f"Rows in simplemerged_neg with duplicate unique_id: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged_neg with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged_neg with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged_neg with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged_neg.loc[
    simplemerged_neg.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged_neg with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged_neg with duplicate unique_id: 314635
Rows in simplemerged_neg with duplicate interaction_intactid: 3
Rows in simplemerged_neg with duplicate seq_pair_id: 316591
Rows in simplemerged_neg with duplicate unique_id+seq_pair_id: 314635
Rows in simplemerged_neg with duplicate unique_id+interaction_intactid: 0
Rows in simplemerged_neg with duplicate seq_pair_id+interaction_intactid: 0


In [642]:
def convert_ptm_cols_to_ppi(row):
    """
    Convert ptm columns to ppi
    """
    # defaults
    aa_1 = row["aa_1"]
    length_1 = row["length_1"]
    uniprot_A = row["uniprot_A"]
    chain_seq_start_1 = row["chain_seq_start_1"]
    chain_seq_end_1 = row["chain_seq_end_1"]
    uniprot_A_equalseq = row["uniprot_A_equalseq"]
    uniprot_A_equalseq_canonical = row["uniprot_A_equalseq_canonical"]
    uniprot_A_full = row["uniprot_A_full"]
    uniprot_A_inseq = row["uniprot_A_inseq"]
    uniprot_A_inseq_canonical = row["uniprot_A_inseq_canonical"]
    uniprot_A_noiso1 = row["uniprot_A_noiso1"]
    
    aa_2 = row["aa_2"]
    length_2 = row["length_2"]
    uniprot_B = row["uniprot_B"]
    chain_seq_start_2 = row["chain_seq_start_2"]
    chain_seq_end_2 = row["chain_seq_end_2"]
    uniprot_B_equalseq = row["uniprot_B_equalseq"]
    uniprot_B_equalseq_canonical = row["uniprot_B_equalseq_canonical"]
    uniprot_B_full = row["uniprot_B_full"]
    uniprot_B_inseq = row["uniprot_B_inseq"]
    uniprot_B_inseq_canonical = row["uniprot_B_inseq_canonical"]
    uniprot_B_noiso1 = row["uniprot_B_noiso1"]
    
    ptm_parter = row["PTM Partner"]
    if not(type(ptm_parter)==float or ptm_parter is None):
        if "A" in ptm_parter:
            aa_1 = row["ptm_aa_1"]
            length_1 = len(aa_1) if type(aa_1)==str else 0
            uniprot_A = str(row["uniprot_A"]) + "_ptm" if type(row["uniprot_A"])==str else None
            chain_seq_start_1 = np.nan
            chain_seq_end_1 = np.nan
            uniprot_A_equalseq = np.nan
            uniprot_A_equalseq_canonical = np.nan
            uniprot_A_full = np.nan
            uniprot_A_inseq = np.nan
            uniprot_A_inseq_canonical = np.nan
            uniprot_A_noiso1 = np.nan
        if "B" in ptm_parter:
            aa_2 = row["ptm_aa_2"]
            length_2 = len(aa_2) if type(aa_2)==str else 0
            uniprot_B = str(row["uniprot_B"]) + "_ptm" if type(row["uniprot_B"])==str else None
            chain_seq_start_2 = np.nan
            chain_seq_end_2 = np.nan
            uniprot_B_equalseq = np.nan
            uniprot_B_equalseq_canonical = np.nan
            uniprot_B_full = np.nan
            uniprot_B_inseq = np.nan
            uniprot_B_inseq_canonical = np.nan
            uniprot_B_noiso1 = np.nan
    
    return pd.Series({
        "aa_1": aa_1,
        "length_1": length_1,
        "uniprot_A": uniprot_A,
        "chain_seq_start_1": chain_seq_start_1,
        "chain_seq_end_1": chain_seq_end_1,
        "uniprot_A_equalseq": uniprot_A_equalseq,
        "uniprot_A_equalseq_canonical": uniprot_A_equalseq_canonical,
        "uniprot_A_full": uniprot_A_full,
        "uniprot_A_inseq": uniprot_A_inseq,
        "uniprot_A_inseq_canonical": uniprot_A_inseq_canonical,
        "uniprot_A_noiso1": uniprot_A_noiso1,
        "aa_2": aa_2,
        "length_2": length_2,
        "uniprot_B": uniprot_B,
        "chain_seq_start_2": chain_seq_start_2,
        "chain_seq_end_2": chain_seq_end_2,
        "uniprot_B_equalseq": uniprot_B_equalseq,
        "uniprot_B_equalseq_canonical": uniprot_B_equalseq_canonical,
        "uniprot_B_full": uniprot_B_full,
        "uniprot_B_inseq": uniprot_B_inseq,
        "uniprot_B_inseq_canonical": uniprot_B_inseq_canonical,
        "uniprot_B_noiso1": uniprot_B_noiso1,
    })
    
# somehow get it back into simplemerged
# simplemerged has 
change_cols = ["aa_1",
 "length_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]
    

In [643]:
pos_og_from_ptm_data = merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="yes")
].reset_index(drop=True)
pos_og_from_ptm_data["PTM Partner Status"] = ["original"]*len(pos_og_from_ptm_data)
print(f"Total rows where PTM decisive_seqpair_og_binds==yes: {len(pos_og_from_ptm_data)}")

# what columns are in common?
common_cols = list(set(pos_og_from_ptm_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
pos_og_from_ptm_data = pd.merge(
    simplemerged,
    pos_og_from_ptm_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_og_from_ptm_data["seq_sort"] = pos_og_from_ptm_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(pos_og_from_ptm_data.loc[
    ~pos_og_from_ptm_data["seq_sort"].isin(test1)
])==0
print(f"As expected, no new sequence pairs arose from pos_og_from_ptm_data: {test1}")

temp = pos_og_from_ptm_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "PTM decisive_seqpair_og_binds",
        "PTM Partner","PTM Partner Status", "ptm_short_1","ptm_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_ptm = pd.merge(
    simplemerged,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
print(f"Merged in some PTM-related information. New size of dataframe = {len(simplemerged_ptm)}")


Total rows where PTM decisive_seqpair_og_binds==yes: 910
As expected, no new sequence pairs arose from pos_og_from_ptm_data: True
Merged in some PTM-related information. New size of dataframe = 743301


In [644]:
merged_neg_expl_ptm_filt

Unnamed: 0,ptm_begin_1,ptm_begin_2,ptm_end_1,ptm_end_2,ptm_mi_1,ptm_mi_2,ptm_name_1,ptm_name_2,ptm_new_1,ptm_new_2,...,PTM all_new_binds,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds
0,1,,1,,MI:0639,,"resulting-ptm,guanylated residue",,,,...,unknown,"unknown,yes",unknown,yes,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",seq_sort_og_2233,yes,seq_sort_new_30,unknown
1,1,,1,,MI:0639,,"resulting-ptm,uridylated residue",,,,...,unknown,"unknown,yes",unknown,yes,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",seq_sort_og_2233,yes,seq_sort_new_32,unknown
2,13,,13,,MI:1224,,"ptm increasing an interaction,O4'-sulfo-L-tyro...",,,,...,"unknown,yes","unknown,yes",yes,yes,MGVLRVYVILILVGFCVQIVVVNSQNLTCNSNDLKALEGFMRGLES...,"<psi-mod:""MOD:00181""(O4'-sulfo-L-tyrosine)>I<p...",seq_sort_og_1638,yes,seq_sort_new_12,yes
3,10,,10,,MI:0638,,"prerequisite-ptm,N6,N6-dimethyl-L-lysine",,,,...,"unknown,yes","no,unknown",yes,no,GKGGAKRHRKVLRDNIQGI_MSRYITRLSMRRTYKWNGRPVGEDRK...,"GKGGAKRHR<psi-mi:""MI:0166""(N6,N6-dimethyl-L-ly...",seq_sort_og_139,no,seq_sort_new_288,yes
4,10,,10,,MI:0638,,"prerequisite-ptm,observed-ptm,N6,N6,N6-trimeth...",,,,...,"unknown,yes","no,unknown",yes,no,MARTKQTARKSTGGKAPRKQLATKAARKSAPATGGVKKPHRYRPGT...,"MARTKQTAR<psi-mi:""MI:0167""(N6,N6,N6-trimethyl-...",seq_sort_og_672,no,seq_sort_new_1335,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5931,,998,,998,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,unknown,unknown,unknown,unknown,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,seq_sort_og_2042,unknown,seq_sort_new_3800,unknown
5932,,998,,998,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,unknown,unknown,unknown,unknown,MGKEQELLEAARTGHLPAVEKLLSGKRLSSGFGGGGGGGSGGGGGG...,MGKEQELLEAARTGHLPAVEKLLSGKRLSSGFGGGGGGGSGGGGGG...,seq_sort_og_1546,unknown,seq_sort_new_2829,unknown
5933,,998,,998,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,unknown,unknown,unknown,unknown,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,seq_sort_og_2048,unknown,seq_sort_new_3804,unknown
5934,,998,,998,,MI:0925,,"observed-ptm,phosphorylated residue",,,...,unknown,unknown,unknown,unknown,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,MRPSGTAGAALLALLAALCPASRALEEKKVCQGTSNKLTQLGTFED...,seq_sort_og_2048,unknown,seq_sort_new_3804,unknown


In [645]:
## Positive pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
## Positive pairs: (ptm-partner)'s ptm_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
pos_og_from_ptm_neg_data = merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="yes"
].reset_index(drop=True)
pos_og_from_ptm_neg_data["PTM Partner Status"] = ["original"]*len(pos_og_from_ptm_neg_data)
print(f"Total rows where PTM decisive_seqpair_og_binds==yes: {len(pos_og_from_ptm_neg_data)}")

# what columns are in common?
common_cols = list(set(pos_og_from_ptm_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
pos_og_from_ptm_neg_data = pd.merge(
    simplemerged_neg,
    pos_og_from_ptm_neg_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_og_from_ptm_neg_data["seq_sort"] = pos_og_from_ptm_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(pos_og_from_ptm_neg_data.loc[
    pos_og_from_ptm_neg_data["seq_sort"].isin(test1)
])
print(f"Total positive OG interactions that are currently negative in our neg-PPI data: {test1}")

print(f"Joining in positive data from the negatives database: simplemerged_neg")
temp = pos_og_from_ptm_neg_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "PTM decisive_seqpair_og_binds",
        "PTM Partner", "PTM Partner Status", "ptm_short_1","ptm_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_neg_ptm_pos = pd.merge(
    simplemerged_neg,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
)
simplemerged_neg_ptm_pos = simplemerged_neg_ptm_pos.loc[simplemerged_neg_ptm_pos["PTM Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tSize of simplemerged_neg_ptm_pos with just original negatives: {len(simplemerged_neg_ptm_pos)}")

Total rows where PTM decisive_seqpair_og_binds==yes: 815
Total positive OG interactions that are currently negative in our neg-PPI data: 815
Joining in positive data from the negatives database: simplemerged_neg
	Size of simplemerged_neg_ptm_pos with just original negatives: 660


In [646]:
pos_new_from_ptm_data = merged_expl_ptm_filt.loc[
    # new binds 
    (merged_expl_ptm_filt["PTM decisive_seqpair_new_binds"]=="yes")
].reset_index(drop=True)
pos_new_from_ptm_data["PTM Partner Status"] = ["ptm"]*len(pos_new_from_ptm_data)
print(f"Total rows where PTM decisive_seqpair_new_binds==yes: {len(pos_new_from_ptm_data)}")

# what columns are in common?
common_cols = list(set(pos_new_from_ptm_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
pos_new_from_ptm_data = pd.merge(
    simplemerged,
    pos_new_from_ptm_data, 
    on=common_cols, 
    how="inner"
)

pos_new_from_ptm_data[
    change_cols
] = pos_new_from_ptm_data.apply(lambda row: convert_ptm_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_new_from_ptm_data["seq_sort"] = pos_new_from_ptm_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(pos_new_from_ptm_data.loc[
    ~pos_new_from_ptm_data["seq_sort"].isin(test1)
])
print(f"Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from ptm data: {test1}")

Total rows where PTM decisive_seqpair_new_binds==yes: 1432
Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from ptm data: 1432


In [647]:
pos_new_from_ptm_neg_data = merged_neg_expl_ptm_filt.loc[
    # new binds 
    (merged_neg_expl_ptm_filt["PTM decisive_seqpair_new_binds"]=="yes")  
].reset_index(drop=True)
pos_new_from_ptm_neg_data["PTM Partner Status"] = ["ptm"]*len(pos_new_from_ptm_neg_data)
print(f"Total rows where PTM decisive_seqpair_new_binds==yes: {len(pos_new_from_ptm_neg_data)}")


# what columns are in common?
common_cols = list(set(pos_new_from_ptm_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
pos_new_from_ptm_neg_data = pd.merge(
    simplemerged_neg,
    pos_new_from_ptm_neg_data, 
    on=common_cols, 
    how="inner"
)

if len(pos_new_from_ptm_neg_data)>0:
    pos_new_from_ptm_neg_data[
        change_cols
    ] = pos_new_from_ptm_neg_data.apply(lambda row: convert_ptm_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
pos_new_from_ptm_neg_data["seq_sort"] = pos_new_from_ptm_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(pos_new_from_ptm_neg_data.loc[
    ~pos_new_from_ptm_neg_data["seq_sort"].isin(test1)
])
print(f"Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from ptm data: {test1}")

Total rows where PTM decisive_seqpair_new_binds==yes: 1306
Total new interactions (by seq-pair; new sequence 1 + sequence 2) added from ptm data: 1306


In [648]:
## Negative pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
## Negative pairs: (ptm-partner)'s ptm_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
neg_og_from_ptm_data = merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="no"
].reset_index(drop=True)
neg_og_from_ptm_data["PTM Partner Status"] = ["original"]*len(neg_og_from_ptm_data)
print(f"Total rows where PTM decisive_seqpair_og_binds==no: {len(neg_og_from_ptm_data)}")

# what columns are in common?
common_cols = list(set(neg_og_from_ptm_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
neg_og_from_ptm_data = pd.merge(
    simplemerged,
    neg_og_from_ptm_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_og_from_ptm_data["seq_sort"] = neg_og_from_ptm_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(neg_og_from_ptm_data.loc[
    neg_og_from_ptm_data["seq_sort"].isin(test1)
])
print(f"Total negative OG interactions that are currently positive in our PPI data: {test1}")

print(f"Joining in negative data from the positives database: simplemerged")
temp = neg_og_from_ptm_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "PTM decisive_seqpair_og_binds",
        "PTM Partner", "PTM Partner Status", "ptm_short_1","ptm_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_ptm_neg = pd.merge(
    simplemerged,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
)
simplemerged_ptm_neg = simplemerged_ptm_neg.loc[simplemerged_ptm_neg["PTM Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tSize of simplemerged_ptm_neg with just original negatives: {len(simplemerged_ptm_neg)}")

Total rows where PTM decisive_seqpair_og_binds==no: 1247
Total negative OG interactions that are currently positive in our PPI data: 1247
Joining in negative data from the positives database: simplemerged
	Size of simplemerged_ptm_neg with just original negatives: 1061


In [649]:
neg_og_from_ptm_neg_data = merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="no")
].reset_index(drop=True)
neg_og_from_ptm_neg_data["PTM Partner Status"] = ["original"]*len(neg_og_from_ptm_neg_data)
print(f"Total rows where PTM decisive_seqpair_og_binds==no: {len(neg_og_from_ptm_neg_data)}")

# what columns are in common?
common_cols = list(set(neg_og_from_ptm_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
neg_og_from_ptm_neg_data = pd.merge(
    simplemerged_neg,
    neg_og_from_ptm_neg_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_og_from_ptm_neg_data["seq_sort"] = neg_og_from_ptm_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(neg_og_from_ptm_neg_data.loc[
    ~neg_og_from_ptm_neg_data["seq_sort"].isin(test1)
])==0
print(f"As expected, no new sequence pairs arose from neg_og_from_ptm_neg_data: {test1}. Size of simplemerged_neg: {len(simplemerged_neg)}")

temp = neg_og_from_ptm_neg_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "PTM decisive_seqpair_og_binds",
        "PTM Partner","PTM Partner Status", "ptm_short_1","ptm_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

## Creation of simplemerged_neg_ptm
# Goal: get ptm dta into simplemerged_neg_ptm, starting with the positive OGs.
simplemerged_neg_ptm = pd.merge(
    simplemerged_neg,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
print(f"Merged in some ptm-related information. New size of dataframe = {len(simplemerged_neg_ptm)}")

Total rows where PTM decisive_seqpair_og_binds==no: 1131
As expected, no new sequence pairs arose from neg_og_from_ptm_neg_data: True. Size of simplemerged_neg: 743130
Merged in some ptm-related information. New size of dataframe = 743239


In [650]:
## Negative pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
## Negative pairs: (ptm-partner)'s ptm_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
neg_new_from_ptm_data = merged_expl_ptm_filt.loc[
    (merged_expl_ptm_filt["PTM decisive_seqpair_new_binds"]=="no") 
].reset_index(drop=True)
neg_new_from_ptm_data["PTM Partner Status"] = ["ptm"]*len(neg_new_from_ptm_data)
print(f"Total rows where PTM decisive_seqpair_new_binds==no: {len(neg_new_from_ptm_data)}")

# what columns are in common?
common_cols = list(set(neg_new_from_ptm_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
neg_new_from_ptm_data = pd.merge(
    simplemerged,
    neg_new_from_ptm_data, 
    on=common_cols, 
    how="inner"
)

neg_new_from_ptm_data[
    change_cols
] = neg_new_from_ptm_data.apply(lambda row: convert_ptm_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_new_from_ptm_data["seq_sort"] = neg_new_from_ptm_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(neg_new_from_ptm_data.loc[
    neg_new_from_ptm_data["seq_sort"].isin(test1)
])
print(f"Total negative PTM interactions that are currently positive in our PPI data: {test1}")
print(f"The above number should be 0 because all the PTM sequences have <ptm token> characters in them")



Total rows where PTM decisive_seqpair_new_binds==no: 70
Total negative PTM interactions that are currently positive in our PPI data: 0
The above number should be 0 because all the PTM sequences have <ptm token> characters in them


In [651]:
## Negative pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
## Negative pairs: (ptm-partner)'s ptm_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is false
neg_new_from_ptm_neg_data = merged_neg_expl_ptm_filt.loc[
    (merged_neg_expl_ptm_filt["PTM decisive_seqpair_new_binds"]=="no") 
].reset_index(drop=True)
neg_new_from_ptm_neg_data["PTM Partner Status"] = ["ptm"]*len(neg_new_from_ptm_neg_data)
print(f"Total rows where PTM decisive_seqpair_new_binds==no: {len(neg_new_from_ptm_neg_data)}")

# what columns are in common?
common_cols = list(set(neg_new_from_ptm_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
neg_new_from_ptm_neg_data = pd.merge(
    simplemerged_neg,
    neg_new_from_ptm_neg_data, 
    on=common_cols, 
    how="inner"
)

if len(neg_new_from_ptm_neg_data)>0:
    neg_new_from_ptm_neg_data[
        change_cols
    ] = neg_new_from_ptm_neg_data.apply(lambda row: convert_ptm_cols_to_ppi(row),axis=1)

    # check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
    neg_new_from_ptm_neg_data["seq_sort"] = neg_new_from_ptm_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(neg_new_from_ptm_neg_data.loc[
    neg_new_from_ptm_neg_data["seq_sort"].isin(test1)
])
print(f"Total negative MUTATED interactions that are currently positive in our PPI data: {test1}")



Total rows where PTM decisive_seqpair_new_binds==no: 65
Total negative MUTATED interactions that are currently positive in our PPI data: 0


In [652]:
## Unknown pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is unknown
unknown_og_from_ptm_data = merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="unknown"
].reset_index(drop=True)
unknown_og_from_ptm_data["PTM Partner Status"] = ["original"]*len(unknown_og_from_ptm_data)
print(f"Total rows where PTM decisive_seqpair_og_binds==unknown: {len(unknown_og_from_ptm_data)}")

# what columns are in common?
common_cols = list(set(unknown_og_from_ptm_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
unknown_og_from_ptm_data = pd.merge(
    simplemerged,
    unknown_og_from_ptm_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_og_from_ptm_data["seq_sort"] = unknown_og_from_ptm_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(unknown_og_from_ptm_data.loc[
    unknown_og_from_ptm_data["seq_sort"].isin(test1)
])
print(f"Total unknown OG interactions that are currently positive in our PPI data: {test1}")

Total rows where PTM decisive_seqpair_og_binds==unknown: 4446
Total unknown OG interactions that are currently positive in our PPI data: 4446


In [653]:
## Unknown pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_og_binds is unknown
unknown_og_from_ptm_neg_data = merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["PTM decisive_seqpair_og_binds"]=="unknown"
].reset_index(drop=True)
unknown_og_from_ptm_neg_data["PTM Partner Status"] = ["original"]*len(unknown_og_from_ptm_neg_data)
print(f"Total rows where PTM decisive_seqpair_og_binds==unknown: {len(unknown_og_from_ptm_neg_data)}")

# what columns are in common?
common_cols = list(set(unknown_og_from_ptm_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
unknown_og_from_ptm_neg_data = pd.merge(
    simplemerged_neg,
    unknown_og_from_ptm_neg_data, 
    on=common_cols, 
    how="inner"
)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_og_from_ptm_neg_data["seq_sort"] = unknown_og_from_ptm_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(unknown_og_from_ptm_neg_data.loc[
    unknown_og_from_ptm_neg_data["seq_sort"].isin(test1)
])
print(f"Total unknown OG interactions that are currently negative in our PPI data: {test1}")

Total rows where PTM decisive_seqpair_og_binds==unknown: 3990
Total unknown OG interactions that are currently negative in our PPI data: 3990


In [654]:
## Unknown pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_new_binds is unknown
unknown_new_from_ptm_data = merged_expl_ptm_filt.loc[
    merged_expl_ptm_filt["PTM decisive_seqpair_new_binds"]=="unknown"
].reset_index(drop=True)
unknown_new_from_ptm_data["PTM Partner Status"] = ["ptm"]*len(unknown_new_from_ptm_data)
print(f"Total rows where PTM decisive_seqpair_new_binds==unknown: {len(unknown_new_from_ptm_data)}")

# what columns are in common?
common_cols = list(set(unknown_new_from_ptm_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
unknown_new_from_ptm_data = pd.merge(
    simplemerged,
    unknown_new_from_ptm_data, 
    on=common_cols, 
    how="inner"
)

if len(unknown_new_from_ptm_data)>0:
    unknown_new_from_ptm_data[
        change_cols
    ] = unknown_new_from_ptm_data.apply(lambda row: convert_ptm_cols_to_ppi(row),axis=1)

    # check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
    unknown_new_from_ptm_data["seq_sort"] = unknown_new_from_ptm_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_ptm_data.loc[
    unknown_new_from_ptm_data["seq_sort"].isin(test1)
])
print(f"Total unknown ptm interactions that are currently positive in our PPI data: {test1}")

Total rows where PTM decisive_seqpair_new_binds==unknown: 5101
Total unknown ptm interactions that are currently positive in our PPI data: 0


In [655]:
## Unknown pairs: (ptm-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere PTM decisive_seqpair_new_binds is unknown
unknown_new_from_ptm_neg_data = merged_neg_expl_ptm_filt.loc[
    merged_neg_expl_ptm_filt["PTM decisive_seqpair_new_binds"]=="unknown"
].reset_index(drop=True)
unknown_new_from_ptm_neg_data["PTM Partner Status"] = ["ptm"]*len(unknown_new_from_ptm_neg_data)
print(f"Total rows where PTM decisive_seqpair_new_binds==unknown: {len(unknown_new_from_ptm_neg_data)}")

# what columns are in common?
common_cols = list(set(unknown_new_from_ptm_neg_data.columns).intersection(set(simplemerged_neg.columns)))

# we want to merge on all of these except 
unknown_new_from_ptm_neg_data = pd.merge(
    simplemerged_neg,
    unknown_new_from_ptm_neg_data, 
    on=common_cols, 
    how="inner"
)

unknown_new_from_ptm_neg_data[
    change_cols
] = unknown_new_from_ptm_neg_data.apply(lambda row: convert_ptm_cols_to_ppi(row),axis=1)

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
unknown_new_from_ptm_neg_data["seq_sort"] = unknown_new_from_ptm_neg_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged_neg["seq_sort"] = simplemerged_neg.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged_neg["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_ptm_neg_data.loc[
    unknown_new_from_ptm_neg_data["seq_sort"].isin(test1)
])
print(f"Total unknown ptm interactions that are currently negative in our PPI data: {test1}")

Total rows where PTM decisive_seqpair_new_binds==unknown: 4565
Total unknown ptm interactions that are currently negative in our PPI data: 0


In [656]:
# Include the positives and negatives here
print("Working with unknown data from positives: simplemerged")
temp = unknown_og_from_ptm_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "PTM decisive_seqpair_og_binds",
        "PTM Partner", "PTM Partner Status", "ptm_short_1","ptm_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_ptm_unknown = pd.merge(
    simplemerged,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
simplemerged_ptm_unknown = simplemerged_ptm_unknown.loc[simplemerged_ptm_unknown["PTM Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tMerged in ptm-related information when UNKNOWN if original sequence binds. Size of ptm unknown og sequence dataframe = {len(simplemerged_ptm_unknown)}")

simplemerged_ptm_unknown = pd.concat([
    simplemerged_ptm_unknown,
    unknown_new_from_ptm_data
]).drop_duplicates().reset_index(drop=True)

print(f"\tMerged in ptm-related information when UNKNOWN if new sequence binds. Size of ptm unknown ptm sequence dataframe = {len(simplemerged_ptm_unknown)}")

# Add the negatives now 
print("Working with unknown data from negatives: simplemerged_neg")
temp = unknown_og_from_ptm_neg_data[[
        "unique_id",
        "interaction_intactid",
        "seq_pair_id",
        "PTM decisive_seqpair_og_binds",
        "PTM Partner", "PTM Partner Status", "ptm_short_1","ptm_short_2",
        "seq_sort",
        "seq_sort_og",
        "seq_sort_og_id"]].drop_duplicates().reset_index(drop=True)

simplemerged_neg_ptm_unknown = pd.merge(
    simplemerged_neg,
    temp, 
    on=["unique_id",
        "interaction_intactid",
        "seq_pair_id","seq_sort"], 
    how="left"
).reset_index(drop=True)
simplemerged_neg_ptm_unknown = simplemerged_neg_ptm_unknown.loc[
    simplemerged_neg_ptm_unknown["PTM Partner Status"].notna()].drop_duplicates().reset_index(drop=True)

print(f"\tMerged in ptm-related information from the NEGATIVES database when UNKNOWN if original sequence binds. Size of ptm unknown og sequence dataframe = {len(simplemerged_neg_ptm_unknown)}")

simplemerged_neg_ptm_unknown = pd.concat([
    simplemerged_neg_ptm_unknown,
    unknown_new_from_ptm_neg_data
]).drop_duplicates().reset_index(drop=True)

print(f"\tMerged in ptm-related information from the NEGATIVES database when UNKNOWN if new sequence binds. Size of ptm unknown ptm sequence dataframe = {len(simplemerged_neg_ptm_unknown)}")

simplemerged_ptm_unknown = pd.concat([
    simplemerged_ptm_unknown,
    simplemerged_neg_ptm_unknown,
]).drop_duplicates().reset_index(drop=True)

print(f"Combined simplemerged_neg_ptm_unknown with simplemerged_ptm_unknown. Size of ptm unknown ptm sequence dataframe = {len(simplemerged_ptm_unknown)}")

Working with unknown data from positives: simplemerged
	Merged in ptm-related information when UNKNOWN if original sequence binds. Size of ptm unknown og sequence dataframe = 4285
	Merged in ptm-related information when UNKNOWN if new sequence binds. Size of ptm unknown ptm sequence dataframe = 9363
Working with unknown data from negatives: simplemerged_neg
	Merged in ptm-related information from the NEGATIVES database when UNKNOWN if original sequence binds. Size of ptm unknown og sequence dataframe = 3835
	Merged in ptm-related information from the NEGATIVES database when UNKNOWN if new sequence binds. Size of ptm unknown ptm sequence dataframe = 8383
Combined simplemerged_neg_ptm_unknown with simplemerged_ptm_unknown. Size of ptm unknown ptm sequence dataframe = 17746


In [657]:
simplemerged_neg_ptm_unknown

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,ptm_aa_2,ptm_new_binds_bo_mi,ptm_og_binds_bo_mi,PTM all_new_binds,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_new,seq_sort_new_id,PTM decisive_seqpair_new_binds
0,False,ADTCPEVKVVGLEGSDKLTILRGCPGLPGAPGPKGEAGVIGERGER...,MHLLAILFCALWSAVLAENSDDYDLMYVNLDNEIDNGLHPTEDPTP...,,,intact:EBI-11784425,intact:EBI-11574553,326,,30,...,,,,,,,,,,
1,False,ADTCPEVKVVGLEGSDKLTILRGCPGLPGAPGPKGEAGVIGERGER...,MHLLAILFCALWSAVLAENSDDYDLMYVNLDNEIDNGLHPTEDPTP...,,,intact:EBI-11784425,intact:EBI-11574553,326,,30,...,,,,,,,,,,
2,False,APIKGVTFGEDTVWEVQGYKNVRITFELDERVDKVLNEKCSVYTVE...,MFQAAERPQEWAMEGPRDGLKKERLLDDRHDSGLDSMKDEEYEQMV...,,,intact:EBI-25474079,intact:EBI-307386,2740,,819,...,,,,,,,,,,
3,False,APIKGVTFGEDTVWEVQGYKNVRITFELDERVDKVLNEKCSVYTVE...,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...,,,intact:EBI-25474079,intact:EBI-3390054,2740,,819,...,,,,,,,,,,
4,False,APIKGVTFGEDTVWEVQGYKNVRITFELDERVDKVLNEKCSVYTVE...,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...,,,intact:EBI-25474079,intact:EBI-3390054,2740,,819,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8378,False,SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTS...,MNRHLWKSQLCEMVQPSGGPAADQDVLGEESPLGKPAMLHLPSEQG...,,,intact:EBI-25475864,intact:EBI-27121550,3569.0,,3264.0,...,MNRHLWKSQLCEMVQPSGGPAADQDVLGEESPLGKPAMLHLPSEQG...,unknown,yes,unknown,"unknown,yes",unknown,yes,MNRHLWKSQLCEMVQPSGGPAADQDVLGEESPLGKPAMLHLPSEQG...,seq_sort_new_3493,unknown
8379,False,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,,,intact:EBI-25475871,intact:EBI-25475885,,5324.0,,...,,unknown,yes,unknown,"unknown,yes",unknown,yes,"<psi-mod:""MOD:01163""(guanylated residue)>KMSDV...",seq_sort_new_30,unknown
8380,False,"S<psi-mod:""MOD:01163""(guanylated residue)>MSDV...",SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,,,intact:EBI-25475871,intact:EBI-25475885,,5324.0,,...,,unknown,yes,unknown,"unknown,yes",unknown,yes,"S<psi-mod:""MOD:01163""(guanylated residue)>MSDV...",seq_sort_new_4155,unknown
8381,False,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,,,intact:EBI-25475871,intact:EBI-25475885,,5324.0,,...,,unknown,yes,unknown,"unknown,yes",unknown,yes,"<psi-mod:""MOD:01166""(uridylated residue)>KMSDV...",seq_sort_new_32,unknown


In [658]:
## Effectively combine this information. Delete any rows from simplemerged where there is a negative interaction shown by ptm
neg_seq_pairs = neg_new_from_ptm_data["seq_sort"].unique().tolist() + neg_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_data["seq_sort"].unique().tolist()
neg_seq_pairs += (neg_new_from_ptm_neg_data["seq_sort"].unique().tolist() + neg_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_neg_data["seq_sort"].unique().tolist())
print(f"Size of simplemerged_ptm before we remove erroneous positives (should be negative or unknown): {len(simplemerged_ptm)}")
simplemerged_ptm = simplemerged_ptm.loc[
    ~simplemerged_ptm["seq_sort"].isin(neg_seq_pairs)
]
print(f"Size of simplemerged_ptm after we remove erroneous positives (should be negative or unknown): {len(simplemerged_ptm)}")

Size of simplemerged_ptm before we remove erroneous positives (should be negative or unknown): 743301
Size of simplemerged_ptm after we remove erroneous positives (should be negative or unknown): 736009


In [659]:
# Delete any rows from simplemerged where there is a positive interaction that should be negative
pos_seq_pairs = pos_new_from_ptm_data["seq_sort"].unique().tolist() + pos_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_data["seq_sort"].unique().tolist()
pos_seq_pairs += (pos_new_from_ptm_neg_data["seq_sort"].unique().tolist() + pos_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_neg_data["seq_sort"].unique().tolist())

print(f"Size of simplemerged_neg_ptm before we remove erroneous negatives (should be positive or unknown): {len(simplemerged_neg_ptm)}")
simplemerged_neg_ptm = simplemerged_neg_ptm.loc[
    ~simplemerged_neg_ptm["seq_sort"].isin(pos_seq_pairs)
]
print(f"Size of simplemerged_neg_ptm after we remove erroneous negatives (should be positive or unknown): {len(simplemerged_neg_ptm)}")

Size of simplemerged_neg_ptm before we remove erroneous negatives (should be positive or unknown): 743239
Size of simplemerged_neg_ptm after we remove erroneous negatives (should be positive or unknown): 736116


In [660]:
print(len(neg_og_from_ptm_data.columns))
print(len(simplemerged.columns))
print(set(neg_og_from_ptm_data.columns)-set(simplemerged.columns))

168
111
{'ptm_end_2', 'PTM new_binds_bo_feature_type', 'PTM PubMedID', 'seq_sort_og', 'ptm_orig_2', 'ptm_mi_2', 'PTM decisive_entry_new_binds', 'scraped_ptm_has_info', 'PTM Affected protein full name', 'PTM Interaction AC', 'ptm_name_2', 'ptm_new_binds_bo_mi', 'PTM Feature type', 'ptm_short_2', 'PTM og_binds_bo_feature_type', 'ptm_orig_1', 'scraped_ptm_has_info_2', 'scraped_ptm_has_info_1', 'PTM all_og_binds', 'PTM # Feature AC', 'PTM Affected protein organism', 'seq_sort_new', 'ptm_name_1', 'ptm_og_binds_bo_mi', 'PTM og_binds_bo_annotation', 'seq_sort_new_id', 'ptm_new_2', 'ptm_short_1', 'PTM Feature range(s)', 'PTM Figure legend(s)', 'PTM Affected protein AC', 'PTM all_new_binds', 'ptm_begin_1', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds', 'seq_sort_og_id', 'PTM Partner Status', 'ptm_mi_1', 'PTM new_binds_bo_annotation', 'PTM Feature short label', 'ptm_aa_1', 'PTM Interactor Matches', 'PTM Resulting sequence', 'ptm_end_1', 'agg_ptm_has_info', 'PTM Partner', 'ptm_be

In [661]:
# Are there any instances in neg_og_from_ptm_data that are already in simplemerged_neg? 
l = neg_og_from_ptm_data["interaction_intactid"].unique().tolist()
display(simplemerged_neg.loc[
    simplemerged_neg["interaction_intactid"].isin(l)
])

# Are there any instances in pos_og_from_neg_ptm_data that are already in simplemerged_pos? 
l = pos_og_from_ptm_neg_data["interaction_intactid"].unique().tolist()
display(simplemerged.loc[
    simplemerged["interaction_intactid"].isin(l)
])

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
1043,False,ARTKQTARKSTGGKAPRKQLA,MEQVAEGARVTAVPVSAADSTEELAEVEEGVGVVGEDNDAAARGAE...,,,intact:EBI-15634422,intact:EBI-2653928,,,,...,,,,,,,,,,
2775,False,GKGGAKRHRKVLRDNIQGI,MSRYITRLSMRRTYKWNGRPVGEDRKLRRQYYGSMSISVDGRTEDV...,,,intact:EBI-15973242,intact:EBI-15973521,,,,...,,,,,,MI:0442,sufficient binding region,BAH domain,44,166
3134,False,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,,,intact:EBI-7572597,intact:EBI-986224,,,,...,,,,,,,,,,
4467,False,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSV...,,,intact:EBI-1181460,intact:EBI-347088,,,,...,MI:0442,sufficient binding region,t-loop region,214,228,,,,,
4604,False,MAAAAATAVGPGAGSAGVAGPGGAGPCATVSVFPGARLLTIGDANG...,GKAPRKQLATKAARKSAPATG,,,intact:EBI-4414343,intact:EBI-1179609,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
723071,False,MVSLTFKNFKKEKVPLDLEPSNTILETKTKLAQSISCEESQIKLIY...,MQIFVKTLTGKTITLEVEPSDTIENVKAKIQDKEGIPPDQQRLIFA...,,,intact:EBI-14668,intact:EBI-3390054,,,,...,,,,,,,,,,
725277,False,MVTKPSHNLRREHKWLKETATLQEDKDFVFQAIQKHIANKRPKTNS...,MENITQPTQQSTQATQRFLIEKFSQEQIGENIVCRVICTTGQIPIR...,,,intact:EBI-17059,intact:EBI-17843,,,,...,MI:0117,binding-associated region,binding site,446,456,MI:0117,binding-associated region,binding site,22,162
728886,False,MWIIEAEGDILKGKSRILFPGTYIVGRNVSDDSSHIQVISKSISKR...,MEDTQAIDWDVEEEEETEQSSESLRCNVEPVGRLHIFSGAHGPEKD...,,,intact:EBI-2125045,intact:EBI-495644,,,,...,MI:0442,sufficient binding region,region,1,324,MI:0442,sufficient binding region,region,325,334
737563,False,MYNTVWSMDRDDADWREVMMPYSTELIFYIEMDPPALPPKPPKPMT...,MGTRDDEYDYLFKVVLIGDSGVGKSNLLSRFTRNEFNLESKSTIGV...,,,intact:EBI-79893,intact:EBI-745098,,,,...,,,,,,MI:0442,sufficient binding region,region,92,106


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
886,False,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHL...,,,intact:EBI-25492388,intact:EBI-25475847,2763,180,819,...,MI:0442,sufficient binding region,core_region,179,1329,MI:0442,sufficient binding region,region,1,179
909,False,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,MKPATGLWVWVSLLVAAGTVQPSDSQSVCAGTENKLSSLSDLEQQY...,,,intact:EBI-25492388,intact:EBI-80371,2763,,819,...,MI:0442,sufficient binding region,PL-PRO region,746,1060,MI:0442,sufficient binding region,region,136,151
911,False,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,MPNPRPGKPSAPSLALGPSPGASPSWRAAPKASDLLGARGPGGTFQ...,,,intact:EBI-25492388,intact:EBI-983719,2763,,819,...,MI:0442,sufficient binding region,PL-PRO region,746,1060,MI:0442,sufficient binding region,region,42,59
912,False,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,MPNPRPGKPSAPSLALGPSPGASPSWRAAPKASDLLGARGPGGTFQ...,,,intact:EBI-25492388,intact:EBI-983719,2763,,819,...,MI:0442,sufficient binding region,PL-PRO region,746,1060,MI:0442,sufficient binding region,region,1,260
913,False,APTKVTFGDDTVIEVQGYKSVNITFELDERIDKVLNEKCSAYTVEL...,MPNPRPGKPSAPSLALGPSPGASPSWRAAPKASDLLGARGPGGTFQ...,,,intact:EBI-25492388,intact:EBI-983719,2763,,819,...,MI:0442,sufficient binding region,PL-PRO region,746,1060,MI:0442,sufficient binding region,region,1,260
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
741571,False,SGFRKMAFPSGKVEGCMVQVTCGTTTLNGLWLDDVVYCPRHVICTS...,MNRHLWKSQLCEMVQPSGGPAADQDVLGEESPLGKPAMLHLPSEQG...,,,intact:EBI-25475864,intact:EBI-27121550,3569,,3264,...,,,,,,MI:0442,sufficient binding region,region,222,241
741940,False,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,,,intact:EBI-25475871,intact:EBI-25475885,3942,5324,3860,...,,,,,,,,,,
741941,False,SKMSDVKCTSVVLLSVLQQLRVESSSKLWAQCVQLHNDILLAKDTT...,SADAQSFLNRVCGVSAARLTPCGTGTSTDVVYRAFDIYNDKVAGFA...,,,intact:EBI-25475871,intact:EBI-25475885,3942,5324,3860,...,,,,,,,,,,
743048,False,YIYTQ,MGVLRVYVILILVGFCVQIVVVNSQNLTCNSNDLKALEGFMRGLES...,,,intact:EBI-16173029,intact:EBI-16172869,,,,...,,,,,,MI:0442,sufficient binding region,Region 24-659,24,659


Databases that we have at this point
* simplemerged_ptm: positive ogs from positive dataset, merged back into simplemerged
* simplemerged_neg_ptm: negative ogs from negative dataset, merged back into simplemerged
* simplemerged_ptm_unknown (already encapsulates all of the unknowns below, and information from simplemerged/simplemerged_neg when relevant)

* simplemerged_ptm_neg: negatives from simplemerged (neg_og_from_ptm_data), merged back in. Why? because original sequences are affected. Must combine with the rows that already exist about them.
* simplemerged_neg_ptm_pos: positives from simplemerged_neg (pos_og_from_ptm_neg_data), merged back in.  Why? because original sequences are affected. Must combine with the rows that already exist about them.

Extracted pos/neg/unknown
* pos_new_from_ptm_data
* pos_og_from_ptm_data
* pos_new_from_ptm_neg_data
* pos_og_from_ptm_neg_data
* neg_new_from_ptm_data
* neg_og_from_ptm_data
* neg_new_from_ptm_neg_data
* neg_og_from_ptm_neg_data
* unknown_og_from_ptm_data: unknowns from simplemerged, merged back in. Why? Because original seqences are affected.
* unknown_new_from_ptm_data: unknowns from simplemerged_neg, merged back in. Why? Because original seqences are affected.
* unknown_og_from_ptm_neg_data
* unknown_new_from_ptm_neg_data

In [662]:
pos_new_from_ptm_neg_data["Negative"] = False
pos_og_from_ptm_neg_data["Negative"] = False
# make simplemerged, starting wtih nothing that's in the other databases already
temp = pd.concat([
    pos_og_from_ptm_neg_data,
    pos_og_from_ptm_data
])
temp["temp"] = temp["interaction_intactid"] + "_" + temp["unique_id"] 
temp = temp[["temp"]]
exclude_combo = temp["temp"].unique().tolist()
print(len(exclude_combo))
print(f"Merging the newly found positive ogs and news from each dataset with original simplemerged.")
print(f"First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.")
print(f"Total to be excluded: {len(exclude_combo)}")
simplemerged_ptm = simplemerged.copy(deep=True)
test1 = len(simplemerged_ptm)
simplemerged_ptm["temp"] = simplemerged_ptm["interaction_intactid"] + "_" + simplemerged_ptm["unique_id"] 
simplemerged_ptm = simplemerged_ptm.loc[~simplemerged_ptm["temp"].isin(exclude_combo)].reset_index(drop=True)
simplemerged_ptm = simplemerged_ptm.drop(columns=["temp"])
print(f"Total simplemerged_ptm rows that were excluded: {test1-len(simplemerged_ptm)}. Length now: {len(simplemerged_ptm)}")

simplemerged_ptm = pd.concat([
    simplemerged_ptm,
    pos_new_from_ptm_data,
    pos_new_from_ptm_neg_data,
    pos_og_from_ptm_neg_data,
    pos_og_from_ptm_data
]).reset_index(drop=True)
simplemerged_ptm = simplemerged_ptm.drop_duplicates().reset_index(drop=True)
print(f"Size of simplemerged_ptm after we concatenate new positives found from ptms (both from positive and negative initial datasets): {len(simplemerged_ptm)}")

## Effectively combine this information. Delete any rows from simplemerged where there is a negative interaction shown by ptm
neg_seq_pairs = neg_new_from_ptm_data["seq_sort"].unique().tolist() + neg_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_data["seq_sort"].unique().tolist()
neg_seq_pairs += (neg_new_from_ptm_neg_data["seq_sort"].unique().tolist() + neg_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_neg_data["seq_sort"].unique().tolist())
print(f"Size of simplemerged_ptm before we remove erroneous positives (should be negative or unknown): {len(simplemerged_ptm)}")
simplemerged_ptm = simplemerged_ptm.loc[
    ~simplemerged_ptm["seq_sort"].isin(neg_seq_pairs)
]
print(f"Size of simplemerged_ptm after we remove erroneous positives (should be negative or unknown): {len(simplemerged_ptm)}")


584
Merging the newly found positive ogs and news from each dataset with original simplemerged.
First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.
Total to be excluded: 584
Total simplemerged_ptm rows that were excluded: 584. Length now: 742546
Size of simplemerged_ptm after we concatenate new positives found from ptms (both from positive and negative initial datasets): 746999
Size of simplemerged_ptm before we remove erroneous positives (should be negative or unknown): 746999
Size of simplemerged_ptm after we remove erroneous positives (should be negative or unknown): 739707


In [663]:
print(f"Total negative ptm sequences from processing positive PPI data: {len(neg_new_from_ptm_data)}")
display(neg_new_from_ptm_data.head())
print(f"Total negative ptm sequences from processing negative PPI data: {len(neg_new_from_ptm_neg_data)}")
display(neg_new_from_ptm_neg_data.head())
print(f"Total negative original sequences from processing positive PPI data: {len(simplemerged_ptm_neg)}")
display(simplemerged_ptm_neg.head())
print(f"Total negative original sequences from processing negative PPI data: {len(neg_og_from_ptm_neg_data)}")
display(neg_og_from_ptm_neg_data.head())

Total negative ptm sequences from processing positive PPI data: 70


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds,PTM Partner Status
0,False,KVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQ...,MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKES...,,,intact:EBI-9209835,intact:EBI-366182,317.0,,19.0,...,"unknown,yes",no,yes,KVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQ...,KVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQ...,seq_sort_og_172,yes,seq_sort_new_338,no,ptm
1,False,MAAGGGGGSSKASSSSASSAGALESSLDRKFQSVTNTMESIQGLSS...,"SPSYSPT<psi-mi:""MI:0176""(O-phospho-L-serine)>P...",,,intact:EBI-5280390,intact:EBI-16112696,,,,...,"unknown,yes",no,yes,MAAGGGGGSSKASSSSASSAGALESSLDRKFQSVTNTMESIQGLSS...,MAAGGGGGSSKASSSSASSAGALESSLDRKFQSVTNTMESIQGLSS...,seq_sort_og_232,yes,seq_sort_new_420,no,ptm
2,False,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,"STGGKAPR<psi-mi:""MI:0526""(N-acetylated L-lysin...",,,intact:EBI-78139,intact:EBI-16203811,,,,...,"unknown,yes",no,yes,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,seq_sort_og_273,yes,seq_sort_new_505,no,ptm
3,False,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,"STGGKAPR<psi-mi:""MI:0526""(N-acetylated L-lysin...",,,intact:EBI-78139,intact:EBI-16203811,,,,...,"unknown,yes",no,yes,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,seq_sort_og_273,yes,seq_sort_new_505,no,ptm
4,False,MAMQMQLEANADTSVEEESFGPQPISRLEQCGINANDVKKLEEAGF...,MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEP...,,,intact:EBI-15557721,intact:EBI-79792,,,,...,"unknown,yes",no,yes,MAMQMQLEANADTSVEEESFGPQPISRLEQCGINANDVKKLEEAGF...,MAMQMQLEANADTSVEEESFGPQPISRLEQCGINANDVKKLEEAGF...,seq_sort_og_674,yes,seq_sort_new_1235,no,ptm


Total negative ptm sequences from processing negative PPI data: 65


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds,PTM Partner Status
0,False,KVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQ...,MAEPRQEFEVMEDHAGTYGLGDRKDQGGYTMHQDQEGDTDAGLKES...,,,intact:EBI-9209835,intact:EBI-366182,317.0,,19.0,...,"unknown,yes",no,yes,KVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQ...,KVEQAVETEPEPELRQQTEWQSGQRWELALGRFWDYLRWVQTLSEQ...,seq_sort_og_161,yes,seq_sort_new_320,no,ptm
1,False,MAAGGGGGSSKASSSSASSAGALESSLDRKFQSVTNTMESIQGLSS...,"SPSYSPT<psi-mi:""MI:0176""(O-phospho-L-serine)>P...",,,intact:EBI-5280390,intact:EBI-16112696,,,,...,"unknown,yes",no,yes,MAAGGGGGSSKASSSSASSAGALESSLDRKFQSVTNTMESIQGLSS...,MAAGGGGGSSKASSSSASSAGALESSLDRKFQSVTNTMESIQGLSS...,seq_sort_og_215,yes,seq_sort_new_394,no,ptm
2,False,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,"STGGKAPR<psi-mi:""MI:0526""(N-acetylated L-lysin...",,,intact:EBI-78139,intact:EBI-16203811,,,,...,"unknown,yes",no,yes,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,seq_sort_og_242,yes,seq_sort_new_462,no,ptm
3,False,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,"STGGKAPR<psi-mi:""MI:0526""(N-acetylated L-lysin...",,,intact:EBI-78139,intact:EBI-16203811,,,,...,"unknown,yes",no,yes,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,MAASAAAASAAAASAASGSPGPGEGSAGGEKRSTAPSAAASASASA...,seq_sort_og_242,yes,seq_sort_new_462,no,ptm
4,False,MAMQMQLEANADTSVEEESFGPQPISRLEQCGINANDVKKLEEAGF...,MPIGSKERPTFFEIFKTRCNKADLGPISLNWFEELSSEAPPYNSEP...,,,intact:EBI-15557721,intact:EBI-79792,,,,...,"unknown,yes",no,yes,MAMQMQLEANADTSVEEESFGPQPISRLEQCGINANDVKKLEEAGF...,MAMQMQLEANADTSVEEESFGPQPISRLEQCGINANDVKKLEEAGF...,seq_sort_og_592,yes,seq_sort_new_1118,no,ptm


Total negative original sequences from processing positive PPI data: 1061


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_short_2,all_binding_begin_2,all_binding_end_2,PTM decisive_seqpair_og_binds,PTM Partner,PTM Partner Status,ptm_short_1,ptm_short_2,seq_sort_og,seq_sort_og_id
0,False,ARTKQTARKSTGGKAPRKQLA,MEQVAEGARVTAVPVSAADSTEELAEVEEGVGVVGEDNDAAARGAE...,,,intact:EBI-15634422,intact:EBI-2653928,,,,...,,,,no,A,original,lys-9,,ARTKQTARKSTGGKAPRKQLA_MEQVAEGARVTAVPVSAADSTEEL...,seq_sort_og_40
1,False,GKGGAKRHRKVLRDNIQGI,MSRYITRLSMRRTYKWNGRPVGEDRKLRRQYYGSMSISVDGRTEDV...,,,intact:EBI-15973242,intact:EBI-15973521,,,,...,BAH domain,44.0,166.0,no,A,original,lys-10,,GKGGAKRHRKVLRDNIQGI_MSRYITRLSMRRTYKWNGRPVGEDRK...,seq_sort_og_145
2,False,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,,,intact:EBI-7572597,intact:EBI-986224,,,,...,,,,no,B,original,,other modification,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,seq_sort_og_156
3,False,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,MDKNELVQKAKLAEQAERYDDMAACMKSVTEQGAELSNEERNLLSV...,,,intact:EBI-1181460,intact:EBI-347088,,,,...,,,,no,A,original,thr-163,,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,seq_sort_og_189
4,False,MAAAAATAVGPGAGSAGVAGPGGAGPCATVSVFPGARLLTIGDANG...,GKAPRKQLATKAARKSAPATG,,,intact:EBI-4414343,intact:EBI-1179609,,,,...,,,,no,B,original,,methylated residue,GKAPRKQLATKAARKSAPATG_MAAAAATAVGPGAGSAGVAGPGGA...,seq_sort_og_138


Total negative original sequences from processing negative PPI data: 1131


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,PTM all_og_binds,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds,PTM Partner Status
0,False,ARTKQTARKSTGGKAPRKQLA,MEQVAEGARVTAVPVSAADSTEELAEVEEGVGVVGEDNDAAARGAE...,,,intact:EBI-15634422,intact:EBI-2653928,,,,...,"no,unknown",yes,no,ARTKQTARKSTGGKAPRKQLA_MEQVAEGARVTAVPVSAADSTEEL...,"ARTKQTAR<psi-mod:""MOD:01683""(monomethylated L-...",seq_sort_og_38,no,seq_sort_new_204,yes,original
1,False,GKGGAKRHRKVLRDNIQGI,MSRYITRLSMRRTYKWNGRPVGEDRKLRRQYYGSMSISVDGRTEDV...,,,intact:EBI-15973242,intact:EBI-15973521,,,,...,"no,unknown",yes,no,GKGGAKRHRKVLRDNIQGI_MSRYITRLSMRRTYKWNGRPVGEDRK...,"GKGGAKRHR<psi-mi:""MI:0166""(N6,N6-dimethyl-L-ly...",seq_sort_og_139,no,seq_sort_new_288,yes,original
2,False,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,MPSSVSWGILLLAGLCCLVPVSLAEDPQGDAAQKTDTSHHDQDHPT...,,,intact:EBI-7572597,intact:EBI-986224,,,,...,"no,unknown",yes,no,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,seq_sort_og_148,no,seq_sort_new_303,yes,original
3,False,MAAAAATAVGPGAGSAGVAGPGGAGPCATVSVFPGARLLTIGDANG...,GKAPRKQLATKAARKSAPATG,,,intact:EBI-4414343,intact:EBI-1179609,,,,...,"no,unknown",yes,no,GKAPRKQLATKAARKSAPATG_MAAAAATAVGPGAGSAGVAGPGGA...,"GKAPRKQLATKAARKS<psi-mod:""MOD:00427""(methylate...",seq_sort_og_132,no,seq_sort_new_279,yes,original
4,False,MAAAAATAVGPGAGSAGVAGPGGAGPCATVSVFPGARLLTIGDANG...,GKAPRKQLATKAARKSAPATG,,,intact:EBI-4414343,intact:EBI-1179609,,,,...,"no,unknown",yes,no,GKAPRKQLATKAARKSAPATG_MAAAAATAVGPGAGSAGVAGPGGA...,"GKAPRKQLATKAARKS<psi-mod:""MOD:00427""(methylate...",seq_sort_og_132,no,seq_sort_new_279,yes,original


In [664]:
neg_new_from_ptm_data["Negative"] = True
neg_og_from_ptm_data["Negative"] = True
# make simplemerged_neg, starting wtih nothing that's in the other databases already
temp = pd.concat([
    neg_og_from_ptm_neg_data,
    neg_og_from_ptm_data
])
temp["temp"] = temp["interaction_intactid"] + "_" + temp["unique_id"]
temp = temp[["temp"]]
exclude_combo = temp["temp"].unique().tolist()
print(f"Merging the newly found positive ogs and news from each dataset with original simplemerged_neg.")
print(f"First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.")
print(f"Total to be excluded: {len(exclude_combo)}")
simplemerged_neg_ptm = simplemerged_neg.copy(deep=True)
test1 = len(simplemerged_neg_ptm)
simplemerged_neg_ptm["temp"] = simplemerged_neg_ptm["interaction_intactid"] + "_" + simplemerged_neg_ptm["unique_id"]
simplemerged_neg_ptm = simplemerged_neg_ptm.loc[~simplemerged_neg_ptm["temp"].isin(exclude_combo)].reset_index(drop=True)
simplemerged_neg_ptm = simplemerged_neg_ptm.drop(columns=["temp"])
print(f"Total simplemerged_neg_ptm rows that were excluded: {test1-len(simplemerged_neg_ptm)}. Length now: {len(simplemerged_neg_ptm)}")

simplemerged_neg_ptm = pd.concat([
    simplemerged_neg_ptm,
    neg_new_from_ptm_data,
    neg_new_from_ptm_neg_data,
    neg_og_from_ptm_neg_data,
    neg_og_from_ptm_data
]).reset_index(drop=True)
simplemerged_neg_ptm = simplemerged_neg_ptm.drop_duplicates().reset_index(drop=True)
print(f"Size of simplemerged_neg_ptm after we concatenate new positives found from ptms (both from positive and negative initial datasets): {len(simplemerged_neg_ptm)}")

# Delete any rows from simplemerged where there is a positive interaction that should be negative
pos_seq_pairs = pos_new_from_ptm_data["seq_sort"].unique().tolist() + pos_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_data["seq_sort"].unique().tolist()
pos_seq_pairs += (pos_new_from_ptm_neg_data["seq_sort"].unique().tolist() + pos_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_og_from_ptm_neg_data["seq_sort"].unique().tolist() + unknown_new_from_ptm_neg_data["seq_sort"].unique().tolist())

print(f"Size of simplemerged_neg_ptm before we remove erroneous negatives (should be positive or unknown): {len(simplemerged_neg_ptm)}")
simplemerged_neg_ptm = simplemerged_neg_ptm.loc[
    ~simplemerged_neg_ptm["seq_sort"].isin(pos_seq_pairs)
]
print(f"Size of simplemerged_neg_ptm after we remove erroneous negatives (should be positive or unknown): {len(simplemerged_neg_ptm)}")

Merging the newly found positive ogs and news from each dataset with original simplemerged_neg.
First, deleting any rows that have the same interaction_intactid and unique_id, as these will be merged back in.
Total to be excluded: 948
Total simplemerged_neg_ptm rows that were excluded: 948. Length now: 742182
Size of simplemerged_neg_ptm after we concatenate new positives found from ptms (both from positive and negative initial datasets): 744691
Size of simplemerged_neg_ptm before we remove erroneous negatives (should be positive or unknown): 744691
Size of simplemerged_neg_ptm after we remove erroneous negatives (should be positive or unknown): 737568


In [665]:
simplemerged_ptm[["ptm_short_1","ptm_short_2"]]

# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_ptm.loc[
    (simplemerged_ptm["ptm_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_ptm["ptm_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join ptm labels with pipe because pipe is not used in them: {test1}")
simplemerged_ptm["ptm_short"] = simplemerged_ptm["ptm_short_1"].fillna("") + "|" + simplemerged_ptm["ptm_short_2"].fillna("")
simplemerged_ptm["ptm_short"] = simplemerged_ptm["ptm_short"].str.strip("\\|")
simplemerged_ptm[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","ptm_short_1","ptm_short_2","ptm_short","PTM Partner"]]

test1 = len(simplemerged_ptm.loc[
    simplemerged_ptm.duplicated(["unique_id","seq_pair_id","interaction_intactid","ptm_short"])
])
print(f"Rows in simplemerged_ptm with duplicate seq_pair_id+interaction_intactid+ptm_short: {test1}")

Can join ptm labels with pipe because pipe is not used in them: True
Rows in simplemerged_ptm with duplicate seq_pair_id+interaction_intactid+ptm_short: 2788


In [666]:
simplemerged_neg_ptm[["ptm_short_1","ptm_short_2"]]

# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_neg_ptm.loc[
    (simplemerged_neg_ptm["ptm_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_neg_ptm["ptm_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join ptm labels with pipe because pipe is not used in them: {test1}")
simplemerged_neg_ptm["ptm_short"] = simplemerged_neg_ptm["ptm_short_1"].fillna("") + "|" + simplemerged_neg_ptm["ptm_short_2"].fillna("")
simplemerged_neg_ptm["ptm_short"] = simplemerged_neg_ptm["ptm_short"].str.strip("\\|")
simplemerged_neg_ptm[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","ptm_short_1","ptm_short_2","ptm_short","PTM Partner"]]

test1 = len(simplemerged_neg_ptm.loc[
    simplemerged_neg_ptm.duplicated(["unique_id","seq_pair_id","interaction_intactid","ptm_short"])
])
print(f"Rows in simplemerged_neg_ptm with duplicate seq_pair_id+interaction_intactid+ptm_short: {test1}")

Can join ptm labels with pipe because pipe is not used in them: True
Rows in simplemerged_neg_ptm with duplicate seq_pair_id+interaction_intactid+ptm_short: 1409


In [667]:
simplemerged_ptm_unknown[["ptm_short_1","ptm_short_2"]]

# Let's make a new unique_id that includes whether the sequence was mutated
test1 = len(simplemerged_ptm_unknown.loc[
    (simplemerged_ptm_unknown["ptm_short_1"].fillna("").str.contains("\\|")) | 
    (simplemerged_ptm_unknown["ptm_short_2"].fillna("").str.contains("\\|"))
])==0 
print(f"Can join ptm labels with pipe because pipe is not used in them: {test1}")
simplemerged_ptm_unknown["ptm_short"] = simplemerged_ptm_unknown["ptm_short_1"].fillna("") + "|" + simplemerged_ptm_unknown["ptm_short_2"].fillna("")
simplemerged_ptm_unknown["ptm_short"] = simplemerged_ptm_unknown["ptm_short"].str.strip("\\|")
simplemerged_ptm_unknown[["all_intact_A_sorted","all_intact_A_sorted","unique_id","seq_pair_id","interaction_intactid","ptm_short_1","ptm_short_2","ptm_short","PTM Partner"]]

test1 = len(simplemerged_ptm_unknown.loc[
    simplemerged_ptm_unknown.duplicated(["unique_id","seq_pair_id","interaction_intactid","ptm_short"])
])
print(f"Rows in simplemerged_ptm_unknown with duplicate seq_pair_id+interaction_intactid+ptm_short: {test1}")

Can join ptm labels with pipe because pipe is not used in them: True
Rows in simplemerged_ptm_unknown with duplicate seq_pair_id+interaction_intactid+ptm_short: 12882


In [668]:
# Merge pos_og_from_ptm_data back with simplemerged

In [669]:
print([x for x in simplemerged_mut if "decisive" in x or "all" in x])
print([x for x in simplemerged_ptm if "decisive" in x or "all" in x])

['all_intact_A_sorted', 'all_intact_B_sorted', 'unique_all_intact_sorted', 'all_binding_mi_1', 'all_binding_name_1', 'all_binding_short_1', 'all_binding_begin_1', 'all_binding_end_1', 'all_binding_mi_2', 'all_binding_name_2', 'all_binding_short_2', 'all_binding_begin_2', 'all_binding_end_2', 'Mutated all_new_binds', 'Mutated all_og_binds', 'Mutated decisive_entry_new_binds', 'Mutated decisive_entry_og_binds', 'Mutated decisive_seqpair_og_binds', 'Mutated decisive_seqpair_new_binds']
['all_intact_A_sorted', 'all_intact_B_sorted', 'unique_all_intact_sorted', 'all_binding_mi_1', 'all_binding_name_1', 'all_binding_short_1', 'all_binding_begin_1', 'all_binding_end_1', 'all_binding_mi_2', 'all_binding_name_2', 'all_binding_short_2', 'all_binding_begin_2', 'all_binding_end_2', 'PTM all_new_binds', 'PTM all_og_binds', 'PTM decisive_entry_new_binds', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds', 'PTM decisive_seqpair_new_binds']


In [670]:
print(len(ptms.loc[
    (ptms["Feature annotation(s)"] != "-") & 
    (ptms["Feature annotation(s)"].notna())
    ]))
print(len(ptms))

1105
10286


In [671]:
display(simplemerged_ptm[["interaction_intactid","unique_id","confidence_val_int"]])
display(simplemerged_mut[["interaction_intactid","unique_id","confidence_val_int"]])

Unnamed: 0,interaction_intactid,unique_id,confidence_val_int
0,EBI-20589590,intact:EBI-20589573_intact:EBI-358616,0.44
1,EBI-25507641,intact:EBI-25507607_intact:EBI-448610,0.37
2,EBI-25508294,intact:EBI-16730154_intact:EBI-25507607,0.37
3,EBI-25507637,intact:EBI-1380492_intact:EBI-25507607,0.37
4,EBI-25508313,intact:EBI-25507607_intact:EBI-25508298,0.37
...,...,...,...
746994,EBI-28965837,intact:EBI-29014783_intact:EBI-29020361,0.65
746995,EBI-28966017,intact:EBI-29020361_intact:EBI-29036734,0.65
746996,EBI-28966029,intact:EBI-29020361_intact:EBI-29036734,0.65
746997,EBI-16173124,intact:EBI-16172869_intact:EBI-16173029,0.61


Unnamed: 0,interaction_intactid,unique_id,confidence_val_int
0,EBI-22074159,intact:EBI-100018_intact:EBI-101707,0.37
1,EBI-502739,intact:EBI-100018_intact:EBI-102069,0.37
2,EBI-263347,intact:EBI-100018_intact:EBI-104215,0.37
3,EBI-235587,intact:EBI-100018_intact:EBI-107089,0.37
4,EBI-22074151,intact:EBI-100018_intact:EBI-117032,0.37
...,...,...,...
775667,EBI-24735616,intact:EBI-999900_intact:EBI-999909,0.93
775668,EBI-25025071,intact:EBI-999900_intact:EBI-999909,0.93
775669,EBI-999899,intact:EBI-999900_intact:EBI-999909,0.93
775670,EBI-999928,intact:EBI-999900_intact:EBI-999909,0.93


## Resolve any disagreeing PTM-mutation behavior 

In [672]:
# how do we merge the rows where there are mutations and PTMs at the same time? and maybe we should investigate some of these?
# simplemerged_ptm and simplemerged are not the same
# simplemerged_ptm is subsetted from merged, like simplemerged is subsetted from merged
simplemerged_ptm["confidence_val_int"] = simplemerged_ptm["confidence_val_int"].astype(float)
simplemerged_ptm["unique_score_int"] = simplemerged_ptm["unique_score_int"].astype(float)

c1 = set(list(simplemerged_mut.columns))
c2 = set(list(simplemerged_ptm.columns))
c3 = c1.intersection(c2)
# these are the columns from before that will be different based on whether mutation or not 
# i should keep the simplemerged entries for this 
change_cols = ["aa_1",
 "length_1",
 "invalids_aa_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "invalids_aa_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + [
    "all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"
]
# Should be joining on "confidence_val_int","unique_score_int" but trying without
noc1 = list(set(change_cols+need_pipejoin)) + ["seq_sort_og", "seq_sort_og_id","seq_sort","seq_sort_new","seq_sort_new_id"]

c3 = list(c3 - set(noc1))
print(len(c3))
    
simplemerged_ptm_and_mut = pd.merge(
    simplemerged_ptm.drop(columns=noc1), 
    simplemerged_mut,
    on=c3,
    how="inner"
)
print(f"Length of simplemerged_ptm_and_mut: {len(simplemerged_ptm_and_mut)}")

62
Length of simplemerged_ptm_and_mut: 769753


In [673]:
print(len(simplemerged_ptm_and_mut.loc[simplemerged_ptm_and_mut["PTM Partner"].notna()]))
print(len(simplemerged_ptm.loc[simplemerged_ptm["PTM Partner"].notna()]))

2883
4453


In [674]:
ptm_partner_yes = simplemerged_ptm.loc[simplemerged_ptm["PTM Partner"].notna()]["interaction_intactid"].tolist()
mutated_parter_yes = simplemerged_mut.loc[simplemerged_mut["Mutated Partner"].notna()]["interaction_intactid"].tolist()
print(len(set(ptm_partner_yes).intersection(set(mutated_parter_yes))))
print(list(set(ptm_partner_yes).intersection(set(mutated_parter_yes)))[0])

346
EBI-7972588


In [675]:
# Alright let's figure out what's different here
display(simplemerged_ptm.loc[simplemerged_ptm["interaction_intactid"]=="EBI-16040496"])
display(simplemerged_mut.loc[simplemerged_mut["interaction_intactid"]=="EBI-16040496"])
temp = pd.concat([
    simplemerged_ptm.loc[simplemerged_ptm["interaction_intactid"]=="EBI-16040496"],
    simplemerged_mut.loc[simplemerged_mut["interaction_intactid"]=="EBI-16040496"]
]).reset_index(drop=True)
x = cols_with_differences(temp, [0,1])
print(set(x).intersection(c3))

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,PTM decisive_entry_new_binds,PTM decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,PTM decisive_seqpair_og_binds,seq_sort_new_id,PTM decisive_seqpair_new_binds,PTM Partner Status,ptm_short
743345,False,MKKGGVRSYRRSSTSKRSVIDDDSEPELPSMTKEAIASHKADSGSS...,"ARTKQTAR<psi-mi:""MI:0167""(N6,N6,N6-trimethyl-L...",,,intact:EBI-926939,intact:EBI-15732116,,,,...,yes,no,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,"ARTKQTAR<psi-mi:""MI:0167""(N6,N6,N6-trimethyl-L...",seq_sort_og_69,no,seq_sort_new_205,yes,ptm,lys-9
744724,False,MKKGGVRSYRRSSTSKRSVIDDDSEPELPSMTKEAIASHKADSGSS...,"ARTKQTAR<psi-mi:""MI:0167""(N6,N6,N6-trimethyl-L...",,,intact:EBI-926939,intact:EBI-15732116,,,,...,yes,no,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,"ARTKQTAR<psi-mi:""MI:0167""(N6,N6,N6-trimethyl-L...",seq_sort_og_65,no,seq_sort_new_194,yes,ptm,lys-9


Unnamed: 0,unique_id,seq_pair_id,interaction_intactid,mutation_short,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,...,Mutated all_og_binds,Mutated decisive_entry_new_binds,Mutated decisive_entry_og_binds,seq_sort_og,seq_sort_new,seq_sort_og_id,seq_sort_new_id,Mutated decisive_seqpair_og_binds,Mutated decisive_seqpair_new_binds,Mutated Partner Status
283688,intact:EBI-15732116_intact:EBI-926939,seqpair143109,EBI-16040496,P40381:p.Glu74_Glu80delinsAlaAlaAlaAlaAlaAlaAla,False,MKKGGVRSYRRSSTSKRSVIDDDSEPELPSMTKEAIASHKADSGSS...,ARTKQTARKSTGGKAPRK,,,intact:EBI-926939,...,yes,no,yes,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,seq_sort_og_84,seq_sort_new_387,yes,no,original
283689,intact:EBI-15732116_intact:EBI-926939,seqpair143109,EBI-16040496,P40381:p.Trp104Ala,False,MKKGGVRSYRRSSTSKRSVIDDDSEPELPSMTKEAIASHKADSGSS...,ARTKQTARKSTGGKAPRK,,,intact:EBI-926939,...,yes,no,yes,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,seq_sort_og_84,seq_sort_new_389,yes,no,original
283690,intact:EBI-15732116_intact:EBI-926939,seqpair143109,EBI-16040496,P40381:p.[Arg93Ala;Lys94Ala],False,MKKGGVRSYRRSSTSKRSVIDDDSEPELPSMTKEAIASHKADSGSS...,ARTKQTARKSTGGKAPRK,,,intact:EBI-926939,...,yes,yes,yes,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,seq_sort_og_84,seq_sort_new_388,yes,yes,mutated
283691,intact:EBI-15732116_intact:EBI-926939,seqpair143109,EBI-16040496,P40381:p.[Cys121Ser;Cys124Ser;Cys310Ser],False,MKKGGVRSYRRSSTSKRSVIDDDSEPELPSMTKEAIASHKADSGSS...,ARTKQTARKSTGGKAPRK,,,intact:EBI-926939,...,yes,yes,yes,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,ARTKQTARKSTGGKAPRK_MKKGGVRSYRRSSTSKRSVIDDDSEPE...,seq_sort_og_84,seq_sort_new_390,yes,yes,mutated


{'no_uniprot_update_B', 'species_taxid_1', 'no_uniprot_update_A', 'species_taxid_2', 'equal_score_int'}


In [676]:
len(simplemerged_neg_mut.loc[simplemerged_neg_mut["Mutated Partner"].notna()])

19242

In [677]:
display(merged_expl_ptm_filt["PTM all_new_binds"].value_counts())
display(merged_expl_ptm_filt["PTM all_og_binds"].value_counts())

PTM all_new_binds
unknown        5255
unknown,yes    1268
no,unknown       79
yes               1
Name: count, dtype: int64

PTM all_og_binds
unknown           4832
no,unknown         981
unknown,yes        776
no,unknown,yes      11
yes                  2
no                   1
Name: count, dtype: int64

In [678]:
test1 = simplemerged_ptm_and_mut.columns
test1 = len([x for x in test1 if x.endswith("_x") or x.endswith("_y")])==0
print(f"Only unique columns from the merge: {test1}")
print(f"Length of merged database: {len(simplemerged_ptm_and_mut)}")
simplemerged_ptm_and_mut.head()

test1 = len(simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner"].notna()) |
    (simplemerged_ptm_and_mut["PTM Partner"].notna())
])
print(f"Total rows in merged database that are annotated with a Mutated Partner Status or PTM Partner Status: {test1}")

Only unique columns from the merge: True
Length of merged database: 769753
Total rows in merged database that are annotated with a Mutated Partner Status or PTM Partner Status: 68119


In [679]:
test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].notna()) 
]
print(f"Displaying a few rows where there is a mutation associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["PTM Partner Status"].notna()) 
]
print(f"\nDisplaying a few rows where there is a PTM associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].notna()) &
    (simplemerged_ptm_and_mut["PTM Partner Status"].notna()) 
]
print(f"\nDisplaying a few rows where there is both a mutation and PTM associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

Displaying a few rows where there is a mutation associated. Total = 66382


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
80,EBI-27022005,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,original,P35247-PRO_0000017465:p.Pro180Ser,,
82,EBI-27022099,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,original,P35247-PRO_0000017465:p.Pro180Ser,,
83,EBI-27081739,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,mutated,P0DTC2:p.Asn501Tyr,,
84,EBI-27081739,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,mutated,P0DTC2:p.Asp614Gly,,
85,EBI-27081739,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,mutated,P0DTC2:p.[Glu484Lys;Asp614Gly],,



Displaying a few rows where there is a PTM associated. Total = 2883


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
766870,EBI-16158286,intact:EBI-15634422_intact:EBI-2653928,seqpair141260,ARTKQTARKSTGGKAPRKQLA_MEQVAEGARVTAVPVSAADSTEEL...,,,ptm,lys-9
766871,EBI-15973539,intact:EBI-15973242_intact:EBI-15973521,seqpair148654,GKGGAKRHRKVLRDNIQGI_MSRYITRLSMRRTYKWNGRPVGEDRK...,,,ptm,lys-10
766872,EBI-7572609,intact:EBI-7572597_intact:EBI-986224,seqpair415387,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,,,ptm,other modification
766873,EBI-1181856,intact:EBI-1181460_intact:EBI-347088,seqpair68760,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,,,ptm,thr-163
766874,EBI-7615104,intact:EBI-1179609_intact:EBI-4414343,seqpair68448,GKAPRKQLATKAARKSAPATG_MAAAAATAVGPGAGSAGVAGPGGA...,,,ptm,methylated residue



Displaying a few rows where there is both a mutation and PTM associated. Total = 1146


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
766890,EBI-476976,intact:EBI-350145_intact:EBI-476965,seqpair285548,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,original,P01112:p.Gly12Val,ptm,ser-189
766891,EBI-476976,intact:EBI-350145_intact:EBI-476965,seqpair285548,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,mutated,P01112:p.Ser189delinsLysLysLysLysLysLys,ptm,ser-189
766904,EBI-16105659,intact:EBI-11658528_intact:EBI-1571482,seqpair63309,MADSTFLAPELSDTESMGEETVRFQELLLKASKELQQAQTARPDST...,original,Q9CQJ2:p.Lys57Glu,ptm,ser-486
766905,EBI-16105659,intact:EBI-11658528_intact:EBI-1571482,seqpair63309,MADSTFLAPELSDTESMGEETVRFQELLLKASKELQQAQTARPDST...,original,Q9CQJ2:p.Lys64Glu,ptm,ser-486
766906,EBI-16105659,intact:EBI-11658528_intact:EBI-1571482,seqpair63309,MADSTFLAPELSDTESMGEETVRFQELLLKASKELQQAQTARPDST...,original,Q9CQJ2:p.Lys57Glu,ptm,ser-488


In [680]:
test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].notna()) 
]
print(f"Displaying a few rows where there is a mutation associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["PTM Partner Status"].notna()) 
]
print(f"\nDisplaying a few rows where there is a PTM associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].notna()) &
    (simplemerged_ptm_and_mut["PTM Partner Status"].notna()) 
]
print(f"\nDisplaying a few rows where there is both a mutation and PTM associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

Displaying a few rows where there is a mutation associated. Total = 66382


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
80,EBI-27022005,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,original,P35247-PRO_0000017465:p.Pro180Ser,,
82,EBI-27022099,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,original,P35247-PRO_0000017465:p.Pro180Ser,,
83,EBI-27081739,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,mutated,P0DTC2:p.Asn501Tyr,,
84,EBI-27081739,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,mutated,P0DTC2:p.Asp614Gly,,
85,EBI-27081739,intact:EBI-25474821_intact:EBI-27021977,seqpair227051,AEMKTYSHRTMPSACTLVMCSSVESGLPGRDGRDGREGPRGEKGDP...,mutated,P0DTC2:p.[Glu484Lys;Asp614Gly],,



Displaying a few rows where there is a PTM associated. Total = 2883


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
766870,EBI-16158286,intact:EBI-15634422_intact:EBI-2653928,seqpair141260,ARTKQTARKSTGGKAPRKQLA_MEQVAEGARVTAVPVSAADSTEEL...,,,ptm,lys-9
766871,EBI-15973539,intact:EBI-15973242_intact:EBI-15973521,seqpair148654,GKGGAKRHRKVLRDNIQGI_MSRYITRLSMRRTYKWNGRPVGEDRK...,,,ptm,lys-10
766872,EBI-7572609,intact:EBI-7572597_intact:EBI-986224,seqpair415387,IVGGEDANVQDHPFTVALVTPDGQQFCGGTLAAPNKVVTAAHCTVG...,,,ptm,other modification
766873,EBI-1181856,intact:EBI-1181460_intact:EBI-347088,seqpair68760,MAAAAASGAGGAAGAGTGGAGPAGRLLPPPAPGSPAAPAAVSPAAG...,,,ptm,thr-163
766874,EBI-7615104,intact:EBI-1179609_intact:EBI-4414343,seqpair68448,GKAPRKQLATKAARKSAPATG_MAAAAATAVGPGAGSAGVAGPGGA...,,,ptm,methylated residue



Displaying a few rows where there is both a mutation and PTM associated. Total = 1146


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
766890,EBI-476976,intact:EBI-350145_intact:EBI-476965,seqpair285548,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,original,P01112:p.Gly12Val,ptm,ser-189
766891,EBI-476976,intact:EBI-350145_intact:EBI-476965,seqpair285548,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,mutated,P01112:p.Ser189delinsLysLysLysLysLysLys,ptm,ser-189
766904,EBI-16105659,intact:EBI-11658528_intact:EBI-1571482,seqpair63309,MADSTFLAPELSDTESMGEETVRFQELLLKASKELQQAQTARPDST...,original,Q9CQJ2:p.Lys57Glu,ptm,ser-486
766905,EBI-16105659,intact:EBI-11658528_intact:EBI-1571482,seqpair63309,MADSTFLAPELSDTESMGEETVRFQELLLKASKELQQAQTARPDST...,original,Q9CQJ2:p.Lys64Glu,ptm,ser-486
766906,EBI-16105659,intact:EBI-11658528_intact:EBI-1571482,seqpair63309,MADSTFLAPELSDTESMGEETVRFQELLLKASKELQQAQTARPDST...,original,Q9CQJ2:p.Lys57Glu,ptm,ser-488


In [681]:
# Get intact IDs that have mutation and PTM data
# Positive
simplemerged_mut_interaction_intactids = simplemerged_mut.loc[
    simplemerged_mut["Mutated Partner Status"].notna()
]["interaction_intactid"].unique().tolist()
simplemerged_ptm_interaction_intactids = simplemerged_ptm.loc[
    simplemerged_ptm["PTM Partner Status"].notna()
]["interaction_intactid"].unique().tolist()
# Negative
my_neg_with_mut_interaction_intactids = simplemerged_neg_mut.loc[
    simplemerged_neg_mut["Mutated Partner Status"].notna()
]["interaction_intactid"].unique().tolist()
simplemerged_ptm_neg_interaction_intactids = simplemerged_ptm_neg.loc[
    simplemerged_ptm_neg["PTM Partner Status"].notna()
]["interaction_intactid"].unique().tolist()
# Unknown
simplemerged_mut_unknown_interaction_intactids = simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown["Mutated Partner Status"].notna()
]["interaction_intactid"].unique().tolist()
simplemerged_ptm_unknown_interaction_intactids = simplemerged_ptm_unknown.loc[
    simplemerged_ptm_unknown["PTM Partner Status"].notna()
]["interaction_intactid"].unique().tolist()

### Shared IDs that have mutation AND PTM Data
# Agreement
print("The following print statements reflect disagreement at the interaction_intactid level, which may not be true conflicts.")
print("Agreeing labels between mutation and PTM:")
# Pos-Pos intersection: 
test1 = set(simplemerged_mut_interaction_intactids).intersection(set(simplemerged_ptm_interaction_intactids))
print(f"\tThere are {len(test1)} unique interaction_ids shared between positive mutation-binding (new or original) and positive PTM-binding (original).")

decision_cols = ['Mutated all_new_binds', 'Mutated all_og_binds', 'Mutated decisive_entry_new_binds', 'Mutated decisive_seqpair_new_binds', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds',]
decision_cols = [x for x in decision_cols if x in simplemerged_ptm_and_mut.columns]
simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["interaction_intactid"].isin(test1)
][[
    "interaction_intactid","unique_id","mutation_short",
] + decision_cols].head()

# Neg-neg intersection
test1 = set(my_neg_with_mut_interaction_intactids).intersection(set(simplemerged_ptm_neg_interaction_intactids))
print(f"\tThere are {len(test1)} unique interaction_ids shared between negative mutation-binding (new or original) and negative PTM-binding (original).")

# Unknown-unknown intersection
test1 = set(simplemerged_mut_unknown_interaction_intactids).intersection(set(simplemerged_ptm_unknown_interaction_intactids))
print(f"\tThere are {len(test1)} unique interaction_ids shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).")

# Disgreement
disagreeing_interaction_intactid_list = []
print("Disagreeing labels between mutation and PTM:")
# Pos-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_interaction_intactids).intersection(set(simplemerged_ptm_neg_interaction_intactids))
disagreeing_interaction_intactid_list += list(test1)
print(f"\tThere are {len(test1)} unique interaction_ids where Mutation label is positive, PTM is negative.")

# Neg-Pos (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_interaction_intactids).intersection(set(simplemerged_ptm_interaction_intactids))
disagreeing_interaction_intactid_list += list(test1)
print(f"\tThere are {len(test1)} unique interaction_ids where Mutation label is negative, PTM is positive.")

# Unknown-Pos (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_interaction_intactids).intersection(set(simplemerged_ptm_interaction_intactids))
disagreeing_interaction_intactid_list += list(test1)
print(f"\tThere are {len(test1)} unique interaction_ids where Mutation label is unknown, PTM is positive.")

# Unknown-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_interaction_intactids).intersection(set(simplemerged_ptm_neg_interaction_intactids))
disagreeing_interaction_intactid_list += list(test1)
print(f"\tThere are {len(test1)} unique interaction_ids where Mutation label is unknown, PTM is negative.")

# Pos-Unknown (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_interaction_intactids).intersection(set(simplemerged_ptm_unknown_interaction_intactids))
disagreeing_interaction_intactid_list += list(test1)
print(f"\tThere are {len(test1)} unique interaction_ids where Mutation label is positive, PTM is unknown.")

# Neg-Unknown (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_interaction_intactids).intersection(set(simplemerged_ptm_unknown_interaction_intactids))
disagreeing_interaction_intactid_list += list(test1)
print(f"\tThere are {len(test1)} unique interaction_ids where Mutation label is negative, PTM is unknown.")

disagreeing_interaction_intactid_list = list(set(disagreeing_interaction_intactid_list))
print(f"TOTAL disagreeing intact id pairs: {len(disagreeing_interaction_intactid_list)}")

The following print statements reflect disagreement at the interaction_intactid level, which may not be true conflicts.
Agreeing labels between mutation and PTM:
	There are 346 unique interaction_ids shared between positive mutation-binding (new or original) and positive PTM-binding (original).
	There are 102 unique interaction_ids shared between negative mutation-binding (new or original) and negative PTM-binding (original).
	There are 103 unique interaction_ids shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).
Disagreeing labels between mutation and PTM:
	There are 198 unique interaction_ids where Mutation label is positive, PTM is negative.
	There are 188 unique interaction_ids where Mutation label is negative, PTM is positive.
	There are 71 unique interaction_ids where Mutation label is unknown, PTM is positive.
	There are 39 unique interaction_ids where Mutation label is unknown, PTM is negative.
	There are 309 unique interaction_ids whe

In [682]:
print(len(simplemerged_mut.loc[simplemerged_mut["seq_sort_og"].isna()]))
print(len(simplemerged_mut.loc[simplemerged_mut["Mutated Partner Status"].isna()]))

print(len(simplemerged_ptm.loc[simplemerged_ptm["seq_sort_og"].isna()]))
print(len(simplemerged_ptm.loc[simplemerged_ptm["PTM Partner Status"].isna()]))

print(len(simplemerged_ptm.loc[simplemerged_ptm["seq_sort"].isna()]))
print(len(simplemerged_mut.loc[simplemerged_mut["seq_sort"].isna()]))
print(len(simplemerged_ptm_and_mut.loc[simplemerged_ptm_and_mut["seq_sort"].isna()]))

708214
708214
735254
735254
0
0
0


In [683]:
# Get intact IDs that have mutation and PTM data
# Positive
simplemerged_mut_seq_sorts = simplemerged_mut.loc[
    simplemerged_mut["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_seq_sorts = simplemerged_ptm.loc[
    simplemerged_ptm["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()
# Negative
my_neg_with_mut_seq_sorts = simplemerged_neg_mut.loc[
    simplemerged_neg_mut["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_neg_seq_sorts = simplemerged_ptm_neg.loc[
    simplemerged_ptm_neg["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()
# Unknown
simplemerged_mut_unknown_seq_sorts = simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_unknown_seq_sorts = simplemerged_ptm_unknown.loc[
    simplemerged_ptm_unknown["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()

### Shared IDs that have mutation AND PTM Data
# Agreement
print("The following print statements reflect disagreement at the seq_sort level, which reflect all true conflicts.")
print("Agreeing labels between mutation and PTM:")
# Pos-Pos intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between positive mutation-binding (new or original) and positive PTM-binding (original).")

decision_cols = ['Mutated all_new_binds', 'Mutated all_og_binds', 'Mutated decisive_entry_new_binds', 'Mutated decisive_seqpair_new_binds', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds',]
decision_cols = [x for x in decision_cols if x in simplemerged_ptm_and_mut.columns]
simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
][[
    "seq_sort","unique_id","mutation_short",
] + decision_cols].head()

# Neg-neg intersection
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between negative mutation-binding (new or original) and negative PTM-binding (original).")

# Unknown-unknown intersection
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).")

# Disgreement
disagreeing_seq_sort_list = []
print("Disagreeing labels between mutation and PTM:")
# Pos-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is positive, PTM is negative.")

# Neg-Pos (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is negative, PTM is positive.")

# Unknown-Pos (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is unknown, PTM is positive.")

# Unknown-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is unknown, PTM is negative.")

# Pos-Unknown (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is positive, PTM is unknown.")

# Neg-Unknown (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is negative, PTM is unknown.")

disagreeing_seq_sort_list = list(set(disagreeing_seq_sort_list))
print(f"TOTAL disagreeing seq_sort pairs (all will be moved into UNKNOWN): {len(disagreeing_seq_sort_list)}")


The following print statements reflect disagreement at the seq_sort level, which reflect all true conflicts.
Agreeing labels between mutation and PTM:
	There are 103 unique seq_sorts shared between positive mutation-binding (new or original) and positive PTM-binding (original).
	There are 1 unique seq_sorts shared between negative mutation-binding (new or original) and negative PTM-binding (original).
	There are 49 unique seq_sorts shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).
Disagreeing labels between mutation and PTM:
	There are 82 unique seq_sorts where Mutation label is positive, PTM is negative.
	There are 4 unique seq_sorts where Mutation label is negative, PTM is positive.
	There are 10 unique seq_sorts where Mutation label is unknown, PTM is positive.
	There are 4 unique seq_sorts where Mutation label is unknown, PTM is negative.
	There are 184 unique seq_sorts where Mutation label is positive, PTM is unknown.
	There are 1 unique

In [684]:
# Get intact IDs that have mutation and PTM data
# Positive
simplemerged_mut_seq_sort_ogs = simplemerged_mut.loc[
    simplemerged_mut["Mutated Partner Status"].notna()
]["seq_sort_og"].unique().tolist()
simplemerged_ptm_seq_sort_ogs = simplemerged_ptm.loc[
    simplemerged_ptm["PTM Partner Status"].notna()
]["seq_sort_og"].unique().tolist()
# Negative
my_neg_with_mut_seq_sort_ogs = simplemerged_neg_mut.loc[
    simplemerged_neg_mut["Mutated Partner Status"].notna()
]["seq_sort_og"].unique().tolist()
simplemerged_ptm_neg_seq_sort_ogs = simplemerged_ptm_neg.loc[
    simplemerged_ptm_neg["PTM Partner Status"].notna()
]["seq_sort_og"].unique().tolist()
# Unknown
simplemerged_mut_unknown_seq_sort_ogs = simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown["Mutated Partner Status"].notna()
]["seq_sort_og"].unique().tolist()
simplemerged_ptm_unknown_seq_sort_ogs = simplemerged_ptm_unknown.loc[
    simplemerged_ptm_unknown["PTM Partner Status"].notna()
]["seq_sort_og"].unique().tolist()

### Shared IDs that have mutation AND PTM Data
# Agreement
print("The following print statements reflect disagreement at the seq_sort_og level, which reflect some true conflicts + some misleading")
print("Agreeing labels between mutation and PTM:")
# Pos-Pos intersection: 
test1 = set(simplemerged_mut_seq_sort_ogs).intersection(set(simplemerged_ptm_seq_sort_ogs))
print(f"\tThere are {len(test1)} unique seq_sort_ogs shared between positive mutation-binding (new or original) and positive PTM-binding (original).")

decision_cols = ['Mutated all_new_binds', 'Mutated all_og_binds', 'Mutated decisive_entry_new_binds', 'Mutated decisive_seqpair_new_binds', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds',]
decision_cols = [x for x in decision_cols if x in simplemerged_ptm_and_mut.columns]
simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort_og"].isin(test1)
][[
    "seq_sort_og","unique_id","mutation_short",
] + decision_cols].head()

# Neg-neg intersection
test1 = set(my_neg_with_mut_seq_sort_ogs).intersection(set(simplemerged_ptm_neg_seq_sort_ogs))
print(f"\tThere are {len(test1)} unique seq_sort_ogs shared between negative mutation-binding (new or original) and negative PTM-binding (original).")

# Unknown-unknown intersection
test1 = set(simplemerged_mut_unknown_seq_sort_ogs).intersection(set(simplemerged_ptm_unknown_seq_sort_ogs))
print(f"\tThere are {len(test1)} unique seq_sort_ogs shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).")

# Disgreement
disagreeing_seq_sort_og_list = []
print("Disagreeing labels between mutation and PTM:")
# Pos-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sort_ogs).intersection(set(simplemerged_ptm_neg_seq_sort_ogs))
disagreeing_seq_sort_og_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sort_ogs where Mutation label is positive, PTM is negative.")

# Neg-Pos (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sort_ogs).intersection(set(simplemerged_ptm_seq_sort_ogs))
disagreeing_seq_sort_og_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sort_ogs where Mutation label is negative, PTM is positive.")

# Unknown-Pos (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sort_ogs).intersection(set(simplemerged_ptm_seq_sort_ogs))
disagreeing_seq_sort_og_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sort_ogs where Mutation label is unknown, PTM is positive.")

# Unknown-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sort_ogs).intersection(set(simplemerged_ptm_neg_seq_sort_ogs))
disagreeing_seq_sort_og_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sort_ogs where Mutation label is unknown, PTM is negative.")

# Pos-Unknown (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sort_ogs).intersection(set(simplemerged_ptm_unknown_seq_sort_ogs))
disagreeing_seq_sort_og_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sort_ogs where Mutation label is positive, PTM is unknown.")

# Neg-Unknown (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sort_ogs).intersection(set(simplemerged_ptm_unknown_seq_sort_ogs))
disagreeing_seq_sort_og_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sort_ogs where Mutation label is negative, PTM is unknown.")

disagreeing_seq_sort_og_list = list(set(disagreeing_seq_sort_og_list))
print(f"TOTAL disagreeing seq_sort_og pairs (all will be moved into UNKNOWN): {len(disagreeing_seq_sort_og_list)}")


The following print statements reflect disagreement at the seq_sort_og level, which reflect some true conflicts + some misleading
Agreeing labels between mutation and PTM:
	There are 263 unique seq_sort_ogs shared between positive mutation-binding (new or original) and positive PTM-binding (original).
	There are 76 unique seq_sort_ogs shared between negative mutation-binding (new or original) and negative PTM-binding (original).
	There are 111 unique seq_sort_ogs shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).
Disagreeing labels between mutation and PTM:
	There are 112 unique seq_sort_ogs where Mutation label is positive, PTM is negative.
	There are 182 unique seq_sort_ogs where Mutation label is negative, PTM is positive.
	There are 58 unique seq_sort_ogs where Mutation label is unknown, PTM is positive.
	There are 27 unique seq_sort_ogs where Mutation label is unknown, PTM is negative.
	There are 402 unique seq_sort_ogs where Mutation lab

In [685]:
# CHECK that everywhere that Mutated Partner Status notna and PTM Partner Status notna, seq_sort_og is also notna (we're relying on this) 
test1 = len(simplemerged_mut.loc[
    (simplemerged_mut["Mutated Partner Status"].notna()) & 
    (simplemerged_mut["seq_sort_og"].isna())
])==0
print(f"All positive-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort_og: {test1}")
test1 = len(simplemerged_ptm.loc[
    (simplemerged_ptm["PTM Partner Status"].notna()) & 
    (simplemerged_ptm["seq_sort_og"].isna())
])==0
print(f"All positive-PPI rows with a PTM Partner (original) also have an associated seq_sort_og: {test1}")

test1 = len(simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["Mutated Partner Status"].notna()) & 
    (simplemerged_neg_mut["seq_sort_og"].isna())
])==0
print(f"All negative-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort_og: {test1}")

test1 = len(simplemerged_ptm_neg.loc[
    (simplemerged_ptm_neg["PTM Partner Status"].notna()) & 
    (simplemerged_ptm_neg["seq_sort_og"].isna())
])==0
print(f"All negative-PPI rows with a PTM Partner (original) also have an associated seq_sort_og: {test1}")

test1 = len(simplemerged_mut_unknown.loc[
    (simplemerged_mut_unknown["Mutated Partner Status"].notna()) & 
    (simplemerged_mut_unknown["seq_sort_og"].isna())
])==0
print(f"All unknown-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort_og: {test1}")

test1 = len(simplemerged_ptm_unknown.loc[
    (simplemerged_ptm_unknown["PTM Partner Status"].notna()) & 
    (simplemerged_ptm_unknown["seq_sort_og"].isna())
])==0
print(f"All unknown-PPI rows with a PTM Partner (original) also have an associated seq_sort_og: {test1}")


All positive-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort_og: True
All positive-PPI rows with a PTM Partner (original) also have an associated seq_sort_og: True
All negative-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort_og: True
All negative-PPI rows with a PTM Partner (original) also have an associated seq_sort_og: True
All unknown-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort_og: True
All unknown-PPI rows with a PTM Partner (original) also have an associated seq_sort_og: True


In [686]:
# CHECK that everywhere that Mutated Partner Status notna and PTM Partner Status notna, seq_sort is also notna (we're relying on this) 
test1 = len(simplemerged_mut.loc[
    (simplemerged_mut["Mutated Partner Status"].notna()) & 
    (simplemerged_mut["seq_sort"].isna())
])==0
print(f"All positive-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort: {test1}")
test1 = len(simplemerged_ptm.loc[
    (simplemerged_ptm["PTM Partner Status"].notna()) & 
    (simplemerged_ptm["seq_sort"].isna())
])==0
print(f"All positive-PPI rows with a PTM Partner (original or PTM) also have an associated seq_sort: {test1}")

test1 = len(simplemerged_neg_mut.loc[
    (simplemerged_neg_mut["Mutated Partner Status"].notna()) & 
    (simplemerged_neg_mut["seq_sort"].isna())
])==0
print(f"All negative-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort: {test1}")

test1 = len(simplemerged_ptm_neg.loc[
    (simplemerged_ptm_neg["PTM Partner Status"].notna()) & 
    (simplemerged_ptm_neg["seq_sort"].isna())
])==0
print(f"All negative-PPI rows with a PTM Partner (original or PTM) also have an associated seq_sort: {test1}")

test1 = len(simplemerged_mut_unknown.loc[
    (simplemerged_mut_unknown["Mutated Partner Status"].notna()) & 
    (simplemerged_mut_unknown["seq_sort"].isna())
])==0
print(f"All unknown-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort: {test1}")

test1 = len(simplemerged_ptm_unknown.loc[
    (simplemerged_ptm_unknown["PTM Partner Status"].notna()) & 
    (simplemerged_ptm_unknown["seq_sort"].isna())
])==0
print(f"All unknown-PPI rows with a PTM Partner (original or PTM) also have an associated seq_sort: {test1}")


All positive-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort: True
All positive-PPI rows with a PTM Partner (original or PTM) also have an associated seq_sort: True
All negative-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort: True
All negative-PPI rows with a PTM Partner (original or PTM) also have an associated seq_sort: True
All unknown-PPI rows with a Mutated Partner (original or mutated) also have an associated seq_sort: True
All unknown-PPI rows with a PTM Partner (original or PTM) also have an associated seq_sort: True


In [687]:
# Check out cases where there wasn't a feature and we weren't able to make a new sequence
# Are there any cases where status is ptm and the sequences match? 
simplemerged_ptm["seq_sort_old_equals_new"] = simplemerged_ptm["seq_sort_og"]==simplemerged_ptm["seq_sort"]
test1 = len(simplemerged_ptm.loc[
    (simplemerged_ptm["PTM Partner Status"]=="ptm")
    & (simplemerged_ptm["seq_sort_old_equals_new"])
][["PTM Feature type","seq_sort_og","seq_sort_new","seq_sort","seq_sort_old_equals_new"]])==0

In [688]:
## Now that we know what the conflicting sequence pairs are, we need to find a way to move them all from their respective databases into unknown.
# Also we have to combine in the case that things agree. 
# We also have to combine in the case that things don't agree so we can document the conflict. 
# There are going to be cases where seq_sort agrees. Maybe we should keep this

In [689]:
## Now that we know what the conflicting sequence pairs are, we need to find a way to move them all from their respective databases into unknown.
# Also we have to combine in the case that things agree. 
# We also have to combine in the case that hings don't agree so we can document the conflict. 

# how do we merge the rows where there are mutations and PTMs at the same time? and maybe we should investigate some of these?
# simplemerged_ptm and simplemerged are not the same
# simplemerged_ptm is subsetted from merged, like simplemerged is subsetted from merged

c1 = set(list(simplemerged_mut.columns))
c2 = set(list(simplemerged_ptm.columns))
c3 = c1.intersection(c2)
# these are the columns from before that will be different based on whether mutation or not 
# i should keep the simplemerged entries for this 
change_cols = ["aa_1",
 "length_1",
 "invalids_aa_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "invalids_aa_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + [
    "all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"
]
noc1 = list(set(change_cols+need_pipejoin)) + ["Negative","seq_sort_og","seq_sort_og_id", "seq_sort_new", "seq_sort_new_id"]

c3 = list(c3 - set(noc1))
print(f"seq_sort_og in list of columns to merge on: {'seq_sort_og' in c3}")
print(f"seq_sort in list of columns to merge on: {'seq_sort' in c3}")
print(f"Total columns to merge on: {len(c3)}")

# Do an outer mege so we keep everything
simplemerged_ptm_and_mut = pd.merge(
    simplemerged_ptm.drop(columns=noc1), 
    simplemerged_mut,
    on=c3,
    how="outer"
)
print(f"Length of simplemerged_ptm_and_mut directly after joining: {len(simplemerged_ptm_and_mut)}")

seq_sort_og in list of columns to merge on: False
seq_sort in list of columns to merge on: True
Total columns to merge on: 62
Length of simplemerged_ptm_and_mut directly after joining: 801620


In [690]:
## Now that we know what the conflicting sequence pairs are, we need to find a way to move them all from their respective databases into unknown.
# Also we have to combine in the case that things agree. 
# We also have to combine in the case that hings don't agree so we can document the conflict. 

# how do we merge the rows where there are mutations and PTMs at the same time? and maybe we should investigate some of these?
# simplemerged_ptm and simplemerged are not the same
# simplemerged_ptm is subsetted from merged, like simplemerged is subsetted from merged

c1 = set(list(simplemerged_neg_mut.columns))
c2 = set(list(simplemerged_neg_ptm.columns))
c3 = c1.intersection(c2)
# these are the columns from before that will be different based on whether mutation or not 
# i should keep the simplemerged entries for this 
change_cols = ["aa_1",
 "length_1",
 "invalids_aa_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "invalids_aa_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + ["all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"]

noc1 = list(set(change_cols+need_pipejoin)) + ["Negative","seq_sort_og","seq_sort_og_id", "seq_sort_new", "seq_sort_new_id"]

c3 = list(c3 - set(noc1))
print(f"seq_sort_og in list of columns to merge on: {'seq_sort_og' in c3}")
print(f"seq_sort in list of columns to merge on: {'seq_sort' in c3}")
print(f"Total columns to merge on: {len(c3)}")

# Do an outer mege so we keep everything
simplemerged_neg_ptm_and_mut = pd.merge(
    simplemerged_neg_ptm.drop(columns=noc1), 
    simplemerged_neg_mut,
    on=c3,
    how="outer"
)
print(f"Length of simplemerged_neg_ptm_and_mut directly after joining: {len(simplemerged_neg_ptm_and_mut)}")

seq_sort_og in list of columns to merge on: False
seq_sort in list of columns to merge on: True
Total columns to merge on: 62
Length of simplemerged_neg_ptm_and_mut directly after joining: 757756


In [691]:
## Now that we know what the conflicting sequence pairs are, we need to find a way to move them all from their respective databases into unknown.
# Also we have to combine in the case that things agree. 
# We also have to combine in the case that hings don't agree so we can document the conflict. 

# how do we merge the rows where there are mutations and PTMs at the same time? and maybe we should investigate some of these?
# simplemerged_ptm and simplemerged are not the same
# simplemerged_ptm is subsetted from merged, like simplemerged is subsetted from merged

c1 = set(list(simplemerged_mut_unknown.columns))
c2 = set(list(simplemerged_ptm_unknown.columns))
c3 = c1.intersection(c2)
# these are the columns from before that will be different based on whether mutation or not 
# i should keep the simplemerged entries for this 
change_cols = ["aa_1",
 "length_1",
 "invalids_aa_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "invalids_aa_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + ["all_binding_mi_1", 
"all_binding_name_1", 
"all_binding_short_1", 
"all_binding_begin_1", 
"all_binding_end_1", 
"all_binding_mi_2", 
"all_binding_name_2", 
"all_binding_short_2",
"all_binding_begin_2", 
"all_binding_end_2"]
noc1 = list(set(change_cols+need_pipejoin)) + ["Negative","seq_sort_og","seq_sort_og_id", "seq_sort_new", "seq_sort_new_id"]

c3 = list(c3 - set(noc1))
print(f"seq_sort_og in list of columns to merge on: {'seq_sort_og' in c3}")
print(f"seq_sort in list of columns to merge on: {'seq_sort' in c3}")
print(f"Total columns to merge on: {len(c3)}")

# Do an outer mege so we keep everything
simplemerged_ptm_and_mut_unknown = pd.merge(
    simplemerged_ptm_unknown.drop(columns=noc1), 
    simplemerged_mut_unknown,
    on=c3,
    how="outer"
)
print(f"Length of simplemerged_ptm_and_mut_unknown directly after joining: {len(simplemerged_ptm_and_mut_unknown)}")

noc1_cross_merge = noc1 + ["seq_sort_og"]
# PTM pos + Mut neg
to_concat_ptm_pos_mut_neg = pd.merge(
    simplemerged_ptm.drop(columns=noc1 + []), 
    simplemerged_neg_mut,
    on=c3,
    how="inner"
)

# PTM neg + Mut pos
to_concat_ptm_neg_mut_pos = pd.merge(
    simplemerged_neg_ptm.drop(columns=noc1), 
    simplemerged_mut,
    on=c3,
    how="inner"
)

# PTM pos + Mut unknown
to_concat_ptm_pos_mut_unknown = pd.merge(
    simplemerged_ptm.drop(columns=noc1), 
    simplemerged_mut_unknown,
    on=c3,
    how="inner"
)

# PTM neg + Mut unknown
to_concat_ptm_neg_mut_unknown = pd.merge(
    simplemerged_neg_ptm.drop(columns=noc1), 
    simplemerged_mut_unknown,
    on=c3,
    how="inner"
)

# PTM unknown + Mut neg 
to_concat_ptm_unknown_mut_pos = pd.merge(
    simplemerged_ptm_unknown.drop(columns=noc1), 
    simplemerged_neg_mut,
    on=c3,
    how="inner"
)

# PTM unknown + Mut pos 
to_concat_ptm_unknown_mut_pos = pd.merge(
    simplemerged_ptm_unknown.drop(columns=noc1), 
    simplemerged_mut,
    on=c3,
    how="inner"
)

simplemerged_ptm_and_mut_unknown = pd.concat([
    simplemerged_ptm_and_mut_unknown,
    to_concat_ptm_pos_mut_neg,
    to_concat_ptm_neg_mut_pos,
    to_concat_ptm_pos_mut_unknown,
    to_concat_ptm_neg_mut_unknown,
    to_concat_ptm_unknown_mut_pos,
    to_concat_ptm_unknown_mut_pos
])
simplemerged_ptm_and_mut_unknown = simplemerged_ptm_and_mut_unknown.drop_duplicates()
simplemerged_ptm_and_mut_unknown = simplemerged_ptm_and_mut_unknown.reset_index(drop=True)

print(f"Length of simplemerged_ptm_and_mut_unknown after adding all the other mismatches: {len(simplemerged_ptm_and_mut_unknown)}")

seq_sort_og in list of columns to merge on: False
seq_sort in list of columns to merge on: True
Total columns to merge on: 62
Length of simplemerged_ptm_and_mut_unknown directly after joining: 28529
Length of simplemerged_ptm_and_mut_unknown after adding all the other mismatches: 40816


In [692]:
# Make sure all the agreeing ones are in there
# Get intact IDs that have mutation and PTM data
# Positive
simplemerged_mut_seq_sorts = simplemerged_mut.loc[
    simplemerged_mut["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_seq_sorts = simplemerged_ptm.loc[
    simplemerged_ptm["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()
# Negative
my_neg_with_mut_seq_sorts = simplemerged_neg_mut.loc[
    simplemerged_neg_mut["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_neg_seq_sorts = simplemerged_ptm_neg.loc[
    simplemerged_ptm_neg["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()
# Unknown
simplemerged_mut_unknown_seq_sorts = simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_unknown_seq_sorts = simplemerged_ptm_unknown.loc[
    simplemerged_ptm_unknown["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()

### Shared IDs that have mutation AND PTM Data
# Agreement
print("The following print statements reflect disagreement at the seq_sort level, which reflect all true conflicts.")
print("Agreeing labels between mutation and PTM:")
# Pos-Pos intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between positive mutation-binding (new or original) and positive PTM-binding (original).")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

decision_cols = ['Mutated all_new_binds', 'Mutated all_og_binds', 'Mutated decisive_entry_new_binds', 'Mutated decisive_seqpair_new_binds', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds',]
decision_cols = [x for x in decision_cols if x in simplemerged_ptm_and_mut.columns]

# Neg-neg intersection
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between negative mutation-binding (new or original) and negative PTM-binding (original).")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Unknown-unknown intersection
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Disgreement
disagreeing_seq_sort_list = []
print("Disagreeing labels between mutation and PTM:")
# Pos-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is positive, PTM is negative.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Neg-Pos (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is negative, PTM is positive.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Unknown-Pos (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is unknown, PTM is positive.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Unknown-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is unknown, PTM is negative.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Pos-Unknown (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is positive, PTM is unknown.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Neg-Unknown (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is negative, PTM is unknown.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

disagreeing_seq_sort_list = list(set(disagreeing_seq_sort_list))
print(f"TOTAL disagreeing seq_sort pairs (all will be moved into UNKNOWN): {len(disagreeing_seq_sort_list)}")


The following print statements reflect disagreement at the seq_sort level, which reflect all true conflicts.
Agreeing labels between mutation and PTM:
	There are 103 unique seq_sorts shared between positive mutation-binding (new or original) and positive PTM-binding (original).
		Total seq_sorts present in simplemerged_ptm_and_mut: 103/103
		Total seq_sorts present in simplemerged_neg_ptm_and_mut: 0/103
		Total seq_sorts present in simplemerged_ptm_and_mut_unknown: 0/103
	There are 1 unique seq_sorts shared between negative mutation-binding (new or original) and negative PTM-binding (original).
		Total seq_sorts present in simplemerged_ptm_and_mut: 0/1
		Total seq_sorts present in simplemerged_neg_ptm_and_mut: 1/1
		Total seq_sorts present in simplemerged_ptm_and_mut_unknown: 0/1
	There are 49 unique seq_sorts shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).
		Total seq_sorts present in simplemerged_ptm_and_mut: 0/49
		Total seq_sorts presen

In [693]:
### Remove the disagreeing labels!!
print(f"Length of simplemerged_ptm_and_mut BEFORE dropping seq_sort_ogs that have conflicting Mutation and PTM labels: {len(simplemerged_ptm_and_mut)}")
new_unknowns_from_pos = simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(disagreeing_seq_sort_list)
].reset_index(drop=True)

simplemerged_ptm_and_mut = simplemerged_ptm_and_mut.loc[
    ~simplemerged_ptm_and_mut["seq_sort"].isin(disagreeing_seq_sort_list)
].reset_index(drop=True)
print(f"Length of simplemerged_ptm_and_mut after dropping seq_sort_ogs that have conflicting Mutation and PTM labels: {len(simplemerged_ptm_and_mut)}")

Length of simplemerged_ptm_and_mut BEFORE dropping seq_sort_ogs that have conflicting Mutation and PTM labels: 801620
Length of simplemerged_ptm_and_mut after dropping seq_sort_ogs that have conflicting Mutation and PTM labels: 799352


In [694]:
### Remove the disagreeing labels!!
print(f"Length of simplemerged_neg_ptm_and_mut BEFORE dropping seq_sort_ogs that have conflicting Mutation and PTM labels: {len(simplemerged_neg_ptm_and_mut)}")
new_unknowns_from_neg = simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(disagreeing_seq_sort_list)
].reset_index(drop=True)

simplemerged_neg_ptm_and_mut = simplemerged_neg_ptm_and_mut.loc[
    ~simplemerged_neg_ptm_and_mut["seq_sort"].isin(disagreeing_seq_sort_list)
].reset_index(drop=True)
print(f"Length of simplemerged_neg_ptm_and_mut after dropping seq_sort_ogs that have conflicting Mutation and PTM labels: {len(simplemerged_neg_ptm_and_mut)}")

Length of simplemerged_neg_ptm_and_mut BEFORE dropping seq_sort_ogs that have conflicting Mutation and PTM labels: 757756
Length of simplemerged_neg_ptm_and_mut after dropping seq_sort_ogs that have conflicting Mutation and PTM labels: 756275


In [695]:
# Make sure all the agreeing ones are in there
# Get intact IDs that have mutation and PTM data
# Positive
print(f"Re-running these tests after removing all disagreeing seq_sorts from these databases. We should see none of them.")
simplemerged_mut_seq_sorts = simplemerged_mut.loc[
    simplemerged_mut["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_seq_sorts = simplemerged_ptm.loc[
    simplemerged_ptm["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()
# Negative
my_neg_with_mut_seq_sorts = simplemerged_neg_mut.loc[
    simplemerged_neg_mut["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_neg_seq_sorts = simplemerged_ptm_neg.loc[
    simplemerged_ptm_neg["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()
# Unknown
simplemerged_mut_unknown_seq_sorts = simplemerged_mut_unknown.loc[
    simplemerged_mut_unknown["Mutated Partner Status"].notna()
]["seq_sort"].unique().tolist()
simplemerged_ptm_unknown_seq_sorts = simplemerged_ptm_unknown.loc[
    simplemerged_ptm_unknown["PTM Partner Status"].notna()
]["seq_sort"].unique().tolist()

### Shared IDs that have mutation AND PTM Data
# Agreement
print("The following print statements reflect disagreement at the seq_sort level, which reflect all true conflicts.")
print("Agreeing labels between mutation and PTM:")
# Pos-Pos intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between positive mutation-binding (new or original) and positive PTM-binding (original).")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

decision_cols = ['Mutated all_new_binds', 'Mutated all_og_binds', 'Mutated decisive_entry_new_binds', 'Mutated decisive_seqpair_new_binds', 'PTM decisive_entry_og_binds', 'PTM decisive_seqpair_og_binds',]
decision_cols = [x for x in decision_cols if x in simplemerged_ptm_and_mut.columns]

# Neg-neg intersection
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between negative mutation-binding (new or original) and negative PTM-binding (original).")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Unknown-unknown intersection
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
print(f"\tThere are {len(test1)} unique seq_sorts shared between unknown mutation-binding (new or original) and unknown PTM-binding (original).")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Disgreement
disagreeing_seq_sort_list = []
print("Disagreeing labels between mutation and PTM:")
# Pos-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is positive, PTM is negative.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Neg-Pos (Mut-PTM) intersection: 
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is negative, PTM is positive.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Unknown-Pos (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is unknown, PTM is positive.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Unknown-Neg (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_unknown_seq_sorts).intersection(set(simplemerged_ptm_neg_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is unknown, PTM is negative.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Pos-Unknown (Mut-PTM) intersection: 
test1 = set(simplemerged_mut_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is positive, PTM is unknown.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

# Neg-Unknown (Mut-PTM) intersection: 
disagreeing_seq_sort_list += list(test1)
print(f"\tThere are {len(test1)} unique seq_sorts where Mutation label is negative, PTM is unknown.")
test2 = len(simplemerged_ptm_and_mut.loc[
    simplemerged_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_neg_ptm_and_mut.loc[
    simplemerged_neg_ptm_and_mut["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_neg_ptm_and_mut: {test2}/{len(test1)}")
test2 = len(simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
]["seq_sort"].unique().tolist())
print(f"\t\tTotal seq_sorts present in simplemerged_ptm_and_mut_unknown: {test2}/{len(test1)}")

disagreeing_seq_sort_list = list(set(disagreeing_seq_sort_list))
print(f"TOTAL disagreeing seq_sort pairs (all will be moved into UNKNOWN): {len(disagreeing_seq_sort_list)}")


Re-running these tests after removing all disagreeing seq_sorts from these databases. We should see none of them.
The following print statements reflect disagreement at the seq_sort level, which reflect all true conflicts.
Agreeing labels between mutation and PTM:
	There are 103 unique seq_sorts shared between positive mutation-binding (new or original) and positive PTM-binding (original).
		Total seq_sorts present in simplemerged_ptm_and_mut: 103/103
		Total seq_sorts present in simplemerged_neg_ptm_and_mut: 0/103
		Total seq_sorts present in simplemerged_ptm_and_mut_unknown: 0/103
	There are 1 unique seq_sorts shared between negative mutation-binding (new or original) and negative PTM-binding (original).
		Total seq_sorts present in simplemerged_ptm_and_mut: 0/1
		Total seq_sorts present in simplemerged_neg_ptm_and_mut: 1/1
		Total seq_sorts present in simplemerged_ptm_and_mut_unknown: 0/1
	There are 49 unique seq_sorts shared between unknown mutation-binding (new or original) and un

In [696]:
test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].notna()) 
]
print(f"Displaying a few rows where there is a mutation associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["PTM Partner Status"].notna()) 
]
print(f"\nDisplaying a few rows where there is a PTM associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].notna()) &
    (simplemerged_ptm_and_mut["PTM Partner Status"].notna()) 
]
print(f"\nDisplaying a few rows where there is both a mutation and PTM associated. Total = {len(test1)}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short"]].head())

Displaying a few rows where there is a mutation associated. Total = 66629


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
136,EBI-63730944,intact:EBI-743549_intact:EBI-7779316,seqpair410510,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,mutated,O43791:p.Phe102Cys,,
139,EBI-63734654,intact:EBI-743549_intact:EBI-7779316,seqpair410510,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,mutated,O43791:p.Phe133Leu,,
140,EBI-63741722,intact:EBI-743549_intact:EBI-7779316,seqpair410510,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,mutated,O43791:p.Tyr87Cys,,
143,EBI-63943274,intact:EBI-743549_intact:EBI-7779316,seqpair410510,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,mutated,O43791:p.Phe133Val,,
145,EBI-63718351,intact:EBI-302489_intact:EBI-7779316,seqpair265579,MENEKENLFCEPHKRGLMKTPLKESTTANIVLAEIQPDFGPLTTPT...,mutated,P51532:p.Gly1232Ser,,



Displaying a few rows where there is a PTM associated. Total = 4454


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
601,EBI-8082520,intact:EBI-747353_intact:EBI-974488,seqpair412199,MAGTSAPGSKRRSEPPAPRPGPPPGTGHPPSKRARGFSAAAAPDPD...,,,ptm,phosphorylated
2693,EBI-8537517,intact:EBI-129424_intact:EBI-522090,seqpair106585,MANVESMIVEEKTQVKQIDREKTCPMLLRVFCSTGRHHSVSEYMFG...,,,ptm,phosphorylated
3477,EBI-7702055,intact:EBI-1055635_intact:EBI-7702062,seqpair38479,MGFSSELCSPQGHGVLQQMQEAELRLLEGMRKWMAQRVKSDREYAG...,,,ptm,phosphorylated
3478,EBI-7702055,intact:EBI-1055635_intact:EBI-7702062,seqpair38479,MGFSSELCSPQGHGVLQQMQEAELRLLEGMRKWMAQRVKSDREYAG...,,,ptm,phosphorylated
4520,EBI-8082540,intact:EBI-621404_intact:EBI-974488,seqpair388040,MAVPFVEDWDLVQTLGEGAYGEVQLAVNRVTEEAVAVKIVDMKRAV...,,,ptm,phosphorylated



Displaying a few rows where there is both a mutation and PTM associated. Total = 250


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short
272834,EBI-8839164,intact:EBI-6405522_intact:EBI-8839139,seqpair390795,MTDMNPDIEKDQTSDEVTVETTSVFRADFLSELDAPAQAGTESAVS...,original,P9WJA9:p.Asn117Ala,original,thr22
272835,EBI-8839164,intact:EBI-6405522_intact:EBI-8839139,seqpair390795,MTDMNPDIEKDQTSDEVTVETTSVFRADFLSELDAPAQAGTESAVS...,original,P9WJA9:p.Lys141Glu,original,thr22
272836,EBI-8839164,intact:EBI-6405522_intact:EBI-8839139,seqpair390795,MTDMNPDIEKDQTSDEVTVETTSVFRADFLSELDAPAQAGTESAVS...,original,P9WJA9:p.Ser95Ala,original,thr22
283092,EBI-13646290,intact:EBI-13646303_intact:EBI-7147442,seqpair115564,MKMMLVRRFRVLILMVFLVACALHIALDLLPRLERRGARPSGEPGC...,original,Q8IXL6:p.Asp478Ala,original,thr-193
283093,EBI-13646290,intact:EBI-13646303_intact:EBI-7147442,seqpair115564,MKMMLVRRFRVLILMVFLVACALHIALDLLPRLERRGARPSGEPGC...,original,Q8IXL6:p.Asp478Ala,original,ser-194


In [697]:
print(f"For simplemerged_ptm_and_mut:")
test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"]=="original") &
    (simplemerged_ptm_and_mut["PTM Partner Status"]=="ptm") 
]
print(f"\nTotal rows where mutated partner status is original and PTM partner status is ptm: {len(test1)}")
test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"]=="mutated") &
    (simplemerged_ptm_and_mut["PTM Partner Status"]=="original") 
]
print(f"\nTotal rows where mutated partner status is mutated and PTM partner status is original: {len(test1)}")

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"].isna()) &
    (simplemerged_ptm_and_mut["PTM Partner Status"]=="ptm") 
]
print(f"\nTotal rows where mutated partner status is NaN and PTM partner status is ptm: {len(test1)}")
test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"]=="mutated") &
    (simplemerged_ptm_and_mut["PTM Partner Status"].isna()) 
]
print(f"\nTotal rows where mutated partner status is mutated and PTM partner status is NaN: {len(test1)}")

test1 = simplemerged_ptm_and_mut.loc[
    (simplemerged_ptm_and_mut["Mutated Partner Status"]=="original") &
    (simplemerged_ptm_and_mut["PTM Partner Status"]=="original") 
]
print(f"\nTotal rows where mutated partner status is original and PTM partner status is original: {len(test1)}")
test2 = (test1["PTM decisive_seqpair_og_binds"]==test1["Mutated decisive_seqpair_og_binds"]).all()
print(f"\tPTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds always agree here: {test2}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short","Mutated decisive_seqpair_og_binds","PTM decisive_seqpair_og_binds"]].head())

For simplemerged_ptm_and_mut:

Total rows where mutated partner status is original and PTM partner status is ptm: 0

Total rows where mutated partner status is mutated and PTM partner status is original: 0

Total rows where mutated partner status is NaN and PTM partner status is ptm: 2732

Total rows where mutated partner status is mutated and PTM partner status is NaN: 45133

Total rows where mutated partner status is original and PTM partner status is original: 250
	PTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds always agree here: True


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short,Mutated decisive_seqpair_og_binds,PTM decisive_seqpair_og_binds
272834,EBI-8839164,intact:EBI-6405522_intact:EBI-8839139,seqpair390795,MTDMNPDIEKDQTSDEVTVETTSVFRADFLSELDAPAQAGTESAVS...,original,P9WJA9:p.Asn117Ala,original,thr22,yes,yes
272835,EBI-8839164,intact:EBI-6405522_intact:EBI-8839139,seqpair390795,MTDMNPDIEKDQTSDEVTVETTSVFRADFLSELDAPAQAGTESAVS...,original,P9WJA9:p.Lys141Glu,original,thr22,yes,yes
272836,EBI-8839164,intact:EBI-6405522_intact:EBI-8839139,seqpair390795,MTDMNPDIEKDQTSDEVTVETTSVFRADFLSELDAPAQAGTESAVS...,original,P9WJA9:p.Ser95Ala,original,thr22,yes,yes
283092,EBI-13646290,intact:EBI-13646303_intact:EBI-7147442,seqpair115564,MKMMLVRRFRVLILMVFLVACALHIALDLLPRLERRGARPSGEPGC...,original,Q8IXL6:p.Asp478Ala,original,thr-193,yes,yes
283093,EBI-13646290,intact:EBI-13646303_intact:EBI-7147442,seqpair115564,MKMMLVRRFRVLILMVFLVACALHIALDLLPRLERRGARPSGEPGC...,original,Q8IXL6:p.Asp478Ala,original,ser-194,yes,yes


In [698]:
print(f"For simplemerged_neg_ptm_and_mut:")
test1 = simplemerged_neg_ptm_and_mut.loc[
    (simplemerged_neg_ptm_and_mut["Mutated Partner Status"]=="original") &
    (simplemerged_neg_ptm_and_mut["PTM Partner Status"]=="ptm") 
]
print(f"\nTotal rows where mutated partner status is original and PTM partner status is ptm: {len(test1)}")
test1 = simplemerged_neg_ptm_and_mut.loc[
    (simplemerged_neg_ptm_and_mut["Mutated Partner Status"]=="mutated") &
    (simplemerged_neg_ptm_and_mut["PTM Partner Status"]=="original") 
]
print(f"\nTotal rows where mutated partner status is mutated and PTM partner status is original: {len(test1)}")

test1 = simplemerged_neg_ptm_and_mut.loc[
    (simplemerged_neg_ptm_and_mut["Mutated Partner Status"].isna()) &
    (simplemerged_neg_ptm_and_mut["PTM Partner Status"]=="ptm") 
]
print(f"\nTotal rows where mutated partner status is NaN and PTM partner status is ptm: {len(test1)}")
test1 = simplemerged_neg_ptm_and_mut.loc[
    (simplemerged_neg_ptm_and_mut["Mutated Partner Status"]=="mutated") &
    (simplemerged_neg_ptm_and_mut["PTM Partner Status"].isna()) 
]
print(f"\nTotal rows where mutated partner status is mutated and PTM partner status is NaN: {len(test1)}")

test1 = simplemerged_neg_ptm_and_mut.loc[
    (simplemerged_neg_ptm_and_mut["Mutated Partner Status"]=="original") &
    (simplemerged_neg_ptm_and_mut["PTM Partner Status"]=="original") 
]
print(f"\nTotal rows where mutated partner status is original and PTM partner status is original: {len(test1)}")
test2 = (test1["PTM decisive_seqpair_og_binds"]==test1["Mutated decisive_seqpair_og_binds"]).all()
print(f"\tPTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds always agree here: {test2}")
display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short","Mutated decisive_seqpair_og_binds","PTM decisive_seqpair_og_binds"]].head())

For simplemerged_neg_ptm_and_mut:

Total rows where mutated partner status is original and PTM partner status is ptm: 0

Total rows where mutated partner status is mutated and PTM partner status is original: 0

Total rows where mutated partner status is NaN and PTM partner status is ptm: 135

Total rows where mutated partner status is mutated and PTM partner status is NaN: 17940

Total rows where mutated partner status is original and PTM partner status is original: 2
	PTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds always agree here: True


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short,Mutated decisive_seqpair_og_binds,PTM decisive_seqpair_og_binds
288180,EBI-22223546,intact:EBI-726075_intact:EBI-8796752,seqpair405301,MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...,original,Q9ULR3:p.Asp288Ala,original,thr-73,no,no
288181,EBI-22223546,intact:EBI-726075_intact:EBI-8796752,seqpair405301,MAKKTYDLLFKLLLIGDSGVGKTCVLFRFSDDAFNTTFISTIGIDF...,original,Q9ULR3:p.Asp288Glu,original,thr-73,no,no


In [699]:
print(f"For simplemerged_ptm_and_mut_unknown:")
test1 = simplemerged_ptm_and_mut_unknown.loc[
    (simplemerged_ptm_and_mut_unknown["Mutated Partner Status"]=="original") &
    (simplemerged_ptm_and_mut_unknown["PTM Partner Status"]=="ptm") 
]
print(f"\nTotal rows where mutated partner status is original and PTM partner status is ptm: {len(test1)}")
test1 = simplemerged_ptm_and_mut_unknown.loc[
    (simplemerged_ptm_and_mut_unknown["Mutated Partner Status"]=="mutated") &
    (simplemerged_ptm_and_mut_unknown["PTM Partner Status"]=="original") 
]
print(f"\nTotal rows where mutated partner status is mutated and PTM partner status is original: {len(test1)}")

test1 = simplemerged_ptm_and_mut_unknown.loc[
    (simplemerged_ptm_and_mut_unknown["Mutated Partner Status"].isna()) &
    (simplemerged_ptm_and_mut_unknown["PTM Partner Status"]=="ptm") 
]
print(f"\nTotal rows where mutated partner status is NaN and PTM partner status is ptm: {len(test1)}")
test1 = simplemerged_ptm_and_mut_unknown.loc[
    (simplemerged_ptm_and_mut_unknown["Mutated Partner Status"]=="mutated") &
    (simplemerged_ptm_and_mut_unknown["PTM Partner Status"].isna()) 
]
print(f"\nTotal rows where mutated partner status is mutated and PTM partner status is NaN: {len(test1)}")

test1 = simplemerged_ptm_and_mut_unknown.loc[
    (simplemerged_ptm_and_mut_unknown["Mutated Partner Status"]=="original") &
    (simplemerged_ptm_and_mut_unknown["PTM Partner Status"]=="original") 
]
print(f"\nTotal rows where mutated partner status is original and PTM partner status is original: {len(test1)}")
test2 = (test1["PTM decisive_seqpair_og_binds"]==test1["Mutated decisive_seqpair_og_binds"]).sum()
print(f"\tTotal rows where PTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds agree: {test2}")
test2 = (test1["PTM decisive_seqpair_og_binds"]!=test1["Mutated decisive_seqpair_og_binds"]).sum()
print(f"\tTotal rows where PTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds disagree: {test2} (expected: {len(new_unknowns_from_pos)+len(new_unknowns_from_neg)})")

display(test1[["interaction_intactid","unique_id","seq_pair_id","seq_sort","Mutated Partner Status","mutation_short","PTM Partner Status","ptm_short","Mutated decisive_seqpair_og_binds","PTM decisive_seqpair_og_binds"]].head())

For simplemerged_ptm_and_mut_unknown:

Total rows where mutated partner status is original and PTM partner status is ptm: 0

Total rows where mutated partner status is mutated and PTM partner status is original: 0

Total rows where mutated partner status is NaN and PTM partner status is ptm: 9626

Total rows where mutated partner status is mutated and PTM partner status is NaN: 5215

Total rows where mutated partner status is original and PTM partner status is original: 886
	Total rows where PTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds agree: 147
	Total rows where PTM decisive_seqpair_og_binds and Mutated decisive_seqpair_og_binds disagree: 739 (expected: 3749)


Unnamed: 0,interaction_intactid,unique_id,seq_pair_id,seq_sort,Mutated Partner Status,mutation_short,PTM Partner Status,ptm_short,Mutated decisive_seqpair_og_binds,PTM decisive_seqpair_og_binds
2009,EBI-22180273,intact:EBI-22180293_intact:EBI-886,seqpair209563,MAATAYEHLKLHITPEKFYVEACDDGADDVLIIDRVSTEVTLAVKK...,original,p.Ala134Val,original,tyr-136,unknown,unknown
2355,EBI-22180165,intact:EBI-22154522_intact:EBI-517684,seqpair209540,MAASAKRKQEEKHLKMLRDMTGLPHNRKCFDCDQRGPTYVNMTVGS...,original,Q13480:p.Pro320Ser,original,tyr-317,unknown,unknown
2816,EBI-5277045,intact:EBI-5277069_intact:EBI-722425,seqpair380413,MELPAVGEHVFAVESIEKKRIRKGRVEYLVKWRGWSPKYNTWEPEE...,original,Q9H3R0:p.[His190Gly;Glu192Ala],original,lys-191,unknown,unknown
3232,EBI-22079913,intact:EBI-1223708_intact:EBI-22033103,seqpair95398,MRLLAKIICLMLWAICVAEDCNELPPRRNTEILTGSWSDQTYPEGT...,original,P02741-PRO_0000023526:p.[Cys36Ala;Cys97Ala],original,n-formylmet,unknown,unknown
3236,EBI-22079938,intact:EBI-1223708_intact:EBI-22033103,seqpair95398,MRLLAKIICLMLWAICVAEDCNELPPRRNTEILTGSWSDQTYPEGT...,original,P02741-PRO_0000023526:p.[Cys36Ala;Cys97Ala],original,n-formylmet,unknown,unknown


In [700]:
simplemerged_ptm_and_mut_unknown.loc[
    (simplemerged_ptm_and_mut_unknown["PTM decisive_seqpair_og_binds"].notna()) & 
    (simplemerged_ptm_and_mut_unknown["Mutated decisive_seqpair_og_binds"].notna()) & 
    (simplemerged_ptm_and_mut_unknown["PTM decisive_seqpair_og_binds"]!=simplemerged_ptm_and_mut_unknown["Mutated decisive_seqpair_og_binds"])
][[
    "interaction_intactid","seq_sort","PTM Partner Status","Mutated Partner Status","PTM decisive_seqpair_og_binds","Mutated decisive_seqpair_og_binds"
]]

Unnamed: 0,interaction_intactid,seq_sort,PTM Partner Status,Mutated Partner Status,PTM decisive_seqpair_og_binds,Mutated decisive_seqpair_og_binds
29827,EBI-25689545,MGNAAAAKKGSEQESVKEFLAKAKEDFLKKWETPSQNTAQLDQFDR...,original,original,yes,no
29846,EBI-476976,MAAQKDQQKDAEGEGLSATTLLPKLIPSGAGREWLERRRATIRPWG...,original,original,no,yes
29871,EBI-8546510,MAGNVKKSSGAGGGSGSGGSGSGGLIGLMKDAFQPHHHHHHHLSPH...,original,original,no,yes
29872,EBI-8546510,MAGNVKKSSGAGGGSGSGGSGSGGLIGLMKDAFQPHHHHHHHLSPH...,original,original,no,yes
29877,EBI-15973246,GKGGAKRHRKVLRDNIQGI_MAHYPTRLKTRKTYSWVGRPLLDRKL...,original,original,no,yes
...,...,...,...,...,...,...
40410,EBI-1382486,MSGRGKGGKGLGKGGAKRHRKVLRDNIQGITKPAIRRLARRGGVKR...,original,original,unknown,yes
40417,EBI-15948241,MGSQKRLVQRVERKLEQTVGDAFARIFGGSIVPQEVEALLRREAAD...,original,original,unknown,yes
40463,EBI-16200180,MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFA...,original,original,unknown,yes
40464,EBI-16200180,MQIFVKTLTGKTITLEVESSDTIDNVKSKIQDKEGIPPDQQRLIFA...,original,original,unknown,yes


In [701]:
test1 = set(my_neg_with_mut_seq_sorts).intersection(set(simplemerged_ptm_unknown_seq_sorts))
simplemerged_ptm_and_mut_unknown.loc[
    simplemerged_ptm_and_mut_unknown["seq_sort"].isin(test1)
][[
    "interaction_intactid","seq_sort","PTM Partner Status","Mutated Partner Status","PTM decisive_seqpair_og_binds","Mutated decisive_seqpair_og_binds"
]]

Unnamed: 0,interaction_intactid,seq_sort,PTM Partner Status,Mutated Partner Status,PTM decisive_seqpair_og_binds,Mutated decisive_seqpair_og_binds
17424,EBI-10888004,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17425,EBI-10888004,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17428,EBI-10888004,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17429,EBI-10888004,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17431,EBI-10888037,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17433,EBI-10888037,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17436,EBI-10893779,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17437,EBI-10893779,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17440,EBI-10893779,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,
17441,EBI-10893779,MAWRCPRMGRVPLAWCLALCGWACMAPRGTQAEESPFVGNPGNITG...,original,,unknown,


In [702]:
c = [x for x in merged_neg.columns if "mutation" in x]  # a bit clearer
mask = merged_neg[c].notna().any(axis=1)           # row-wise any
print(len(merged_neg.loc[mask] [
    ["interaction_intactid"]+c]))
merged_neg.loc[mask] [
    ["interaction_intactid"]+c].head()

37655


Unnamed: 0,interaction_intactid,mutation_mi_1,mutation_name_1,mutation_short_1,mutation_begin_1,mutation_end_1,mutation_orig_1,mutation_new_1,mutation_mi_2,mutation_name_2,mutation_short_2,mutation_begin_2,mutation_end_2,mutation_orig_2,mutation_new_2
107,EBI-2437595,MI:0573|MI:0573,mutation disrupting interaction|mutation disru...,P08069:p.Lys1033Ala|P08069:p.Tyr980Phe,1033|980,1033|980,K|Y,A|F,,,,,,,
124,EBI-9678732,,,,,,,,MI:0429,necessary binding region,finger_domain,1.0,187.0,,
127,EBI-9678671,MI:0118,mutation,Q16513:p.Leu520Ile,520,520,L,I,,,,,,,
128,EBI-9998408,MI:0118,mutation,Q16513:p.Leu520Ile,520,520,L,I,,,,,,,
129,EBI-9678713,,,,,,,,MI:0429,necessary binding region,finger_domain,1.0,187.0,,


In [703]:
# save the PTM information
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)

simplemerged_ptm_and_mut.to_csv(f"{savedir}/simplemerged_ptm_and_mut_dec11_2025.csv", index=False)
simplemerged_neg_ptm_and_mut.to_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_dec11_2025.csv", index=False)
simplemerged_ptm_and_mut_unknown.to_csv(f"{savedir}/simplemerged_ptm_and_mut_unknown_dec11_2025.csv", index=False)

In [704]:
# let's look at the stats!!
print(f"Unique positive sequences: {len(simplemerged_ptm_and_mut['seq_sort'].unique())}")
print(f"Unique negative sequences: {len(simplemerged_neg_ptm_and_mut['seq_sort'].unique())}")
print(f"Unique unknown sequences: {len(simplemerged_ptm_and_mut_unknown['seq_sort'].unique())}")

Unique positive sequences: 463404
Unique negative sequences: 437236
Unique unknown sequences: 10363


In [705]:
simplemerged_ptm_and_mut[["interaction_mi","interaction_label"]]

Unnamed: 0,interaction_mi,interaction_label
0,MI:1110,predicted interaction
1,MI:1110,predicted interaction
2,MI:1110,predicted interaction
3,MI:1110,predicted interaction
4,MI:1110,predicted interaction
...,...,...
799347,MI:0407,direct interaction
799348,MI:0407,direct interaction
799349,MI:0407,direct interaction
799350,MI:0407,direct interaction


In [706]:
simplemerged_ptm_and_mut.drop_duplicates("seq_sort")["interaction_mi"].value_counts()

interaction_mi
MI:0915    391186
MI:0407     25917
MI:0914     23007
MI:2364      7916
MI:0403      2739
MI:0217      1817
MI:0203       381
MI:0570       373
MI:0194       206
MI:0195       162
MI:0414       106
MI:1110       104
MI:0213        97
MI:0192        57
MI:0557        56
MI:0871        38
MI:0408        35
MI:0945        34
MI:0197        33
MI:0844        25
MI:0559        25
MI:0220        22
MI:0216        20
MI:0204        18
MI:0210        13
MI:0566         8
MI:1127         5
MI:1126         5
MI:1310         5
MI:1148         2
MI:2280         2
MI:0201         2
MI:0985         2
MI:0569         1
MI:0556         1
MI:0199         1
MI:1327         1
MI:0193         1
MI:1143         1
MI:0701         1
MI:1237         1
MI:0902         1
MI:0567         1
Name: count, dtype: int64

In [707]:
simplemerged_neg_ptm_and_mut.drop_duplicates("seq_sort")["interaction_mi"].value_counts()

interaction_mi
MI:0915    9444
MI:0407    2636
MI:0217     388
MI:0403     127
MI:0570      81
MI:2364      68
MI:0203      66
MI:0213      62
MI:0914      34
MI:0194      25
MI:0414      22
MI:0192      17
MI:0871      15
MI:0559      14
MI:0566       7
MI:0216       6
MI:0204       6
MI:0195       6
MI:1127       5
MI:0197       3
MI:0844       3
MI:0945       3
MI:0210       3
MI:0220       2
MI:0556       1
Name: count, dtype: int64

# Binding sites

## Intermediate file load

In [708]:
# Now to read bindsites we have to bump the limit
# bump the limit (use a big number; sys.maxsize may OverflowError on some platforms)
limit = 10**9
try:
    csv.field_size_limit(limit)
except OverflowError:
    # fallback: shrink until it fits the platform
    while True:
        try:
            csv.field_size_limit(limit)
            break
        except OverflowError:
            limit //= 10

In [709]:
ptms_path = "data_files/raw/intact/psimitab/features/ptms.tsv"
ptms = pd.read_csv(ptms_path, sep="\t", engine="python")
ptms["Interaction AC"] = ptms["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

mutations_path = "data_files/raw/intact/psimitab/features/mutations.tsv"
mutations = pd.read_csv(mutations_path, sep="\t", engine="python")
mutations["Interaction AC"] = mutations["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

bindsites_path = "data_files/raw/intact/psimitab/features/bindings_regions.tsv"            
bindsites = pd.read_csv(bindsites_path, sep="\t", engine="python")
bindsites["Interaction AC"] = bindsites["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

analyzed_mods_dir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/features_analyzed"
bindsite_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/bindsite_types_analyzed.csv")
mutation_feature_ac_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_ac_analyzed.csv")
mutation_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_annotations_analyzed.csv")
mutation_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_types_analyzed.csv")
ptm_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_types_analyzed.csv")
ptm_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_annotations_analyzed.csv")

interaction_milabel_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0190_subtree.csv")
mutation_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0118_subtree.csv")
bindsite_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0117_subtree.csv")
ptm_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0925_subtree.csv")

In [710]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv")
# intact-clust
intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv")

# merged
merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
# 
#merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")

# my_pos
my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv")
# my_neg
my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv")

simplemerged_ptm_and_mut = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_dec11_2025.csv")

simplemerged_ptm_and_mut_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_unknown_dec11_2025.csv")

simplemerged_neg_ptm_and_mut = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_dec11_2025.csv")


  intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv")
  intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv")
  merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
  merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
  my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv")
  my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv")
  simplemerged_ptm_and_mut = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_dec11_2025.csv")
  simplemerged_ptm_and_mut_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_unknown_dec11_2025.csv")
  simplemerged_neg_ptm_and_mut = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_dec11_2025.csv")


In [711]:
intact_dtypes = {k: "string" for k in intact.columns}
intact_dtypes["Negative"] = "bool"
#intact_dtypes["miscore"] = "float"

intact_clust_dtypes = {k: "string" for k in intact_clust.columns}
intact_clust_dtypes["Negative"] = "bool"
intact_clust_dtypes["miscore"] = "float"
intact_clust_dtypes["equal_score_int"] = "bool"

simplemerged_dtypes = {k: "string" for k in simplemerged_ptm_and_mut.columns}
simplemerged_dtypes["miscore"] = "float"
simplemerged_dtypes["year"] = "int"
simplemerged_dtypes["confidence_val_int"] = "float"
simplemerged_dtypes["unique_score_int"] = "float"

#simplemerged_dtypes["scraped_mut_has_info"] = "bool"
#simplemerged_dtypes["scraped_mut_has_info_1"] = "bool"
#simplemerged_dtypes["scraped_mut_has_info_2"] = "bool"
#simplemerged_dtypes["agg_mut_has_info"] = "bool"

merged_dtypes = {k: "string" for k in merged.columns}
merged_dtypes["Negative"] = "bool"
merged_dtypes["length_1"] = "int"
merged_dtypes["length_2"] = "int"
merged_dtypes["miscore"] = "float"
merged_dtypes["year"] = "int"
merged_dtypes["confidence_val_int"] = "float"
merged_dtypes["unique_score_int"] = "float"

my_pos_dtypes = {k: "string" for k in my_pos.columns}
my_pos_dtypes["length_1"] = "int"
my_pos_dtypes["length_2"] = "int"
my_pos_dtypes["year"] = "int"

my_neg_dtypes = {k: "string" for k in my_neg.columns}
my_neg_dtypes["miscore"] = "float"
#my_neg_dtypes["Negative"] = "bool"
my_neg_dtypes["length_1"] = "int"
my_neg_dtypes["length_2"] = "int"
my_neg_dtypes["year"] = "int"


In [712]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv", dtype=intact_dtypes)
# intact-clust
intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv", dtype=intact_clust_dtypes)

# merged
merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv", dtype=merged_dtypes)
# 
#merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv", dtype=merged_dtypes)

# my_pos
my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv", dtype=my_pos_dtypes)
# my_neg
my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv", dtype=my_neg_dtypes)

simplemerged_ptm_and_mut = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_dec11_2025.csv", dtype=simplemerged_dtypes)

simplemerged_ptm_and_mut_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_unknown_dec11_2025.csv",dtype=simplemerged_dtypes)

simplemerged_neg_ptm_and_mut = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_dec11_2025.csv",dtype=simplemerged_dtypes)


In [713]:
simplemerged_ptm_and_mut["Negative"] = False
simplemerged_neg_ptm_and_mut["Negative"] = True
simplemerged_ptm_and_mut_unknown["Negative"] = False
simplemerged_ptm_and_mut["length_1"] = simplemerged_ptm_and_mut["aa_1"].apply(lambda x: len(x) if pd.notna(x) else 0)
simplemerged_ptm_and_mut["length_2"] = simplemerged_ptm_and_mut["aa_2"].apply(lambda x: len(x) if pd.notna(x) else 0)
simplemerged_neg_ptm_and_mut["length_1"] = simplemerged_neg_ptm_and_mut["aa_1"].apply(lambda x: len(x) if pd.notna(x) else 0)
simplemerged_neg_ptm_and_mut["length_2"] = simplemerged_neg_ptm_and_mut["aa_2"].apply(lambda x: len(x) if pd.notna(x) else 0)
simplemerged_ptm_and_mut_unknown["length_1"] = simplemerged_ptm_and_mut_unknown["aa_1"].apply(lambda x: len(x) if pd.notna(x) else 0)
simplemerged_ptm_and_mut_unknown["length_2"] = simplemerged_ptm_and_mut_unknown["aa_2"].apply(lambda x: len(x) if pd.notna(x) else 0)

In [714]:
# harmonize nulls
intact = harmonize_nulls_to_nan(intact)
intact_clust = harmonize_nulls_to_nan(intact_clust)
merged = harmonize_nulls_to_nan(merged)
merged_neg = harmonize_nulls_to_nan(merged_neg)
my_pos = harmonize_nulls_to_nan(my_pos)
my_neg = harmonize_nulls_to_nan(my_neg)
simplemerged_ptm_and_mut = harmonize_nulls_to_nan(simplemerged_ptm_and_mut)
simplemerged_ptm_and_mut_unknown = harmonize_nulls_to_nan(simplemerged_ptm_and_mut_unknown)
simplemerged_neg_ptm_and_mut = harmonize_nulls_to_nan(simplemerged_neg_ptm_and_mut)


## Data processing

In [715]:
merged_expl = merged.copy(deep=True)
print(f"Length of merged: {len(merged_expl)}. Merged is already exploded by IntAct interaction identifier(s)")
# confirm just one more time that there is only one interaction identifier per row
test1 = len(merged_expl.loc[merged_expl["interaction_intactid"].isna()])==0
print(f"\tAll rows have one intact interaction ID after exploding: {test1}")
test1 = len(merged_expl.loc[merged_expl["interaction_intactid"].str.count("EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID after exploding: {test1}")

Length of merged: 745085. Merged is already exploded by IntAct interaction identifier(s)
	All rows have one intact interaction ID after exploding: True
	No rows have >1 intact interaction ID after exploding: True


In [716]:
scraped_binding_cols = [
"binding_mi_1",
"binding_name_1",
"binding_short_1",
"binding_begin_1",
"binding_end_1",
"binding_mi_2",
"binding_name_2",
"binding_short_2",
"binding_begin_2",
"binding_end_2"
]

In [717]:
merged_expl["scraped_binding_has_info"] = merged_expl[scraped_binding_cols].notna().any(axis=1)
merged_expl["scraped_binding_has_info_1"] = merged_expl[[x for x in scraped_binding_cols if x.endswith("_1")]].notna().any(axis=1)
merged_expl["scraped_binding_has_info_2"] = merged_expl[[x for x in scraped_binding_cols if x.endswith("_2")]].notna().any(axis=1)
test1 = len(merged_expl.loc[
    (merged_expl["scraped_binding_has_info"]) & 
    (merged_expl["scraped_binding_has_info_1"]) &
    (merged_expl["scraped_binding_has_info_2"])
    ][scraped_binding_cols])
print(f"Total rows with binding site entries for both interactor 1 and interactor 2: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_binding_has_info"])
    ][scraped_binding_cols])
print(f"Total rows with binding site entries for either interactor 1 or interactor 2: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")

Total rows with binding site entries for both interactor 1 and interactor 2: 43771/745085 (5.87%)
Total rows with binding site entries for either interactor 1 or interactor 2: 138159/745085 (18.54%)


In [718]:
merged_expl["binding_expandable_1"] = merged_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_binding_cols, interactor=1) if row["scraped_binding_has_info_1"] else False, axis=1)
merged_expl["binding_expandable_2"] = merged_expl.apply(lambda row: verify_equal_feature_lengths(row, scraped_binding_cols, interactor=2) if row["scraped_binding_has_info_2"] else False, axis=1)

In [719]:
test1 = len(merged_expl.loc[
    (merged_expl["scraped_binding_has_info_1"] != merged_expl["binding_expandable_1"])
])
print(f"Total rows where interactor 1 binding site info exists but is not expandable: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")
test1 = len(merged_expl.loc[
    (merged_expl["scraped_binding_has_info_2"] != merged_expl["binding_expandable_2"])
])
print(f"Total rows where interactor 2 binding site info exists but is not expandable: {test1}/{len(merged_expl)} ({100*test1/len(merged_expl):.2f}%)")

Total rows where interactor 1 binding site info exists but is not expandable: 0/745085 (0.00%)
Total rows where interactor 2 binding site info exists but is not expandable: 0/745085 (0.00%)


In [720]:
# Figure out how to explode along these columns
scraped_binding_cols_1 = [x for x in scraped_binding_cols if x.endswith("_1")]
scraped_binding_cols_2 = [x for x in scraped_binding_cols if x.endswith("_2")]
for c in scraped_binding_cols_1:
    merged_expl[c] = merged_expl.apply(lambda row: row[c].split("|") if (row["binding_expandable_1"] and type(row[c])==str) else [row[c]], axis=1)
for c in scraped_binding_cols_2:
    merged_expl[c] = merged_expl.apply(lambda row: row[c].split("|") if (row["binding_expandable_2"] and type(row[c])==str) else [row[c]], axis=1)

In [721]:
print(f"Going to explode merged_expl by interactor 1, then interactor 2 binding site columns. Length before: {len(merged_expl)}")
merged_expl = merged_expl.explode(scraped_binding_cols_1).reset_index(drop=True)
print(f"\tLength after exploding across 1: {len(merged_expl)}")
merged_expl = merged_expl.explode(scraped_binding_cols_2).reset_index(drop=True)
print(f"\tLength after exploding across 2: {len(merged_expl)}")

Going to explode merged_expl by interactor 1, then interactor 2 binding site columns. Length before: 745085
	Length after exploding across 1: 746801
	Length after exploding across 2: 750095


In [722]:
# if there is a merged_expl row that has binding info for interactor A and interactor B, then duplicate that row, and delete all the binding_*_1 info for one and all the binding_*_2 info for the other 
import pandas as pd
import numpy as np
import re

def split_rows_by_binding_blocks(merged_expl: pd.DataFrame) -> pd.DataFrame:
    """
    If a row has binding info in ANY binding_*_1 column AND ANY binding_*_2 column,
    duplicate that row into two:
      - copy A: keep *_1, null out *_2
      - copy B: keep *_2, null out *_1
    All other columns are preserved. Returns a new DataFrame.
    """
    df = merged_expl.copy()

    # Find all binding columns and split into sides
    mut_cols = [c for c in df.columns if re.match(r"^binding_.*_(1|2)$", c)]
    block1 = [c for c in mut_cols if c.endswith("_1")]
    block2 = [c for c in mut_cols if c.endswith("_2")]

    # "Has info" = any non-null value in that side's block
    has1 = df[block1].notna().any(axis=1)
    has2 = df[block2].notna().any(axis=1)

    both = has1 & has2
    keep = ~both

    # Rows that don't need splitting
    base = df.loc[keep]

    # Rows that do need splitting -> make two copies
    to_split = df.loc[both]

    left  = to_split.copy()
    right = to_split.copy()

    # In left copy: keep side 1, wipe side 2
    left.loc[:, block2] = np.nan

    # In right copy: keep side 2, wipe side 1
    right.loc[:, block1] = np.nan

    # Return combined result (preserve order roughly: base rows first, then splits)
    out = pd.concat([base, left, right], ignore_index=True)
    out = out.reset_index(drop=True)

    return out

In [723]:
merged_expl = split_rows_by_binding_blocks(merged_expl)
print(f"Length of merged_expl after splitting multiple bind site effects for the same row: {len(merged_expl)}")

  left.loc[:, block2] = np.nan
  right.loc[:, block1] = np.nan


Length of merged_expl after splitting multiple bind site effects for the same row: 1500190


In [724]:
binding_to_merge = bindsites.copy(deep=True)
test1 = len(bindsites.loc[bindsites["Interaction AC"].str.count("intact:EBI-")>1])==0
print(f"\tNo rows have >1 intact interaction ID in binding dataframe: {test1}")
binding_to_merge.columns = "Binding " + binding_to_merge.columns
binding_to_merge["Binding interaction_intactid"] = binding_to_merge["Binding Interaction AC"].apply(lambda x: x.split("intact:")[-1] if (type(x)==str and x.count("intact:")==1) else (x if type(x)!=str else None))
merged_expl = pd.merge(
    merged_expl,
    binding_to_merge.rename(columns={"Binding interaction_intactid":"interaction_intactid"}),
    on=["interaction_intactid"],
    how="left"
)

	No rows have >1 intact interaction ID in binding dataframe: True


In [725]:
agg_binding_cols = [
    "Binding # Feature AC",
"Binding Feature short label",
"Binding Feature range(s)",
"Binding Original sequence",
"Binding Resulting sequence",
"Binding Feature type",
"Binding Feature annotation(s)",
"Binding Affected protein AC",
"Binding Affected protein symbol",
"Binding Affected protein full name",
"Binding Affected protein organism",
"Binding Interaction participants",
"Binding PubMedID",
"Binding Figure legend(s)",
"Binding Interaction AC",
"Binding Xref ID(s)"
]

In [726]:
merged_expl["agg_binding_has_info"] = merged_expl[agg_binding_cols].notna().any(axis=1)

In [727]:
len(merged_expl.loc[
    (merged_expl["scraped_binding_has_info"]) &
    (merged_expl["agg_binding_has_info"])
].drop_duplicates(subset=["seq_pair_id"]))

73764

In [728]:
l = merged_expl.loc[
    (merged_expl["scraped_binding_has_info"]) &
    ~(merged_expl["agg_binding_has_info"])
]["binding_mi_1"].dropna().unique().tolist() 
l2 = merged_expl.loc[
    (merged_expl["scraped_binding_has_info"]) &
    ~(merged_expl["agg_binding_has_info"])
]["binding_mi_2"].dropna().unique().tolist()
l = l+l2
bindsite_mi_ok.loc[
    bindsite_mi_ok["id"].isin(l+l2)
].drop_duplicates(subset=["id"])

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
2,sufficient binding region,MI:0442,MI:0117,MI:0117,binding-associated region


In [729]:
# For what's below, I have to keep in mind that the aggregated binding data is NOT binary interactions only. It can also be n-ary interactions. 
interactions_with_xml_binding_data_only = merged_expl.loc[(merged_expl["scraped_binding_has_info"]) & ~(merged_expl["agg_binding_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_agg_binding_data_only = merged_expl.loc[~(merged_expl["scraped_binding_has_info"]) & (merged_expl["agg_binding_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_with_xml_and_agg_binding_data = merged_expl.loc[(merged_expl["scraped_binding_has_info"]) & (merged_expl["agg_binding_has_info"])]["interaction_intactid"].dropna().unique().tolist()
interactions_in_my_pos = my_pos["interaction_intactid"].dropna().unique().tolist()
total_intactids = len(merged_expl["interaction_intactid"].dropna().unique())
print(f"Total interaction IDs with binding data only from XML scraping: {len(interactions_with_xml_binding_data_only)}/{total_intactids} = ({100*len(interactions_with_xml_binding_data_only)/total_intactids:.2f}%)")
print(f"Total interaction IDs with binding data only from aggregated binding table: {len(interactions_with_agg_binding_data_only)}/{total_intactids} = ({100*len(interactions_with_agg_binding_data_only)/total_intactids:.2f}%)")
print(f"\tTotal that are also in my_pos: {len(set(interactions_with_agg_binding_data_only).intersection(set(interactions_in_my_pos)))}")
print(f"Total interaction IDs with binding data from both XML scraping and aggregated binding table: {len(interactions_with_xml_and_agg_binding_data)}/{total_intactids} = ({100*len(interactions_with_xml_and_agg_binding_data)/total_intactids:.2f}%)")

# For what's below, I have to keep in mind that the aggregated binding data is NOT binary interactions only. It can also be n-ary interactions. 
seq_pair_ids_with_xml_binding_data_only = merged_expl.loc[(merged_expl["scraped_binding_has_info"]) & ~(merged_expl["agg_binding_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_agg_binding_data_only = merged_expl.loc[~(merged_expl["scraped_binding_has_info"]) & (merged_expl["agg_binding_has_info"])]["seq_pair_id"].dropna().unique().tolist()
seq_pair_ids_with_xml_and_agg_binding_data = merged_expl.loc[(merged_expl["scraped_binding_has_info"]) & (merged_expl["agg_binding_has_info"])]["seq_pair_id"].dropna().unique().tolist()
total_seq_pair_ids = len(merged_expl["seq_pair_id"].dropna().unique())
print(f"\nTotal seq_pair_ids with binding data only from XML scraping: {len(seq_pair_ids_with_xml_binding_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_binding_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with binding data only from aggregated binding table: {len(seq_pair_ids_with_agg_binding_data_only)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_agg_binding_data_only)/total_seq_pair_ids:.2f}%)")
print(f"Total seq_pair_ids with binding data from both XML scraping and aggregated binding table: {len(seq_pair_ids_with_xml_and_agg_binding_data)}/{total_seq_pair_ids} = ({100*len(seq_pair_ids_with_xml_and_agg_binding_data)/total_seq_pair_ids:.2f}%)")

Total interaction IDs with binding data only from XML scraping: 6/743127 = (0.00%)
Total interaction IDs with binding data only from aggregated binding table: 0/743127 = (0.00%)
	Total that are also in my_pos: 0
Total interaction IDs with binding data from both XML scraping and aggregated binding table: 136618/743127 = (18.38%)

Total seq_pair_ids with binding data only from XML scraping: 4/426539 = (0.00%)
Total seq_pair_ids with binding data only from aggregated binding table: 0/426539 = (0.00%)
Total seq_pair_ids with binding data from both XML scraping and aggregated binding table: 73764/426539 = (17.29%)


In [730]:
merged_expl.loc[
    (merged_expl["Binding Original sequence"].notna()) & 
    (merged_expl["Binding Original sequence"]!="-")
    ][["Binding Original sequence","Binding Resulting sequence"]]

Unnamed: 0,Binding Original sequence,Binding Resulting sequence
39,MALKRHKHHIGTRYIEVYRASGEDFLAIAGGASNEAQAFLSKGAQV...,-
40,MALKRHKHHIGTRYIEVYRASGEDFLAIAGGASNEAQAFLSKGAQV...,-
43,MSPPHHENRLFGIHLGLNLGGGGHHHHHHHPPPPVHHYHPPPPVHH...,-
44,MSPPHHENRLFGIHLGLNLGGGGHHHHHHHPPPPVHHYHPPPPVHH...,-
123,MNKLSGGGGRRTRVEGGQLGGEEWTRHGSFVNKPTRGWLHPNDKVM...,-
...,...,...
1696282,MQTDSLSPSPNPVSPVPLNNPTSAPRYGTVIPNRIFVGGIDFKTNE...,-
1696294,DYLSFT,-
1696308,LAAMLFILMNWYYRTVHKRKLKAVVAGSTGNQGLMDILDMPNTNKY...,-
1696310,LAAMLFILMNWYYRTVHKRKLKAVVAGSTGNQGLMDILDMPNTNKY...,-


In [731]:
gbuid = merged_expl.groupby("unique_id").agg(
    total_seq_ids=("seq_pair_id", lambda x: len(set(x)))
).reset_index()
print(sum(gbuid["total_seq_ids"]))
gbuid.sort_values(by=["total_seq_ids"],ascending=False)

428495


Unnamed: 0,unique_id,total_seq_ids
428494,intact:EBI-999900_intact:EBI-999909,1
0,intact:EBI-100018_intact:EBI-101707,1
1,intact:EBI-100018_intact:EBI-102069,1
2,intact:EBI-100018_intact:EBI-104215,1
3,intact:EBI-100018_intact:EBI-107089,1
...,...,...
18,intact:EBI-100018_intact:EBI-499767,1
19,intact:EBI-100018_intact:EBI-86917,1
20,intact:EBI-100028_intact:EBI-109208,1
21,intact:EBI-100028_intact:EBI-126963,1


In [732]:
# how do we determine if a mutation row is a mutation row? 
# if it has # Feature AC 
test1 = len(bindsites.loc[bindsites["# Feature AC"].isna()])==0
print(f"Everything in the bindsites dataframe has a # Feature AC entry: {test1}")
test1 = len(bindsites.loc[bindsites["Feature type"].isna()])==0
print(f"Everything in the bindsites dataframe has a Feature type entry: {test1}")
test1 = len(bindsites.loc[bindsites["Feature type"].isna()])
print(f"\tTotal mutation features without an MI term identifier: {test1}/{len(bindsites)} ({100*test1/len(bindsites):.2f}%)")
test1 = len(bindsites.loc[bindsites["Feature range(s)"].isna()])
print(f"\tTotal mutation features without a feature range: {test1}/{len(bindsites)} ({100*test1/len(bindsites):.2f}%)")
test1 = len(bindsites.loc[
    bindsites["Affected protein AC"].isna()
])
print(f"\tTotal bindsites that do not indicate which protein is affected: {test1}")
test1 = len(bindsites.loc[
    bindsites["Interaction AC"].isna()
])
print(f"\tTotal bindsites that do not indicate which interaction is affected: {test1}")

test1 = len(bindsites.loc[
    bindsites["Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal bindsites that have multiple comma-separated entries for affected protein AC: {test1}")
test1 = len(bindsites.loc[
    bindsites["Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal bindsites that have multiple pipe-separated entries for affected protein AC: {test1}")

Everything in the bindsites dataframe has a # Feature AC entry: True
Everything in the bindsites dataframe has a Feature type entry: False
	Total mutation features without an MI term identifier: 72/205660 (0.04%)
	Total mutation features without a feature range: 50/205660 (0.02%)
	Total bindsites that do not indicate which protein is affected: 84
	Total bindsites that do not indicate which interaction is affected: 118
	Total bindsites that have multiple comma-separated entries for affected protein AC: 22
	Total bindsites that have multiple pipe-separated entries for affected protein AC: 22


In [733]:
# Check for comma-separated identifiers of affected protein
test1 = len(merged_expl.loc[
    merged_expl["Binding Affected protein AC"].fillna("").str.contains(",")
])
print(f"\tTotal merged-in bindings that have multiple comma-separated entries for affected protein AC: {test1}")

# Check for pipe-separated identifiers of affected protein
test1 = len(merged_expl.loc[
    merged_expl["Binding Affected protein AC"].fillna("").str.contains("\\|")
])
print(f"\tTotal merged-in bindings that have multiple pipe-separated entries for affected protein AC: {test1}")

# Check for UniProtKB vs. IntAct identifiers for affected protein
test2 = len(merged_expl.loc[
    merged_expl["Binding Affected protein AC"].notna()
])
test1 = len(merged_expl.loc[
    merged_expl["Binding Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
print(f"\tTotal merged-in bindings that have uniprotkb identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")
# How many uniprots have isoforms? 
test2 = len(merged_expl.loc[
    merged_expl["Binding Affected protein AC"].fillna("").str.contains("uniprotkb:")
])
test1 = len(merged_expl.loc[
    (merged_expl["Binding Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_expl["Binding Affected protein AC"].fillna("").str.contains("-"))
])
print(f"\t\tFraction where an isoform is present: {test1}/{test2} ({100*test1/test2:.2f}%)")

test1 = len(merged_expl.loc[
    merged_expl["Binding Affected protein AC"].fillna("").str.contains("intact:EBI-")
])
print(f"\tTotal merged-in bindings that have IntAct identifier for affected protein: {test1}/{test2} ({100*test1/test2:.2f}%)")

temp = merged_expl.loc[
    (merged_expl["Binding Affected protein AC"].notna()) & 
    ~(merged_expl["Binding Affected protein AC"].fillna("").str.contains("uniprotkb:")) &
    ~(merged_expl["Binding Affected protein AC"].fillna("").str.contains("intact:EBI-")) 
]["Binding Affected protein AC"].apply(lambda x: x.split(":")[0]).value_counts().to_dict()
test1 = sum([v for v in list(temp.values())])
print(f"\tTotal merged-in bindings that have other identifiers: {test1}/{test2} ({100*test1/test2:.2f}%)")
print(f"\t\tBreakdown: {temp}")


	Total merged-in bindings that have multiple comma-separated entries for affected protein AC: 0
	Total merged-in bindings that have multiple pipe-separated entries for affected protein AC: 0
	Total merged-in bindings that have uniprotkb identifier for affected protein: 479908/482462 (99.47%)
		Fraction where an isoform is present: 65330/479908 (13.61%)
	Total merged-in bindings that have IntAct identifier for affected protein: 2276/479908 (0.47%)
	Total merged-in bindings that have other identifiers: 278/479908 (0.06%)
		Breakdown: {'dip': 166, 'afcs': 92, 'uniparc': 10, 'refseq': 6, 'entrezgene/locuslink': 2, 'ddbj/embl/genbank': 2}


In [734]:
test1 = merged_expl.loc[
    (merged_expl["Binding Affected protein AC"].fillna("").str.contains("uniprotkb:")) & 
    (merged_expl["Binding Affected protein AC"].fillna("").str.contains("-")) 
][
    ["unique_id","uniprot_A_intact","uniprot_B_intact","uniprot_A","uniprot_B","Binding Affected protein AC", "aa_1","aa_2",]
]
test1 = len(test1.loc[
    (test1["uniprot_A"]!=test1["Binding Affected protein AC"]) & 
    (test1["uniprot_B"]!=test1["Binding Affected protein AC"]) & 
    (test1["uniprot_A_intact"]!=test1["Binding Affected protein AC"]) & 
    (test1["uniprot_B_intact"]!=test1["Binding Affected protein AC"]) 
].reset_index(drop=True))
print(f"\tTotal rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: {test1}")


	Total rows where the Affected protein AC from UniProt does not match the IntAct-provided uniprot A or B, or the corrected ones we calculated: 0


In [735]:
bindsite_types_labeled.head()


Unnamed: 0,feature,original_sequence,without_binding_site,comments
0,"psi-mi:""MI:0442""(sufficient binding region)",yes,unknown,
1,"psi-mi:""MI:0429""(necessary binding region)",yes,no,
2,"psi-mi:""MI:0117""(binding-associated region)",yes,unknown,
3,"psi-mi:""MI:1125""(direct binding region)",yes,unknown,
4,"(uniprotkb:Q14524(psi-mi:""MI:0326""(protein))",yes,unknown,assume yes for original sequence binding here!...


In [736]:
## Merge in feature types labeled
merged_expl = pd.merge(
    merged_expl,
    bindsite_types_labeled.rename(
        columns={
            "feature": "Binding Feature type",
            "original_sequence": "Binding og_binds_bo_feature_type",
            "without_binding_site": "Binding new_binds_bo_feature_type"
            }
    )[[
        "Binding Feature type",
        "Binding og_binds_bo_feature_type",
        "Binding new_binds_bo_feature_type"
            ]],
    on="Binding Feature type",
    how="left"
)

In [737]:
# look at the subset of merged_expl that has binding-related data
mask = merged_expl["scraped_binding_has_info"] | merged_expl["agg_binding_has_info"]
merged_expl_binding = merged_expl.loc[mask].reset_index(drop=True)
print(f"\tTotal rows of exploded merged that have at least one column of binding-related data (from XML or aggregated): {len(merged_expl_binding)}/{len(merged_expl)} ({100*len(merged_expl_binding)/len(merged_expl):.2f}%)")

	Total rows of exploded merged that have at least one column of binding-related data (from XML or aggregated): 482478/1696330 (28.44%)


In [738]:
del merged_expl

In [739]:
merged_expl_binding["Binding Interactor Matches"] = merged_expl_binding.apply(lambda row: feature_affected_protein_matches_id(row,feature="Binding"), axis=1)
merged_expl_binding.head()

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Alias(es) interactor A,Alias(es) interactor B,Interaction detection method(s),Publication 1st author(s),Publication Identifier(s),Taxid interactor A,...,Binding Affected protein organism,Binding Interaction participants,Binding PubMedID,Binding Figure legend(s),Binding Interaction AC,Binding Xref ID(s),agg_binding_has_info,Binding og_binds_bo_feature_type,Binding new_binds_bo_feature_type,Binding Interactor Matches
0,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VE54(psi-mi:""MI:0326""(protein)), ...",pubmed:15710747|imex:IM-16519,-,intact:EBI-502739,-,True,yes,unknown,A
1,intact:EBI-100018,intact:EBI-102069,uniprotkb:Q9VE54,uniprotkb:O16844,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:cos_drome|psi-mi:cos|uniprotkb:A1Z6X4|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VE54(psi-mi:""MI:0326""(protein)), ...",pubmed:15710747|imex:IM-16519,-,intact:EBI-502739,-,True,yes,unknown,B
2,intact:EBI-100018,intact:EBI-128453,uniprotkb:Q9VE54,uniprotkb:Q9VGI8,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:blm_drome|psi-mi:Blm|uniprotkb:Q9Y062|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VE54(psi-mi:""MI:0326""(protein)), ...",pubmed:15710747|imex:IM-16519,-,intact:EBI-509490,-,True,yes,unknown,A
3,intact:EBI-100018,intact:EBI-128453,uniprotkb:Q9VE54,uniprotkb:Q9VGI8,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,psi-mi:blm_drome|psi-mi:Blm|uniprotkb:Q9Y062|u...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VE54(psi-mi:""MI:0326""(protein)), ...",pubmed:15710747|imex:IM-16519,-,intact:EBI-509490,-,True,yes,unknown,B
4,intact:EBI-152952,intact:EBI-100018,uniprotkb:Q9VML8,uniprotkb:Q9VE54,psi-mi:q9vml8_drome|psi-mi:rau|uniprotkb:rau|u...,psi-mi:q9ve54_drome|psi-mi:CG7696|uniprotkb:CG...,"psi-mi:""MI:0399""(two hybrid fragment pooling a...",Formstecher et al. (2005),pubmed:15710747|imex:IM-16519|mint:MINT-5217543,taxid:7227(drome),...,"taxid:7227(drome)|taxid:7227(""Drosophila melan...","(uniprotkb:Q9VML8(psi-mi:""MI:0326""(protein)), ...",pubmed:15710747|imex:IM-16519,-,intact:EBI-512367,-,True,yes,unknown,B


In [740]:
# Investigate database breakdown
# Look at the different databases the results came from
print("Investigating positive-PPIs merged with Binding data")
temp = merged_expl_binding.loc[merged_expl_binding["Binding Interactor Matches"].apply(lambda x: len(x)>0)]
print("\nDatabases yielding successful matches:")
print(temp["Binding Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

temp = merged_expl_binding.loc[merged_expl_binding["Binding Interactor Matches"].apply(lambda x: len(x)==0)]
print("\nDatabases yielding unsuccessful matches:")
print(temp["Binding Affected protein AC"].fillna("").str.split(":",expand=True)[0].value_counts())

Investigating positive-PPIs merged with Binding data

Databases yielding successful matches:
0
uniprotkb    479908
intact         2276
dip             166
Name: count, dtype: int64

Databases yielding unsuccessful matches:
0
afcs                    92
                        16
uniparc                 10
refseq                   6
ddbj/embl/genbank        2
entrezgene/locuslink     2
Name: count, dtype: int64


In [741]:
print([x for x in merged_expl_binding.columns if "intact" in x])

['all_intact_A_sorted', 'all_intact_B_sorted', 'uniprot_A_intact', 'uniprot_B_intact', 'interaction_intactid', 'intactid_1', 'intactid_2', 'unique_all_intact_sorted']


In [742]:
merged_expl_binding.loc[
    (merged_expl_binding["Binding Interactor Matches"].apply(lambda x: len(x)==0)) & 
    (merged_expl_binding["Binding Affected protein AC"].str.contains("entrezgene/locuslink"))
    ][
    ["ID(s) interactor A","ID(s) interactor B","interaction_intactid","interaction_xml_id","year",'Alt. ID(s) interactor A', 'Alt. ID(s) interactor B','Binding Affected protein AC']+agg_binding_cols]

Unnamed: 0,ID(s) interactor A,ID(s) interactor B,interaction_intactid,interaction_xml_id,year,Alt. ID(s) interactor A,Alt. ID(s) interactor B,Binding Affected protein AC,Binding # Feature AC,Binding Feature short label,...,Binding Feature annotation(s),Binding Affected protein AC.1,Binding Affected protein symbol,Binding Affected protein full name,Binding Affected protein organism,Binding Interaction participants,Binding PubMedID,Binding Figure legend(s),Binding Interaction AC,Binding Xref ID(s)
39426,intact:EBI-21395984,intact:EBI-1104674,EBI-21373863,2831,2019,,uniprotkb:P10909,entrezgene/locuslink:221262,EBI-21395988,region,...,-,entrezgene/locuslink:221262,-,C6orf185,taxid:9606(human)|taxid:9606(Homo sapiens),"(entrezgene/locuslink:221262(psi-mi:""MI:0326""(...",pubmed:31413325|imex:IM-26801,figure legend:Table 1,intact:EBI-21373863,-
280665,intact:EBI-21395984,intact:EBI-1104674,EBI-21373863,2831,2019,,uniprotkb:P10909,entrezgene/locuslink:221262,EBI-21395988,region,...,-,entrezgene/locuslink:221262,-,C6orf185,taxid:9606(human)|taxid:9606(Homo sapiens),"(entrezgene/locuslink:221262(psi-mi:""MI:0326""(...",pubmed:31413325|imex:IM-26801,figure legend:Table 1,intact:EBI-21373863,-


In [743]:
merged_expl_binding["Binding Interactor Matches"].value_counts()

Binding Interactor Matches
B      216534
A      213600
A,B     52216
          128
Name: count, dtype: int64

In [744]:
print(f"Positive-PPIs: Matched bindings with their interactors A and/or B based on IDs.")
# no match
test1 = len(merged_expl_binding.loc[
    merged_expl_binding["Binding Interactor Matches"].apply(lambda x: len(x))==0
])
print(f"\tTotal rows where binding partner could not be found: {test1}/{len(merged_expl_binding)} ({100*test1/len(merged_expl_binding):.2f}%)")
test1 = len(merged_expl_binding.loc[
    (merged_expl_binding["Binding Interactor Matches"].apply(lambda x: len(x))==0) & 
    (merged_expl_binding["Binding Affected protein AC"].notna())
])==0
print(f"\t\tIn all cases, merged Binding database has no Affected protein AC: {test1}")
# A only
test1 = len(merged_expl_binding.loc[
    (merged_expl_binding["Binding Interactor Matches"].apply(lambda x: x=="A" if len(x)==1 else False))
])
print(f"\tTotal rows where binding partner is A only: {test1}/{len(merged_expl_binding)} ({100*test1/len(merged_expl_binding):.2f}%)")
test1 = len(merged_expl_binding.loc[
    (merged_expl_binding["Binding Interactor Matches"].apply(lambda x: x=="B" if len(x)==1 else False))
])
print(f"\tTotal rows where binding partner is B only: {test1}/{len(merged_expl_binding)} ({100*test1/len(merged_expl_binding):.2f}%)")
test1 = len(merged_expl_binding.loc[
    (merged_expl_binding["Binding Interactor Matches"].apply(lambda x: x=="A,B"))
])
print(f"\tTotal rows where binding partner is A and B: {test1}/{len(merged_expl_binding)} ({100*test1/len(merged_expl_binding):.2f}%)")


Positive-PPIs: Matched bindings with their interactors A and/or B based on IDs.
	Total rows where binding partner could not be found: 128/482478 (0.03%)
		In all cases, merged Binding database has no Affected protein AC: False
	Total rows where binding partner is A only: 213600/482478 (44.27%)
	Total rows where binding partner is B only: 216534/482478 (44.88%)
	Total rows where binding partner is A and B: 52216/482478 (10.82%)


In [745]:
def check_binding_indices_for_redundancy(s):
    """
    Return true if not redundancy
    """
    # is it always the same thing on both sides? 
    if s is None or type(s)==float:
        return True
    s_list = s.split(",")
    for s in s_list:
        left = s.split("-")[0]
        right = s.split("-")[1]

        if len(list(set(left.split(".."))))!=1 or  len(list(set(right.split(".."))))!=1:
            return False
    return True

def clean_binding_redundant_indices(s):
    # is it always the same thing on both sides? 
    if s is None or type(s)==float:
        return None
    s_list = s.split(",")
    fixed = []
    for s in s_list:
        left = s.split("-")[0]
        right = s.split("-")[1]
        
        if len(list(set(left.split(".."))))!=1 or  len(list(set(right.split(".."))))!=1:
            return None

        new_item_left = left.split("..")[0]
        new_item_right = right.split("..")[0]
        
        if (not new_item_left.isdigit()) or (not new_item_right.isdigit):
            return None
        
        new_item = f"{new_item_left}-{new_item_right}"
        fixed.append(new_item)

    return ",".join(fixed)

In [746]:
# correct ranges
merged_expl_binding["Binding Feature range(s)"] = merged_expl_binding["Binding Feature range(s)"].apply(lambda x: clean_binding_redundant_indices(x))
display(merged_expl_binding[["Binding Feature range(s)"]].head())

merged_expl_binding["Binding redundant_indices"] = merged_expl_binding["Binding Feature range(s)"].apply(lambda x: check_binding_indices_for_redundancy(x))
test1 = len(merged_expl_binding.loc[merged_expl_binding["Binding redundant_indices"]==False])==0
print(f"None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : {test1}")
merged_expl_binding = merged_expl_binding.drop(columns=["Binding redundant_indices"])

Unnamed: 0,Binding Feature range(s)
0,1207-1783
1,
2,1140-1542
3,
4,1207-1512


None of the rows where indices look like 1..1-2..2 have a different index before and after the .. : True


In [747]:
test1 = len(bindsites.loc[bindsites["Feature range(s)"].fillna("").str.contains("\\?")])
print(f"In original bindsites database, {test1} rows had ? in their Feature Range(s)")
test1 = merged_expl_binding.loc[merged_expl_binding["Binding Feature range(s)"].fillna("").str.contains("\\?")]
print(f"Total rows in merged_expl_binding with ? in its Feature Range(s): {len(test1)}")
display(test1[["Binding Feature range(s)"]].head())
merged_expl_binding["Binding Feature range(s)"] = merged_expl_binding["Binding Feature range(s)"].apply(lambda x: np.nan if (type(x)==str and "?" in x) else x)
test1 = len(merged_expl_binding.loc[merged_expl_binding["Binding Feature range(s)"].fillna("").str.contains("\\?")])==0
print(f"After changing any range containing ? to np.nan, there are no rows with ? in its Feature Range(s): {test1}")

In original bindsites database, 13844 rows had ? in their Feature Range(s)
Total rows in merged_expl_binding with ? in its Feature Range(s): 112


Unnamed: 0,Binding Feature range(s)
6201,1317-?
32388,190-?
34885,2-?
48024,2-?
63999,12-?


After changing any range containing ? to np.nan, there are no rows with ? in its Feature Range(s): True


In [748]:
# make new ranges
def convert_binding_begin_end_into_range(row, partner:int|str = 1):
    """
    Take binding_begin_1 and binding_end_1 (or _2) and convert into Mutation Feature range(s) format 
    e.g. binding_begin_1 = 1033 and binding_end_1 = 1033 --> Mutation Feature range(s) = "1033-1033"
    """
    partner = str(partner)
    beg_col = f"binding_begin_{partner}"
    end_col = f"binding_end_{partner}"
    
    if (row[beg_col] is None) or type(row[beg_col]) in [float,pd._libs.missing.NAType] or (row[end_col] is None) or type(row[end_col])in [float,pd._libs.missing.NAType]:
        return None
    
    # there may be commas. e.g. 91, 94. split on commas and match by index
    try:
        begins = [int(x.strip()) for x in row[beg_col].split(",")]
        ends = [int(x.strip()) for x in row[end_col].split(",")]
    except:
        return None
    
    if len(begins)!=len(ends):
        return None
    ranges = []
    for i in range(len(begins)):
        ranges.append(f"{begins[i]}-{ends[i]}")
    return ",".join(ranges)

In [749]:
merged_expl_binding["binding_range_1"] = merged_expl_binding.apply(lambda row: convert_binding_begin_end_into_range(row, partner=1), axis=1)
merged_expl_binding["binding_range_2"] = merged_expl_binding.apply(lambda row: convert_binding_begin_end_into_range(row, partner=2), axis=1)

In [750]:
merged_expl_binding[["binding_range_1","Binding Feature type"]].dropna().head()

Unnamed: 0,binding_range_1,Binding Feature type
0,1207-1783,"psi-mi:""MI:0117""(binding-associated region)"
1,1207-1783,"psi-mi:""MI:0117""(binding-associated region)"
2,1140-1542,"psi-mi:""MI:0117""(binding-associated region)"
3,1140-1542,"psi-mi:""MI:0117""(binding-associated region)"
4,1-430,"psi-mi:""MI:0117""(binding-associated region)"


In [751]:
merged_expl_binding_filt = merged_expl_binding.loc[
    ((merged_expl_binding["scraped_binding_has_info"]) & ~(merged_expl_binding["agg_binding_has_info"])) |
    (
        (merged_expl_binding["scraped_binding_has_info"]) & 
        (merged_expl_binding["agg_binding_has_info"]) & 
            (
                (merged_expl_binding["Binding Interactor Matches"].str.contains("A")) & 
                (merged_expl_binding["binding_short_1"]==merged_expl_binding["Binding Feature short label"]) & 
                (merged_expl_binding["binding_range_1"]==merged_expl_binding["Binding Feature range(s)"])
            ) | 
            (
                (merged_expl_binding["Binding Interactor Matches"].str.contains("B")) & 
                (merged_expl_binding["binding_short_2"]==merged_expl_binding["Binding Feature short label"]) &
                (merged_expl_binding["binding_range_2"]==merged_expl_binding["Binding Feature range(s)"])
            )  
    )
]
print(f"Length of merged_expl_binding before filtering for scraped-agg consistency: {len(merged_expl_binding)}")
print(f"Length after: {len(merged_expl_binding_filt)}")
merged_expl_binding_filt.head()[[
    "interaction_intactid","Binding Interactor Matches","Binding Feature type","Binding Feature short label","binding_short_1","binding_short_2","Binding Feature range(s)","binding_range_1","binding_range_2"
]]

Length of merged_expl_binding before filtering for scraped-agg consistency: 482478
Length after: 185817


Unnamed: 0,interaction_intactid,Binding Interactor Matches,Binding Feature type,Binding Feature short label,binding_short_1,binding_short_2,Binding Feature range(s),binding_range_1,binding_range_2
0,EBI-502739,A,"psi-mi:""MI:0117""(binding-associated region)",region,region,,1207-1783,1207-1783,
2,EBI-509490,A,"psi-mi:""MI:0117""(binding-associated region)",region,region,,1140-1542,1140-1542,
5,EBI-512367,A,"psi-mi:""MI:0117""(binding-associated region)",region,region,,1-430,1-430,
6,EBI-513529,A,"psi-mi:""MI:0117""(binding-associated region)",region,region,,1-320,1-320,
9,EBI-513536,A,"psi-mi:""MI:0117""(binding-associated region)",region,region,,1-320,1-320,


In [752]:
test1 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["Binding Feature range(s)"].isna()])
print(f"Total rows in merged_expl_binding_filt with no Binding Feature range(s): {test1}")

test1 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["Binding Feature short label"].isna()])
print(f"\nTotal rows in merged_expl_binding_filt with no Binding Feature short label: {test1}")

display(merged_expl_binding_filt.loc[merged_expl_binding_filt["Binding Feature short label"].isna()][["Binding Feature short label","binding_short_1","binding_short_2"]])

Total rows in merged_expl_binding_filt with no Binding Feature range(s): 16

Total rows in merged_expl_binding_filt with no Binding Feature short label: 16


Unnamed: 0,Binding Feature short label,binding_short_1,binding_short_2
18414,,region,
40383,,,
40384,,,
40385,,,
107368,,,
149427,,region,
149428,,region,
149434,,region,
259653,,,
281622,,,region


In [753]:
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Interactor Matches"]=="B") & 
    (merged_expl_binding_filt["binding_range_1"].notna()) & 
    (merged_expl_binding_filt["binding_range_2"].isna())
])
print(f"There are {test1} rows where curated-binding data matches interactor B, but we only pulled data for interactor A.")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Interactor Matches"]=="A") & 
    (merged_expl_binding_filt["binding_range_2"].notna()) & 
    (merged_expl_binding_filt["binding_range_1"].isna())
])
print(f"There are {test1} rows where curated-binding data matches interactor A, but we only pulled data for interactor B.")
merged_expl_binding_filt = merged_expl_binding_filt.loc[
    ~(
    (merged_expl_binding_filt["Binding Interactor Matches"]=="B") & 
    (merged_expl_binding_filt["binding_range_1"].notna()) & 
    (merged_expl_binding_filt["binding_range_2"].isna()))
]
merged_expl_binding_filt = merged_expl_binding_filt.loc[
    ~(
    (merged_expl_binding_filt["Binding Interactor Matches"]=="A") & 
    (merged_expl_binding_filt["binding_range_2"].notna()) & 
    (merged_expl_binding_filt["binding_range_1"].isna())
    )
]
print(f"Dropped these rows. Remaining: {len(merged_expl_binding_filt)}")

There are 0 rows where curated-binding data matches interactor B, but we only pulled data for interactor A.
There are 0 rows where curated-binding data matches interactor A, but we only pulled data for interactor B.
Dropped these rows. Remaining: 185817


In [754]:
merged_expl_binding_filt = harmonize_nulls_to_nan(merged_expl_binding_filt)

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


In [755]:
def fill_bindsites_when_AB(df: pd.DataFrame, cols,
                           match_col: str = "Binding Interactor Matches") -> pd.DataFrame:
    """
    If match_col == 'A,B' (any whitespace; accepts 'A,B' or 'B,A') and
      - side 1 has any mutation_*_1 info but side 2 has none -> copy 1 -> 2
      - side 2 has any mutation_*_2 info but side 1 has none -> copy 2 -> 1
    """
    out = df.copy()

    # Normalize obvious sentinel strings on text-like cols so NA detection works
    text_cols = out.select_dtypes(include=["object","string"]).columns.intersection(cols + [match_col])
    out[text_cols] = out[text_cols].replace({"": pd.NA, "None": pd.NA, "nan": pd.NA}, regex=False)

    # Build paired column lists
    block1 = [c for c in cols if c.endswith("_1")]
    block2 = [c for c in cols if c.endswith("_2")]

    # Map base -> pair, e.g. 'mutation_begin' -> ('mutation_begin_1','mutation_begin_2')
    pairs = []
    for c1 in block1:
        base = c1[:-2]
        c2 = f"{base}_2"
        if c2 in out.columns:
            pairs.append((c1, c2))

    # Rows where interactor matches are A,B (either order, ignore spaces)
    ab_mask = (
        out[match_col]
        .astype("string")
        .str.strip()
        .str.contains(r'^(A\s*,\s*B|B\s*,\s*A)$', flags=re.I, na=False)
    )

    # "Has info" = any non-null across the block
    has1 = out[block1].notna().any(axis=1)
    has2 = out[block2].notna().any(axis=1)

    # Exactly-one-side-only masks
    mask_copy_12 = ab_mask & has1 & ~has2
    mask_copy_21 = ab_mask & has2 & ~has1

    # Copy 1 -> 2
    if mask_copy_12.any():
        idx = mask_copy_12[mask_copy_12].index
        for c1, c2 in pairs:
            out.loc[idx, c2] = out.loc[idx, c1].values

    # Copy 2 -> 1
    if mask_copy_21.any():
        idx = mask_copy_21[mask_copy_21].index
        for c1, c2 in pairs:
            out.loc[idx, c1] = out.loc[idx, c2].values

    return out

In [756]:
BINDING_COLS = [
"binding_mi_1",
"binding_name_1",
"binding_short_1",
"binding_begin_1",
"binding_end_1",
"binding_mi_2",
"binding_name_2",
"binding_short_2",
"binding_begin_2",
"binding_end_2",
"binding_range_1",
"binding_range_2"
]

merged_expl_binding_filt = fill_bindsites_when_AB(merged_expl_binding_filt, BINDING_COLS, match_col="Binding Interactor Matches")

  out[match_col]


In [757]:
temp = bindsites.loc[bindsites["Feature type"].notna()]
test1 = len(temp)
print(f"Fraction of original intact-compiled binding sites database that has a Binding Feature type: {test1}/{len(bindsites)} ({100*test1/len(bindsites):.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Feature type"].notna())  &
    (merged_expl_binding_filt["Binding Feature range(s)"].notna()) 
])
test2 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["scraped_binding_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and a feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Feature type"].notna())  &
    (merged_expl_binding_filt["Binding Feature range(s)"].isna()) 
])
test2 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["scraped_binding_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and no feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")

Fraction of original intact-compiled binding sites database that has a Binding Feature type: 205588/205660 (99.96%)
Fraction of XML-compiled database that has a feature type and a feature range: 185801/185817 (99.99%)
Fraction of XML-compiled database that has a feature type and no feature range: 0/185817 (0.00%)


In [758]:
merged_expl_binding_filt.loc[
    merged_expl_binding_filt["Binding Interactor Matches"]=="A,B"
].reset_index(drop=True)[["Binding Interactor Matches","unique_id"] + BINDING_COLS].head()

Unnamed: 0,Binding Interactor Matches,unique_id,binding_mi_1,binding_name_1,binding_short_1,binding_begin_1,binding_end_1,binding_mi_2,binding_name_2,binding_short_2,binding_begin_2,binding_end_2,binding_range_1,binding_range_2
0,"A,B",intact:EBI-1006038_intact:EBI-1006038,MI:0442,sufficient binding region,region,974,1265,MI:0442,sufficient binding region,region,974,1265,974-1265,974-1265
1,"A,B",intact:EBI-1009303_intact:EBI-1009303,MI:0442,sufficient binding region,region,1,195,MI:0442,sufficient binding region,region,1,195,1-195,1-195
2,"A,B",intact:EBI-1009303_intact:EBI-1009303,MI:0442,sufficient binding region,region,1,195,MI:0442,sufficient binding region,region,1,195,1-195,1-195
3,"A,B",intact:EBI-1009412_intact:EBI-1009412,MI:0442,sufficient binding region,region,28,179,MI:0442,sufficient binding region,region,28,179,28-179,28-179
4,"A,B",intact:EBI-1009412_intact:EBI-1009412,MI:0442,sufficient binding region,region,28,179,MI:0442,sufficient binding region,region,28,179,28-179,28-179


In [759]:
agg_binding_cols = [
"Binding # Feature AC",
"Binding Affected protein AC",
"Binding Affected protein full name",
"Binding Affected protein organism",
"Binding Affected protein symbol",
"Binding Feature annotation(s)",
"Binding Feature range(s)",
"Binding Feature short label",
"Binding Feature type",
"Binding Figure legend(s)",
"Binding Interaction AC",
"Binding Interaction participants",
"Binding Interactor Matches",
"Binding Original sequence",
"Binding PubMedID",
"Binding Resulting sequence",
"Binding Xref ID(s)",
"Binding new_binds_bo_feature_type",
"Binding og_binds_bo_feature_type",
]
scraped_binding_cols = [
"binding_begin_1",
"binding_begin_2",
"binding_end_1",
"binding_end_2",
"binding_mi_1",
"binding_mi_2",
"binding_name_1",
"binding_name_2",
"binding_range_1",
"binding_range_2",
"binding_short_1",
"binding_short_2",
]
keep_cols = scraped_binding_cols + agg_binding_cols + [
"uniprot_A",
"uniprot_A_equalseq",
"uniprot_A_equalseq_canonical",
"uniprot_A_full",
"uniprot_A_inseq",
"uniprot_A_inseq_canonical",
"uniprot_A_intact",
"uniprot_A_noiso1",
"uniprot_A_noisoforms",
"uniprot_B",
"uniprot_B_equalseq",
"uniprot_B_equalseq_canonical",
"uniprot_B_full",
"uniprot_B_inseq",
"uniprot_B_inseq_canonical",
"uniprot_B_intact",
"uniprot_B_noiso1",
"uniprot_B_noisoforms",
"uniprot_gene_name_A",
"uniprot_gene_name_B",
"uniprotkb_1",
"uniprotkb_2",
"unique_all_intact_sorted",
"unique_expansions",
"unique_id",
"unique_score_int",
"unique_scores",
"unique_uniprot_noiso1_pair",
"unique_uniprot_noisoforms_pair",
"unique_uniprot_pair",
"interaction_intactid",
"intactid_1",
"intactid_2",
"dip_1", "dip_2",
"seq_pair_id",
"length_1",
"length_2",
"aa_1","aa_2",
"invalids_aa_1", "invalids_aa_2"
]

In [760]:
merged_expl_binding_filt = merged_expl_binding_filt[keep_cols]
merged_expl_binding_filt["scraped_binding_has_info"] = merged_expl_binding_filt[scraped_binding_cols].notna().any(axis=1)
merged_expl_binding_filt["scraped_binding_has_info_1"] = merged_expl_binding_filt[[x for x in scraped_binding_cols if x.endswith("_1")]].notna().any(axis=1)
merged_expl_binding_filt["scraped_binding_has_info_2"] = merged_expl_binding_filt[[x for x in scraped_binding_cols if x.endswith("_2")]].notna().any(axis=1)
merged_expl_binding_filt["agg_binding_has_info"] = merged_expl_binding_filt[agg_binding_cols].notna().any(axis=1)

In [761]:
for c in merged_expl_binding_filt:
    merged_expl_binding_filt[c] = merged_expl_binding_filt[c].apply(lambda x: "|".join(x) if type(x)==list else x)

In [762]:
merged_expl_binding_filt = merged_expl_binding_filt.drop_duplicates().reset_index(drop=True)
mask = merged_expl_binding_filt[BINDING_COLS].notna().sum(axis=1) == 0
merged_expl_binding_filt = merged_expl_binding_filt.loc[~mask].reset_index(drop=True)
print(len(merged_expl_binding_filt))

172811


In [763]:
# going to group on # Feature AC and other things taht don't meaningfully separate features
to_join = [
    "Binding # Feature AC",
]

all_except_featac = [c for c in merged_expl_binding_filt.columns if c not in to_join]

agg_spec = {c: join_unique_nonnull for c in to_join}

display(merged_expl_binding_filt.head())
merged_expl_binding_filt = (
    merged_expl_binding_filt
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Joined Binding # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: {len(merged_expl_binding_filt)}")

display(merged_expl_binding_filt.head())

Unnamed: 0,binding_begin_1,binding_begin_2,binding_end_1,binding_end_2,binding_mi_1,binding_mi_2,binding_name_1,binding_name_2,binding_range_1,binding_range_2,...,length_1,length_2,aa_1,aa_2,invalids_aa_1,invalids_aa_2,scraped_binding_has_info,scraped_binding_has_info_1,scraped_binding_has_info_2,agg_binding_has_info
0,1207,,1783,,MI:0117,,binding-associated region,,1207-1783,,...,1784,1201,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,,,True,True,False,True
1,1140,,1542,,MI:0117,,binding-associated region,,1140-1542,,...,1784,1487,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSKKPVAQRKQLTLSSFIGLDGNSQSQPKSRAASVRSKPPAVYNPI...,,,True,True,False,True
2,1,,430,,MI:0117,,binding-associated region,,1-430,,...,430,1784,MLKHVQISPLRNRSDSVSLRSSSHASSCASSMCGSPEPPAELQRTP...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,True,True,False,True
3,1,,320,,MI:0117,,binding-associated region,,1-320,,...,635,1784,MSPFGSKKNRSLSVRVSTFDSELEFKLEPRASGQDLFDLVCRTIGL...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,True,True,False,True
4,1,,320,,MI:0117,,binding-associated region,,1-320,,...,635,1784,MSPFGSKKNRSLSVRVSTFDSELEFKLEPRASGQDLFDLVCRTIGL...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,True,True,False,True


Joined Binding # Feature AC column because it fails to meaningfully separate features. Dropped duplicate rows again. Remaining rows: 171606


Unnamed: 0,binding_begin_1,binding_begin_2,binding_end_1,binding_end_2,binding_mi_1,binding_mi_2,binding_name_1,binding_name_2,binding_range_1,binding_range_2,...,length_2,aa_1,aa_2,invalids_aa_1,invalids_aa_2,scraped_binding_has_info,scraped_binding_has_info_1,scraped_binding_has_info_2,agg_binding_has_info,Binding # Feature AC
0,1,1,10,10,MI:0429,MI:0429,necessary binding region,necessary binding region,1-10,1-10,...,776,MADPGVCCFITKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGP...,MADPGVCCFITKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGP...,,,True,True,True,True,"EBI-15974323,EBI-15974313"
1,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,...,1431,MELPKEMEEYFSMLQREIDKAYEIAKKARAQGKDPSLDVEIPQASD...,MELPKEMEEYFSMLQREIDKAYEIAKKARAQGKDPSLDVEIPQASD...,,,True,True,True,True,EBI-8553278
2,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,...,1431,MELPKEMEEYFSMLQREIDKAYEIAKKARAQGKDPSLDVEIPQASD...,MELPKEMEEYFSMLQREIDKAYEIAKKARAQGKDPSLDVEIPQASD...,,,True,True,True,True,EBI-8553311
3,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,...,1431,MELPKEMEEYFSMLQREIDKAYEIAKKARAQGKDPSLDVEIPQASD...,MELPKEMEEYFSMLQREIDKAYEIAKKARAQGKDPSLDVEIPQASD...,,,True,True,True,True,EBI-8553339
4,1,1,100,100,MI:0429,MI:0429,necessary binding region,necessary binding region,1-100,1-100,...,304,MASQPAGGSPPKPWEKEGNTSGPNPFRPPSNTSTAGSVEASGTANP...,MASQPAGGSPPKPWEKEGNTSGPNPFRPPSNTSTAGSVEASGTANP...,,,True,True,True,True,EBI-2369843


In [764]:
test1 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["Binding # Feature AC"].fillna("").str.contains(",")])
print(f"Total merged_expl_binding_filt rows with > 1 Binding # Feature AC (entry contains commas ,): {test1}")
test1 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["Binding # Feature AC"].fillna("").str.contains(",")])
print(f"Total merged_expl_binding_filt rows with > 1 Binding # Feature AC (entry contains commas ,): {test1}")

Total merged_expl_binding_filt rows with > 1 Binding # Feature AC (entry contains commas ,): 1187
Total merged_expl_binding_filt rows with > 1 Binding # Feature AC (entry contains commas ,): 1187


In [765]:
l_xml = merged_expl_binding_filt["binding_mi_1"].dropna().unique().tolist() + merged_expl_binding_filt["binding_mi_2"].dropna().unique().tolist()
l_xml

['MI:0429',
 'MI:0117',
 'MI:0442',
 'MI:1125',
 'MI:0429',
 'MI:0117',
 'MI:0442',
 'MI:1125']

In [766]:
l = bindsites["Feature type"].value_counts().reset_index()["Feature type"].unique().tolist()
l = [x for x in l if x.startswith("psi-mi:")]
l = [x.split("psi-mi:")[1].split("(")[0] for x in l]
l = [x.strip("\"") for x in l]
l

['MI:0442', 'MI:0117', 'MI:0429', 'MI:1125']

In [767]:
print("Display overlap between labeled MIs and the MIs in aggregated PTM database:")
display(bindsite_mi_ok.loc[bindsite_mi_ok["id"].isin(l)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

print("Display overlap between labeled MIs and the MIs in XML-scraped PTM database:")
display(bindsite_mi_ok.loc[bindsite_mi_ok["id"].isin(l_xml)].drop_duplicates("id").sort_values(by=["id"],ascending=True))

Display overlap between labeled MIs and the MIs in aggregated PTM database:


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,binding-associated region,MI:0117,,MI:0252,biological feature
1,necessary binding region,MI:0429,MI:0117,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting..."
2,sufficient binding region,MI:0442,MI:0117,MI:0117,binding-associated region
3,direct binding region,MI:1125,MI:0442,MI:0442,sufficient binding region


Display overlap between labeled MIs and the MIs in XML-scraped PTM database:


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,binding-associated region,MI:0117,,MI:0252,biological feature
1,necessary binding region,MI:0429,MI:0117,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting..."
2,sufficient binding region,MI:0442,MI:0117,MI:0117,binding-associated region
3,direct binding region,MI:1125,MI:0442,MI:0442,sufficient binding region


In [768]:
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Interactor Matches"]=="A") & 
    (merged_expl_binding_filt["binding_range_2"].notna()) & 
    (merged_expl_binding_filt["binding_range_1"].isna())
])==0
print(f"No rows where interactor is A but binding_ data is associated with B: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Interactor Matches"]=="B") & 
    (merged_expl_binding_filt["binding_range_1"].notna()) & 
    (merged_expl_binding_filt["binding_range_2"].isna())
])==0
print(f"No rows where interactor is B but binding_ data is associated with A: {test1}")

No rows where interactor is A but binding_ data is associated with B: True
No rows where interactor is B but binding_ data is associated with A: True


In [769]:
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Feature range(s)"].isna())
])
print(f"{test1} rows do not have a Binding Feature range(s) value.")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Feature range(s)"].isna()) & 
    (merged_expl_binding_filt["agg_binding_has_info"])
])==0
print(f"\tAll of these rows do not have Binding-aggregated data at all: {test1}")

8 rows do not have a Binding Feature range(s) value.
	All of these rows do not have Binding-aggregated data at all: True


In [770]:
test1 = len(merged_expl_binding_filt.loc[
        merged_expl_binding_filt["Binding Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_expl_binding_filt["Binding Original sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of binding feature ranges as original sequences: {test1}")
test1 = len(merged_expl_binding_filt.loc[
        merged_expl_binding_filt["Binding Feature range(s)"].fillna("").str.split(",").apply(lambda x: len(x)) != merged_expl_binding_filt["Binding Resulting sequence"].fillna("").str.split(",").apply(lambda x: len(x))
    ])==0
print(f"All rows have the same # of binding feature ranges as resulting sequences: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    ~merged_expl_binding_filt["Binding Feature range(s)"].fillna("-").str.contains("-")
    ])==0
print(f"All rows have a dash - in binding feature range indicating the span of the feature: {test1}")


All rows have the same # of binding feature ranges as original sequences: True
All rows have the same # of binding feature ranges as resulting sequences: True
All rows have a dash - in binding feature range indicating the span of the feature: True


In [771]:
bindsite_mi_ok

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,binding-associated region,MI:0117,,MI:0252,biological feature
1,necessary binding region,MI:0429,MI:0117,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting..."
2,sufficient binding region,MI:0442,MI:0117,MI:0117,binding-associated region
3,direct binding region,MI:1125,MI:0442,MI:0442,sufficient binding region


In [772]:
bindsite_mi_ok_labeled = bindsite_mi_ok.copy(deep=True)
d_og = {
    "MI:0117": "yes",
    "MI:0429": "yes",
    "MI:0442": "yes",
    "MI:1125": "yes"
}
d_new = {
    "MI:0117": "unknown",
    "MI:0429": "no",
    "MI:0442": "unknown",
    "MI:1125": "unknown"
}
bindsite_mi_ok_labeled["original_sequence"] = bindsite_mi_ok_labeled["id"].map(d_og)
bindsite_mi_ok_labeled["bindsite_sequence"] = bindsite_mi_ok_labeled["id"].map(d_new)
bindsite_mi_ok_labeled

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all,original_sequence,bindsite_sequence
0,binding-associated region,MI:0117,,MI:0252,biological feature,yes,unknown
1,necessary binding region,MI:0429,MI:0117,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting...",yes,no
2,sufficient binding region,MI:0442,MI:0117,MI:0117,binding-associated region,yes,unknown
3,direct binding region,MI:1125,MI:0442,MI:0442,sufficient binding region,yes,unknown


In [773]:
bindsite_mis_labeled = bindsite_types_labeled.copy(deep=True)
bindsite_mis_labeled = bindsite_mis_labeled.loc[bindsite_mis_labeled["feature"].str.startswith("psi-mi")]
bindsite_mis_labeled["mi"] = bindsite_mis_labeled["feature"].apply(lambda x: x.split("psi-mi:")[1].split("(")[0].strip("\""))
bindsite_mis_labeled = pd.concat([
    bindsite_mis_labeled,
    bindsite_mi_ok_labeled.rename(columns={"id":"mi","label":"feature"})
])
bindsite_mis_labeled

Unnamed: 0,feature,original_sequence,without_binding_site,comments,mi,parent_id,parent_ids_all,parent_names_all,bindsite_sequence
0,"psi-mi:""MI:0442""(sufficient binding region)",yes,unknown,,MI:0442,,,,
1,"psi-mi:""MI:0429""(necessary binding region)",yes,no,,MI:0429,,,,
2,"psi-mi:""MI:0117""(binding-associated region)",yes,unknown,,MI:0117,,,,
3,"psi-mi:""MI:1125""(direct binding region)",yes,unknown,,MI:1125,,,,
0,binding-associated region,yes,,,MI:0117,,MI:0252,biological feature,unknown
1,necessary binding region,yes,,,MI:0429,MI:0117,"MI:0117, MI:0573, MI:1128, MI:1129","binding-associated region, mutation disrupting...",no
2,sufficient binding region,yes,,,MI:0442,MI:0117,MI:0117,binding-associated region,unknown
3,direct binding region,yes,,,MI:1125,MI:0442,MI:0442,sufficient binding region,unknown


In [774]:
test1 = set(merged_expl_binding_filt["binding_mi_1"].dropna().tolist() + merged_expl_binding_filt["binding_mi_2"].dropna().tolist())
test1 = len(set(bindsite_mis_labeled["mi"].tolist())-test1)==0
print(f"All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: {test1}.")
bindsite_mis_og_labeled = dict(zip(bindsite_mis_labeled["mi"],bindsite_mis_labeled["original_sequence"]))
bindsite_mis_new_labeled = dict(zip(bindsite_mis_labeled["mi"],bindsite_mis_labeled["without_binding_site"]))

merged_expl_binding_filt["binding_new_binds_bo_mi"] = merged_expl_binding_filt.apply(
    lambda row: bindsite_mis_new_labeled.get(row["binding_mi_1"]) if type(row["binding_mi_1"])==str else bindsite_mis_new_labeled.get(row["binding_mi_2"]),axis=1)
merged_expl_binding_filt["binding_og_binds_bo_mi"] = merged_expl_binding_filt.apply(
    lambda row: bindsite_mis_og_labeled.get(row["binding_mi_1"]) if type(row["binding_mi_1"])==str else bindsite_mis_og_labeled.get(row["binding_mi_2"]),axis=1)

All MIs for interactor 1 and 2 in DataFrame have yes/no interaction labels prepared: True.


In [775]:
## Get positive, negative, and unknown labels for each
# We won't augment the negative dataset with this but we will take things out of the positives
# And we will make sure that none of these things are added as negatives when we do negative scraping 

# now combine to get unique assignments per row 
newbindcols = [
"Binding new_binds_bo_annotation",
"Binding new_binds_bo_feature_type",
"binding_new_binds_bo_mi"]
ogbindcols = ["Binding og_binds_bo_annotation",
"Binding og_binds_bo_feature_type","binding_og_binds_bo_mi"]


merged_expl_binding_filt["Binding all_new_binds"] = (
    merged_expl_binding_filt.apply(lambda r: _collect_row_values(r, newbindcols), axis=1)
)
merged_expl_binding_filt["Binding all_og_binds"] = (
    merged_expl_binding_filt.apply(lambda r: _collect_row_values(r, ogbindcols), axis=1)
)

In [776]:
display(merged_expl_binding_filt["Binding all_new_binds"].value_counts())
display(merged_expl_binding_filt["Binding all_og_binds"].value_counts())

Binding all_new_binds
unknown    167626
no           3972
Name: count, dtype: int64

Binding all_og_binds
yes    171606
Name: count, dtype: int64

In [777]:
# Figure out if any rows have contradicting labels 
def simplify_binding_bind_labels(s):
    """
    Turn s into a catchall label
    """
    if type(s)==float:
        return "unknown"
    s = set(s.split(","))
    options = set(["yes","no","unknown"])
    if s.intersection(options)==set(["yes"]):
        return "yes"
    elif s.intersection(options)==set(["no"]):
        return "no"
    elif s.intersection(options)==set(["yes","unknown"]):
        return "yes"
    elif s.intersection(options)==set(["no","unknown"]):
        return "no"
    elif s.intersection(options)==set(["yes","no"]):
        return "unknown"
    elif s.intersection(options)==set(["yes","no","unknown"]):
        return "unknown"
    return "unknown"
    
merged_expl_binding_filt["Binding decisive_entry_new_binds"] = merged_expl_binding_filt["Binding all_new_binds"].apply(lambda s: simplify_binding_bind_labels(s))
merged_expl_binding_filt["Binding decisive_entry_og_binds"] = merged_expl_binding_filt["Binding all_og_binds"].apply(lambda s: simplify_binding_bind_labels(s))

In [778]:
test1 =len(merged_expl_binding_filt.loc[(merged_expl_binding_filt["binding_short_1"].isna()) & (merged_expl_binding_filt["binding_short_2"].isna())])==0
print(f"Everything has a value in either binding_short_1 or binding_short_2: {test1}")

def get_final_bindingated_partner(row):
    matches = []
    if type(row["Binding Interactor Matches"])==str:
        return row["Binding Interactor Matches"]
    else:
        if type(row["binding_short_1"])==str:
            matches.append("A")
        if type(row["binding_short_2"])==str:
            matches.append("B")
        return ",".join(matches)
    
merged_expl_binding_filt["Binding Site Partner"] = merged_expl_binding_filt.apply(lambda row: get_final_bindingated_partner(row),axis=1)


Everything has a value in either binding_short_1 or binding_short_2: True


In [779]:
merged_expl_binding_filt[scraped_binding_cols + ["Binding decisive_entry_new_binds","Binding decisive_entry_og_binds"]]

Unnamed: 0,binding_begin_1,binding_begin_2,binding_end_1,binding_end_2,binding_mi_1,binding_mi_2,binding_name_1,binding_name_2,binding_range_1,binding_range_2,binding_short_1,binding_short_2,Binding decisive_entry_new_binds,Binding decisive_entry_og_binds
0,1,1,10,10,MI:0429,MI:0429,necessary binding region,necessary binding region,1-10,1-10,Region 1-10,Region 1-10,no,yes
1,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,binding site,binding site,unknown,yes
2,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,binding site,binding site,unknown,yes
3,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,binding site,binding site,unknown,yes
4,1,1,100,100,MI:0429,MI:0429,necessary binding region,necessary binding region,1-100,1-100,n-terminal,n-terminal,no,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171601,,999,,1661,,MI:0429,,necessary binding region,,999-1661,,region,no,yes
171602,,999,,1661,,MI:0429,,necessary binding region,,999-1661,,region,no,yes
171603,,999,,1661,,MI:0429,,necessary binding region,,999-1661,,region,no,yes
171604,,,,,MI:0442,,sufficient binding region,,,,region,,unknown,yes


In [780]:
temp = bindsites.loc[bindsites["Feature type"].notna()]
test1 = len(temp)
print(f"Fraction of original intact-compiled binding sites database that has a Binding Feature type: {test1}/{len(bindsites)} ({100*test1/len(bindsites):.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Feature type"].notna())  &
    (merged_expl_binding_filt["Binding Feature range(s)"].notna()) 
])
test2 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["scraped_binding_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and a feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Feature type"].notna())  &
    (merged_expl_binding_filt["Binding Feature range(s)"].isna()) 
])
test2 = len(merged_expl_binding_filt.loc[merged_expl_binding_filt["scraped_binding_has_info"]])
print(f"Fraction of XML-compiled database that has a feature type and no feature range: {test1}/{test2} ({100*test1/test2:.2f}%)")

Fraction of original intact-compiled binding sites database that has a Binding Feature type: 205588/205660 (99.96%)
Fraction of XML-compiled database that has a feature type and a feature range: 171598/171606 (100.00%)
Fraction of XML-compiled database that has a feature type and no feature range: 0/171606 (0.00%)


In [781]:
def precompute_worst_substitutions(blosum_matrix, valid_aas=None):
    """
    Precompute, for each residue in the BLOSUM alphabet, the amino acid with
    the *lowest* substitution score (worst substitution), optionally restricted
    to a provided set of valid amino acids.

    Returns a dict: {residue: worst_substitution}
    """
    alphabet = getattr(blosum_matrix, "alphabet", None)
    if alphabet is None:
        raise ValueError("BLOSUM matrix missing `.alphabet` attribute.")

    worst_sub = {}

    for res in alphabet:
        # Skip residues not in valid list (if provided)
        if valid_aas is not None and res not in valid_aas:
            continue

        # Candidate residues must be in the alphabet and valid, and not equal to res
        candidates = [
            aa for aa in alphabet
            if aa != res and (valid_aas is None or aa in valid_aas)
        ]

        if not candidates:
            # No valid alternative substitution → fallback to identity
            worst_sub[res] = res
            continue

        # Pick the amino acid with the lowest BLOSUM substitution score
        best_aa = None
        best_score = None

        for aa in candidates:
            score = blosum_matrix[res, aa]
            if best_score is None or score < best_score or (score == best_score and aa < best_aa):
                best_score = score
                best_aa = aa

        worst_sub[res] = best_aa

    return worst_sub


In [782]:
import re
import random
import pandas as pd


def get_binding_site_mutated_sequence(
    row,
    partner="A",
    mode="delete",
    worst_sub=None,   # precomputed dict: residue -> worst substitution
):
    """
    Build the binding site-mutant sequence for partner 'A' or 'B'.
    Modes:
      - delete: remove residues in the binding site
      - shuffle: randomly permute residues (fails if identical or length 1)
      - blosum: substitute each residue using precomputed worst_sub dict

    Extra failsafes:
      - Return None if we can't actually mutate (no valid ranges, no change, etc.).
      - Shuffle returns None if the mutation cannot produce a different sequence.
      - BLOSUM substitutions only use residues in `worst_sub` (pre-filtered).
      - If 'Binding Original sequence' is a valid string and does NOT match the
        original sequence over the specified ranges, return None for ALL modes.
      - Ranges must be fully valid for the length of the provided sequence:
        start/end must both lie within [1, len(aa_og)] in 1-based indexing.
    """

    matches = row.get("Binding Site Partner")
    if matches is None:
        return None

    # Column selection based on partner
    if partner == "A":
        binding_range_col = "binding_range_1"
        aa_col = "aa_1"
    else:
        binding_range_col = "binding_range_2"
        aa_col = "aa_2"

    # Accept A/B/A,B syntax
    def _has_partner(m, p):
        if isinstance(m, (list, tuple, set)):
            return p in m
        return bool(re.search(rf'(^|,)\s*{re.escape(p)}\s*(,|$)', str(m)))

    if not _has_partner(matches, partner):
        return None

    # Extract data
    ranges_str = row.get(binding_range_col)
    if (
        ranges_str is None
        or (isinstance(ranges_str, float) and pd.isna(ranges_str))
        or str(ranges_str).strip() == ""
    ):
        return None

    aa_og = row.get(aa_col)
    if not isinstance(aa_og, str) or len(aa_og) == 0:
        return None

    seq_len = len(aa_og)

    # Parse ranges: "120-122, 140-140" -> [(119, 121), (139, 139)] (0-based)
    ranges = []
    for chunk in str(ranges_str).split(","):
        chunk = chunk.strip()
        if not chunk:
            continue
        m = re.fullmatch(r"(\d+)-(\d+)", chunk)
        if not m:
            continue

        start_1b = int(m.group(1))
        end_1b   = int(m.group(2))

        # 1-based validity check:
        # - start >= 1
        # - end >= start
        # - end <= seq_len
        if start_1b < 1 or end_1b < start_1b or end_1b > seq_len:
            # Any out-of-bounds range makes the whole annotation invalid
            return None

        start = start_1b - 1  # convert to 0-based
        end   = end_1b - 1
        ranges.append((start, end))

    if not ranges:
        return None

    # ------------------------------------------------------------------
    # Check: "Binding Original sequence" vs aa_og over these ranges
    # Only if it is a 'valid string':
    #   - not None / NaN
    #   - not just whitespace
    #   - not "-"
    # ------------------------------------------------------------------
    orig_binding = row.get("Binding Original sequence")

    def _normalize_pieces_str(s):
        # Normalize comma-separated pieces by stripping spaces around each part
        return ",".join(part.strip() for part in str(s).split(","))

    valid_orig_str = False
    if isinstance(orig_binding, str):
        stripped = orig_binding.strip()
        if stripped != "" and stripped != "-" and stripped !=",":
            valid_orig_str = True
    elif orig_binding is not None and not pd.isna(orig_binding):
        # Non-string but non-NaN/None: treat as "valid" for comparison
        valid_orig_str = True

    if valid_orig_str:
        # Compute the original subsequences from aa_og over the ranges
        computed_pieces = [aa_og[start:end+1] for (start, end) in ranges]
        computed_str = ",".join(computed_pieces)

        if _normalize_pieces_str(orig_binding) != _normalize_pieces_str(computed_str):
            # Mismatch between annotated original sequence and actual sequence
            return None
    # If orig_binding is NaN / None / "" / "-" / spaces -> we skip the check

    # ------------------
    #      DELETE
    # ------------------
    if mode == "delete":
        seq = aa_og
        for start, end in sorted(ranges, reverse=True):
            seq = seq[:start] + seq[end+1:]
        return seq

    # ------------------
    #     SHUFFLE
    # ------------------
    if mode == "shuffle":
        seq_list = list(aa_og)

        for start, end in ranges:
            window = seq_list[start:end+1]

            # fail if length is 1 (no shuffle possible)
            if len(window) <= 1:
                return None
            # fail if all residues identical (any permutation == original)
            if len(set(window)) == 1:
                return None

            orig_window = window[:]
            # Try limited times to get a different ordering
            for _ in range(10):
                random.shuffle(window)
                if window != orig_window:
                    break

            if window == orig_window:
                return None  # Failed to change

            seq_list[start:end+1] = window

        shuffled = "".join(seq_list)
        return None if shuffled == aa_og else shuffled

    # ------------------
    #      BLOSUM
    # ------------------
    if mode == "blosum":
        if worst_sub is None:
            raise ValueError("BLOSUM mode requires `worst_sub` lookup dict.")

        seq_list = list(aa_og)
        for start, end in ranges:
            for i in range(start, end+1):
                res = seq_list[i]
                if res in worst_sub:   # substitution allowed only if in dict
                    seq_list[i] = worst_sub[res]
        blosum_mut = "".join(seq_list)
        if blosum_mut == aa_og:
            return None

        return blosum_mut

    raise ValueError(f"Unknown mode {mode}")


In [783]:
from Bio.Align import substitution_matrices
blosum62 = substitution_matrices.load("BLOSUM62")
# Precompute the worst substitutions ONCE
worst_sub = precompute_worst_substitutions(blosum62, valid_aas=VALID_AAS)

merged_expl_binding_filt["bindsite_delete_aa_1"] = merged_expl_binding_filt.apply(lambda row: get_binding_site_mutated_sequence(row, partner="A",mode="delete"), axis=1)
merged_expl_binding_filt["bindsite_delete_aa_2"] = merged_expl_binding_filt.apply(lambda row: get_binding_site_mutated_sequence(row, partner="B",mode="delete"), axis=1)

merged_expl_binding_filt["bindsite_shuffle_aa_1"] = merged_expl_binding_filt.apply(lambda row: get_binding_site_mutated_sequence(row, partner="A",mode="shuffle"), axis=1)
merged_expl_binding_filt["bindsite_shuffle_aa_2"] = merged_expl_binding_filt.apply(lambda row: get_binding_site_mutated_sequence(row, partner="B",mode="shuffle"), axis=1)

merged_expl_binding_filt["bindsite_blosum62_aa_1"] = merged_expl_binding_filt.apply(lambda row: get_binding_site_mutated_sequence(row, partner="A",mode="blosum",worst_sub=worst_sub), axis=1)
merged_expl_binding_filt["bindsite_blosum62_aa_2"] = merged_expl_binding_filt.apply(lambda row: get_binding_site_mutated_sequence(row, partner="B",mode="blosum",worst_sub=worst_sub), axis=1)

In [784]:
# verify that there is no identical sequence between aa_1 and mutated versions of aa_1 if aa_1 is the binding site partner
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    ((merged_expl_binding_filt["aa_1"] == merged_expl_binding_filt["bindsite_delete_aa_1"]) | 
    (merged_expl_binding_filt["aa_1"] == merged_expl_binding_filt["bindsite_shuffle_aa_1"]) |
    (merged_expl_binding_filt["aa_1"] == merged_expl_binding_filt["bindsite_blosum62_aa_1"]))
])==0
print(f"There is no case where aa_1 matches any of its binding-site-mutated versions when interactor A is the binding site partner: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["aa_1"] == merged_expl_binding_filt["bindsite_delete_aa_1"])
])
print(f"\tRows where deleted sequence matches: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["aa_1"] == merged_expl_binding_filt["bindsite_shuffle_aa_1"])
])
print(f"\tRows where shuffle sequence matches: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["aa_1"] == merged_expl_binding_filt["bindsite_blosum62_aa_1"])
])
print(f"\tRows where blosum62 sequence matches: {test1}")


test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    ((merged_expl_binding_filt["aa_2"] == merged_expl_binding_filt["bindsite_delete_aa_2"]) | 
    (merged_expl_binding_filt["aa_2"] == merged_expl_binding_filt["bindsite_shuffle_aa_2"]) |
    (merged_expl_binding_filt["aa_2"] == merged_expl_binding_filt["bindsite_blosum62_aa_2"]))
])==0
print(f"There is no case where aa_2 matches any of its binding-site-mutated versions when interactor B is the binding site partner: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["aa_2"] == merged_expl_binding_filt["bindsite_delete_aa_2"])
])
print(f"\tRows where deleted sequence matches: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["aa_2"] == merged_expl_binding_filt["bindsite_shuffle_aa_2"])
])
print(f"\tRows where shuffle sequence matches: {test1}")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["aa_2"] == merged_expl_binding_filt["bindsite_blosum62_aa_2"])
])
print(f"\tRows where blosum62 sequence matches: {test1}")


There is no case where aa_1 matches any of its binding-site-mutated versions when interactor A is the binding site partner: True
	Rows where deleted sequence matches: 0
	Rows where shuffle sequence matches: 0
	Rows where blosum62 sequence matches: 0
There is no case where aa_2 matches any of its binding-site-mutated versions when interactor B is the binding site partner: True
	Rows where deleted sequence matches: 0
	Rows where shuffle sequence matches: 0
	Rows where blosum62 sequence matches: 0


In [785]:
import re
import pandas as pd

def compute_range_length(range_str):
    """
    Given a string like '1-10' or '120-122, 140-140',
    return the total length across all ranges:
    
        length = sum(end - start + 1)

    Returns None if the string is empty, NaN, or malformed.
    """
    if (
        range_str is None
        or (isinstance(range_str, float) and pd.isna(range_str))
        or str(range_str).strip() == ""
    ):
        return None

    total_len = 0
    valid_found = False

    for chunk in str(range_str).split(","):
        chunk = chunk.strip()
        if not chunk:
            continue

        m = re.fullmatch(r"(\d+)-(\d+)", chunk)
        if not m:
            continue

        start = int(m.group(1))
        end   = int(m.group(2))

        # ignore malformed cases like 10-5
        if end < start:
            continue

        valid_found = True
        total_len += (end - start + 1)

    return total_len if valid_found else None


merged_expl_binding_filt["binding_range_length_1"] = merged_expl_binding_filt["binding_range_1"].apply(compute_range_length)
merged_expl_binding_filt["binding_range_length_2"] = merged_expl_binding_filt["binding_range_2"].apply(compute_range_length)

In [786]:
# look at how many rows have binding site annotation @ A versus how many rows have a sequence for the binding mutant 
# verify that there is no identical sequence between aa_1 and mutated versions of aa_1 if aa_1 is the binding site partner
test0 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False))
])
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (   (merged_expl_binding_filt["bindsite_delete_aa_1"].notna()) | 
        (merged_expl_binding_filt["bindsite_shuffle_aa_1"].notna()) | 
        (merged_expl_binding_filt["bindsite_blosum62_aa_1"].notna()) 
    )
])
print(f"Total rows (out of all where A is a binding site partner) that have at least one binding-site-mutated sequence for A: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["bindsite_delete_aa_1"].notna())
])
print(f"\tDeleted binding site: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["bindsite_shuffle_aa_1"].notna())
])
print(f"\tShuffled binding site: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["bindsite_blosum62_aa_1"].notna())
])
print(f"\tBLOSUM62-substituted binding site: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("A", na=False)) &
    (merged_expl_binding_filt["binding_range_length_1"]>1)
])
print(f"\tTotal rows where binding range length > 1: {test1}/{test0} ({100*test1/test0:.2f}%)")

# look at how many rows have binding site annotation @ A versus how many rows have a sequence for the binding mutant 
# verify that there is no identical sequence between aa_2 and mutated versions of aa_2 if aa_2 is the binding site partner
test0 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False))
])
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (   (merged_expl_binding_filt["bindsite_delete_aa_2"].notna()) | 
        (merged_expl_binding_filt["bindsite_shuffle_aa_2"].notna()) | 
        (merged_expl_binding_filt["bindsite_blosum62_aa_2"].notna()) 
    )
])
print(f"Total rows (out of all where A is a binding site partner) that have at least one binding-site-mutated sequence for A: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["bindsite_delete_aa_2"].notna())
])
print(f"\tDeleted binding site: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["bindsite_shuffle_aa_2"].notna())
])
print(f"\tShuffled binding site: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["bindsite_blosum62_aa_2"].notna())
])
print(f"\tBLOSUM62-substituted binding site: {test1}/{test0} ({100*test1/test0:.2f}%)")
test1 = len(merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (merged_expl_binding_filt["binding_range_length_2"]>1)
])
print(f"\tTotal rows where binding range length > 1: {test1}/{test0} ({100*test1/test0:.2f}%)")



Total rows (out of all where A is a binding site partner) that have at least one binding-site-mutated sequence for A: 88356/88556 (99.77%)
	Deleted binding site: 88356/88556 (99.77%)
	Shuffled binding site: 88166/88556 (99.56%)
	BLOSUM62-substituted binding site: 88356/88556 (99.77%)
	Total rows where binding range length > 1: 88377/88556 (99.80%)
Total rows (out of all where A is a binding site partner) that have at least one binding-site-mutated sequence for A: 86548/86738 (99.78%)
	Deleted binding site: 86548/86738 (99.78%)
	Shuffled binding site: 86362/86738 (99.57%)
	BLOSUM62-substituted binding site: 86548/86738 (99.78%)
	Total rows where binding range length > 1: 86560/86738 (99.79%)


In [787]:
merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (   (merged_expl_binding_filt["binding_range_1"].str.contains(","))
    )
][["binding_range_2","aa_2","length_2","bindsite_blosum62_aa_2","Binding Original sequence"]]

Unnamed: 0,binding_range_2,aa_2,length_2,bindsite_blosum62_aa_2,Binding Original sequence
15944,"1-105,114-566",MRRQWGSAMRAAEQAGCMVSASRAGQPEAGPWSCSGVILSRSPGLV...,566,DCCCDIWWDCWWCCWIEDDWWWCWICFCWIFDWEWIDGDWCWFIDD...,MRRQWGSAMRAAEQAGCMVSASRAGQPEAGPWSCSGVILSRSPGLV...
15945,"1-105,114-566",MRRQWGSAMRAAEQAGCMVSASRAGQPEAGPWSCSGVILSRSPGLV...,566,DCCCDIWWDCWWCCWIEDDWWWCWICFCWIFDWEWIDGDWCWFIDD...,MRRQWGSAMRAAEQAGCMVSASRAGQPEAGPWSCSGVILSRSPGLV...
15947,"1-21,133-409",MMKKNNSAKRGPQDGNQQPAPPEKVGWVRKFCGKGIFREIWKNRYV...,409,DDCCWWWWCCIFCLIWCCFWFPEKVGWVRKFCGKGIFREIWKNRYV...,","
15950,"1-113,140-161",MSVPGPYQAATGPSSAPSAPPSYEETVAVNSYYPTPPAPMPGPTTG...,161,DWDFIFDCWWFIFWWWFWWFFWDCCFDWDWWDDFFFFWFDFIFFFI...,MSVPGPYQAATGPSSAPSAPPSYEETVAVNSYYPTPPAPMPGPTTG...
15953,"1-22,1447-1665",MPPLLAPLLCLALLPALAARGPRCSQPGETCLNGGKCEAANGTEAC...,2555,DFFDDWFDDEDWDDFWDWWCIFRCSQPGETCLNGGKCEAANGTEAC...,"MPPLLAPLLCLALLPALAARGP,EACELPECQEDAGNKVCSLQCNN..."
15954,"1-22,1447-1665",MPPLLAPLLCLALLPALAARGPRCSQPGETCLNGGKCEAANGTEAC...,2555,DFFDDWFDDEDWDDFWDWWCIFRCSQPGETCLNGGKCEAANGTEAC...,"MPPLLAPLLCLALLPALAARGP,EACELPECQEDAGNKVCSLQCNN..."
15955,"1-22,1447-1665",MPPLLAPLLCLALLPALAARGPRCSQPGETCLNGGKCEAANGTEAC...,2555,DFFDDWFDDEDWDDFWDWWCIFRCSQPGETCLNGGKCEAANGTEAC...,"MPPLLAPLLCLALLPALAARGP,EACELPECQEDAGNKVCSLQCNN..."
15956,"1-22,1447-1734",MPPLLAPLLCLALLPALAARGPRCSQPGETCLNGGKCEAANGTEAC...,2555,DFFDDWFDDEDWDDFWDWWCIFRCSQPGETCLNGGKCEAANGTEAC...,"MPPLLAPLLCLALLPALAARGP,EACELPECQEDAGNKVCSLQCNN..."
15967,"1-150,175-329",MSAFDTNPFADPVDVNPFQDPSVTQLTNAPQGGLAEFNPFSETNAA...,329,DWWPLFWFPWLFDLDWFPCLFWDFCDFWWFCIIDWCPWFPWCFWWW...,MSAFDTNPFADPVDVNPFQDPSVTQLTNAPQGGLAEFNPFSETNAA...
15969,"1-43,181-419",MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQGLP...,419,DWLWIFCWCCWWFCGFPIIFWLWFIWWCWICCWIWCWCCCCFCGLP...,"MSDNGPQNQRNAPRITFGGPSDSTGSNQNGERSGARSKQRRPQ,QA..."


In [788]:
merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding Site Partner"].str.contains("B", na=False)) &
    (   (merged_expl_binding_filt["bindsite_delete_aa_2"].isna()) | 
        (merged_expl_binding_filt["bindsite_shuffle_aa_2"].isna()) | 
        (merged_expl_binding_filt["bindsite_blosum62_aa_2"].isna()) 
    )
][["binding_range_2","aa_2","Binding Original sequence"]]

Unnamed: 0,binding_range_2,aa_2,Binding Original sequence
15982,"1-1,22-1036",MKMASTRCKLARYLEDLEDVDLKKFKMHLEDYPPQKGCIPLPRGQT...,"M,LKKFKMHLEDYPPQKGCIPLPRGQTEKADHVDLATLMIDFNGEE..."
16838,102-102,MYGKIIFVLLLSEIVSISALSTTEVAMHTSTSSSVTKSYISSQTND...,G
16839,102-102,MYGKIIFVLLLSEIVSISALSTTEVAMHTSTSSSVTKSYISSQTND...,G
16976,103-103,MRCGGGARACRRACRCWLSGYAGPADGTQQPDAPEHAVAREALVDL...,-
17275,105-105,MRNSYRFLASSLSVVVSLLLIPEDVCEKIIGGNEVTPHSRPYMVLL...,-
...,...,...,...
170933,97-97,MKLIAASLRRLSLAVLTVLLVVSSFAVFTPSASAETYTVKLGSDKG...,-
171117,976-976,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,V
171118,976-976,MRGARGAWDFLCVLLLLLRVQTGSSQPSVSPGEPSPPSIHPGKSDL...,V
171222,98-98,MKLIAASLRRLSLAVLTVLLVVSSFAVFTPSASAETYTVKLGSDKG...,-


In [789]:
merged_expl_binding_filt[scraped_binding_cols + agg_binding_cols + ["bindsite_delete_aa_1","bindsite_delete_aa_2","bindsite_shuffle_aa_1","bindsite_shuffle_aa_2","bindsite_blosum62_aa_1","bindsite_blosum62_aa_2"]]

Unnamed: 0,binding_begin_1,binding_begin_2,binding_end_1,binding_end_2,binding_mi_1,binding_mi_2,binding_name_1,binding_name_2,binding_range_1,binding_range_2,...,Binding Resulting sequence,Binding Xref ID(s),Binding new_binds_bo_feature_type,Binding og_binds_bo_feature_type,bindsite_delete_aa_1,bindsite_delete_aa_2,bindsite_shuffle_aa_1,bindsite_shuffle_aa_2,bindsite_blosum62_aa_1,bindsite_blosum62_aa_2
0,1,1,10,10,MI:0429,MI:0429,necessary binding region,necessary binding region,1-10,1-10,...,-,-,no,yes,TKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGPDRFVLLETGG...,TKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGPDRFVLLETGG...,CCGMVAPDFITKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGP...,CFAIGCPDMVTKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGP...,DWLFIDEEPGTKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGP...,DWLFIDEEPGTKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGP...
1,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,...,-,mint:MINT-8082044(identity),unknown,yes,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,IKKDEDEPDGIKKAGLIVMVRGEIAIAEFEMDVLSMRMGRGLSKQE...,MPVSLIIGLSIKKMAQADFPVAEKKGERVIAERLYRKSLSSDLAEF...,DCDFCCDCCDPWDDCCCGLCWDCGWCCWCWCICLFWDLDCGFCWWL...,DCDFCCDCCDPWDDCCCGLCWDCGWCCWCWCICLFWDLDCGFCWWL...
2,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,...,-,mint:MINT-8082058(identity),unknown,yes,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,ADGVFDGEKIGVMAAYYVDLIEVQEDRGSPGGPAGLAFKLEKSSLK...,DGLPAPYDAVAEMIFYYSVSKEEIEDRGKLDKSVKEDLVMPPKIEK...,DCDFCCDCCDPWDDCCCGLCWDCGWCCWCWCICLFWDLDCGFCWWL...,DCDFCCDCCDPWDDCCCGLCWDCGWCCWCWCICLFWDLDCGFCWWL...
3,1,1,100,100,MI:0117,MI:0117,binding-associated region,binding-associated region,1-100,1-100,...,-,mint:MINT-8082083(identity),unknown,yes,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,YVVGGRKMVGLREAKAPLIIKKIPEQEVDDGEKKVQDLEVAEMSEF...,IYEEDIMILDEASMSDVLMEAADQYRKEEPEDKKKALSKGGEEELP...,DCDFCCDCCDPWDDCCCGLCWDCGWCCWCWCICLFWDLDCGFCWWL...,DCDFCCDCCDPWDDCCCGLCWDCGWCCWCWCICLFWDLDCGFCWWL...
4,1,1,100,100,MI:0429,MI:0429,necessary binding region,necessary binding region,1-100,1-100,...,-,-,no,yes,GTYGSALGGYGSSYGGGMYGGSSMYRGGYGGGGLYGSSGMYGGGAM...,GTYGSALGGYGSSYGGGMYGGSSMYRGGYGGGGLYGSSGMYGGGAM...,GGNPVWAGMGAVPGGNVASEGQPMTGPWGNNTPQGTNESSLGGVSY...,TEPNPQSAQGANGNSEARMRSSSSTMVTFLNYNGSWSNMGGPTKMT...,DWWCFWIIWFFCFDCCCIWFWIFWFPCFFWWFWFWIWDCWWIFWWF...,DWWCFWIIWFFCFDCCCIWFWIFWFPCFFWWFWFWIWDCWWIFWWF...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171601,,999,,1661,,MI:0429,,necessary binding region,,999-1661,...,-,-,no,yes,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...
171602,,999,,1661,,MI:0429,,necessary binding region,,999-1661,...,-,-,no,yes,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...
171603,,999,,1661,,MI:0429,,necessary binding region,,999-1661,...,-,-,no,yes,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...,,MLGAPDESSVRVAVRIRPQLAKEKIEGCHICTSVTPGEPQVFLGKD...
171604,,,,,MI:0442,,sufficient binding region,,,,...,,,,,,,,,,


In [790]:
# Rearrange so there's just og and new 
# "bindsite_delete_aa_1","bindsite_delete_aa_2","bindsite_shuffle_aa_1","bindsite_shuffle_aa_2","bindsite_blosum62_aa_1","bindsite_blosum62_aa_2"
merged_expl_binding_filt_delete = merged_expl_binding_filt.copy(deep=True).drop(
    columns=["bindsite_shuffle_aa_1","bindsite_shuffle_aa_2","bindsite_blosum62_aa_1","bindsite_blosum62_aa_2"]
    ).rename(
        columns={"bindsite_delete_aa_1": "bindsite_mutated_aa_1",
                 "bindsite_delete_aa_2": "bindsite_mutated_aa_2"}
    )
merged_expl_binding_filt_delete["Binding Site Partner Status"] = "bindsite_deletion"

merged_expl_binding_filt_shuffle = merged_expl_binding_filt.copy(deep=True).drop(
    columns=["bindsite_delete_aa_1","bindsite_delete_aa_2","bindsite_blosum62_aa_1","bindsite_blosum62_aa_2"]
    ).rename(
        columns={"bindsite_shuffle_aa_1": "bindsite_mutated_aa_1",
                 "bindsite_shuffle_aa_2": "bindsite_mutated_aa_2"}
    )
merged_expl_binding_filt_shuffle["Binding Site Partner Status"] = "bindsite_shuffle"

merged_expl_binding_filt_blosum = merged_expl_binding_filt.copy(deep=True).drop(
    columns=["bindsite_delete_aa_1","bindsite_delete_aa_2","bindsite_shuffle_aa_1","bindsite_shuffle_aa_2"]
    ).rename(
        columns={"bindsite_blosum62_aa_1": "bindsite_mutated_aa_1",
                 "bindsite_blosum62_aa_2": "bindsite_mutated_aa_2"}
    )
merged_expl_binding_filt_blosum["Binding Site Partner Status"] = "bindsite_blosum62"

merged_expl_binding_filt = pd.concat([
    merged_expl_binding_filt_delete,
    merged_expl_binding_filt_shuffle,
    merged_expl_binding_filt_blosum
]).reset_index(drop=True)

merged_expl_binding_filt[["bindsite_mutated_aa_1","bindsite_mutated_aa_2","Binding Site Partner Status"]].head()


Unnamed: 0,bindsite_mutated_aa_1,bindsite_mutated_aa_2,Binding Site Partner Status
0,TKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGPDRFVLLETGG...,TKILCAHGGRMTLEELLGEIRLPEAQLYELLETAGPDRFVLLETGG...,bindsite_deletion
1,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,bindsite_deletion
2,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,bindsite_deletion
3,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,QAVRTALAILTEGVVSAPIEGIASVKIKRNTWSDNSEYLALYYAGP...,bindsite_deletion
4,GTYGSALGGYGSSYGGGMYGGSSMYRGGYGGGGLYGSSGMYGGGAM...,GTYGSALGGYGSSYGGGMYGGSSMYRGGYGGGGLYGSSGMYGGGAM...,bindsite_deletion


In [791]:
# get value counts so we can see the number of different mutants we have here
print(f"Size of merged_expl_binding_filt after expanding for different types of mutated binding site sequences: {len(merged_expl_binding_filt)}")
display(merged_expl_binding_filt["Binding Site Partner Status"].value_counts())

Size of merged_expl_binding_filt after expanding for different types of mutated binding site sequences: 514818


Binding Site Partner Status
bindsite_deletion    171606
bindsite_shuffle     171606
bindsite_blosum62    171606
Name: count, dtype: int64

In [792]:
# Group by seq_sort 
def get_seqsort_for_binding_site_pair(row, seq_type="og"):
    """
    Get the unique pair of sequences involved in this interaction. For the og or the ptm
    """
    # Figure out which partner is ptm
    ptm_partner = row["Binding Site Partner"]
    
    # Get original and ptm sequences
    og_aa_1 = row["aa_1"]
    new_aa_1 = row["bindsite_mutated_aa_1"]
    
    og_aa_2 = row["aa_2"]
    new_aa_2 = row["bindsite_mutated_aa_2"]
    
    # Assemble sequence pair based on whether we want the ptm interaction or the original interactin
    seqpair = [None, None]
    if not(type(ptm_partner)==float or ptm_partner is None):
        if ptm_partner=="A,B":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [new_aa_1,new_aa_2]
        elif ptm_partner == "A":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [new_aa_1,og_aa_2]
        elif ptm_partner == "B":
            if seq_type == "og":
                seqpair = [og_aa_1,og_aa_2]
            else:
                seqpair = [og_aa_1,new_aa_2]
                
    intA = seqpair[0]
    intB = seqpair[1]
    
    if intA is None or (type(intA)==float and np.isnan(intA)):
        intA=""
    if intB is None or (type(intB)==float and np.isnan(intB)):
        intB=""
    
    if intA <= intB:
        return f"{intA}_{intB}"
    return f"{intB}_{intA}"

In [793]:
merged_expl_binding_filt["seq_sort_og"] = merged_expl_binding_filt.apply(lambda row: get_seqsort_for_binding_site_pair(row, seq_type="og"),axis=1)
merged_expl_binding_filt["seq_sort_new"] = merged_expl_binding_filt.apply(lambda row: get_seqsort_for_binding_site_pair(row, seq_type="new"),axis=1)

In [794]:
# 
gb_og = merged_expl_binding_filt.groupby("seq_sort_og").agg(
    decisive_entry_og_binds=("Binding decisive_entry_og_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_og["seq_sort_og_id"] = [f"seq_sort_og_{i+1}" for i in range(len(gb_og))]

display(gb_og.loc[gb_og["decisive_entry_og_binds"].str.contains(",")].head())

test1 = len(gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes"))
    ])
print(f"Total og pairs that have yes AND no for binding based on binding annotations: {test1}/{len(gb_og)} ({100*test1/len(gb_og):.2f}%)")

Unnamed: 0,seq_sort_og,decisive_entry_og_binds,seq_sort_og_id


Total og pairs that have yes AND no for binding based on binding annotations: 0/69683 (0.00%)


In [795]:
# 
gb_new = merged_expl_binding_filt.groupby("seq_sort_new").agg(
    decisive_entry_new_binds=("Binding decisive_entry_new_binds", lambda x: ",".join(set(x)))
).reset_index()
gb_new["seq_sort_new_id"] = [f"seq_sort_new_{i+1}" for i in range(len(gb_new))]

display(gb_new.loc[gb_new["decisive_entry_new_binds"].str.contains(",")].head())

test1 = len(gb_new.loc[
    (gb_new["decisive_entry_new_binds"].str.contains("yes,no")) | 
    (gb_new["decisive_entry_new_binds"].str.contains("no,yes")) 
    ])
print(f"Positive db: Total new pairs that have yes AND no for binding based on binding annotations: {test1}/{len(gb_new)} ({100*test1/len(gb_new):.2f}%)")

Unnamed: 0,seq_sort_new,decisive_entry_new_binds,seq_sort_new_id
222,AARDDDEDDYPLLSDVEMEMESEADKIISEHLLKFKDINGLVEQNV...,"unknown,no",seq_sort_new_223
2523,APITAYSQQTRGLLGCIITSLTGRDKNQDCICDCDDWFWFCWPDWF...,"unknown,no",seq_sort_new_2524
2527,APITAYSQQTRGLLGCIITSLTGRDKNQSMETTMRSPVFTDNSSPP...,"unknown,no",seq_sort_new_2528
4984,CEQAVYQTILEEDVEDPVYQYIVFEAGHEPIRDPETEENIYQVPTS...,"unknown,no",seq_sort_new_4985
5014,CFCDFIIWWICFFWIDDIDDFFIWCCWGCDGWFWIWDCGWWFWDWE...,"unknown,no",seq_sort_new_5015


Positive db: Total new pairs that have yes AND no for binding based on binding annotations: 0/394893 (0.00%)


In [796]:
print(f"From positive database:")
print("Value counts for PTM all_new_binds")
print(merged_expl_binding_filt["Binding all_new_binds"].value_counts())
print("\nValue counts for PTM all_og_binds")
print(merged_expl_binding_filt["Binding all_og_binds"].value_counts())

From positive database:
Value counts for PTM all_new_binds
Binding all_new_binds
unknown    502878
no          11916
Name: count, dtype: int64

Value counts for PTM all_og_binds
Binding all_og_binds
yes    514818
Name: count, dtype: int64


In [797]:
# Figure out if any rows have contradicting labels 
def simplify_bindingsite_bind_labels(s):
    """
    Turn s into a catchall label
    """
    if type(s)!=str:
        return "unknown"
    s = set(s.split(","))
    options = set(["yes","no","unknown"])
    if s.intersection(options)==set(["yes"]):
        return "yes"
    elif s.intersection(options)==set(["no"]):
        return "no"
    elif s.intersection(options)==set(["yes","unknown"]):
        return "yes"
    elif s.intersection(options)==set(["no","unknown"]):
        return "no"
    elif s.intersection(options)==set(["yes","no"]):
        return "unknown"
    elif s.intersection(options)==set(["yes","no","unknown"]):
        return "unknown"
    return "unknown"

In [798]:
test1 = gb_og.loc[
    (gb_og["decisive_entry_og_binds"].str.contains("yes,no")) | 
    (gb_og["decisive_entry_og_binds"].str.contains("no,yes")) 
    ]["seq_sort_og"].unique().tolist()

gb_og_id_dict = dict(zip(gb_og["seq_sort_og"],gb_og["seq_sort_og_id"]))

gb_og["Binding decisive_seqpair_og_binds"] = gb_og["decisive_entry_og_binds"].apply(lambda s: simplify_bindingsite_bind_labels(s))

gb_og_dict = dict(zip(gb_og["seq_sort_og"],gb_og["Binding decisive_seqpair_og_binds"]))

merged_expl_binding_filt["seq_sort_og_id"] = merged_expl_binding_filt["seq_sort_og"].map(gb_og_id_dict)

merged_expl_binding_filt["Binding decisive_seqpair_og_binds"] = merged_expl_binding_filt["seq_sort_og"].map(gb_og_dict)

In [799]:
test1 = gb_new.loc[
    (gb_new["decisive_entry_new_binds"].str.contains("yes,no")) | 
    (gb_new["decisive_entry_new_binds"].str.contains("no,yes")) 
    ]["seq_sort_new"].unique().tolist()

gb_new_id_dict = dict(zip(gb_new["seq_sort_new"],gb_new["seq_sort_new_id"]))

gb_new["Binding decisive_seqpair_new_binds"] = gb_new["decisive_entry_new_binds"].apply(lambda s: simplify_binding_bind_labels(s))

gb_new_dict = dict(zip(gb_new["seq_sort_new"],gb_new["Binding decisive_seqpair_new_binds"]))

merged_expl_binding_filt["seq_sort_new_id"] = merged_expl_binding_filt["seq_sort_new"].map(gb_new_id_dict)

merged_expl_binding_filt["Binding decisive_seqpair_new_binds"] = merged_expl_binding_filt["seq_sort_new"].map(gb_new_dict)

In [800]:
print("From positive database:")
print("\nValue counts for Binding decisive_seqpair_og_binds")
print(merged_expl_binding_filt["Binding decisive_seqpair_og_binds"].value_counts())

print("\nValue counts for Binding decisive_seqpair_new_binds")
print(merged_expl_binding_filt["Binding decisive_seqpair_new_binds"].value_counts())

From positive database:

Value counts for Binding decisive_seqpair_og_binds
Binding decisive_seqpair_og_binds
yes    514818
Name: count, dtype: int64

Value counts for Binding decisive_seqpair_new_binds
Binding decisive_seqpair_new_binds
unknown    501948
no          12870
Name: count, dtype: int64


In [801]:
merged_binding_cols = [x for x in scraped_binding_cols if x in merged.columns]
rename_dict = {
    x: f"all_{x}" for x in merged_binding_cols
}
merged = merged.rename(columns=rename_dict)
print(",\n".join([x for x in merged.columns if x.startswith("all_binding")]))

all_binding_mi_1,
all_binding_name_1,
all_binding_short_1,
all_binding_begin_1,
all_binding_end_1,
all_binding_mi_2,
all_binding_name_2,
all_binding_short_2,
all_binding_begin_2,
all_binding_end_2


In [802]:
merged_binding_cols = [x for x in merged.columns if x.startswith("all_binding")]
simplemerged = merged[[
"Negative",
"aa_1",
"aa_2",
"invalids_aa_1",
"invalids_aa_2",
"all_intact_A_sorted",
"all_intact_B_sorted",
"chain_seq_end_1",
"chain_seq_end_2",
"chain_seq_start_1",
"chain_seq_start_2",
"confidence_val_int",
"ensg_1",
"ensg_2",
"ensp_1",
"ensp_2",
"enst_1",
"enst_2",
"equal_score_int",
"gene_symbol_1",
"gene_symbol_2",
"go_1",
"go_2",
"dip_1",
"dip_2",
"host_cell_type_1",
"host_cell_type_2",
"host_compartment_1",
"host_compartment_2",
"host_label_full_1",
"host_label_full_2",
"host_label_short_1",
"host_label_short_2",
"host_taxid_1",
"host_taxid_2",
"host_tissue_1",
"host_tissue_2",
"intactid_1",
"intactid_2",
"interaction_detection_methods_sorted",
"interaction_intactid",
"interaction_label",
"interaction_mi",
"interaction_xml_id",
"interpro_1",
"interpro_2",
"length_1",
"length_2",
"miscore",
"mol_type_1",
"mol_type_2",
"no_uniprot_update_A",
"no_uniprot_update_B",
"primaryref_db_1",
"primaryref_db_2",
"primaryref_id_1",
"primaryref_id_2",
"protein_1",
"protein_2",
"pubmeds",
"reactome_1",
"reactome_2",
"rscbpdb_1",
"rscbpdb_2",
"seq_pair_id",
"seq_sort",
"species_label_1",
"species_label_2",
"species_taxid_1",
"species_taxid_2",
"uniprot_A",
"uniprot_A_equalseq",
"uniprot_A_equalseq_canonical",
"uniprot_A_full",
"uniprot_A_inseq",
"uniprot_A_inseq_canonical",
"uniprot_A_intact",
"uniprot_A_noiso1",
"uniprot_A_noisoforms",
"uniprot_B",
"uniprot_B_equalseq",
"uniprot_B_equalseq_canonical",
"uniprot_B_full",
"uniprot_B_inseq",
"uniprot_B_inseq_canonical",
"uniprot_B_intact",
"uniprot_B_noiso1",
"uniprot_B_noisoforms",
"uniprot_gene_name_A",
"uniprot_gene_name_B",
"uniprotkb_1",
"uniprotkb_2",
"unique_all_intact_sorted",
"unique_expansions",
"unique_id",
"unique_score_int",
"unique_scores",
"unique_uniprot_noiso1_pair",
"unique_uniprot_noisoforms_pair",
"unique_uniprot_pair",
"year"] + merged_binding_cols]

simplemerged = simplemerged.drop_duplicates().reset_index(drop=True)
print(len(simplemerged))


744614


In [803]:
need_pipejoin = ["interaction_xml_id",
                 "reactome_1","reactome_2",
                 "host_label_full_1",
                "host_label_full_2",
                "host_label_short_1",
                "host_label_short_2",
                "host_taxid_1",
                "host_taxid_2",
                "host_cell_type_1","host_cell_type_2",
                "interaction_label", "interaction_mi","pubmeds"
] + merged_binding_cols

simplemerged = harmonize_nulls_to_nan(simplemerged)

all_except_featac = [c for c in simplemerged.columns if c not in need_pipejoin]

agg_spec = {c: join_unique_nonnull for c in need_pipejoin}

display(simplemerged.head())
simplemerged = (
    simplemerged
    .groupby(all_except_featac, dropna=False, as_index=False)
    .agg(agg_spec)
)

print(f"Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: {len(simplemerged)}")

display(simplemerged.head())

  out = out.replace({"": pd.NA, "None": pd.NA, "nan": pd.NA})


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,MGFPRILSKNNKIYTKLGEFCLSGDSFWIVCHTCQEELQTQDQFWK...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-101707,intact:EBI-100018,,,,...,,,,,,,,,,
1,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MEIPIQVAVRIFPHRELKDLLRSFGPTEPKKDAQAVDEGADSKDSE...,,,intact:EBI-100018,intact:EBI-102069,,,,...,MI:0117,binding-associated region,region,1207.0,1783.0,MI:0117,binding-associated region,region,,
2,False,MLPFRLGLLLGAVLFVASANGAAIENEVSSLNDLQREKRSGRGYSR...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-104215,intact:EBI-100018,,,,...,,,,,,,,,,
3,False,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,MSNYYSLLLQADTYDDESIGDERSEEDTDDASETEFRSPSRYGAMN...,,,intact:EBI-100018,intact:EBI-107089,,,,...,,,,,,,,,,
4,False,MSPPSGEFRCRVCLKQDELLVDIYEIVEEMQVDLCTLLETCGGIKV...,MVDNSVQCPVCTLYLHAGMNLSDHLETHPKEQVIKALVQMTIVGNG...,,,intact:EBI-117032,intact:EBI-100018,,,,...,,,,,,,,,,


Pipe-joined values in some columns that were unmeaningfully separating the same interaction. New pos PPI db size: 743130


Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
0,False,AAAAARPAGGSARRWGRPGRCGLLAAGPKRVRSEPGGRLPERSLGP...,MTVFRQENVDDYYDTGEELGSGQFAVVKKCREKSTGLQYAAKFIKK...,,,intact:EBI-20589573,intact:EBI-358616,,,,...,,,,,,,,,,
1,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAKWGEGDPRWIVEERADATNVNNWHWTERDASNWSTDKLKTLFLA...,,,intact:EBI-25507607,intact:EBI-448610,7176.0,,6878.0,...,,,,,,,,,,
2,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MANDPLEGFHEVNLASPTSPDLLGVCDPGTQEQTTSPSVIYRPHPS...,,,intact:EBI-25507607,intact:EBI-16730154,7176.0,,6878.0,...,,,,,,,,,,
3,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAQYGHPSPLGMAAREELYSKVTPRRNRQQRPGTIKHGSALDVLLS...,,,intact:EBI-25507607,intact:EBI-1380492,7176.0,,6878.0,...,,,,,,,,,,
4,False,AAADWKPGYVMPVLYKYLESPLERVNLWNYGKPITLPTGCMMNVAK...,MAVALLEEWCKIMGVDVQKSLLVVDIPVDCGEPEIQTVLQEALKCV...,,,intact:EBI-25507607,intact:EBI-25508298,7176.0,,6878.0,...,,,,,,,,,,


In [804]:
simplemerged.loc[simplemerged["all_binding_mi_1"].notna()][merged_binding_cols].head()

Unnamed: 0,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
11,MI:0442,sufficient binding region,region,653,1099,MI:0442,sufficient binding region,region,653.0,1099.0
12,MI:0442,sufficient binding region,region,653,1099,,,,,
13,MI:0442,sufficient binding region,region,653,1099,,,,,
30,MI:0442,sufficient binding region,region,3,264,MI:0442,sufficient binding region,region,305.0,550.0
31,MI:0442,sufficient binding region,region,3,264,MI:0442,sufficient binding region,region,306.0,550.0


In [805]:
## AFTER cleaning simplemerged a bit, run this again!
# trying to find the least common set of unique identifierrs in merged
# "unique_id": concatenation of the two intact ids of the interactors
# "interaction_intactid": the intact id of this interaction (one evidence piece of these two interactors interacting)
# "seq_pair_id": unique combination of two sequences 
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id"])
])
print(f"Rows in simplemerged with duplicate unique_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate interaction_intactid: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["seq_pair_id"])
])
print(f"Rows in simplemerged with duplicate seq_pair_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id","seq_pair_id"])
])
print(f"Rows in simplemerged with duplicate unique_id+seq_pair_id: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["unique_id","interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate unique_id+interaction_intactid: {test1}")
test1 = len(simplemerged.loc[
    simplemerged.duplicated(["seq_pair_id","interaction_intactid"])
])
print(f"Rows in simplemerged with duplicate seq_pair_id+interaction_intactid: {test1}")

Rows in simplemerged with duplicate unique_id: 314635
Rows in simplemerged with duplicate interaction_intactid: 3
Rows in simplemerged with duplicate seq_pair_id: 316591
Rows in simplemerged with duplicate unique_id+seq_pair_id: 314635
Rows in simplemerged with duplicate unique_id+interaction_intactid: 0
Rows in simplemerged with duplicate seq_pair_id+interaction_intactid: 0


In [806]:
def convert_bindsite_cols_to_ppi(row):
    """
    Convert bindsite columns to ppi
    """
    # defaults
    aa_1 = row["aa_1"]
    length_1 = row["length_1"]
    uniprot_A = row["uniprot_A"]
    chain_seq_start_1 = row["chain_seq_start_1"]
    chain_seq_end_1 = row["chain_seq_end_1"]
    uniprot_A_equalseq = row["uniprot_A_equalseq"]
    uniprot_A_equalseq_canonical = row["uniprot_A_equalseq_canonical"]
    uniprot_A_full = row["uniprot_A_full"]
    uniprot_A_inseq = row["uniprot_A_inseq"]
    uniprot_A_inseq_canonical = row["uniprot_A_inseq_canonical"]
    uniprot_A_noiso1 = row["uniprot_A_noiso1"]
    
    aa_2 = row["aa_2"]
    length_2 = row["length_2"]
    uniprot_B = row["uniprot_B"]
    chain_seq_start_2 = row["chain_seq_start_2"]
    chain_seq_end_2 = row["chain_seq_end_2"]
    uniprot_B_equalseq = row["uniprot_B_equalseq"]
    uniprot_B_equalseq_canonical = row["uniprot_B_equalseq_canonical"]
    uniprot_B_full = row["uniprot_B_full"]
    uniprot_B_inseq = row["uniprot_B_inseq"]
    uniprot_B_inseq_canonical = row["uniprot_B_inseq_canonical"]
    uniprot_B_noiso1 = row["uniprot_B_noiso1"]
    
    bindsite_parter = row["Binding Site Partner"]
    if not(type(bindsite_parter)==float or bindsite_parter is None):
        if "A" in bindsite_parter:
            aa_1 = row["bindsite_mutated_aa_1"]
            length_1 = len(aa_1) if type(aa_1)==str else 0
            uniprot_A = str(row["uniprot_A"]) + "_bindsite_mut" if type(row["uniprot_A"])==str else None
            chain_seq_start_1 = np.nan
            chain_seq_end_1 = np.nan
            uniprot_A_equalseq = np.nan
            uniprot_A_equalseq_canonical = np.nan
            uniprot_A_full = np.nan
            uniprot_A_inseq = np.nan
            uniprot_A_inseq_canonical = np.nan
            uniprot_A_noiso1 = np.nan
        if "B" in bindsite_parter:
            aa_2 = row["bindsite_mutated_aa_2"]
            length_2 = len(aa_2) if type(aa_2)==str else 0
            uniprot_B = str(row["uniprot_B"]) + "_bindsite_mut" if type(row["uniprot_B"])==str else None
            chain_seq_start_2 = np.nan
            chain_seq_end_2 = np.nan
            uniprot_B_equalseq = np.nan
            uniprot_B_equalseq_canonical = np.nan
            uniprot_B_full = np.nan
            uniprot_B_inseq = np.nan
            uniprot_B_inseq_canonical = np.nan
            uniprot_B_noiso1 = np.nan
    
    return pd.Series({
        "aa_1": aa_1,
        "length_1": length_1,
        "uniprot_A": uniprot_A,
        "chain_seq_start_1": chain_seq_start_1,
        "chain_seq_end_1": chain_seq_end_1,
        "uniprot_A_equalseq": uniprot_A_equalseq,
        "uniprot_A_equalseq_canonical": uniprot_A_equalseq_canonical,
        "uniprot_A_full": uniprot_A_full,
        "uniprot_A_inseq": uniprot_A_inseq,
        "uniprot_A_inseq_canonical": uniprot_A_inseq_canonical,
        "uniprot_A_noiso1": uniprot_A_noiso1,
        "aa_2": aa_2,
        "length_2": length_2,
        "uniprot_B": uniprot_B,
        "chain_seq_start_2": chain_seq_start_2,
        "chain_seq_end_2": chain_seq_end_2,
        "uniprot_B_equalseq": uniprot_B_equalseq,
        "uniprot_B_equalseq_canonical": uniprot_B_equalseq_canonical,
        "uniprot_B_full": uniprot_B_full,
        "uniprot_B_inseq": uniprot_B_inseq,
        "uniprot_B_inseq_canonical": uniprot_B_inseq_canonical,
        "uniprot_B_noiso1": uniprot_B_noiso1,
    })
    
# somehow get it back into simplemerged
# simplemerged has 
change_cols = ["aa_1",
 "length_1",
 "uniprot_A",
 "chain_seq_start_1",
 "chain_seq_end_1",
 "uniprot_A_equalseq",
 "uniprot_A_equalseq_canonical",
 "uniprot_A_full",
 "uniprot_A_inseq",
 "uniprot_A_inseq_canonical",
 "uniprot_A_noiso1",
 "aa_2",
 "length_2",
 "uniprot_B",
 "chain_seq_start_2",
 "chain_seq_end_2",
 "uniprot_B_equalseq",
 "uniprot_B_equalseq_canonical",
 "uniprot_B_full",
 "uniprot_B_inseq",
 "uniprot_B_inseq_canonical",
 "uniprot_B_noiso1"]
    

In [807]:
## Negative pairs: (binding-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Binding decisive_seqpair_og_binds is false
## Negative pairs: (binding-partner)'s binding_sequence plus (other partner)'s og_sequence, anywhere Binding decisive_seqpair_og_binds is false
neg_new_from_binding_data = merged_expl_binding_filt.loc[
    (merged_expl_binding_filt["Binding decisive_seqpair_new_binds"]=="no") 
].reset_index(drop=True)
neg_new_from_binding_data["Binding Partner Status"] = ["binding"]*len(neg_new_from_binding_data)
print(f"Total rows where Binding decisive_seqpair_new_binds==no: {len(neg_new_from_binding_data)}")

# what columns are in common?
common_cols = list(set(neg_new_from_binding_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
neg_new_from_binding_data = pd.merge(
    simplemerged,
    neg_new_from_binding_data, 
    on=common_cols, 
    how="inner"
)

neg_new_from_binding_data[
    change_cols
] = neg_new_from_binding_data.apply(lambda row: convert_bindsite_cols_to_ppi(row),axis=1)
print(f"Total unique sequences where Binding decisive_seqpair_new_binds==no: {len(neg_new_from_binding_data.drop_duplicates('seq_sort'))}")

# check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
neg_new_from_binding_data["seq_sort"] = neg_new_from_binding_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test1 = simplemerged["seq_sort"].unique().tolist()
test1 = len(neg_new_from_binding_data.loc[
    neg_new_from_binding_data["seq_sort"].isin(test1)
])
print(f"Total negative Binding interactions that are currently positive in our PPI data: {test1}")

# check if we already had these in negative (that's fine, just less exciting!)
test0 = simplemerged_neg_ptm_and_mut["seq_sort"].unique().tolist()
test1 = len(neg_new_from_binding_data.loc[
    neg_new_from_binding_data["seq_sort"].isin(test0)
])
print(f"Total negative Binding interaction rows that are currently NEGATIVE in our filtered PPI data: {test1}")
test1 = len(neg_new_from_binding_data.loc[
    neg_new_from_binding_data["seq_sort"].isin(test0)
].drop_duplicates("seq_sort"))
print(f"Total unique negative Binding interaction SEQUENCES that are currently NEGATIVE in our filtered PPI data: {test1}")

test1 = simplemerged_ptm_and_mut_unknown["seq_sort"].unique().tolist()
test1 = len(neg_new_from_binding_data.loc[
    neg_new_from_binding_data["seq_sort"].isin(test1)
])
print(f"Total unknown binding interactions that are currently in our unknown negative PPI+mut data: {test1}")



Total rows where Binding decisive_seqpair_new_binds==no: 12870
Total unique sequences where Binding decisive_seqpair_new_binds==no: 2837
Total negative Binding interactions that are currently positive in our PPI data: 1
Total negative Binding interaction rows that are currently NEGATIVE in our filtered PPI data: 11
Total unique negative Binding interaction SEQUENCES that are currently NEGATIVE in our filtered PPI data: 7
Total unknown binding interactions that are currently in our unknown negative PPI+mut data: 0


In [808]:
test1 = simplemerged["seq_sort"].unique().tolist()
overlapping_neg = neg_new_from_binding_data.loc[
    neg_new_from_binding_data["seq_sort"].isin(test1)
]["seq_sort"].tolist()

print(overlapping_neg)

display(simplemerged.loc[
    simplemerged["seq_sort"].isin(overlapping_neg)
])
display(
    neg_new_from_binding_data.loc[
        neg_new_from_binding_data["seq_sort"].isin(overlapping_neg)
    ][
        ["aa_1","aa_2"] + scraped_binding_cols + agg_binding_cols
    ]
)

og_seq = neg_new_from_binding_data.loc[
        neg_new_from_binding_data["seq_sort"].isin(overlapping_neg)
    ]["aa_2"].tolist()[0]

print(f"original: {og_seq}")
print(f"deleted binding site: {overlapping_neg[0]}")

['MDEPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGLFDPPYAGSGAGGTDPASPDTSSPGSLSPPPATLSSSLEAFLSGPQAAPSPLSPPQPAPTPLKMYPSMPAFSPGPGIKEESVPLSILQTPTPQPLPGALLPQSFPAPAPPQFSSTPVLGYPSPPGGFSTGSPPGNTQQPLPGLPLASPPGVPPVSLHTQVQSVVPQQLLTVTAAPTAAPVTTTVTSQIQQVPVLLQPHFIKADSLLLTAMKTDGATVKAAGLSPLVSGTTVQTGPLPTLVSGGTILATVPLVVDAEKLPINRLAAGSKAPASAQSRGEKRTAHNAIEKRYRSSINDKIIELKDLVVGTEAKLNKSAVLRKAIDYIRFLQHSNQKLKQENLSLRTAVHKSKSLKDLVSACGSGGNTDVLMEGVKTEVEDTLTPPPSDAGSPFQSSPLSLGSRGSGSGGSGSDSEPDSPVFEDSKAKPEQRPSLHSRGMLDRSRLAL_METPSQRRATRSGAQASSTPLSPTRITRLQEKEDLQELNDRLAVYIDRVRSLETENAGLRLRITESEEVVSREVSGIKAAYEAELGDARKTLDSVAKERARLQLELSKVREEFKELKARNTKKEGDLIAAQARLKDLEALLNSKEAALSTALSEKRTLEGELHDLRGQVAKLEAALGEAKKQLQDEMLRRVDAENRLQTMKEELDFQKNIYSEELRETKRRHETRLVEIDNGKQREFESRLADALQELRAQHEDQVEQYKKELEKTYSAKLDNARQSAERNSNLVGAAHEELQQSRIRIDSLSAQLSQLQKQLAAKEAKLRDLEDSLARERDTSRRLLAEKEREMAEMRARMQQQLDEYQELLDIKLALDMEIHAYRKLLEGEEERLRLSPSPTSQRSRGRASSHSSQTQGGGSVTKKRKLESTESRSSFSQHARTSGRVAVEEVDEEGKFVRLRNKSNEDQSMGNWQIKRQNGDDPLLTYRFPPKFTLKAGQVVTIWAAGAGATHS

Unnamed: 0,Negative,aa_1,aa_2,invalids_aa_1,invalids_aa_2,all_intact_A_sorted,all_intact_B_sorted,chain_seq_end_1,chain_seq_end_2,chain_seq_start_1,...,all_binding_mi_1,all_binding_name_1,all_binding_short_1,all_binding_begin_1,all_binding_end_1,all_binding_mi_2,all_binding_name_2,all_binding_short_2,all_binding_begin_2,all_binding_end_2
180957,False,MDEPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGL...,METPSQRRATRSGAQASSTPLSPTRITRLQEKEDLQELNDRLAVYI...,,,intact:EBI-22057616,intact:EBI-9034379,490,,1,...,MI:0442,sufficient binding region,region,227,487,,,,,
180958,False,MDEPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGL...,METPSQRRATRSGAQASSTPLSPTRITRLQEKEDLQELNDRLAVYI...,,,intact:EBI-22057616,intact:EBI-9034379,490,,1,...,MI:0442,sufficient binding region,region,227,487,,,,,


Unnamed: 0,aa_1,aa_2,binding_begin_1,binding_begin_2,binding_end_1,binding_end_2,binding_mi_1,binding_mi_2,binding_name_1,binding_name_2,...,Binding Figure legend(s),Binding Interaction AC,Binding Interaction participants,Binding Interactor Matches,Binding Original sequence,Binding PubMedID,Binding Resulting sequence,Binding Xref ID(s),Binding new_binds_bo_feature_type,Binding og_binds_bo_feature_type
5165,METPSQRRATRSGAQASSTPLSPTRITRLQEKEDLQELNDRLAVYI...,MDEPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGL...,607,,656,,MI:0429,,necessary binding region,,...,"figure legend:Fig. 1d, 1e, 3b",intact:EBI-22057623,"(uniprotkb:P02545-1(psi-mi:""MI:0326""(protein))...",A,VGGPISSGSSASSVTVTRSYRSVGGSGGGSFGDNLVTRSYLLGNSS...,pubmed:21993218|imex:IM-27410,-,-,no,yes


original: MDEPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGLFDPPYAGSGAGGTDPASPDTSSPGSLSPPPATLSSSLEAFLSGPQAAPSPLSPPQPAPTPLKMYPSMPAFSPGPGIKEESVPLSILQTPTPQPLPGALLPQSFPAPAPPQFSSTPVLGYPSPPGGFSTGSPPGNTQQPLPGLPLASPPGVPPVSLHTQVQSVVPQQLLTVTAAPTAAPVTTTVTSQIQQVPVLLQPHFIKADSLLLTAMKTDGATVKAAGLSPLVSGTTVQTGPLPTLVSGGTILATVPLVVDAEKLPINRLAAGSKAPASAQSRGEKRTAHNAIEKRYRSSINDKIIELKDLVVGTEAKLNKSAVLRKAIDYIRFLQHSNQKLKQENLSLRTAVHKSKSLKDLVSACGSGGNTDVLMEGVKTEVEDTLTPPPSDAGSPFQSSPLSLGSRGSGSGGSGSDSEPDSPVFEDSKAKPEQRPSLHSRGMLDRSRLAL
deleted binding site: MDEPPFSEAALEQALGEPCDLDAALLTDIEDMLQLINNQDSDFPGLFDPPYAGSGAGGTDPASPDTSSPGSLSPPPATLSSSLEAFLSGPQAAPSPLSPPQPAPTPLKMYPSMPAFSPGPGIKEESVPLSILQTPTPQPLPGALLPQSFPAPAPPQFSSTPVLGYPSPPGGFSTGSPPGNTQQPLPGLPLASPPGVPPVSLHTQVQSVVPQQLLTVTAAPTAAPVTTTVTSQIQQVPVLLQPHFIKADSLLLTAMKTDGATVKAAGLSPLVSGTTVQTGPLPTLVSGGTILATVPLVVDAEKLPINRLAAGSKAPASAQSRGEKRTAHNAIEKRYRSSINDKIIELKDLVVGTEAKLNKSAVLRKAIDYIRFLQHSNQKLKQENLSLRTAVHKSKSLKDLVSACGSGGNTDVLMEGVKTEVEDTLTPPPSDAGSPFQSSPLSLGSRGSGSGGSGSDSEPDSPVFEDSKAKPEQRPSL

In [809]:
## Unknown pairs: (binding-partner)'s original_sequence plus (other partner)'s og_sequence, anywhere Binding decisive_seqpair_new_binds is unknown
unknown_new_from_binding_data = merged_expl_binding_filt.loc[
    merged_expl_binding_filt["Binding decisive_seqpair_new_binds"]=="unknown"
].reset_index(drop=True)
unknown_new_from_binding_data["Binding Partner Status"] = ["binding"]*len(unknown_new_from_binding_data)
print(f"Total rows where Binding decisive_seqpair_new_binds==unknown: {len(unknown_new_from_binding_data)}")

# what columns are in common?
common_cols = list(set(unknown_new_from_binding_data.columns).intersection(set(simplemerged.columns)))

# we want to merge on all of these except 
unknown_new_from_binding_data = pd.merge(
    simplemerged,
    unknown_new_from_binding_data, 
    on=common_cols, 
    how="inner"
)

if len(unknown_new_from_binding_data)>0:
    unknown_new_from_binding_data[
        change_cols
    ] = unknown_new_from_binding_data.apply(lambda row: convert_bindsite_cols_to_ppi(row),axis=1)

    # check if we actually added something new - we  shouldn't have. WE should get seq pair IDs that already exist. 
    unknown_new_from_binding_data["seq_sort"] = unknown_new_from_binding_data.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
    
simplemerged["seq_sort"] = simplemerged.apply(lambda row: get_unique_id(row, colA="aa_1",colB="aa_2"), axis=1)
test0 = simplemerged["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_binding_data.loc[
    unknown_new_from_binding_data["seq_sort"].isin(test0)
])
print(f"Total unknown binding interactions that are currently positive in our PPI data: {test1}")
test1 = len(unknown_new_from_binding_data.loc[
    unknown_new_from_binding_data["seq_sort"].isin(test0)
].drop_duplicates("seq_sort"))
print(f"\tTotal sequences: {test1}")

test1 = simplemerged_ptm_and_mut["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_binding_data.loc[
    unknown_new_from_binding_data["seq_sort"].isin(test1)
])
print(f"Total unknown binding interactions that are currently positive in our filtered PPI+mut data: {test1}")

test1 = simplemerged_neg_ptm_and_mut["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_binding_data.loc[
    unknown_new_from_binding_data["seq_sort"].isin(test1)
])
print(f"Total unknown binding interactions that are currently in our filtered negative PPI+mut data: {test1}")

test1 = simplemerged_ptm_and_mut_unknown["seq_sort"].unique().tolist()
test1 = len(unknown_new_from_binding_data.loc[
    unknown_new_from_binding_data["seq_sort"].isin(test1)
])
print(f"Total unknown binding interactions that are currently in our unknown negative PPI+mut data: {test1}")

Total rows where Binding decisive_seqpair_new_binds==unknown: 501948
Total unknown binding interactions that are currently positive in our PPI data: 3
	Total sequences: 3
Total unknown binding interactions that are currently positive in our filtered PPI+mut data: 9
Total unknown binding interactions that are currently in our filtered negative PPI+mut data: 3
Total unknown binding interactions that are currently in our unknown negative PPI+mut data: 6


In [810]:
# delete incorrect negatives from positive database
neg_seq_pairs = neg_new_from_binding_data["seq_sort"].unique().tolist()
print(f"Size of simplemerged_ptm_and_mut before we remove erroneous positives (should be negative or unknown): {len(simplemerged_ptm_and_mut)}")
simplemerged_ptm_and_mut_and_bindsite = simplemerged_ptm_and_mut.loc[
    ~simplemerged_ptm_and_mut["seq_sort"].isin(neg_seq_pairs)
]
print(f"Size of simplemerged_ptm_and_mut_and_bindsite after we remove erroneous positives (should be negative or unknown): {len(simplemerged_ptm_and_mut_and_bindsite)}")

Size of simplemerged_ptm_and_mut before we remove erroneous positives (should be negative or unknown): 799352
Size of simplemerged_ptm_and_mut_and_bindsite after we remove erroneous positives (should be negative or unknown): 799348


In [811]:
neg_new_from_binding_data["Negative"] = True
simplemerged_neg_ptm_and_mut_and_bindsite = pd.concat([
    simplemerged_neg_ptm_and_mut,
    neg_new_from_binding_data
]).reset_index(drop=True)
simplemerged_neg_ptm_and_mut_and_bindsite = simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates().reset_index(drop=True)
print(f"Size of simplemerged_neg_ptm_and_mut_and_bindsite after we concatenate new positives found from ptms (both from positive and negative initial datasets): {len(simplemerged_neg_ptm_and_mut_and_bindsite)}")

Size of simplemerged_neg_ptm_and_mut_and_bindsite after we concatenate new positives found from ptms (both from positive and negative initial datasets): 768483


In [812]:
# add the other binding site mutants to the unknowns dataset
unknown_new_from_binding_data["Negative"] = False
simplemerged_ptm_and_mut_and_bindsite_unknown = pd.concat([
    simplemerged_ptm_and_mut_unknown,
    unknown_new_from_binding_data
]).reset_index(drop=True)
simplemerged_ptm_and_mut_and_bindsite_unknown = simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates().reset_index(drop=True)
print(f"Size of simplemerged_ptm_and_mut_and_bindsite_unknown after we concatenate new positives found from ptms (both from positive and negative initial datasets): {len(simplemerged_ptm_and_mut_and_bindsite_unknown)}")

Size of simplemerged_ptm_and_mut_and_bindsite_unknown after we concatenate new positives found from ptms (both from positive and negative initial datasets): 534448


In [813]:
### Save the datasets as they are now
# save the PTM information
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)

simplemerged_ptm_and_mut_and_bindsite.to_csv(
    f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_dec11_2025.csv", index=False
)
simplemerged_neg_ptm_and_mut_and_bindsite.to_csv(
    f"{savedir}/simplemerged_neg_ptm_and_mut_and_bindsite_dec11_2025.csv", index=False
)
simplemerged_ptm_and_mut_and_bindsite_unknown.to_csv(
    f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_unknown_dec11_2025.csv", index=False
)

In [814]:
print(len([x for x in simplemerged_ptm_and_mut_and_bindsite.columns if x.startswith("Bind") or x.startswith("all_bind")]))
print(len([x for x in simplemerged_ptm_and_mut_and_bindsite_unknown.columns if x.startswith("Bind") or x.startswith("bind")]))
print(len([x for x in simplemerged_neg_ptm_and_mut_and_bindsite.columns if x.startswith("Bind") or x.startswith("bind")]))

10
46
46


# Final filtering on Interaction MI
* Save one version that's children of physical association
* Save one version that's of direct interaction
* Make sure there is NO sequence overlap between positive, negative, and unknown sequence pairs 

## Data load

In [4]:
mi_0915_subtree = pd.read_csv(
    "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0915_subtree.csv"
)
print(f"Total size of MI:0915 (physical association) subtree: {len(mi_0915_subtree)}")
mi_0915_subtree.head()

Total size of MI:0915 (physical association) subtree: 72


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,physical association,MI:0915,,MI:0914,association
1,direct interaction,MI:0407,MI:0915,MI:0915,physical association
2,covalent binding,MI:0195,MI:0407,MI:0407,direct interaction
3,disulfide bond,MI:0408,MI:0195,MI:0195,covalent binding
4,transglutamination reaction,MI:0556,MI:0195,MI:0195,covalent binding


In [5]:
mi_0407_subtree = pd.read_csv(
    "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0407_subtree.csv"
)
print(f"Total size of MI:0407 (direct interaction) subtree: {len(mi_0407_subtree)}")
mi_0407_subtree.head()

Total size of MI:0407 (direct interaction) subtree: 71


Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,direct interaction,MI:0407,,MI:0915,physical association
1,covalent binding,MI:0195,MI:0407,MI:0407,direct interaction
2,disulfide bond,MI:0408,MI:0195,MI:0195,covalent binding
3,transglutamination reaction,MI:0556,MI:0195,MI:0195,covalent binding
4,enzymatic reaction,MI:0414,MI:0407,MI:0407,direct interaction


In [6]:
# Now to read bindsites we have to bump the limit
# bump the limit (use a big number; sys.maxsize may OverflowError on some platforms)
limit = 10**9
try:
    csv.field_size_limit(limit)
except OverflowError:
    # fallback: shrink until it fits the platform
    while True:
        try:
            csv.field_size_limit(limit)
            break
        except OverflowError:
            limit //= 10

In [7]:
ptms_path = "data_files/raw/intact/psimitab/features/ptms.tsv"
ptms = pd.read_csv(ptms_path, sep="\t", engine="python")
ptms["Interaction AC"] = ptms["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

mutations_path = "data_files/raw/intact/psimitab/features/mutations.tsv"
mutations = pd.read_csv(mutations_path, sep="\t", engine="python")
mutations["Interaction AC"] = mutations["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

bindsites_path = "data_files/raw/intact/psimitab/features/bindings_regions.tsv"            
bindsites = pd.read_csv(bindsites_path, sep="\t", engine="python")
bindsites["Interaction AC"] = bindsites["Interaction AC"].apply(lambda x: "intact:"+x if (type(x)==str and not(x.startswith("intact:"))) else x)

analyzed_mods_dir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/features_analyzed"
bindsite_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/bindsite_types_analyzed.csv")
mutation_feature_ac_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_ac_analyzed.csv")
mutation_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_annotations_analyzed.csv")
mutation_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/mutation_feature_types_analyzed.csv")
ptm_feature_types_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_types_analyzed.csv")
ptm_feature_annotations_labeled = pd.read_csv(f"{analyzed_mods_dir}/ptm_feature_annotations_analyzed.csv")

interaction_milabel_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0190_subtree.csv")
mutation_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0118_subtree.csv")
bindsite_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0117_subtree.csv")
ptm_mi_ok = pd.read_csv("/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/cv/mi_0925_subtree.csv")

In [8]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv")
# intact-clust
intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv")

# merged
merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
# 
#merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")

# my_pos
my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv")
# my_neg
my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv")

simplemerged_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_dec11_2025.csv")

simplemerged_ptm_and_mut_and_bindsite_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_unknown_dec11_2025.csv")

simplemerged_neg_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_and_bindsite_dec11_2025.csv")


  intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv")
  intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv")
  merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
  merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
  my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv")
  my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv")
  simplemerged_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_dec11_2025.csv")
  simplemerged_ptm_and_mut_and_bindsite_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_unknown_dec11_2025.csv")
  simplemerged_neg_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_and_bindsite_dec11_2025.csv")


In [9]:
intact_dtypes = {k: "string" for k in intact.columns}
intact_dtypes["Negative"] = "bool"
#intact_dtypes["miscore"] = "float"

intact_clust_dtypes = {k: "string" for k in intact_clust.columns}
intact_clust_dtypes["Negative"] = "bool"
intact_clust_dtypes["miscore"] = "float"
intact_clust_dtypes["equal_score_int"] = "bool"

# simplemerged pos and the other two
simplemerged_dtypes = {k: "string" for k in simplemerged_ptm_and_mut_and_bindsite.columns}
simplemerged_dtypes["Negative"] = "bool"
simplemerged_dtypes["length_1"] = "int"
simplemerged_dtypes["length_2"] = "int"
simplemerged_dtypes["miscore"] = "float"
simplemerged_dtypes["year"] = "int"
simplemerged_dtypes["confidence_val_int"] = "float"
simplemerged_dtypes["unique_score_int"] = "float"

#simplemerged_dtypes["scraped_mut_has_info"] = "bool"
#simplemerged_dtypes["scraped_mut_has_info_1"] = "bool"
#simplemerged_dtypes["scraped_mut_has_info_2"] = "bool"
#simplemerged_dtypes["agg_mut_has_info"] = "bool"

merged_dtypes = {k: "string" for k in merged.columns}
merged_dtypes["Negative"] = "bool"
merged_dtypes["length_1"] = "int"
merged_dtypes["length_2"] = "int"
merged_dtypes["miscore"] = "float"
merged_dtypes["year"] = "int"
merged_dtypes["confidence_val_int"] = "float"
merged_dtypes["unique_score_int"] = "float"

my_pos_dtypes = {k: "string" for k in my_pos.columns}
my_pos_dtypes["length_1"] = "int"
my_pos_dtypes["length_2"] = "int"
my_pos_dtypes["year"] = "int"

my_neg_dtypes = {k: "string" for k in my_neg.columns}
my_neg_dtypes["miscore"] = "float"
#my_neg_dtypes["Negative"] = "bool"
my_neg_dtypes["length_1"] = "int"
my_neg_dtypes["length_2"] = "int"
my_neg_dtypes["year"] = "int"


In [10]:
# let's save these files as temp intermediate files so we can pick up from here in the future
savedir = "/scratch/pranamlab/sophie/interactome/interactome/data_files/processed/intact/intermediate"
os.makedirs(savedir,exist_ok=True)
# intact
intact = pd.read_csv(f"{savedir}/intact_post_mutation_processing_dec11_2025.csv", dtype=intact_dtypes)
# intact-clust
intact_clust = pd.read_csv(f"{savedir}/intact_clust_post_mutation_processing_dec11_2025.csv", dtype=intact_clust_dtypes)

# merged
merged = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv", dtype=merged_dtypes)
# 
#merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv")
merged_neg = pd.read_csv(f"{savedir}/merged_post_mutation_processing_dec11_2025.csv", dtype=merged_dtypes)

# my_pos
my_pos = pd.read_csv(f"{savedir}/my_pos_post_mutation_processing_dec11_2025.csv", dtype=my_pos_dtypes)
# my_neg
my_neg = pd.read_csv(f"{savedir}/my_neg_post_mutation_processing_dec11_2025.csv", dtype=my_neg_dtypes)

simplemerged_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_dec11_2025.csv", dtype=simplemerged_dtypes)

simplemerged_ptm_and_mut_and_bindsite_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_unknown_dec11_2025.csv",dtype=simplemerged_dtypes)

simplemerged_neg_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_and_bindsite_dec11_2025.csv",dtype=simplemerged_dtypes)


  simplemerged_ptm_and_mut_and_bindsite_unknown = pd.read_csv(f"{savedir}/simplemerged_ptm_and_mut_and_bindsite_unknown_dec11_2025.csv",dtype=simplemerged_dtypes)
  simplemerged_neg_ptm_and_mut_and_bindsite = pd.read_csv(f"{savedir}/simplemerged_neg_ptm_and_mut_and_bindsite_dec11_2025.csv",dtype=simplemerged_dtypes)


## Processing

In [11]:
display(mi_0407_subtree.head())
display(simplemerged_ptm_and_mut_and_bindsite.head()[["interaction_mi","interaction_label"]])

Unnamed: 0,label,id,parent_id,parent_ids_all,parent_names_all
0,direct interaction,MI:0407,,MI:0915,physical association
1,covalent binding,MI:0195,MI:0407,MI:0407,direct interaction
2,disulfide bond,MI:0408,MI:0195,MI:0195,covalent binding
3,transglutamination reaction,MI:0556,MI:0195,MI:0195,covalent binding
4,enzymatic reaction,MI:0414,MI:0407,MI:0407,direct interaction


Unnamed: 0,interaction_mi,interaction_label
0,MI:1110,predicted interaction
1,MI:1110,predicted interaction
2,MI:1110,predicted interaction
3,MI:1110,predicted interaction
4,MI:1110,predicted interaction


In [12]:
# Filter to direct interactions only based on MI:0407 subtree
mi_0407_ids = mi_0407_subtree["id"].unique().tolist()
print(f"Total MI:0407 ids: {len(mi_0407_ids)}")

mi0407_simplemerged_ptm_and_mut_and_bindsite = simplemerged_ptm_and_mut_and_bindsite.loc[
    simplemerged_ptm_and_mut_and_bindsite["interaction_mi"].isin(mi_0407_ids)
].reset_index(drop=True)
mi0407_simplemerged_ptm_and_mut_and_bindsite = mi0407_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["aa_1"].notna()) & 
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["aa_2"].notna())
].reset_index(drop=True)
print(f"Size of mi0407_simplemerged_ptm_and_mut_and_bindsite: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")

mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown = simplemerged_ptm_and_mut_and_bindsite_unknown.loc[
    simplemerged_ptm_and_mut_and_bindsite_unknown["interaction_mi"].isin(mi_0407_ids)
].reset_index(drop=True)
simplemerged_ptm_and_mut_and_bindsite_unknown = simplemerged_ptm_and_mut_and_bindsite_unknown.loc[
    (simplemerged_ptm_and_mut_and_bindsite_unknown["aa_1"].notna()) & 
    (simplemerged_ptm_and_mut_and_bindsite_unknown["aa_2"].notna())
].reset_index(drop=True)
print(f"Size of mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown)}")
print(f"\tUnique seq pairs: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates('seq_sort'))}")

mi0407_simplemerged_neg_ptm_and_mut_and_bindsite = simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    simplemerged_neg_ptm_and_mut_and_bindsite["interaction_mi"].isin(mi_0407_ids)
].reset_index(drop=True)
mi0407_simplemerged_neg_ptm_and_mut_and_bindsite = mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["aa_1"].notna()) & 
    (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["aa_2"].notna())
].reset_index(drop=True)
print(f"Size of mi0407_simplemerged_neg_ptm_and_mut_and_bindsite: {len(mi0407_simplemerged_neg_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")

## check for overlap
pos = mi0407_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
neg = mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
unknown = mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown["seq_sort"].unique().tolist()

overlap_pos_neg = set(pos).intersection(set(neg))
overlap_pos_unknown = set(pos).intersection(set(unknown))
overlap_neg_unknown = set(neg).intersection(set(unknown))
print(f"\nTotal overlapping seq_pairs between pos and neg: {len(overlap_pos_neg)}")
print(f"Total overlapping seq_pairs between pos and unknown: {len(overlap_pos_unknown)}")
print(f"Total overlapping seq_pairs between neg and unknown: {len(overlap_neg_unknown)}")

## Let's remove the overlapping pairs
pos_bad = mi0407_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_unknown))
    ]
neg_bad = mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
    (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_neg_unknown))
    ]
mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown = pd.concat([
    mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown,
    pos_bad,
    neg_bad
]).drop_duplicates().reset_index(drop=True)
mi0407_simplemerged_ptm_and_mut_and_bindsite = mi0407_simplemerged_ptm_and_mut_and_bindsite.loc[
    ~(
        (mi0407_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
        (mi0407_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_unknown))
    )
].reset_index(drop=True)
mi0407_simplemerged_neg_ptm_and_mut_and_bindsite = mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    ~(
        (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
        (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_neg_unknown))
    )
].reset_index(drop=True)

## Now redo the stats
print(f"{'-'*100}\nDropped bad seq_sorts.\nMI:0407 (physical interaction) Final database sizes:")
print(f"Size of POSITIVE database, mi0407_simplemerged_ptm_and_mut_and_bindsite: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")
print(f"Size of UNKNOWN database, mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown)}")
print(f"\tUnique seq pairs: {len(mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates('seq_sort'))}")
print(f"Size of NEGATIVE mi0407_simplemerged_neg_ptm_and_mut_and_bindsite: {len(mi0407_simplemerged_neg_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")

## check for overlap
pos = mi0407_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
neg = mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
unknown = mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown["seq_sort"].unique().tolist()

overlap_pos_neg = set(pos).intersection(set(neg))
overlap_pos_unknown = set(pos).intersection(set(unknown))
overlap_neg_unknown = set(neg).intersection(set(unknown))
print(f"\nTotal overlapping seq_pairs between pos and neg: {len(overlap_pos_neg)}")
print(f"Total overlapping seq_pairs between pos and unknown: {len(overlap_pos_unknown)}")
print(f"Total overlapping seq_pairs between neg and unknown: {len(overlap_neg_unknown)}")

# How many are protein-peptide? 
test0 = len(mi0407_simplemerged_ptm_and_mut_and_bindsite.drop_duplicates("seq_sort"))
test1 = len(mi0407_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["mol_type_1"]=="peptide") | 
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["mol_type_2"]=="peptide")
].drop_duplicates("seq_sort"))
print(f"\nTotal POSITIVE peptide-protein pairs: {test1}/{test0} ({(test1/test0)*100:.2f}%)")

test0 = len(mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates("seq_sort"))
test1 = len(mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown.loc[
    (mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown["mol_type_1"]=="peptide") | 
    (mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown["mol_type_2"]=="peptide")
].drop_duplicates("seq_sort"))
print(f"Total UNKNOWN peptide-protein pairs: {test1}/{test0} ({(test1/test0)*100:.2f}%)")

test0 = len(mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates("seq_sort"))
test1 = len(mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["mol_type_1"]=="peptide") | 
    (mi0407_simplemerged_neg_ptm_and_mut_and_bindsite["mol_type_2"]=="peptide")
].drop_duplicates("seq_sort"))
print(f"Total NEGATIVE peptide-protein pairs: {test1}/{test0} ({(test1/test0)*100:.2f}%)")

Total MI:0407 ids: 71
Size of mi0407_simplemerged_ptm_and_mut_and_bindsite: 50901
	Unique seq pairs: 33684
Size of mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown: 171834
	Unique seq pairs: 141683
Size of mi0407_simplemerged_neg_ptm_and_mut_and_bindsite: 6083
	Unique seq pairs: 5376

Total overlapping seq_pairs between pos and neg: 3
Total overlapping seq_pairs between pos and unknown: 1092
Total overlapping seq_pairs between neg and unknown: 27
----------------------------------------------------------------------------------------------------
Dropped bad seq_sorts.
MI:0407 (physical interaction) Final database sizes:
Size of POSITIVE database, mi0407_simplemerged_ptm_and_mut_and_bindsite: 48233
	Unique seq pairs: 32590
Size of UNKNOWN database, mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown: 174546
	Unique seq pairs: 141685
Size of NEGATIVE mi0407_simplemerged_neg_ptm_and_mut_and_bindsite: 6039
	Unique seq pairs: 5347

Total overlapping seq_pairs between pos and neg: 0
Tot

In [13]:
mi0407_simplemerged_ptm_and_mut_and_bindsite["all_binding_name_1"]

0        binding-associated region
1                             <NA>
2        binding-associated region
3                             <NA>
4                             <NA>
                   ...            
48228    sufficient binding region
48229    sufficient binding region
48230    sufficient binding region
48231    sufficient binding region
48232    sufficient binding region
Name: all_binding_name_1, Length: 48233, dtype: string

In [14]:
# Filter to direct interactions only based on MI:0915 subtree
mi_0915_ids = mi_0915_subtree["id"].unique().tolist()
print(f"Total MI:0915 ids: {len(mi_0915_ids)}")

mi0915_simplemerged_ptm_and_mut_and_bindsite = simplemerged_ptm_and_mut_and_bindsite.loc[
    simplemerged_ptm_and_mut_and_bindsite["interaction_mi"].isin(mi_0915_ids)
].reset_index(drop=True)
mi0915_simplemerged_ptm_and_mut_and_bindsite = mi0915_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0915_simplemerged_ptm_and_mut_and_bindsite["aa_1"].notna()) & 
    (mi0915_simplemerged_ptm_and_mut_and_bindsite["aa_2"].notna())
].reset_index(drop=True)
print(f"Size of mi0915_simplemerged_ptm_and_mut_and_bindsite: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")

mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown = simplemerged_ptm_and_mut_and_bindsite_unknown.loc[
    simplemerged_ptm_and_mut_and_bindsite_unknown["interaction_mi"].isin(mi_0915_ids)
].reset_index(drop=True)
simplemerged_ptm_and_mut_and_bindsite_unknown = simplemerged_ptm_and_mut_and_bindsite_unknown.loc[
    (simplemerged_ptm_and_mut_and_bindsite_unknown["aa_1"].notna()) & 
    (simplemerged_ptm_and_mut_and_bindsite_unknown["aa_2"].notna())
].reset_index(drop=True)
print(f"Size of mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown)}")
print(f"\tUnique seq pairs: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates('seq_sort'))}")

mi0915_simplemerged_neg_ptm_and_mut_and_bindsite = simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    simplemerged_neg_ptm_and_mut_and_bindsite["interaction_mi"].isin(mi_0915_ids)
].reset_index(drop=True)
mi0915_simplemerged_neg_ptm_and_mut_and_bindsite = mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["aa_1"].notna()) & 
    (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["aa_2"].notna())
].reset_index(drop=True)
print(f"Size of mi0915_simplemerged_neg_ptm_and_mut_and_bindsite: {len(mi0915_simplemerged_neg_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")

## check for overlap
pos = mi0915_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
neg = mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
unknown = mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown["seq_sort"].unique().tolist()

overlap_pos_neg = set(pos).intersection(set(neg))
overlap_pos_unknown = set(pos).intersection(set(unknown))
overlap_neg_unknown = set(neg).intersection(set(unknown))
print(f"\nTotal overlapping seq_pairs between pos and neg: {len(overlap_pos_neg)}")
print(f"Total overlapping seq_pairs between pos and unknown: {len(overlap_pos_unknown)}")
print(f"Total overlapping seq_pairs between neg and unknown: {len(overlap_neg_unknown)}")

## Let's remove the overlapping pairs
pos_bad = mi0915_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0915_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
    (mi0915_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_unknown))
    ]
neg_bad = mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
    (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_neg_unknown))
    ]
mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown = pd.concat([
    mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown,
    pos_bad,
    neg_bad
]).drop_duplicates().reset_index(drop=True)
mi0915_simplemerged_ptm_and_mut_and_bindsite = mi0915_simplemerged_ptm_and_mut_and_bindsite.loc[
    ~(
        (mi0915_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
        (mi0915_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_unknown))
    )
].reset_index(drop=True)
mi0915_simplemerged_neg_ptm_and_mut_and_bindsite = mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    ~(
        (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_pos_neg)) | 
        (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].isin(overlap_neg_unknown))
    )
].reset_index(drop=True)

## Now redo the stats
print(f"{'-'*100}\nDropped bad seq_sorts.\nMI:0915 (physical interaction) Final database sizes:")
print(f"Size of POSITIVE database, mi0915_simplemerged_ptm_and_mut_and_bindsite: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")
print(f"Size of UNKNOWN database, mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown)}")
print(f"\tUnique seq pairs: {len(mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates('seq_sort'))}")
print(f"Size of NEGATIVE mi0915_simplemerged_neg_ptm_and_mut_and_bindsite: {len(mi0915_simplemerged_neg_ptm_and_mut_and_bindsite)}")
print(f"\tUnique seq pairs: {len(mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates('seq_sort'))}")

## check for overlap
pos = mi0915_simplemerged_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
neg = mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["seq_sort"].unique().tolist()
unknown = mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown["seq_sort"].unique().tolist()

overlap_pos_neg = set(pos).intersection(set(neg))
overlap_pos_unknown = set(pos).intersection(set(unknown))
overlap_neg_unknown = set(neg).intersection(set(unknown))
print(f"\nTotal overlapping seq_pairs between pos and neg: {len(overlap_pos_neg)}")
print(f"Total overlapping seq_pairs between pos and unknown: {len(overlap_pos_unknown)}")
print(f"Total overlapping seq_pairs between neg and unknown: {len(overlap_neg_unknown)}")

# How many are protein-peptide? 
test0 = len(mi0915_simplemerged_ptm_and_mut_and_bindsite.drop_duplicates("seq_sort"))
test1 = len(mi0915_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0915_simplemerged_ptm_and_mut_and_bindsite["mol_type_1"]=="peptide") | 
    (mi0915_simplemerged_ptm_and_mut_and_bindsite["mol_type_2"]=="peptide")
].drop_duplicates("seq_sort"))
print(f"\nTotal POSITIVE peptide-protein pairs: {test1}/{test0} ({(test1/test0)*100:.2f}%)")

test0 = len(mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown.drop_duplicates("seq_sort"))
test1 = len(mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown.loc[
    (mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown["mol_type_1"]=="peptide") | 
    (mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown["mol_type_2"]=="peptide")
].drop_duplicates("seq_sort"))
print(f"Total UNKNOWN peptide-protein pairs: {test1}/{test0} ({(test1/test0)*100:.2f}%)")

test0 = len(mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.drop_duplicates("seq_sort"))
test1 = len(mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.loc[
    (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["mol_type_1"]=="peptide") | 
    (mi0915_simplemerged_neg_ptm_and_mut_and_bindsite["mol_type_2"]=="peptide")
].drop_duplicates("seq_sort"))
print(f"Total NEGATIVE peptide-protein pairs: {test1}/{test0} ({(test1/test0)*100:.2f}%)")


Total MI:0915 ids: 72
Size of mi0915_simplemerged_ptm_and_mut_and_bindsite: 732666
	Unique seq pairs: 423577
Size of mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown: 512002
	Unique seq pairs: 383597
Size of mi0915_simplemerged_neg_ptm_and_mut_and_bindsite: 31306
	Unique seq pairs: 23378

Total overlapping seq_pairs between pos and neg: 84
Total overlapping seq_pairs between pos and unknown: 1452
Total overlapping seq_pairs between neg and unknown: 275
----------------------------------------------------------------------------------------------------
Dropped bad seq_sorts.
MI:0915 (physical interaction) Final database sizes:
Size of POSITIVE database, mi0915_simplemerged_ptm_and_mut_and_bindsite: 728611
	Unique seq pairs: 422043
Size of UNKNOWN database, mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown: 517010
	Unique seq pairs: 383679
Size of NEGATIVE mi0915_simplemerged_neg_ptm_and_mut_and_bindsite: 30353
	Unique seq pairs: 23021

Total overlapping seq_pairs between pos and 

In [15]:
print(",\n".join(
    sorted([f"\"{x}\"" for x in list(mi0915_simplemerged_ptm_and_mut_and_bindsite.columns)]))
      )

"Mutated Partner Status",
"Mutated Partner",
"Mutated all_new_binds",
"Mutated all_og_binds",
"Mutated decisive_entry_new_binds",
"Mutated decisive_entry_og_binds",
"Mutated decisive_seqpair_new_binds",
"Mutated decisive_seqpair_og_binds",
"Mutation # Feature AC",
"Mutation Affected protein AC",
"Mutation Affected protein full name",
"Mutation Affected protein organism",
"Mutation Affected protein symbol",
"Mutation Feature annotation(s)",
"Mutation Feature range(s)",
"Mutation Feature short label",
"Mutation Feature type",
"Mutation Figure legend(s)",
"Mutation Interaction AC",
"Mutation Interaction participants",
"Mutation Interactor Matches",
"Mutation Original sequence",
"Mutation PubMedID",
"Mutation Resulting sequence",
"Mutation Xref ID(s)",
"Mutation new_binds_bo_ac",
"Mutation new_binds_bo_annotation",
"Mutation new_binds_bo_feature_type",
"Mutation new_binds_to_gname_bo_annotation",
"Mutation new_binds_to_uniprot_bo_annotation",
"Mutation new_nobind_to_gname_bo_annotation",
"

In [16]:
# How many are protein-peptide? 
test1 = mi0407_simplemerged_ptm_and_mut_and_bindsite.loc[
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["mol_type_1"]=="peptide") | 
    (mi0407_simplemerged_ptm_and_mut_and_bindsite["mol_type_2"]=="peptide")
]
print(len(test1.drop_duplicates("seq_sort")))
test1.head()[["interaction_intactid","mol_type_1","mol_type_2","aa_1","length_1","aa_2","length_2"]]

618


Unnamed: 0,interaction_intactid,mol_type_1,mol_type_2,aa_1,length_1,aa_2,length_2
880,EBI-9037892,peptide,protein,PLARTLSVAGLPGKK,15,MSSRNIEWEHFEEREKGHRSPRGSGGSHSGSRGNGIVPSPAHSAHC...,810
1643,EBI-15669673,peptide,protein,QDKEYYKVKEPG,12,MTLRCLEPSGNGGEGTRSQWGTAGSAEEPSPQAARLAKALRELGQT...,198
1644,EBI-15669753,peptide,protein,NGNNYVYIDPT,11,MTLRCLEPSGNGGEGTRSQWGTAGSAEEPSPQAARLAKALRELGQT...,198
1690,EBI-5274719,peptide,protein,WDCLDNRIGRRQCVKL,16,MEQRGQNAPAASGARKRHGPGPREARGARPGPRVPKTLVLVVAAVL...,440
1691,EBI-5274707,peptide,protein,WDCLDNRIGKRQCVRL,16,MEQRGQNAPAASGARKRHGPGPREARGARPGPRVPKTLVLVVAAVL...,440


In [17]:
# save
savedir = "data_files/processed/intact/clean"
mi0915_simplemerged_ptm_and_mut_and_bindsite.to_csv(f"{savedir}/mi0915_ptm_and_mut_and_bindsite.csv",index=False)
mi0915_simplemerged_neg_ptm_and_mut_and_bindsite.to_csv(f"{savedir}/mi0915_neg_ptm_and_mut_and_bindsite.csv",index=False)
mi0915_simplemerged_ptm_and_mut_and_bindsite_unknown.to_csv(f"{savedir}/mi0915_unknown_ptm_and_mut_and_bindsite.csv",index=False)

# save
mi0407_simplemerged_ptm_and_mut_and_bindsite.to_csv(f"{savedir}/mi0407_ptm_and_mut_and_bindsite.csv",index=False)
mi0407_simplemerged_neg_ptm_and_mut_and_bindsite.to_csv(f"{savedir}/mi0407_neg_ptm_and_mut_and_bindsite.csv",index=False)
mi0407_simplemerged_ptm_and_mut_and_bindsite_unknown.to_csv(f"{savedir}/mi0407_unknown_ptm_and_mut_and_bindsite.csv",index=False)

# Look for prevalence of structure-based methods

In [827]:
cryo = "MI:0040"
xray_crystallography = "MI:0114"
len(merged.loc[
    (merged["interaction_detection_methods_sorted"].str.contains(cryo)) | 
    (merged["interaction_detection_methods_sorted"].str.contains(xray_crystallography))  
])

len(merged.loc[
    (merged["interaction_detection_methods_sorted"].str.contains(cryo)) &
    (merged["interaction_detection_methods_sorted"].str.contains(xray_crystallography)) &
    (merged["interaction_detection_methods_sorted"].str.count("\\|")==1)
])

1

In [828]:
merged.loc[
    (merged["interaction_detection_methods_sorted"].str.contains(cryo)) &
    (merged["interaction_detection_methods_sorted"].str.contains(xray_crystallography)) &
    (merged["interaction_detection_methods_sorted"].str.count("\\|")==1)
][["interaction_intactid","interaction_detection_methods_sorted","uniprot_A","uniprot_B"]]

Unnamed: 0,interaction_intactid,interaction_detection_methods_sorted,uniprot_A,uniprot_B
728094,EBI-7808554,MI:0040|MI:0114,uniprotkb:Q9WZU0-0,uniprotkb:Q9WZU0-0


In [829]:
import time

# Calculate the end time: current time + 10 minutes (in seconds)
end_time = time.time() + (15 * 60) 

# Loop until the current time exceeds the end_time
while time.time() < end_time:
    # Place the code you want to execute repeatedly within this loop
    #print("This message repeats every few seconds for 10 minutes.")
    time.sleep(5) # Optional: Add a small delay to avoid excessive CPU usage
    # If you have a function to call, you can call it here:
    # my_function() 

print("Loop finished after 60 minutes.")

### SOPHIE NOTE TO SELF YOURE STOPPING HERE SO YOU CAN EDIT THE MATCHES METHOD

KeyboardInterrupt: 