In [1]:
import omnipath
from omnipath._core.downloader._downloader import Downloader

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import liana as li

In [3]:
import pandas as pd
import numpy as np

In [4]:
dnwld = Downloader()

In [5]:
# Show homologues available
RAW_TAXA_URL = "https://raw.githubusercontent.com/oganm/homologene/master/data-raw/taxData.tsv"
HOMOLOGENE_URL = "https://raw.githubusercontent.com/oganm/homologene/master/data-raw/homologene2.tsv"
CPLEX_PREFIX = 'COMPLEX:'

In [6]:
def show_homologene():
    return dnwld.maybe_download(RAW_TAXA_URL, callback=pd.read_table)

In [7]:
show_homologene()

Unnamed: 0,tax_id,name_txt
0,10090,Mus musculus
1,10116,Rattus norvegicus
2,28985,Kluyveromyces lactis
3,318829,Magnaporthe oryzae
4,33169,Eremothecium gossypii
5,3702,Arabidopsis thaliana
6,4530,Oryza sativa
7,4896,Schizosaccharomyces pombe
8,4932,Saccharomyces cerevisiae
9,5141,Neurospora crassa


In [8]:
# omnipath.clear_cache()

Homologene Download function

In [9]:
def _get_homologene_raw(): 
    homologene = (dnwld.maybe_download(HOMOLOGENE_URL,
                                     callback=pd.read_table,
                                     ).
                  astype(str).
                  rename(columns={"Gene.Symbol": "genesymbol",
                                   "Gene.ID":'gene_id',
                                   "Taxonomy":'ncbi_taxid',
                                   "HID":'hid'}).
                  set_index("hid")
                  )
    return homologene

In [10]:
homologene = _get_homologene_raw()

In [11]:
homologene.head()

Unnamed: 0_level_0,genesymbol,ncbi_taxid,gene_id
hid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3,ACADM,9606,34
3,ACADM,9598,469356
3,ACADM,9544,705168
3,ACADM,9615,490207
3,ACADM,9913,505968


In [12]:
def download_homologene(source_organism, target_organism, id_type='genesymbol'):
    """
    Download homologene information for a given source and target organism.
    
    Parameters
    ----------
    source_organism : str
        Source organism name.
    target_organism : str
        Target organism name.
    id_type : str
        Type of ID to use for homology conversion. Can be one of 'genesymbol', 'gene_id'.
        
    Returns
    -------
    A pandas DataFrame with homologene information.
    
    """
    
    homologene = _get_homologene_raw()
    
    source_df = homologene[(homologene['ncbi_taxid'] == source_organism)][[id_type]]
    target_df = homologene[(homologene['ncbi_taxid'] == target_organism)][[id_type]]

    homologene = pd.merge(source_df, target_df,
                          right_index=True, left_index=True,
                          suffixes=('_source', '_target'),
                          how='inner')
    homologene = homologene.reset_index().rename({f'{id_type}_source':'source',
                                                  f'{id_type}_target':'target'}, axis=1)
    homologene = homologene[['source', 'target']]
    
    return homologene

In [13]:
resource = li.resource.select_resource()
# resource = resource[resource['receptor'].str.contains('_')]

Simplify fun, to be done by a single-column at a time

In [14]:
import pandas as pd
from itertools import product

In [15]:
# define a function to replace list elements with dictionary values
def _replace_subunits(lst, my_dict, one_to_many=False):
    result = []
    for x in lst:
        if x in my_dict:
            value = my_dict[x]
            
            if not isinstance(value, list):
                value = [value]
            
            if (not one_to_many) & (len(my_dict[x]) > 1):
                result.append(np.nan)
            else:
                result.append(value)
        else:
            result.append(np.nan)
    return result

In [16]:
def generate_orthologs(df, receptor_col, subunits_col):
    """
    Group a DataFrame by a receptor column, and generate all possible combinations of subunits
    within each group.

    Args:
        df (pd.DataFrame): the input DataFrame.
        receptor_col (str): the name of the column containing the receptor IDs.
        subunits_col (str): the name of the column containing the subunit IDs.

    Returns:
        pd.DataFrame: a DataFrame containing all possible subunit combinations for each receptor.
    """
    grouped = df.groupby(receptor_col)
    
    # Generate all possible subunit combinations within each group
    complexes = []
    for name, group in grouped:
        if group[subunits_col].isnull().all():
            continue
        subunit_lists = [list(x) for x in group[subunits_col]]
        complex_combinations = list(product(*subunit_lists))
        for complex in complex_combinations:
            complexes.append((name, '_'.join(complex)))
    
    # Create output DataFrame
    result = pd.DataFrame(complexes, columns=['source', 'target'])
    return result

In [17]:
map_df = download_homologene('9606', '10090').set_index('source')

In [18]:
map_df.head()

Unnamed: 0_level_0,target
source,Unnamed: 1_level_1
ETFA,Etfa
CLDN4,Cldn4
HAUS2,Haus2
EFHC1,Efhc1
LRRC8D,Lrrc8d


In [19]:
map_dict = map_df.groupby(level=0)["target"].apply(list).to_dict()

In [20]:
# one_to_many = map_df[(map_df.groupby(level=0).count()['target']>=2)].index
# # receptor contains any of the one_to_many
# resource = resource[resource['receptor'].str.contains('|'.join(one_to_many))]

In [21]:
columns = ['ligand', 'receptor']
col = columns[1]

In [41]:
cplex_table = resource[[col]].drop_duplicates().set_index(col)
# cplex_table = resource[col]?
cplex_table['subunits'] = cplex_table.index.str.split('_')

In [42]:
# apply the function to the column
cplex_table['subunits'].apply(_replace_subunits, args=(map_dict, False,))

receptor
PTPRC      [[Ptprc]]
MET          [[Met]]
CD44        [[Cd44]]
LRP1        [[Lrp1]]
CD47        [[Cd47]]
             ...    
TMIGD3         [nan]
IL20RA    [[Il20ra]]
AMHR2      [[Amhr2]]
ACTR2      [[Actr2]]
IFNAR2    [[Ifnar2]]
Name: subunits, Length: 1059, dtype: object

In [43]:
cplex_table['subunits'] = cplex_table['subunits'].apply(_replace_subunits, args=(map_dict, False,))

In [44]:
cplex_table

Unnamed: 0_level_0,subunits,receptor
receptor,Unnamed: 1_level_1,Unnamed: 2_level_1
PTPRC,[PTPRC],[[Ptprc]]
MET,[MET],[[Met]]
CD44,[CD44],[[Cd44]]
LRP1,[LRP1],[[Lrp1]]
CD47,[CD47],[[Cd47]]
...,...,...
TMIGD3,[TMIGD3],[nan]
IL20RA,[IL20RA],[[Il20ra]]
AMHR2,[AMHR2],[[Amhr2]]
ACTR2,[ACTR2],[[Actr2]]


In [45]:
resource['receptor'].nunique()

1059

In [46]:
df = cplex_table['subunits'].explode().reset_index().dropna()

In [39]:
generate_orthologs(df, col, 'subunits').sort_values('source')

Unnamed: 0,source,target
0,A1BG,A1bg
1,ABCA1,Abca1
2,ACKR1,Ackr1
3,ACKR2,Ackr2
4,ACKR3,Ackr3
...,...,...
981,VLDLR,Vldlr
982,VSIG10L,Vsig10l
983,VSIR,Vsir
984,XCR1,Xcr1


Homology Conversion

In [None]:
def _explode_complexes(resource: pd.DataFrame,
                       target: str ='source',
                       source: str ='target') -> pd.DataFrame:
    resource['key'] = resource[SOURCE] + '|' + resource[TARGET]
    resource = (resource.set_index('interaction')
                .apply(lambda x: x.str.split('_'))
                .explode([TARGET])
                .explode(SOURCE)
                .reset_index()
                )
    resource[[f'{SOURCE}_complex', f'{TARGET}_complex']] = resource[
        'interaction'].str.split('|', expand=True)

    return resource

In [None]:
def _rebuild_complexes(df, group_cols, target_col):
    df = resource.copy()
    df_grouped = resource.groupby(group_cols)['target'].agg(lambda x: '_'.join(map(str, x))).reset_index()
    df = df.drop('target', axis=1).merge(df_grouped, on=list(np.setdiff1d(group_cols, target_col)), how='inner')
    df = df.drop(f'{target_col}_complex', axis=1).rename({'target':f'{target_col}_complex'}, axis=1)
    
    return df

In [None]:
map_df = download_homologene('9606', '10090')

In [None]:
map_df

In [None]:
resource = li.resource.select_resource()

In [None]:
resource = li.resource.explode_complexes(resource)
# map_df = dict(zip(map_df['source'], map_df['target']))

In [None]:
resource

In [None]:
resource = resource.merge(map_df, left_on='receptor', right_on='source', how='inner').drop(['source'], axis=1)
resource = _rebuild_complexes(resource, ['interaction', 'ligand'], 'receptor')

In [None]:
resource

In [None]:
resource = resource.merge(map_df, left_on='ligand', right_on='source', how='inner').drop(['source'], axis=1)
resource = _rebuild_complexes(resource, ['interaction', 'receptor'], 'ligand')

In [None]:
resource

In [None]:
# remove duplicates (1:many mappings)
resource = resource.drop_duplicates(['interaction']).drop(['interaction'], axis=1)

In [None]:
resource

Only keep full complexes

In [None]:
map_df = download_homologene('9606', '10090')

In [None]:
resource = li.resource.select_resource()
resource = li.resource.explode_complexes(resource)
resource = resource.merge(map_df, left_on='receptor', right_on='source', how='inner').drop(['source'], axis=1)

In [None]:
group_cols = ['interaction', 'ligand']
target = 'receptor'

In [None]:
resource

In [None]:
df = resource.copy()

df_grouped = resource.groupby(group_cols)['target'].agg(lambda x: '_'.join(map(str, x))).reset_index()

In [None]:
df_grouped

In [None]:
# cplex_table['subunits2']