In [1]:
import numpy as np
import pandas as pd

from glypy.io import iupac
from glypy.algorithms import subtree_search

In [2]:
# Lists

monomers = ['Glc', 'Gal', 'Man', 'Fuc', 'Kdn', 'GlcNAc', 'GalNAc', 'GlcA', 'Neu5Ac', 'Neu5Gc']
terminal = ['[' + monomer for monomer in monomers]
modifications = ['3S', '4S', '6S', '6P']
modified = ['(6P)Man', '(6S)Glc', '(6P)Glc', '(3S)GlcA',
            '(3S)Gal', '(4S)Gal', '4S(3S)Gal', '6S(3S)Gal', '(6S)(4S)Gal', '(6P)Gal',
            '(3S)GalNAc', '(4S)GalNAc', '(6S)GalNAc', '(6S)(4S)GalNAc',
            '(3S)GlcNAc', '(6S)GlcNAc', '(6P)GlcNAc']

In [3]:
# Helper Functions

def pre_process(glycan):
    glycan = glycan.replace('KDN', 'Kdn')
    return glycan

def post_process(glycan, subtree):
    subtree = subtree.replace('2NAc', 'NAc')
    subtree = subtree.replace('Neu5N', 'Neu5')
    subtree = subtree.replace('a-Glc', 'GlcA')
    if '5Ac(a' in glycan or '5Gc(a' in glycan:
        subtree = subtree.replace('?', 'a')
    elif '5Ac(b' in glycan or '5Gc(b' in glycan:
        subtree = subtree.replace('?', 'b')
    return subtree

def is_modified(glycan):
    n = 0
    for x in modifications:
        n += glycan.count(x)
    return True if n > 0 else False

def remove_modification(glycan):
    for item in modifications:
        glycan = glycan.replace('(' + item + ')', '')
        glycan = glycan.replace(item, '')
    return glycan

def get_subtrees(glycan, k=2):
    structure = iupac.loads(glycan, dialect='simple')
    subtrees = []
    for treelet in subtree_search.treelets(structure, k, distinct=False):
        subtree = iupac.dumps(treelet, dialect='simple')
        subtrees.append(post_process(glycan, subtree))
    return subtrees

def count_items(string, items):
    count_dict = {}
    for item in reversed(items):
        count = string.count(item)
        if count > 0:
            string = string.replace(item, '')
            count_dict[item] = count
    return count_dict

def count_subtrees(subtrees):
    count_dict = {}
    for subtree in set(subtrees):
        count = subtrees.count(subtree)
        if count > 0:
            count_dict[subtree] = count
    return count_dict

In [13]:
# Main Functions

def remove_linker(glycan):
    """Removes linker from IUPAC."""
    return '('.join(glycan.split('(')[:-1])

def get_descriptors(glycan, depth=3, use_terminal=True):
    """Get descriptors for CFG glycan with counts.
    depth: Number of monomers in the largest subtree searched. Default: 3
    use_terminal: Use terminal monosaccharide as descriptors. Default: True"""
    glycan = pre_process(glycan)
    descriptors = {}
    descriptors.update(count_items(glycan, monomers + modifications))
    if depth > 1:
        if is_modified(glycan):
            descriptors.update(count_items(glycan, modified))
            glycan = remove_modification(glycan)
        for k in range(2, depth + 1):
            subtrees = get_subtrees(glycan, k)
            descriptors.update(count_subtrees(subtrees))
    if use_terminal:
        glycan = '[' + glycan
        descriptors.update(count_items(glycan, terminal))
    return descriptors

In [14]:
cfg_data = pd.read_csv('Data/IUPAC.csv').dropna()
cfg_glycans = [remove_linker(glycan) for glycan in cfg_data['IUPAC']]
all_descs, all_dicts, fingprs = [], [], []
for glycans in cfg_glycans:
    desc_dict = get_descriptors(glycans, depth=3, use_terminal=True)
    all_descs += desc_dict.keys()
    all_dicts.append(desc_dict)
descs = sorted(set(all_descs))
for desc_dict in all_dicts:
    fingpr = [desc_dict[desc] if desc in desc_dict else 0 for desc in descs]
    fingprs.append(fingpr)
cfg_fingprs = pd.DataFrame(fingprs, columns=descs)
cfg_fingprs.insert(0, 'IUPAC', cfg_glycans)
cfg_fingprs.head()

Unnamed: 0,IUPAC,(3S)Gal,(3S)GalNAc,(3S)GlcA,(3S)GlcNAc,(4S)Gal,(4S)GalNAc,(6P)Gal,(6P)Glc,(6P)GlcNAc,...,[Fuc,[Gal,[GalNAc,[Glc,[GlcA,[GlcNAc,[Kdn,[Man,[Neu5Ac,[Neu5Gc
0,Gal,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
1,Glc,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
2,Man,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,GalNAc,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,GalNAc,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [None]:
glygen_data = pd.read_csv('GlyGen/Extracted.csv')
glygen_glycans = glygen_data['IUPAC'].tolist()
fingprs = []
for glycan in glygen_glycans:
    desc_dict = get_descriptors(glycan, depth=3, use_terminal=True)
    fingpr = [desc_dict[desc] if desc in desc_dict else 0 for desc in descs]
    fingprs.append(fingpr)
glygen_fingprs = pd.DataFrame(fingprs, columns=descs)
glygen_fingprs.insert(0, 'IUPAC', glygen_glycans)
glygen_fingprs.head()