In [None]:
import pandas as pd
import re
from glyles import Glycan, convert, convert_generator
from IPython.display import Image
import networkx as nx
import matplotlib.pyplot as plt
import pickle
from data_functions import *

# to do: add heating map cell for different combinations of p_threshold and n_threshold

In [None]:
## function that returns readable glycans

def readable_glycans(glycan):
    """
    glycan: glycan string in it IUPAC condensed notation
    returns glycans which are readable by Glyles package"""
    
    global current_index
    #i = 0
    try:
        glycan = Glycan(glycan, tree_only= True)

        Image(glycan.save_dot("viz.dot").create_png())   

        if current_index % 1000 == 0:
            print(current_index)    
        return True

    except Exception as e:        
        # print(f"error processing this glycan '{glycan}: {e}")
        if current_index % 1000 == 0:
            print(current_index) 
        return False
    
    finally: current_index += 1

In [None]:
## Data filtering
# Step I

data = pd.read_csv('./data/v8_sugarbase.csv', sep=",", quotechar='"')
# data = pd.read_csv('./data/toy_data.csv', sep=",", quotechar='"') 

# replacing (?1-?) -> (a1-999)
# replacing (a1-?) -> (a1-998)
data['glycan'] = data['glycan'].apply(lambda x: replace_wildcards(x))


# make additional column to the data that will add True for readable glycans and False for non readable glycans
current_index = 0
data['readable'] = data['glycan'].apply(readable_glycans)

# print(data.shape)


# separating readable and non readable
filtered_sugarbase = data[data['readable']].copy().reset_index(drop = True)
nonread_sugarbase = data[~data['readable']].copy().reset_index(drop = True)

print(filtered_sugarbase.shape)
print(filtered_sugarbase)

In [None]:
print(data.shape)
print(filtered_sugarbase.shape)

In [None]:
## get its data_structure and filtering data
# removing those glycans which can not be parsed into Glyles package
# due to bad notation
# STEP II

dic = {}
wild_dic = {}
other_unrecognized_glycans = 0

i = 0

In [None]:
# looping over glycan column and getting data structure for this
for index, gly in enumerate(filtered_sugarbase['glycan']):
    try:
        # Attempt to create a Glycan object
        glycan = Glycan(gly, tree_only= True)
        #print("yes it works")
        pt = glycan.parse_tree
        node_data = pt.nodes(data=True)
        monomer_objects = [data['type'] for _, data in node_data]        
        result = []
        visited = set()

        roots = [node for node in pt.nodes  if pt.in_degree(node) == 0]
        root = roots[0]

        nodes_edges_info = dfs_traversal(pt, root, visited, result, dic, monomer_objects)


    except Exception as e:
        #print("it shows error")
        # print(f"Error processing glycan at index {index}: {e}")
        other_unrecognized_glycans += 1
        # print(index, gly)
        filtered_sugarbase = filtered_sugarbase.drop(index)

    i += 1
    if i % 1000 == 0:
        print(i)

In [None]:
print(len(nodes_edges_info)) #14162
print(other_unrecognized_glycans)
print(filtered_sugarbase.shape)
print(filtered_sugarbase)

In [None]:
# to save the structure
with open('./data/data_structure.pkl', 'wb') as f:
    pickle.dump(nodes_edges_info, f)

# testing on toy data
# with open('./data/toy_data_structure.pkl', 'wb') as f:
#     pickle.dump(nodes_edges_info, f)                                    

# # save the filtered data
filtered_sugarbase.to_csv('./data/filtered_v8_sugarbase.csv',  index=False) 
# filtered_sugarbase.to_csv('./data/toy_filtered_v8_sugarbase.csv',  index=False)  # testing on toy data

In [None]:
### for debugging purpose
# check_data = pd.read_csv('./data/filtered_v8_sugarbase.csv', sep=",", quotechar='"')
# check_data = pd.read_csv('./data/toy_filtered_v8_sugarbase.csv', sep=",", quotechar='"')
# print(check_data.shape)

In [None]:
# # import data structure and work with it
# # stpep III 

column_name = 'glycan'  # Specify the column name that has to be extracted

# print((loaded_dict))
loaded_data_struct = nodes_edges_info.copy()

# get wildcards from this data_structure and remove keys with 999 or 998 in it
wildcard_dict = {}
for key, value in loaded_data_struct.items():
    if "999" not in str(key) and "998" not in str(key):
        if "999" in str(value) or "998" in str(value):
            wildcard_dict[key] = value

print(len(wildcard_dict)) #2390





In [None]:
# remove bond configurations (wildcards) which are not sufficiently enough times in the data_structure
p_threshold = 0.70
n_threshold = 10

wild_card_dict = new_wildcard_dict(wildcard_dict, p_threshold, n_threshold)
#print(wild_card_dict)
print(len(wild_card_dict)) # 278

# copy new_wildcard_dict here

In [None]:
# remove wildcard and replce them with known ones (i.e update unknowns with knowns)
wild_card_dict = remove_wildcards_from_datastructure(wild_card_dict)
#print(wild_card_dict)
print(len(wild_card_dict)) # 278

In [None]:
# update data structure of each glycan as much as possible
print(filtered_sugarbase.shape)
count_in = 0
count_out = 0
#keys_data_structure = set(loaded_dict.keys())
other_unrecognized_glycans = 0
i = 0
rectified = 0

for index, gly in filtered_sugarbase['glycan'].items():
    dic = {}
    stop = 0
    stop_1 = 0
    i += 1
    if i % 1000 == 0:
        print(i)
    if "999" in gly or "998" in gly:
        #print(gly)
        glycan = Glycan(gly, tree_only= True)
        #print("yes it works")
        pt = glycan.parse_tree
        node_data = pt.nodes(data=True)
        monomer_objects = [data['type'] for _, data in node_data]
        
        # update the glycan string's data structure
        d = update_glycans(pt, gly, wild_card_dict, index, monomer_objects)
        #print(d)
        if not d:
            #print("d is zero")
            filtered_sugarbase = filtered_sugarbase.drop(index)
            continue
        
        roots = [node for node in pt.nodes  if pt.in_degree(node) == 0]
        root = roots[0]        
        visited = set()
        # update its tree once its structure is updated
        pt = update_graph(pt, d, visited, root, monomer_objects)
        #print(pt)

        if not pt:
            #print("graph could not resolved")
            filtered_sugarbase = filtered_sugarbase.drop(index)

        elif pt:
            rectified += 1
            string = Monomer(monomer_objects[root]).get_name(full=True)
            visited = set()
            # get glycan string back by traversing its tree
            glycan_string = dfs_traversal_for_string(pt, root, visited, string, monomer_objects)
            
            # checking whether the reconstructed string is valid
            try:
                # Attempt to create a Glycan object
                glycan = Glycan(glycan_string, tree_only= True)
                #print("yes it works")
                pt = glycan.parse_tree
                node_data = pt.nodes(data=True)
                monomer_objects = [data['type'] for _, data in node_data]
                result = []
                visited = set()

                roots = [node for node in pt.nodes  if pt.in_degree(node) == 0]
                root = roots[0]

                info = dfs_traversal(pt, root, visited, result, dic, monomer_objects)

                filtered_sugarbase.at[index, 'glycan'] = glycan_string


            except Exception as e:
                #print("it shows error")
                # print(f"Error processing glycan at index {index}: {e}")
                other_unrecognized_glycans += 1


print(filtered_sugarbase.shape)
print(filtered_sugarbase)
print(rectified)

In [None]:
#filtered_sugarbase.to_csv('toy_rectified_v8_sugarbase.csv',  index=False) # save filtered data
filtered_sugarbase.to_csv('./data/rectified_v8_sugarbase.csv',  index=False) # save filtered data

In [None]:
filtered_sugarbase.reset_index(drop = True, inplace = True)
print(filtered_sugarbase)
print(filtered_sugarbase.shape)

In [None]:
# This is the first step after completing data processing part
# Refilter the data by removing those glycans which could not be read by Glyles package

print(filtered_sugarbase.shape)
non_convertibles = 0
i = 0
for index, gly in filtered_sugarbase["glycan"].items():
    i += 1
    if i % 1000 == 0:
        print(i)
    try:
        #print(gly)
        glycan = Glycan(gly, tree_only=True)
        glycan.get_smiles()
    except Exception as e:
            #print("it shows error")
            # print(gly)
            # print(f"Error processing glycan at index {index}: {e}")
            filtered_sugarbase = filtered_sugarbase.drop(index)
            non_convertibles += 1

    
print(non_convertibles)
print(filtered_sugarbase)
print(filtered_sugarbase.shape)


# filtered_sugarbase.to_csv('toy_refiltered_rectified_v8_sugarbase.csv',  index=False) # save filtered data

In [None]:
filtered_sugarbase.reset_index(drop = True, inplace = True)
print(filtered_sugarbase)

print(filtered_sugarbase.shape)

In [None]:
# This is the second step after completing data processing part
# Converting IUPAC-condensed notations into SMILES
# also removing those glycans which could not be converted into SMILES strings
print(filtered_sugarbase.shape)
filtered_sugarbase['smiles'] = filtered_sugarbase.apply(lambda row: iupac_to_smiles(row['glycan'], row.name), axis = 1)

print(filtered_sugarbase)
print(filtered_sugarbase.shape)


new_df = filtered_sugarbase[filtered_sugarbase['smiles'] != '']
fancy_df = filtered_sugarbase[filtered_sugarbase['smiles'] == '']
print(new_df.shape)
print(fancy_df.shape)


new_df.to_csv('./data/iupac_to_smiles.csv',  index=False) # save converted data
fancy_df.to_csv('./data/non_iupac_to_smiles.csv',  index=False) # save non-convertible data
# new_df.to_csv('./data/toy_iupac_to_smiles.csv',  index=False) # save converted data
# fancy_df.to_csv('./data/toy_non_iupac_to_smiles.csv',  index=False) # save non-convertible data
