In [4]:
import importlib

import pandas as pd
import shutil
import os

## raw data

In [25]:
raw_data = pd.read_csv('./data/_all_sites_oglcnac.csv')
raw_data.sample(10, random_state=0)

Unnamed: 0,UniprotKB ID,Entry name,organism,full name,oglcnacscore,oglcnac sites,phosphorylation sites,PMIDS,sequence
5792,P59923,ZN445_HUMAN,Homo sapiens,Zinc finger protein 445,12.289763,T919,,35254053,MPPGRWHAAYPAQAQSSRERGRLQTVKKEEEDESYTPVQAARPQTL...
16761,X2JIM3,X2JIM3_DROME,Drosophila melanogaster,,11.65895,,,33925313,MMEQSVRNQTTMSKTTNRNRTAGGIEAPSIANASATSTASASALAN...
13633,Q9DC40,TELO2_MOUSE,Mus musculus,Telomere length regulation protein TEL2 homolog,12.704362,,S457;S486;S488;S492;S837,36288343,MDPALSAVRLTVQEAIHILSSSEDAGHILSTLGTLKRYLGGTEDPV...
51,A0A0A0MQM6,A0A0A0MQM6_MOUSE,Mus musculus,POU domain protein,12.55059,,,34887587,VQSAIPQTQLMLAGGQITGLTLTPAQQQLLLQQAQAQAQLLAAAVQ...
8636,Q4E0L0,Q4E0L0_TRYCC,Trypanosoma cruzi,,7.585376,,,30984116,MSHFSREATKLLRAAEDALSGRRPAALSLPVEQQESWDRRDGALNC...
9372,Q62059,CSPG2_MOUSE,Mus musculus,Versican core protein,9.761602,,S2585;S2586,33300544,MLINMKGILWMCSTLLLTHALHQAKMETSPPVKGSLSGKVVLPCHF...
15675,Q9Z180,SETBP_MOUSE,Mus musculus,SET-binding protein,11.752367,T1098;S1271;S1276;S1277;S1326;S1343,,34418053,MEPREMLSSCRQRGSESEFLQGSSSRSPPAPGCSGEPLKGISVGGE...
1977,O00311,CDC7_HUMAN,Homo sapiens,Cell division cycle 7-related protein kinase,6.005927,,S27;T503,30379171,MEASLGIQMDEPMAFSPQRDRFQAEGSLKKNEQNFKLAGVKKDIEK...
10067,Q6ZSJ9,SHSA6_HUMAN,Homo sapiens,Protein shisa-6,7.248596,,S391;S397;S409;T433;T477,28411811;30379171,MALRRLLLLLLLSLESLDLLPSVHGARGRAANRTLSAGGAAVGGRR...
13481,Q9CXY9,GPI8_MOUSE,Mus musculus,GPI-anchor transamidase,11.956056,S123,,30059200,MAAPCFLTLRVATLAALALLSLGSSAAGHIEDQAEQFFRSGHTNNW...


In [16]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16762 entries, 0 to 16761
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   UniprotKB ID           16762 non-null  object 
 1   Entry name             16758 non-null  object 
 2   organism               16758 non-null  object 
 3   full name              13872 non-null  object 
 4   oglcnacscore           16758 non-null  float64
 5   oglcnac sites          4583 non-null   object 
 6   phosphorylation sites  8233 non-null   object 
 7   PMIDS                  16758 non-null  object 
 8   sequence               16758 non-null  object 
dtypes: float64(1), object(8)
memory usage: 1.2+ MB


## all o-glcnacylated data

In [10]:
oglcnac_data = raw_data[~raw_data['oglcnac sites'].isnull()] # select o-glcnacylated proteins
oglcnac_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4583 entries, 6 to 16753
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   UniprotKB ID           4583 non-null   object 
 1   Entry name             4583 non-null   object 
 2   organism               4583 non-null   object 
 3   full name              4443 non-null   object 
 4   oglcnacscore           4583 non-null   float64
 5   oglcnac sites          4583 non-null   object 
 6   phosphorylation sites  3136 non-null   object 
 7   PMIDS                  4583 non-null   object 
 8   sequence               4583 non-null   object 
dtypes: float64(1), object(8)
memory usage: 358.0+ KB


**Mammalian species**:

1. HUMAN: Homo sapiens. - This refers to us, humans. Many studies, especially in the latter stages of medical research, are done on human cells, tissues, or volunteers to understand human-specific mechanisms and for clinical trials. - Humans are the most common and widespread species of primate. They are characterized by their hairlessness, bipedalism, and high intelligence. Humans have a large brain and resulting cognitive skills that enable them to thrive in varied environments and develop complex societies and civilizations. Anatomically modern humans emerged around 300,000 years ago in Africa.

1. MOUSE: Mus musculus. - The laboratory mouse is one of the most widely used model organisms in biology and genetics. They share 95% of their genes with humans, reproduce quickly, and are small and easy to keep, making them an excellent model for human disease. - The house mouse is a small rodent, known for its pointed snout, small rounded ears, a body-length scaly tail, and a high breeding rate. The common house mouse is also popular as pets. In some places, certain kinds of field mice are locally common.

1. PIG: - Sus scrofa domesticus. Pigs are used in research for various purposes. They are particularly valuable in translational research because of the physiological and anatomical similarities with humans, especially in the cardiovascular system. - Pigs are domesticated, omnivorous mammals. They are variously considered a subspecies of Sus scrofa (the wild boar or Eurasian boar) or a distinct species. The pig’s head-plus-body length ranges from 0.9 to 1.8 m (3 to 6 ft), and adult pigs typically weigh between 50 and 350 kg (110 and 770 lb).

1. RAT: Rattus norvegicus. - Rats are another common model organism, particularly in neuroscience, because their brains and nervous systems are more complex and closer in some aspects to humans than mice. - Rats are medium-sized, long-tailed rodents. The best-known rat species are the black rat (Rattus rattus) and the brown rat (Rattus norvegicus). This group, generally known as the Old World rats or true rats, originated in Asia5. Rats are bigger than most Old World mice, which are their relatives, but seldom weigh over 500 grams (17½ oz) in the wild.

## o-glcnacylated mammalian data

In [22]:
def entry_class(x): # return True value if mammalian species 
    if x[-5:] in ['HUMAN', 'MOUSE']:
        return True
    
    elif x[-3:] in ['PIG', 'RAT']:
        return True
    
    else:
        return False

mammalian_data = oglcnac_data[oglcnac_data['Entry name'].apply(entry_class)] # select o-glcnacylated mammalian proteins
mammalian_data = mammalian_data[['UniprotKB ID', 'organism', 'oglcnacscore', 'oglcnac sites', 'sequence']].reset_index(drop=True) # select useful columns
mammalian_species = mammalian_data.organism.unique() # to check if data only include four mammalian species (Human:Homo sapiens, Mouse:Mus musculus, Pig:Sus scrofa domesticus, Rat:Rattus norvegicus)

print('Organisms:', dict(zip(range(len(mammalian_species)), mammalian_species)))
mammalian_data.sample(10, random_state=0)

Organisms: {0: 'Homo sapiens', 1: 'Mus musculus', 2: 'Rattus norvegicus', 3: 'Sus scrofa'}


Unnamed: 0,UniprotKB ID,organism,oglcnacscore,oglcnac sites,sequence
1412,P53621,Homo sapiens,22.64697,S489;T821,MLTKFETKSARVKGLSFHPKRPWILTSLHNGVIQLWDYRMCTLIDK...
1496,P61019,Homo sapiens,11.106489,S121,MAYAYLFKYIIIGDTGVGKSCLLLQFTDKRFQPVHDLTIGVEFGAR...
2091,Q2T9K0,Homo sapiens,11.981591,T369;T371,MGEAPSPAPALWDWDYLDRCFARHRVCISFGLWICASSCWIAAHAL...
3401,Q92993,Homo sapiens,13.761853,S119 (Q92993-3),MAEVGEIIEGCRLPVLRRNQDNEDEWPLAEILSVKDISGRKLFYVH...
4049,Q9NSY1,Homo sapiens,13.155974,S367,MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAV...
3714,Q9BVC6,Homo sapiens,7.447188,T55,MAASSISSPWGKHVFKAILMVLVALILLHSALAQSRRDFAPPGQQK...
3333,Q91Z67,Mus musculus,20.655088,S990,MTSPAKFKKDKEIIAEYDTQVKEIRAQLTEQMKCLDQQCELRVQLL...
1389,P51957,Homo sapiens,13.756163,S766,MPLAAYCYLRVVGKGSYGEVTLVKHRRDGKQYVIKKLNLRNASSRE...
1922,Q14584,Homo sapiens,6.005927,S208;S308,MLENYKNLATVGYQLFKPSLISWLEQEESRTVQRGDFQASEWKVQL...
3352,Q92542,Homo sapiens,13.163917,S419;S437;S445;T505;S708,MATAGGGSGADPGSRGLLRLLSFCVLLAGLCRGNSVERKIYIPLNK...


In [26]:
def get_int(list_site_str):
    return [int(x[1:]) if len(x) < 8 else int(x.split(' ')[0][1:]) for x in list_site_str] # remove either S or T, Example of a format with long string: 'S119 (Q92993-3)'
    
mammalian_int = mammalian_data.copy()
mammalian_int['oglcnac sites'] = mammalian_data['oglcnac sites'].apply(lambda x: x.split(';')).apply(get_int) # transform [S489;T821] -> [285, 871]
mammalian_int['sequence'] = mammalian_int.sequence.apply(lambda x: x.replace(' ', '')) # remove space at the end of the sequence
mammalian_int.sample(10, random_state=0)

Unnamed: 0,UniprotKB ID,organism,oglcnacscore,oglcnac sites,sequence
1412,P53621,Homo sapiens,22.64697,"[489, 821]",MLTKFETKSARVKGLSFHPKRPWILTSLHNGVIQLWDYRMCTLIDK...
1496,P61019,Homo sapiens,11.106489,[121],MAYAYLFKYIIIGDTGVGKSCLLLQFTDKRFQPVHDLTIGVEFGAR...
2091,Q2T9K0,Homo sapiens,11.981591,"[369, 371]",MGEAPSPAPALWDWDYLDRCFARHRVCISFGLWICASSCWIAAHAL...
3401,Q92993,Homo sapiens,13.761853,[119],MAEVGEIIEGCRLPVLRRNQDNEDEWPLAEILSVKDISGRKLFYVH...
4049,Q9NSY1,Homo sapiens,13.155974,[367],MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAV...
3714,Q9BVC6,Homo sapiens,7.447188,[55],MAASSISSPWGKHVFKAILMVLVALILLHSALAQSRRDFAPPGQQK...
3333,Q91Z67,Mus musculus,20.655088,[990],MTSPAKFKKDKEIIAEYDTQVKEIRAQLTEQMKCLDQQCELRVQLL...
1389,P51957,Homo sapiens,13.756163,[766],MPLAAYCYLRVVGKGSYGEVTLVKHRRDGKQYVIKKLNLRNASSRE...
1922,Q14584,Homo sapiens,6.005927,"[208, 308]",MLENYKNLATVGYQLFKPSLISWLEQEESRTVQRGDFQASEWKVQL...
3352,Q92542,Homo sapiens,13.163917,"[419, 437, 445, 505, 708]",MATAGGGSGADPGSRGLLRLLSFCVLLAGLCRGNSVERKIYIPLNK...


## 

## save dataset

In [27]:
mammalian_int.to_csv('./data/oglcnacome_sites.csv')
mammalian_int.to_pickle('./data/oglcnacome_sites.pkl')

In [35]:
data_load = pd.read_csv('./data/oglcnacome_sites.csv', index_col=0)
data_load.sample(10, random_state=0)

Unnamed: 0,UniprotKB ID,organism,oglcnacscore,oglcnac sites,sequence
1412,P53621,Homo sapiens,22.64697,"[489, 821]",MLTKFETKSARVKGLSFHPKRPWILTSLHNGVIQLWDYRMCTLIDK...
1496,P61019,Homo sapiens,11.106489,[121],MAYAYLFKYIIIGDTGVGKSCLLLQFTDKRFQPVHDLTIGVEFGAR...
2091,Q2T9K0,Homo sapiens,11.981591,"[369, 371]",MGEAPSPAPALWDWDYLDRCFARHRVCISFGLWICASSCWIAAHAL...
3401,Q92993,Homo sapiens,13.761853,[119],MAEVGEIIEGCRLPVLRRNQDNEDEWPLAEILSVKDISGRKLFYVH...
4049,Q9NSY1,Homo sapiens,13.155974,[367],MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAV...
3714,Q9BVC6,Homo sapiens,7.447188,[55],MAASSISSPWGKHVFKAILMVLVALILLHSALAQSRRDFAPPGQQK...
3333,Q91Z67,Mus musculus,20.655088,[990],MTSPAKFKKDKEIIAEYDTQVKEIRAQLTEQMKCLDQQCELRVQLL...
1389,P51957,Homo sapiens,13.756163,[766],MPLAAYCYLRVVGKGSYGEVTLVKHRRDGKQYVIKKLNLRNASSRE...
1922,Q14584,Homo sapiens,6.005927,"[208, 308]",MLENYKNLATVGYQLFKPSLISWLEQEESRTVQRGDFQASEWKVQL...
3352,Q92542,Homo sapiens,13.163917,"[419, 437, 445, 505, 708]",MATAGGGSGADPGSRGLLRLLSFCVLLAGLCRGNSVERKIYIPLNK...


In [34]:
data_load = pd.read_pickle('./data/oglcnacome_sites.pkl')
data_load.sample(10, random_state=0)

Unnamed: 0,UniprotKB ID,organism,oglcnacscore,oglcnac sites,sequence
1412,P53621,Homo sapiens,22.64697,"[489, 821]",MLTKFETKSARVKGLSFHPKRPWILTSLHNGVIQLWDYRMCTLIDK...
1496,P61019,Homo sapiens,11.106489,[121],MAYAYLFKYIIIGDTGVGKSCLLLQFTDKRFQPVHDLTIGVEFGAR...
2091,Q2T9K0,Homo sapiens,11.981591,"[369, 371]",MGEAPSPAPALWDWDYLDRCFARHRVCISFGLWICASSCWIAAHAL...
3401,Q92993,Homo sapiens,13.761853,[119],MAEVGEIIEGCRLPVLRRNQDNEDEWPLAEILSVKDISGRKLFYVH...
4049,Q9NSY1,Homo sapiens,13.155974,[367],MKKFSRMPKSEGGSGGGAAGGGAGGAGAGAGCGSGGSSVGVRVFAV...
3714,Q9BVC6,Homo sapiens,7.447188,[55],MAASSISSPWGKHVFKAILMVLVALILLHSALAQSRRDFAPPGQQK...
3333,Q91Z67,Mus musculus,20.655088,[990],MTSPAKFKKDKEIIAEYDTQVKEIRAQLTEQMKCLDQQCELRVQLL...
1389,P51957,Homo sapiens,13.756163,[766],MPLAAYCYLRVVGKGSYGEVTLVKHRRDGKQYVIKKLNLRNASSRE...
1922,Q14584,Homo sapiens,6.005927,"[208, 308]",MLENYKNLATVGYQLFKPSLISWLEQEESRTVQRGDFQASEWKVQL...
3352,Q92542,Homo sapiens,13.163917,"[419, 437, 445, 505, 708]",MATAGGGSGADPGSRGLLRLLSFCVLLAGLCRGNSVERKIYIPLNK...
