In [1]:
import pandas as pd
pd.set_option('display.max_rows', 200)
import numpy as np

import time
import random

from os import getcwd 
from os.path import exists

version = 'v4'
update = False

getcwd() # current working directory

'D:\\project\\MIT_glyco'

In [2]:
### data processing ###
### grouping the positive sites with same protein names ###

# load O-GlcNAcylated site data
columns = ['protein', 'positive_site', 'sequence']
mauli_sites = pd.read_csv("./OGP-master/all_sites.csv", 
                          header=None, names=['protein', 'positive_site', 'sequence'])
protein_list = list(mauli_sites.protein.unique()) # all kinds of proteins in Mauli's dataset
display(mauli_sites)

Unnamed: 0,protein,positive_site,sequence
0,A2ABU4,430,MTLPHSPGSAGEPQASQTVQVHRLEHRQEEEQKEERQHSLQMGSSV...
1,A2AGT5,568,MGDDSEWLKLPVDQKCEHKLWKARLSGYEEALKIFQKIKDEKSPEW...
2,A2AHJ4,264,MAAAPTQIEAELYYLIARFLQSGPCNKSAQVLVQELEEHQLIPRRL...
3,A2AHJ4,279,MAAAPTQIEAELYYLIARFLQSGPCNKSAQVLVQELEEHQLIPRRL...
4,A2AKB9,87,MFPFGPHSPGGDETAGAEEPPPLGGPAAASRPPSPAPRPASPQRGA...
...,...,...,...
531,Q9UPN6,614,MEAVKTFNSELYSLNDYKPPISKAKMTQITKAAIKAIKFYKHVVQS...
532,Q9UQ35,2236,MYNGIGLPTPRGSGTNGYVQRNLSLVRGRRGERPDYKGEEELRRLE...
533,Q9Y2X9,891,MKIGSGFLSGGGGTGSSGGSGSGGGGSGGGGGGGSSGRRAEMEPTF...
534,Q9Y520,2693,MSEKSGQSTKAKDGKKYATLSLFNTYKGKSLETQKTTARHGLQSLG...


In [3]:
##########
save_name = f'{version}_positive_sites.csv'
new_columns = ['protein', 'positive_sites', 'sequence']
group_sites = pd.DataFrame([], columns=new_columns)
##########
if not exists(save_name) or update:
    for name in protein_list:
        protein = mauli_sites[mauli_sites['protein']==name]
        sites = [x for x in protein.positive_site.values]
        sequence = protein.sequence.iloc[0]
        temp_df = pd.DataFrame([[name, sites, sequence]], columns = new_columns)
        group_sites = pd.concat([group_sites, temp_df], axis=0)
        
    group_sites = group_sites.reset_index(drop=True)
    group_sites.to_csv(save_name, index=False)
else:
    group_sites = pd.read_csv(save_name)
    
display(group_sites)

Unnamed: 0,protein,positive_sites,sequence
0,A2ABU4,[430],MTLPHSPGSAGEPQASQTVQVHRLEHRQEEEQKEERQHSLQMGSSV...
1,A2AGT5,[568],MGDDSEWLKLPVDQKCEHKLWKARLSGYEEALKIFQKIKDEKSPEW...
2,A2AHJ4,"[264, 279]",MAAAPTQIEAELYYLIARFLQSGPCNKSAQVLVQELEEHQLIPRRL...
3,A2AKB9,"[87, 99, 100, 364]",MFPFGPHSPGGDETAGAEEPPPLGGPAAASRPPSPAPRPASPQRGA...
4,E9Q1P8,[208],MAAAVAVAAASRRQSCYLCDLPRMPWAMIWDFTEPVCRGCVNYEGA...
...,...,...,...
270,Q9P2N5,[738],MLIEDVDALKSWLAKLLEPICDADPSALANYVVALVKKDKPEKELK...
271,Q9UPN6,"[615, 614]",MEAVKTFNSELYSLNDYKPPISKAKMTQITKAAIKAIKFYKHVVQS...
272,Q9UQ35,[2236],MYNGIGLPTPRGSGTNGYVQRNLSLVRGRRGERPDYKGEEELRRLE...
273,Q9Y2X9,[891],MKIGSGFLSGGGGTGSSGGSGSGGGGSGGGGGGGSSGRRAEMEPTF...


In [4]:
### data processing ###
### Integrate separated Mauli's secondary structure data into a data sheet ###
flex_path = "./OGP-master/dynamine_results" # directory of flexibility results
angle_path = "./OGP-master/spider3_results" # directory of angle related information

pass_list = ["P24622_2", "Q91YE8_2"] #these proteins have positive sites which are out of bound
for x in pass_list:
    protein_list.remove(x) 
    
save_path = './protein_secondary'
for name in protein_list:
    save_name = f"{save_path}/{version}_secondary_{name}.csv"
    if not exists(save_name) or update:
        # load flexibility data
        flex_data = pd.read_fwf(f"{flex_path}/{name}_backbone.pred", header=None)
        flex_data.columns = ["values"]
        flex_data = flex_data[11:].reset_index(drop=True)
        flex_data['SEQ'] = flex_data['values'].apply(lambda x: x.split()[0])
        flex_data['flexibility'] = flex_data['values'].apply(lambda x: x.split()[1])
        flex_data = flex_data[['SEQ', 'flexibility']]
        
        # load angle data
        angle_data = pd.read_csv(f"{angle_path}/{name}.spd33")
        # split values from raw angle data
        angle_columns = angle_data.columns[0].split()
        angle_data.columns = ["values"]
        for c, column in enumerate(angle_columns):
            angle_data[column] = angle_data['values'].apply(lambda x: x.split()[c])
            
        # concatenate and add a feature of positivie sites 
        sencondary_data = pd.concat([angle_data.iloc[:,1:], flex_data.iloc[:,1:]], axis=1)
        sencondary_data.index.name = name
        
        # save the dataframe
        sencondary_data.to_csv(save_name, index=True)
        
    else:
        sencondary_data = pd.read_csv(save_name)
        
display(sencondary_data)

Unnamed: 0,Q9Y520,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,P(C),P(H),P(E),flexibility
0,0,1,M,C,155.0,-95.8,133.1,115.4,165.4,3.1,3.3,0.988,0.007,0.005,0.422
1,1,2,S,C,103.7,-97.1,140.5,116.6,-157.5,1.4,3.8,0.952,0.018,0.029,0.402
2,2,3,E,C,151.2,-89.7,125.5,111.2,-135.5,2.9,7.7,0.918,0.031,0.051,0.414
3,3,4,K,C,151.6,-89.7,115.7,109.8,-160.5,3.5,6.9,0.915,0.038,0.047,0.434
4,4,5,S,C,92.5,-87.4,115.3,109.1,-128.5,2.4,7.0,0.939,0.034,0.028,0.453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891,2891,2892,E,C,110.1,-95.9,129.5,114.1,-151.5,8.1,12.4,0.686,0.253,0.062,0.580
2892,2892,2893,E,C,129.9,-89.6,124.2,111.0,-140.1,5.4,11.0,0.713,0.227,0.061,0.540
2893,2893,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,0.753,0.168,0.080,0.473
2894,2894,2895,K,C,160.4,-89.3,66.8,107.1,136.5,2.7,7.7,0.902,0.057,0.042,0.420


In [5]:
### This part generates sequential data within the fixed-length window ###
'''
Features used in Mauli's model
1) Side chain length: 0, 1, 2, 3, 4, 5, 6 or 7 
where 0 is No Residue, 1 is Glycine, 2 Very Small, 3 Small, 4 Normal, 5 Long, 6 Cycle and 7 Proline from positions −1 to +5
2) Non-polar aliphatic amino acids from positions −3 to −1: 0, 1, 2 or 3
3) Polar positively charged residues from positions −7 to −5: 0, 1, 2 or 3
4) Number of serines and threonines in the -/+10 residue window
5) Flexibility: continuous value from 0 to 1 where 0 is flexible and 1 rigid
6) Secondary structure: 0, 1 or 2 where 0 is not structured, 1 is alpha helix and 2 is beta strand
7) Presence of a proline in +1: 0 or 1 (no or yes)
8) Secondary structure according to phi and psi angles (0 other, 1 beta or 2 alpha)
9) Nature of the site: 0 or 1 where 0 is serine and 1 threonine
'''

amino_acid = {"A":1, "R":2, "N":3, "D":4, "C":5, 
              "E":6, "Q":7, "G":8, "H":9, "I":10, 
              "L":11, "K":12, "M":13, "F":14, "P":15, 
              "S":16, "T":17, "W":18, "Y":19, "V":20}
'''
1)  Alanine (Ala, A)
2)  Arginine (Arg, R)
3)  Asparagine (Asn, N)
4)  Aspartic acid (Asp, D)
5)  Cysteine (Cys, C)
6)  Glutamic acid (Glu, E)
7)  Glutamine (Gln, Q)
8)  Glycine (Gly, G)
9)  Histidine (His, H)
10) Isoleucine (Ile, I)
11) Leucine (Leu, L)
12) Lysine (Lys, K)
13) Methionine (Met, M)
14) Phenylalanine (Phe, F)
15) Proline (Pro, P)
16) Serine (Ser, S)
17) Threonine (Thr, T)
18) Tryptophan (Trp, W)
19) Tyrosine (Tyr, Y)
20) Valine (Val, V)
'''

# def letter_to_token(letter):
#     if letter in dictionary.keys():
#         return dictionary[letter]
#     else:
#         return 0

def make_window(protein_ss, index, start=-10, end=10, marking=False):
    start_index = min(max(index+start, 0), len(protein_ss))
    end_index   = max(min(index+end+1, len(protein_ss)), 0)
#     if marking:
#         sequence = protein_ss['SEQ'].iloc[window_start:window_end].copy()
#         sequence.iloc[index-window_start] = f'"{sequence.iloc[index-window_start]}"'
#         sequence = sequence.sum()        
#     else:
#         sequence = protein_ss['SEQ'].iloc[window_start:window_end].sum()

    window = protein_ss['SEQ'].iloc[start_index:end_index].sum()
    return window

def side_chain(letter):
    if (letter == "G"): #1
        return 'gly' 
    elif (letter == "V" or letter == "A"): #2 Asn, Gln
        return 'very_small'
    elif (letter == "S" or letter == "I" or letter == "L" or letter == "T" or letter == "C"): #3 Ser, Thr, Ile, Leu, Cys
        return 'small'
    elif (letter == "D" or letter == "E" or letter == "N" or letter == "Q" or letter == "M"): #4 Asp, Asn, Glu, Gln, Met
        return 'normal'
    elif (letter == "R" or letter == "K"): #5 Arg, Lys
        return 'long'
    elif (letter == "F" or letter == "W" or letter == "Y" or letter == "H"): #6 Phe, Trp, Tyr, His
        return 'cycle'
    elif (letter == "P"): #7
        return 'pro'
    else:
        return 'None' #0
    
def nonpolar_aliphatic(protein_ss, index, start=-3, end=-1): # Non-polar aliphatic AA from -3 ro -1(Ala, Val, Leu, Ile, Pro)
    window = make_window(protein_ss, index, start, end)
    if window:
        nA = window.count("A")
        nV = window.count("V")
        nL = window.count("L")
        nI = window.count("I")
        nP = window.count("P")
        return nA + nV + nL + nI + nP
    else:
        return 0
    

def positively_charged(protein_ss, index, start=-7, end=-5): # count the number of positively charged AA from -7 to -5 (Ard, Lys, His)
    window = make_window(protein_ss, index, start, end)
    if window:
        nR = window.count("R")
        nK = window.count("K")
        nH = window.count("H")
        return nR + nK + nH
    else:
        return 0
    
def S_and_T(protein_ss, index, start=-10, end=10): # Number of serines and threonines in the -/+10 residue window
    window = make_window(protein_ss, index, start, end)
    if window:
        nS = window.count("S")
        nT = window.count("T")
        return nS + nT

    else:
        return 0
    
def is_proline_after(protein_ss, index, after=1): # check whether there is a proline after the site
    if index+after >=0 and index+after <= len(protein_ss)-1:
        return int(protein_ss['SEQ'].iloc[index+after] == 'P')
    else:
        return int(False)

def phi_psi(protein_ss, index):
    phi, psi = protein_ss[['Phi','Psi']].iloc[index]
    if phi > -160 and phi < -50:
        if psi > 100 and psi < 180:
            return "alpha"
        elif psi > -60 and psi < 20:
            return "beta"
        else:
            return "other"
    else:
        return "other"

class Protein():
    def __init__(self, protein_ss):
        self.protein_ss = protein_ss.copy()
        self.protein_len = len(protein_ss)

In [6]:
### data processing ###
### Make dataset for machine learning model ###
### execution time: 1m 57s ###

load_path = './protein_secondary'
save_path = './protein_dataset'

for name in protein_list:
    load_name = f"{load_path}/{version}_secondary_{name}.csv"
    save_name = f"{save_path}/{version}_dataset_{name}.csv"
    if not exists(save_name) or update:
        machine_dataset = pd.read_csv(load_name, index_col=0)
        
        # determine side chain
        side_window = [-1, 5]
        for side in range(side_window[0], side_window[1]+1):
            if side < 0:
                machine_dataset[f"side_{side}"] = pd.concat([pd.Series(np.zeros(abs(side))), machine_dataset.SEQ.iloc[:side]]).apply(side_chain).reset_index(drop=True)
            elif side > 0:
                machine_dataset[f"side_{side}"] = pd.concat([machine_dataset.SEQ.iloc[side:], pd.Series(np.zeros(abs(side)))]).apply(side_chain).reset_index(drop=True)
            else:
                pass
            
        # count non-polar aliphatic amino residues
        machine_dataset['nAli'] = machine_dataset['#'].apply(lambda x: nonpolar_aliphatic(machine_dataset, x-1, start=-3, end=-1))
        
        # count positively charged residues
        machine_dataset['nPos'] = machine_dataset['#'].apply(lambda x: positively_charged(machine_dataset, x-1, start=-7, end=-5))
        
        # count serine or threonine
        machine_dataset['nS/nT'] = machine_dataset['#'].apply(lambda x: S_and_T(machine_dataset, x-1, start=-10, end=10))
        
        # determine presence of proline after +1 position
        machine_dataset['Proline'] = machine_dataset['#'].apply(lambda x: is_proline_after(machine_dataset, x-1, after=1))
        
        # determine secondary structure using Phi-Psi
        machine_dataset['phi_psi'] = machine_dataset['#'].apply(lambda x: phi_psi(machine_dataset, x-1))
        
        # positivity
        machine_dataset['positivity'] = 0
        sites = pd.read_csv(f"{version}_positive_sites.csv")
        sites = sites[sites['protein']==name].positive_sites.iloc[0]
        sites_index = [int(x)-1 for x in sites[1:-1].split(', ')]
        machine_dataset.loc[sites_index, "positivity"] = 1
        
        # save file
        machine_dataset.to_csv(save_name, index=True)
        
    else:
        machine_dataset = pd.read_csv(save_name, index_col=0)
        
display(machine_dataset)

Unnamed: 0_level_0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,side_2,side_3,side_4,side_5,nAli,nPos,nS/nT,Proline,phi_psi,positivity
Q9Y520,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,M,C,155.0,-95.8,133.1,115.4,165.4,3.1,3.3,...,normal,long,small,gly,0,0,4,0,alpha,0
1,2,S,C,103.7,-97.1,140.5,116.6,-157.5,1.4,3.8,...,long,small,gly,normal,0,0,4,0,alpha,0
2,3,E,C,151.2,-89.7,125.5,111.2,-135.5,2.9,7.7,...,small,gly,normal,small,0,0,4,0,alpha,0
3,4,K,C,151.6,-89.7,115.7,109.8,-160.5,3.5,6.9,...,gly,normal,small,small,0,0,4,0,alpha,0
4,5,S,C,92.5,-87.4,115.3,109.1,-128.5,2.4,7.0,...,normal,small,small,long,0,0,4,0,alpha,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2891,2892,E,C,110.1,-95.9,129.5,114.1,-151.5,8.1,12.4,...,small,long,small,,1,1,3,0,alpha,0
2892,2893,E,C,129.9,-89.6,124.2,111.0,-140.1,5.4,11.0,...,long,small,,,0,0,3,0,alpha,0
2893,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,...,small,,,,0,0,3,0,alpha,0
2894,2895,K,C,160.4,-89.3,66.8,107.1,136.5,2.7,7.7,...,,,,,0,1,3,0,other,0


In [7]:
load_path = './protein_dataset'
save_name = f"{version}_data_all_sites.csv"

if not exists(save_name) or update:
    for i, name in enumerate(protein_list):
        load_name = f"{load_path}/{version}_dataset_{name}.csv"
        temp = pd.read_csv(load_name, index_col=0)
        temp['protein'] = temp.index.name

        if i==0:
            dataset = temp
        else:
            dataset = pd.concat([dataset, temp], axis=0)

    dataset = dataset.reset_index(drop=True)
    dataset.to_csv(save_name, index=False)
else:
    dataset = pd.read_csv(save_name)
    
display(dataset)

Unnamed: 0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,side_3,side_4,side_5,nAli,nPos,nS/nT,Proline,phi_psi,positivity,protein
0,1,M,C,112.7,-100.9,139.3,119.5,165.0,8.5,9.1,...,pro,cycle,small,0,0,3,0,alpha,0,A2ABU4
1,2,T,C,103.3,-102.0,132.1,117.6,-150.0,3.8,13.9,...,cycle,small,pro,0,0,3,0,alpha,0,A2ABU4
2,3,L,C,50.9,-97.8,134.7,118.5,-149.2,16.4,11.6,...,small,pro,gly,0,0,3,1,alpha,0,A2ABU4
3,4,P,C,77.2,-69.2,144.0,111.0,-105.3,7.5,16.7,...,pro,gly,small,1,0,3,0,alpha,0,A2ABU4
4,5,H,C,80.3,-95.4,141.5,118.6,-135.6,13.3,13.3,...,gly,small,very_small,2,0,3,0,alpha,0,A2ABU4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
258648,2892,E,C,110.1,-95.9,129.5,114.1,-151.5,8.1,12.4,...,long,small,,1,1,3,0,alpha,0,Q9Y520
258649,2893,E,C,129.9,-89.6,124.2,111.0,-140.1,5.4,11.0,...,small,,,0,0,3,0,alpha,0,Q9Y520
258650,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,...,,,,0,0,3,0,alpha,0,Q9Y520
258651,2895,K,C,160.4,-89.3,66.8,107.1,136.5,2.7,7.7,...,,,,0,1,3,0,other,0,Q9Y520


In [8]:
save_name = f"{version}_data_ST_sites.csv"
if not exists(save_name) or update:
    data_for_ml = dataset[(dataset['SEQ']=='S') | (dataset['SEQ']=='T')].reset_index(drop=True)
    data_for_ml.to_csv(save_name, index=False)
else:
    data_for_ml = pd.read_csv(save_name)

display(data_for_ml)

Unnamed: 0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,side_3,side_4,side_5,nAli,nPos,nS/nT,Proline,phi_psi,positivity,protein
0,2,T,C,103.3,-102.0,132.1,117.6,-150.0,3.8,13.9,...,cycle,small,pro,0,0,3,0,alpha,0,A2ABU4
1,6,S,C,60.0,-87.4,138.5,115.2,-125.9,7.8,16.7,...,small,very_small,gly,2,0,4,1,alpha,0,A2ABU4
2,9,S,C,56.1,-89.9,142.4,116.8,121.2,8.2,13.9,...,normal,pro,normal,1,0,5,0,alpha,0,A2ABU4
3,16,S,C,75.5,-82.7,22.5,104.9,-107.4,5.9,14.2,...,very_small,normal,very_small,2,0,4,0,other,0,A2ABU4
4,18,T,C,78.2,-96.3,112.1,112.0,84.6,5.8,13.7,...,very_small,cycle,long,1,0,3,0,alpha,0,A2ABU4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41427,2876,T,C,76.2,-95.6,138.6,117.8,-135.9,8.8,13.9,...,very_small,long,small,2,0,3,0,alpha,0,Q9Y520
41428,2881,T,C,58.2,-99.5,90.7,111.0,-161.6,11.6,16.2,...,small,long,pro,2,0,4,0,other,0,Q9Y520
41429,2891,T,C,80.3,-102.2,131.1,116.5,-164.4,7.0,14.3,...,small,long,small,2,1,4,0,alpha,0,Q9Y520
41430,2894,T,C,106.7,-91.4,100.3,109.1,-163.0,3.1,9.1,...,,,,0,0,3,0,alpha,0,Q9Y520


In [9]:
match = [exists(f"./protein_charge/{x}") for x in protein_list]
dict_match = dict(zip(protein_list, match))
display(dict_match)

{'A2ABU4': True,
 'A2AGT5': False,
 'A2AHJ4': True,
 'A2AKB9': True,
 'E9Q1P8': True,
 'E9Q5G3': True,
 'F6T0L5': False,
 'F6TYF8': False,
 'O35161': False,
 'O70263': True,
 'O70400': True,
 'P0CG49': False,
 'P14602': True,
 'P20152': True,
 'P23927': True,
 'P25446': True,
 'P26039': True,
 'P31001': True,
 'P31230': True,
 'P35279': False,
 'P42128': True,
 'P47746': True,
 'P48614_2': False,
 'P48678': True,
 'P53668': True,
 'P59759': True,
 'P60710': False,
 'P63017': True,
 'P63248': False,
 'P68033': True,
 'P70402': True,
 'P70670': True,
 'P83741': True,
 'P97306': True,
 'P97326': True,
 'P97379': True,
 'P97855': True,
 'Q01705_2': False,
 'Q03173_4': False,
 'Q04690_4': False,
 'Q3TN34': True,
 'Q3UCQ1': True,
 'Q3USJ8': True,
 'Q4U2R1_2': False,
 'Q505D9': True,
 'Q571K4': True,
 'Q5BJ29': True,
 'Q5DTJ9': True,
 'Q5DU31': True,
 'Q62381_2': False,
 'Q62417_5': False,
 'Q62418': True,
 'Q62419': True,
 'Q62523': True,
 'Q66L42': True,
 'Q69ZI1_3': False,
 'Q6GQX2': True,

In [10]:
import os
charge_folders = os.listdir('./protein_charge')
protein_match = [name for name in protein_list if exists(f"./protein_charge/{name}")]
for name in protein_match:
    charge_folders.remove(name)
    
print("These proteins do not match with their names")
display(charge_folders)

These proteins do not match with their names


['A0A024RAY2',
 'features-all-names.csv',
 'P0CG62',
 'P63249',
 'P68406',
 'Q4R561_P60710',
 'Q62381',
 'Q69ZI1',
 'Q80TI1',
 'Q80TR8',
 'Q80YE7',
 'Q8BXL9',
 'Q91YE8',
 'Q9WVB1']

In [13]:
load_folder = f"./protein_charge"
charge_columns = list(pd.read_csv(f'./protein_charge/features-all-names.csv', header=None).loc[0].values)

save_name = f"{version}_augmented_dataset_all.pkl"
if not exists(save_name):
    for i, name in enumerate(protein_match):
        load_name = f"{load_folder}/{name}/all_AA/combined-output-all-residues-features.csv"
        charge_data = pd.read_csv(load_name, sep=' ', header=None)
        charge_data.columns = ["#"]+charge_columns[1:-2]
        charge_data = charge_data.sort_values(by='#').reset_index(drop=True)

        data = dataset[dataset['protein']==name].reset_index(drop=True)
        aug_data = pd.concat([data, charge_data], axis=1)

        if i==0:
            aug_dataset = aug_data
        else:
            aug_dataset = pd.concat([aug_dataset, aug_data], axis=0)
            
    aug_dataset = aug_dataset.reset_index(drop=True)
    aug_dataset.to_pickle(save_name)
    
else:
    aug_dataset = pd.read_pickle(save_name)
    
display(aug_dataset)

Unnamed: 0,#,SEQ,SS,ASA,Phi,Psi,Theta(i-1=>i+1),Tau(i-2=>i+2),HSE_alpha_up,HSE_alpha_down,...,exposed_positive_charge_all_with_around_target_ser_thr,exposed_positive_charge_all_backbone_around_target_ser_thr,exposed_positive_charge_all_sidechain_around_target_ser_thr,exposed_positive_charge_all_backbone_with_around_target_ser_thr,exposed_positive_charge_all_sidechain_with_around_target_ser_thr,exposed_negative_charge_all_around_target_ser_thr,exposed_negative_charge_all_with_around_target_ser_thr,exposed_negative_charge_all_backbone_around_target_ser_thr,exposed_negative_charge_all_sidechain_around_target_ser_thr,exposed_negative_charge_all_backbone_with_around_target_ser_thr
0,1,M,C,112.7,-100.9,139.3,119.5,165.0,8.5,9.1,...,1.04,1.96,1.55,2.59,-4.08,-5.38,-2.25,-1.83,-3.06,-2.32
1,2,T,C,103.3,-102.0,132.1,117.6,-150.0,3.8,13.9,...,1.62,3.43,1.62,4.22,-5.52,-7.43,-3.06,-2.46,-4.04,-3.39
2,3,L,C,50.9,-97.8,134.7,118.5,-149.2,16.4,11.6,...,1.62,4.42,2.13,5.23,-7.33,-8.85,-4.04,-3.29,-5.02,-3.83
3,4,P,C,77.2,-69.2,144.0,111.0,-105.3,7.5,16.7,...,2.13,4.78,2.66,5.23,-8.28,-9.44,-4.72,-3.56,-5.52,-3.92
4,5,H,C,80.3,-95.4,141.5,118.6,-135.6,13.3,13.3,...,2.08,2.73,2.66,4.80,-6.48,-8.53,-4.56,-1.92,-5.54,-2.99
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
66679,729,A,H,29.7,-73.2,-30.4,98.8,56.7,15.8,16.9,...,4.42,8.10,4.93,8.37,-14.84,-16.09,-8.30,-6.54,-9.28,-6.81
66680,730,K,H,102.8,-72.5,-24.6,97.3,56.5,9.2,17.4,...,3.91,6.35,4.42,8.19,-16.23,-17.40,-9.32,-6.91,-9.83,-7.57
66681,731,Q,C,91.3,-85.0,-2.2,100.0,55.6,9.1,16.1,...,3.40,5.34,3.91,6.78,-10.20,-11.88,-6.81,-3.39,-7.32,-4.56
66682,732,G,C,50.1,79.5,4.9,101.0,-84.7,4.8,17.1,...,2.89,5.26,3.40,5.35,-9.63,-10.63,-5.34,-4.29,-6.34,-4.29


In [None]:
for i, name in enumerate(protein_match):
    load_name = f"{load_folder}/{name}/all_AA/combined-output-all-residues-features.csv"
    charge_data = pd.read_csv(load_name, sep=' ', header=None)
    charge_data.columns = ["#"]+charge_columns[1:-2]
    charge_data = charge_data.sort_values(by='#')
    
    data = dataset[dataset['protein']==name].reset_index(drop=True)
    
    print(name)
    print("augmented:", charge_data.isnull().sum().sum())
    print("original :", data.isnull().sum().sum())
    print()

In [None]:
name = 'A2ABU4'
load_name = f"{load_folder}/{name}/all_AA/combined-output-all-residues-features.csv"
charge_data = pd.read_csv(load_name, sep=' ', header=None)
charge_data.columns = ["#"]+charge_columns[1:-2]
charge_data = charge_data.sort_values(by='#').reset_index(drop=True)
    
charge_data[-200:]