In [4]:
import os

data_identity30 = "data/identity30.txt"

if not os.path.exists(data_identity30):
    
    !wget -O $data_identity30 http://dunbrack.fccc.edu/Guoli/users_html/cullpdb_pc25_res3.0_R0.3_d181017_chains13327.20909

In [5]:
!head --lines 10 $data_identity30

IDs         length   Exptl.  resolution  R-factor FreeRvalue
7ODCA       424      XRAY        1.600    0.20    0.23
3SWHA       341      XRAY        2.650    0.25    0.30
2AXPA       173      XRAY        2.500    0.26    0.29
4EIUA       249      XRAY        1.900    0.18    0.21
5LWAA       159      XRAY        1.650    0.17    0.20
5OF3B       307      XRAY        2.910    0.23    0.27
3T47A        81      XRAY        1.300    0.16    0.19
2NUTC       196      XRAY        2.300    0.22    0.27
3RC3A       677      XRAY        2.080    0.16    0.20


In [22]:
def filter_files(path_file):
    
    names_filter = []
    
    with open(path_file, "r") as freader:
        
        for idx, line in enumerate(freader):
            
            if idx == 0: continue
            
            data = line.split()
            
            name = data[0]
            
            if len(name) > 5: continue
            
            name_reference = name[:4].upper() + "_dihedrals_chain_" + name[-1].upper() + ".csv"
            
            names_filter.append(name_reference)
    
    return names_filter

In [23]:
names_filter = filter_files(path_file=data_identity30)

In [26]:
len(names_filter[0])

26

In [13]:
def encoding_angle(angle, split = 1):

    angle = int(1.0 * angle / split)

    if angle < 0:

        angle += 360

    if angle >= 360:

        angle = angle % 360

    return angle


def encoding_omega(omega):

    omega = int(omega)

    if omega < 0:

        omega += 360

    if omega >= 360:

        omega = omega % 360

    return 180 if 90 <= omega < 270 else 0


def create_dictionary_phi(split = 1):

    dictionary_phi = Dictionary()

    n_ranges = int(1.0 * 360 / split)

    for idx in range(n_ranges):

        dictionary_phi.add_word(str(idx * 360))

    return dictionary_phi


def create_dictionary_ome():

    dictionary_ome = Dictionary()

    dictionary_ome.add_word("180")

    dictionary_ome.add_word("0")

    return dictionary_ome

In [27]:
import os
from glob import glob

class ChainLoader:

    def __init__(self, root_dir, sep=";", fixed=None, filter_by_names = None):

        self.root_dir = root_dir

        self.sep = sep

        self.fixed = fixed
        
        if filter_by_names is not None:
            
            self.names = []
            
            for name in sorted(glob(os.path.join(self.root_dir, '*.csv'))):
                
                name_reference = name[-26:]
                
                if name_reference in filter_by_names:
                    
                    self.names.append(name)
                    
        else:
        
            self.names = sorted(glob(os.path.join(self.root_dir, '*.csv')))

    def __len__(self):

        return len(self.names) - 1 if self.fixed is None else self.fixed

    def __getitem__(self, idx):

        return self.get_with_name(self.names[idx])

    def get_with_name(self, name):

        pri_seq = []

        phi_seq = []

        psi_seq = []

        ome_seq = []

        with open(name, "r") as pdb:

            for index, line in enumerate(pdb):

                if index == 0:

                    continue

                # CSV STRUCTURE
                # RESNAME, RES_N, PHI, PSI, OMEGA

                residue_values = line.split(self.sep)

                resname = residue_values[0].strip().upper()

                ang_phi = residue_values[2].strip()

                ang_psi = residue_values[3].strip()

                ang_ome = residue_values[4].strip()

                pri_seq.append(resname)

                phi_seq.append(float(ang_phi))

                psi_seq.append(float(ang_psi))

                ome_seq.append(float(ang_ome))

        return pri_seq, phi_seq, psi_seq, ome_seq, name.split("/")[-1]


In [30]:
cl = ChainLoader(root_dir="/media/panceta_disk/DATA/CHAINS_SPLIT/train/", 
                 sep=";", 
                 fixed=10,
                 filter_by_names=names_filter
                )

cl[0]

(['P',
  'S',
  'P',
  'R',
  'E',
  'Q',
  'L',
  'M',
  'E',
  'S',
  'I',
  'R',
  'K',
  'G',
  'K',
  'E',
  'L',
  'K',
  'Q',
  'A'],
 [0.0,
  -70.7,
  -54.18,
  -82.22,
  -59.78,
  -80.68,
  -73.9,
  -68.2,
  -71.36,
  -76.03,
  -65.35,
  -69.29,
  -70.41,
  79.52,
  -166.33,
  -76.97,
  -121.54,
  -70.98,
  -136.92,
  -148.58],
 [122.98,
  161.52,
  -27.38,
  -36.56,
  -36.94,
  -19.72,
  -44.44,
  -31.08,
  -30.43,
  -35.11,
  -42.86,
  -29.06,
  -31.72,
  -177.65,
  158.06,
  144.25,
  151.77,
  -172.1,
  160.11,
  0.0],
 [-179.17,
  177.4,
  177.59,
  176.64,
  -179.38,
  173.79,
  179.32,
  177.53,
  177.55,
  176.57,
  177.7,
  175.6,
  -177.32,
  176.54,
  177.89,
  -178.78,
  -168.57,
  -168.13,
  174.06,
  180.0],
 '0020_3MN7_dihedrals_chain_S.csv')

In [31]:
def create_dataset(root_dir, sep=";", fixed=None, output="data", filter_file="data/identity30.txt"):
    
    names_filter = filter_files(path_file=filter_file)
    
    cl = ChainLoader(root_dir=root_dir, sep=sep, fixed=fixed, filter_by_names=names_filter)
    
    src_path = open(output + ".ang", "w")
    
    tgt_path = open(output + ".res", "w")
    
    for idx in range(0, len(cl)):
        
        pri_seq, phi_seq, psi_seq, ome_seq, name = cl[idx]
        
        src_seq = []
        
        tgt_seq = []
        
        for jdx in range(0, len(pri_seq)):
            
            src_seq.append(str(encoding_angle(phi_seq[jdx])))
            
            src_seq.append(str(encoding_angle(psi_seq[jdx])))
            
            src_seq.append(str(encoding_omega(ome_seq[jdx])))
            
            tgt_seq.append(str(pri_seq[jdx]))
            
        src_path.write(" ".join(src_seq) + "\n") 
        
        tgt_path.write(" ".join(tgt_seq) + "\n")
        
    src_path.close()
                           
    tgt_path.close()

    return

In [32]:
!ls

Preprocessing.ipynb  checkpoints  download.sh  preprocess.py  tmp
README.md	     data	  generate.py  score.py       train.py
__pycache__	     data-bin	  images       seq2seq


In [33]:
create_dataset(root_dir="/media/panceta_disk/DATA/CHAINS_SPLIT/train/", 
               sep=";", 
               fixed=None, 
               output="data/train")

In [34]:
create_dataset(root_dir="/media/panceta_disk/DATA/CHAINS_SPLIT/dev/", 
               sep=";", 
               fixed=None, 
               output="data/eval")

In [35]:
create_dataset(root_dir="/media/panceta_disk/DATA/CHAINS_SPLIT/test/", 
               sep=";", 
               fixed=None, 
               output="data/test")

In [36]:
DATA_PATH = "data/"

!python preprocess.py --source-lang ang --target-lang res --train-prefix $DATA_PATH/train --valid-prefix $DATA_PATH/eval --test-prefix $DATA_PATH/test --dest-dir data-bin/tokenized.ang-res

[2018-10-18 12:34:34] COMMAND: preprocess.py --source-lang ang --target-lang res --train-prefix data//train --valid-prefix data//eval --test-prefix data//test --dest-dir data-bin/tokenized.ang-res
[2018-10-18 12:34:34] Arguments: {'source_lang': 'ang', 'target_lang': 'res', 'train_prefix': 'data//train', 'valid_prefix': 'data//eval', 'test_prefix': 'data//test', 'dest_dir': 'data-bin/tokenized.ang-res', 'threshold_src': 0, 'num_words_src': -1, 'threshold_tgt': 0, 'num_words_tgt': -1}
[2018-10-18 12:34:34] COMMAND: preprocess.py --source-lang ang --target-lang res --train-prefix data//train --valid-prefix data//eval --test-prefix data//test --dest-dir data-bin/tokenized.ang-res
[2018-10-18 12:34:34] Arguments: {'source_lang': 'ang', 'target_lang': 'res', 'train_prefix': 'data//train', 'valid_prefix': 'data//eval', 'test_prefix': 'data//test', 'dest_dir': 'data-bin/tokenized.ang-res', 'threshold_src': 0, 'num_words_src': -1, 'threshold_tgt': 0, 'num_words_tgt': -1}
[2018-10-18 12:34:39] 