In [1]:
!git clone https://github.com/radhikasethi2011/ProteinClassify.git

Cloning into 'ProteinClassify'...
remote: Enumerating objects: 27, done.[K
remote: Counting objects: 100% (27/27), done.[K
remote: Compressing objects: 100% (24/24), done.[K
remote: Total 27 (delta 6), reused 0 (delta 0), pack-reused 0[K
Unpacking objects: 100% (27/27), done.


In [2]:
import numpy as np
import pandas as pd
import os
from statistics import mode
import glob as glob

In [3]:
scop_cla = pd.read_csv('/content/ProteinClassify/scop-cla-latest.txt', header=None, skiprows=7, delimiter = ' ')
cols = [0,3,4,5,6,7,8,9]
scop_cla.drop(scop_cla.columns[cols], axis= 1, inplace=True)
scop_cla.rename(columns={ scop_cla.columns[0]: "residue" , 
                         scop_cla.columns[1]: "chain",
                         scop_cla.columns[2]: "label" }, inplace=True)
scop_cla['chain'] = scop_cla['chain'].str.split(':').str[0]
scop_cla['label'] = scop_cla['label'].str.split(',').str[1].str.split('=').str[1]
scop_cla['residue'] = scop_cla['residue'] + '_' + scop_cla['chain']
#scop_cla['residue'] = scop_cla['residue'].str[1:]

#scop_cla.set_index(['residue'], inplace=True)
scop_cla

Unnamed: 0,residue,chain,label
0,6J56_A,A,1000003
1,3FKQ_A,A,1000002
2,1XHF_A,A,1000002
3,1Y7P_B,B,1000002
4,1JBE_A,A,1000002
...,...,...,...
36846,1SCJ_B,B,1000003
36847,1T1E_A,A,1000003
36848,1JQG_A,A,1000003
36849,1KN6_A,A,1000003


In [4]:
cols = ['domid','pdbid', 'pdbchain']
scop_struct = pd.read_csv('/content/ProteinClassify/scop-represented-structures-latest.txt',
                   header = None, skiprows=6, names=cols, delimiter = ' ')
scop_struct['pdbid'] = scop_struct['pdbid'] + '_' + scop_struct['pdbchain']
scop_struct

Unnamed: 0,domid,pdbid,pdbchain
0,8000061,2DT5_B,B
1,8000376,2FR1_A,A
2,8000376,6W7S_A,A
3,8000376,6WH9_A,A
4,8000376,6WH9_D,D
...,...,...,...
858211,8107712,7T7E_A,A
858212,8107712,7T7F_A,A
858213,8107712,7T7G_A,A
858214,8107715,7R1K_A,A


In [6]:
mask = scop_cla['residue'] == '5FLV_E'
scop_cla[mask]['label']

31019    1000001
Name: label, dtype: object

In [7]:
ans = set(list(scop_cla['residue'])).intersection(list(scop_struct['pdbid']))
len(ans)

220

In [8]:
#scop_cla.set_index(['residue'], inplace=True)
#scop_struct.set_index(['pdbid'], inplace=True)
merged = scop_struct.merge(scop_cla, left_on='pdbid', right_on='residue')
merged = merged.drop_duplicates(subset=['pdbid'],keep='first')
merged.drop(columns=['pdbid','pdbchain'], axis=1, inplace=True)
merged['residue'] = merged['residue'].str.split('_').str[0]
merged.reset_index(drop=True, inplace=True)
merged.to_csv('pdb_chain_class.csv', sep=',')
merged.set_index(['residue'], inplace=True)

merged

Unnamed: 0_level_0,domid,chain,label
residue,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2O61,8003180,A,1000003
2J9U,8003499,B,1000004
4KSS,8018133,F,1000001
3GD7,8018340,A,1000002
4JKV,8018754,B,1000000
...,...,...,...
2JSS,8103852,A,1000000
6WVB,8106213,A,1000003
7PAB,8106311,A,1000001
7PAB,8106313,C,1000003


In [9]:
mask = merged.index == '5FLV'
merged[mask]['label']

residue
5FLV    1000000
5FLV    1000001
Name: label, dtype: object

In [10]:
merged[mask]['chain']

residue
5FLV    M
5FLV    E
Name: chain, dtype: object

In [11]:
%mkdir /content/pdbs
%mkdir /content/ca_csv
%mkdir /content/records

In [12]:
def is_atom_record(record):
  if record.startswith('ATOM'): 
    return True;
  return False;

def is_intended_chain(record, chain):
  if record[21] == chain: 
    return True
  return False

def is_chain_ter_record(record, chain):
  if record[21] == chain and record.startswith("TER"): 
    return True
  return False

def is_alt_record(record):
  if record[16] == " ": 
    return False
  return True
  
def is_ca_atom(record):
  if record[12:15].strip() == "CA":
    return True
  return False

def parse_atom_records(record):
  atom = record[12:16].strip()
  residue = record[17:20].strip()
  chain = record[21].strip()
  seq_pos = record[22:26].strip()
  x = record[30:38].strip()
  y = record[38:46].strip()
  z = record[46:54].strip()
  return atom, residue, chain, seq_pos, x, y, z

def parse_pdb(contents, chainl): #chain: list
  ca_records = {}
  pos = 0
  for i in range(len(chainl)):
    for line in contents:
      if line.startswith("ENDMDL"): break;
      if is_chain_ter_record(line, chainl[i]): break;

      if not is_atom_record(line): continue
      if not is_intended_chain(line, chainl[i]): continue
      if is_alt_record(line): continue 
      if not is_ca_atom(line): continue

      (atom, residue, chain, rel_pos, x, y, z) = parse_atom_records(line)
      ca_records[pos] = atom, residue, chain, rel_pos, x, y, z
      pos+=1
  return ca_records

def dump_records_to_csv_file(file_name, record_dict):
  file = open(file_name, 'w')
  for key in record_dict.keys():
    (atom, residue, chain, pos, x, y, z) = record_dict[key]
    file.write(f"%s,%s,%s,%s,%s,%s,%s\n"%(atom, residue, chain, pos, x, y, z))


In [13]:
import tensorflow as tf 

# The following functions can be used to convert a value to a type compatible
# with tf.train.Example.

def _bytes_feature(value):
  """Returns a bytes_list from a string / byte."""
  if isinstance(value, type(tf.constant(0))):
    value = value.numpy() # BytesList won't unpack a string from an EagerTensor.
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
  """Returns a float_list from a float / double."""
  return tf.train.Feature(float_list=tf.train.FloatList(value=[value]))

def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))


In [14]:
def serialize_example(feature0, feature1, feature3, 
                      feature4, feature5, feature6):
  """
  Creates a tf.train.Example message ready to be written to a file.
  """
  # Create a dictionary mapping the feature name to the tf.train.Example-compatible
  # data type.
  feature = {
      'label': _bytes_feature(feature0),
      'residue': _bytes_feature(feature1),
      'pos': _int64_feature(feature3),
      'x': _float_feature(feature4),
      'y': _float_feature(feature5),
      'z': _float_feature(feature6)
  }

  # Create a Features message using tf.train.Example.

  example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
  return example_proto.SerializeToString()




In [15]:
#filename = 'test.tfrecord'
def write_to_testrecord(filename, df, label):
  with tf.io.TFRecordWriter(filename) as writer:
    for i in range(len(df)):
      serialized_example = serialize_example(bytes(label, 'utf-8' ),
                                            bytes(df['atom'][i], 'utf-8'), 
                                            df['pos'][i], df['x'][i], 
                                            df['y'][i], df['z'][i])
      example_proto = tf.train.Example.FromString(serialized_example)
      #writer = tf.data.experimental.TFRecordWriter(filename)
      writer.write(serialized_example)

In [16]:
def edit_pos(csv_name):
  col_names = ['ca','atom','residue','pos','x','y','z']
  df = pd.read_csv(f'/content/ca_csv/{csv_name}.csv', names=col_names, sep=',')
  x = df['x'][0]
  y = df['y'][0]
  z = df['z'][0]
  df['x'] = df['x'] - x
  df['y'] = df['y'] - y
  df['z'] = df['z'] - z
  return df




In [18]:
pd_list = list(merged.index)
for i in range(len(pd_list)):
  pd_id = pd_list[i]
  #print(i, pd_id)
  !wget -q https://files.rcsb.org/download/{pd_id}.pdb -O /content/pdbs/{pd_id}.pdb
  size = os.path.getsize(f"/content/pdbs/{pd_id}.pdb")
  if size>0:
    chainl = list(merged['chain'][pd_id])
    print("i: ", i, pd_id, chainl)
    with open(f'/content/pdbs/{pd_id}.pdb') as f:
      contents = f.readlines() 
    ca_records = parse_pdb(contents, chainl) 
    dump_records_to_csv_file(f'/content/ca_csv/{pd_id}.csv', ca_records)
    df = edit_pos(pd_id)
    label = merged['label'][pd_id]
    if(type(label) is str):
      filename = '/content/records/' + f'{pd_id}' + '_' + f'{label}' + '.tfrecord'
      write_to_testrecord(filename, df, label)
    else:
      for l in label: 
        filename = '/content/records/' + f'{pd_id}' + '_' + f'{l}' + '.tfrecord'
        write_to_testrecord(filename, df, l)
  else: print("! ! ! ! file not found at rcsb ! ! ! !")


  


i:  0 2O61 ['A']
i:  1 2J9U ['B']
i:  2 4KSS ['F']
i:  3 3GD7 ['A']
i:  4 4JKV ['B']
i:  5 4N6H ['A']
i:  6 4NTJ ['A']
i:  7 4OR2 ['A']
i:  8 4YAY ['A']
i:  9 4Z34 ['A']
i:  10 5NDD ['A']
i:  11 5UEN ['A']
i:  12 5UNF ['B']
i:  13 5WIV ['A']
i:  14 6A93 ['A']
i:  15 6BQG ['A']
i:  16 6D32 ['A']
i:  17 6DRZ ['A']
i:  18 6IIU ['A']
i:  19 6IQL ['A']
i:  20 6KO5 ['A']
i:  21 6LW5 ['A']
i:  22 6ME6 ['A']
i:  23 6RZ4 ['A']
i:  24 6RZ6 ['A']
i:  25 2B5U ['A']
i:  26 1DF0 ['A']
i:  27 1QXP ['A']
i:  28 1IVS ['A']
i:  29 1QPZ ['A']
i:  30 1DD3 ['A']
i:  31 2LFR ['A']
i:  32 3PBL ['A']
i:  33 3RZE ['A']
i:  34 3V2Y ['A']
i:  35 3VW7 ['A']
i:  36 4DJH ['A']
i:  37 4EJ4 ['A']
i:  38 4N9N ['A']
i:  39 4RWS ['A']
i:  40 4U14 ['A']
i:  41 5B2G ['A']
i:  42 5CXV ['A']
i:  43 5DSG ['A']
i:  44 5T04 ['A']
i:  45 5TZR ['A']
i:  46 5W0P ['A']
i:  47 5X93 ['A']
i:  48 5XSZ ['A']
i:  49 5YQZ ['R']
i:  50 5ZBQ ['A']
i:  51 6CM4 ['A']
i:  52 6D26 ['A']
i:  53 6D9M ['A']
i:  54 1H6W ['A']
i:  55 1X9Y ['A']
i:

In [None]:
filenames = [filename]
raw_dataset = tf.data.TFRecordDataset(filenames)
raw_dataset

for raw_record in raw_dataset.take(10):
  example = tf.train.Example()
  example.ParseFromString(raw_record.numpy())
  print(example)


In [19]:
!tar -zcvf records.tar.gz /content/records 

tar: Removing leading `/' from member names
/content/records/
/content/records/1S16_1000003.tfrecord
/content/records/5DSG_1000000.tfrecord
/content/records/6SLO_1000002.tfrecord
/content/records/2LFR_1000000.tfrecord
/content/records/5UFU_1000003.tfrecord
/content/records/5ZRY_1000000.tfrecord
/content/records/3UIT_1000000.tfrecord
/content/records/6WVB_1000003.tfrecord
/content/records/4P0K_1000001.tfrecord
/content/records/1PZQ_1000000.tfrecord
/content/records/1SOX_1000003.tfrecord
/content/records/1SJ8_1000000.tfrecord
/content/records/1G1S_1000003.tfrecord
/content/records/2H8W_1000000.tfrecord
/content/records/6D32_1000000.tfrecord
/content/records/5EHM_1000002.tfrecord
/content/records/7PAB_1000001.tfrecord
/content/records/1TPG_1000004.tfrecord
/content/records/2GJ6_1000001.tfrecord
/content/records/4BWC_1000003.tfrecord
/content/records/1UDZ_1000001.tfrecord
/content/records/4PKG_1000004.tfrecord
/content/records/4PKI_1000002.tfrecord
/content/records/6ME6_1000000.tfrecord
/c