In [None]:
import requests
import json
import numpy as np
import pandas as pd
import os
from collections import Counter
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry

format_spacing = {
    'ATOM' : {
        'spacing': [6, 11, 16, 17, 20, 22, 26, 27, 38, 46, 54, 60, 66, 78, 80],
        'label': ['record_name', 'serial_number',
                  'atom_name', 'alt_loc', 'res_name',
                  'chain_id', 'res_seq', 'iCode',
                  'x', 'y', 'z', 'occupancy', 'tempFactor',
                  'element', 'charge']
    },
    'HELIX' : {
        'spacing': [6, 10, 14, 18, 20, 25, 26, 30, 32, 37, 38, 40, 70, 76],
        'label': ['record_name', 'serial_number', 'helix_id', 'init_res_name',
                  'init_chain_id', 'init_seq_num', 'init_iCode', 'end_res_name',
                  'end_chain_id', 'end_seq_num', 'end_iCode', 'helix_class',
                  'comment', 'length']
    },
    'SHEET' : {
        'spacing': [6, 10, 14, 16, 20, 22, 26, 27, 31, 33, 37, 38, 40, 45, 48, 50, 54, 55, 60, 63, 65, 69, 70],
        'label': ['record_name', 'strand', 'sheet_id', 'num_strands', 'init_res_name',
                  'init_chain_id', 'init_seq_num', 'init_iCode', 'end_res_name',
                  'end_chain_id', 'end_seq_num', 'end_iCode', 'sense', 'cur_atom', 'cur_res_name',
                  'cur_chain_id', 'cur_res_seq', 'cur_iCode', 'prev_atom', 'prev_res_name',
                  'prev_chain_id', 'prev_res_seq', 'rev_iCode']
    }
}

atom_backchain = ['N', 'CA', 'C', 'O']


def parse_list_pdb( file_name: str):
    with open(file_name, 'r') as file_list_pdb:
        df_list_pdb = pd.read_csv(file_list_pdb, delimiter=' ', skipinitialspace=True)
    print(df_list_pdb)  
    return df_list_pdb

def parse_pdb_data( str_pdb: str, keyword: str, protein_name: str):
    if keyword in format_spacing.keys() and str_pdb.split(' ')[0] == keyword:
        l_label = format_spacing[keyword]['label']
        l_spacing = format_spacing[keyword]['spacing']
        if len(l_label) != len(l_spacing):
            raise Exception('length of label and spacing for {} not matching'.format(keyword))
        else:
            tmp_dict = dict()
            for i in range(len(l_spacing)):
                if i == 0:
                    tmp_dict[l_label[i]] = str_pdb[0:l_spacing[i]].strip()
                else:
                    tmp_dict[l_label[i]] = str_pdb[l_spacing[i - 1]:l_spacing[i]].strip()

            tmp_dict['protein_name'] = protein_name
            return tmp_dict

def process_pdb(pdb_name, len_atom, df_helix):
    protein_name = pdb_name[:-1]
    protein_chain = pdb_name[-1]
    num_atoms_detected = 0

    if not (os.path.exists(pdb_dir + '/{}.pdb'.format(protein_name))):
        with open(pdb_dir + '/{}.pdb'.format(protein_name), 'wb') as f:
            print('Beginning {} pdb file download with requests'.format(pdb_name))
            session = requests.Session()
            retry = Retry(connect=3, backoff_factor=0.5)
            adapter = HTTPAdapter(max_retries=retry)
            session.mount('http://', adapter)
            session.mount('https://', adapter)
            url = 'https://files.rcsb.org/view/{}.pdb'.format(protein_name)
            r = session.get(url)
            f.write(r.content)

    with open(pdb_dir + '/{}.pdb'.format(protein_name), 'r') as f:
        for line in f.readlines():
            dict_parsed_atom = parse_pdb_data(line, 'ATOM', protein_name)
            dict_parsed_helix = parse_pdb_data(line, 'HELIX', protein_name)
            dict_parsed_sheet = parse_pdb_data(line, 'SHEET', protein_name)
            
            if dict_parsed_helix:
                if dict_parsed_helix['init_chain_id'] == protein_chain or dict_parsed_helix['end_chain_id'] == protein_chain:
                    df_helix = df_helix.append(dict_parsed_helix, ignore_index=True)

    df_csv = df_helix.to_csv(r'C:\Users\Juanb\Documents\bioinformatics_group_project_1\df_export.csv', index = None, header=True)
   
    return df_helix


def pdb_parser(index_to_break, df_helix):
    if index_to_break == -1:
        print('parsing all pdb in {}'.format(filename_list_pdb))
    else:
        print('parsing first {} proteins in {}'.format(index_to_break, filename_list_pdb))
   
    for index, pdb in df_pdb_list.iterrows():
        if index == index_to_break:
            break
        pdb_id = pdb['IDs']
        #print('parsing {}...'.format(pdb_id))
        df_helix = process_pdb(pdb_id, pdb['length'], df_helix)
   
    #print("-"*120,"\nCurrent:",Counter(df_helix.cur_res_name))
    #print("\nDirection:",Counter(df_helix.sense))
    print("\nInitial:",Counter(df_helix.init_res_name))
    print("\nEnd",Counter(df_helix.end_res_name))
    #print("-"*120,"\n",df_helix.info())
    return df_helix

df_atom = pd.DataFrame()
df_sheet = pd.DataFrame()
df_helix = pd.DataFrame()
filename_list_pdb = 'cullpdb_pc30_res3.0_R1.0_d191017_chains18877.gz'
df_pdb_list = parse_list_pdb(filename_list_pdb)
pdb_dir = './pdb_data'

df_helix = pdb_parser(-1, df_helix)
print(df_helix)

In [19]:
print(df_helix.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 178260 entries, 0 to 178259
Data columns (total 15 columns):
comment          40 non-null object
end_chain_id     178260 non-null object
end_iCode        14 non-null object
end_res_name     178260 non-null object
end_seq_num      178260 non-null int64
helix_class      178260 non-null int64
helix_id         178243 non-null object
init_chain_id    178260 non-null object
init_iCode       18 non-null object
init_res_name    178260 non-null object
init_seq_num     178260 non-null int64
length           178260 non-null int64
protein_name     178260 non-null object
record_name      178260 non-null object
serial_number    178260 non-null int64
dtypes: int64(5), object(10)
memory usage: 20.4+ MB
None


1. Find the popularity of different kind of helices: read the data set and analyze it to find the
popularity of each type of helices. 

In [21]:
print("Helix Classes:",Counter(df_helix.helix_class))

Helix Classes: Counter({1: 131830, 5: 46408, 0: 17, 10: 4, 7: 1})


Printed above are the classes of helices, shown with the number of times they appear in our helices dataframe. Here, we see that alpha-helix helices are the most popular in the helix classes, followed by 3-10, Polyproline, and lastly, Omega.

2. Analyze the amino acids that prefer to form/be part of helices. Does an AA prefer to
form/be part of specific type of helices?

In [24]:
df_helix = pd.read_csv("df_export.csv", low_memory=False)
print("Initial:",Counter(df_helix.init_res_name))
print("\nEnd",Counter(df_helix.end_res_name))

Initial: Counter({'ASP': 24552, 'SER': 23523, 'THR': 16535, 'ASN': 15336, 'GLY': 14993, 'PRO': 14450, 'LEU': 8328, 'GLU': 8144, 'ALA': 7717, 'LYS': 6938, 'ARG': 6333, 'HIS': 4894, 'GLN': 4598, 'VAL': 4282, 'PHE': 4150, 'TYR': 4105, 'ILE': 3534, 'CYS': 2082, 'MET': 1668, 'TRP': 1563, 'MSE': 466, 'MLY': 12, 'OCS': 7, 'UNK': 7, 'FME': 5, 'SEP': 4, 'LLP': 4, 'CSO': 4, 'CME': 3, 'CYG': 2, 'SEB': 2, 'CSX': 1, 'PTR': 1, 'MIR': 1, 'SEC': 1, 'AIB': 1, 'GM8': 1, 'CAS': 1, '2ZC': 1, 'DDZ': 1, 'SMC': 1, 'ALO': 1, 'OAS': 1, 'HYP': 1, 'KCX': 1, 'OSE': 1, 'LYR': 1, 'CSD': 1, 'CXM': 1, 'PCA': 1})

End Counter({'GLY': 23286, 'LEU': 17926, 'ALA': 15554, 'SER': 12323, 'ASN': 11344, 'LYS': 11096, 'GLU': 10074, 'ARG': 9940, 'ASP': 8678, 'GLN': 7981, 'PHE': 7904, 'VAL': 7246, 'THR': 7017, 'ILE': 6740, 'TYR': 6374, 'HIS': 5503, 'MET': 3380, 'CYS': 2707, 'TRP': 1922, 'MSE': 965, 'PRO': 239, 'MLY': 25, 'UNK': 9, 'CSO': 5, 'MLZ': 4, 'MHS': 3, 'CGU': 2, 'SEC': 1, 'DBZ': 1, 'CSX': 1, 'XPC': 1, 'CSD': 1, 'OCS': 1,

Here, we see the most popular residues in the helices portion of the data. With a little research, we see that this counter data (printed above) is pretty accurate to what is stated in some research statements, especially in the End Counter. Residues like Leucine, Alanine, Lysine, Glutamate, and Asparagine are found in the top half of the most popular residues in the helices dataframe.

3. Analyze the length of helices (in terms of number of AAs). Is there any relation between
the type of a helix and its length?

In [30]:
df_helix.drop_duplicates(subset=["helix_class", "length"], keep="last")

Unnamed: 0,comment,end_chain_id,end_iCode,end_res_name,end_seq_num,helix_class,helix_id,init_chain_id,init_iCode,init_res_name,init_seq_num,length,protein_name,record_name,serial_number
16,,A,,ARG,197,5,6,A,,ARG,143,55,16VP,HELIX,6
397,,A,,THR,477,5,3,A,,ASP,454,24,1AZS,HELIX,3
852,,A,,PHE,63,7,3,A,,LYS,57,7,1BUO,HELIX,3
1200,POLYPROLINE II,A,,SER,705,10,H1,A,,SER,697,9,1CFB,HELIX,1
1238,,A,,VAL,253,10,L2,A,,GLY,250,4,1CHD,HELIX,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
178253,,A,,GLY,129,1,H04,A,,ALA,109,21,8ABP,HELIX,4
178255,,A,,GLN,192,1,H06,A,,ASN,177,16,8ABP,HELIX,6
178256,,A,,GLY,218,1,H07,A,,ASP,206,13,8ABP,HELIX,7
178258,,A,,LYS,273,1,H09,A,,ASP,257,17,8ABP,HELIX,9


Here, we see samples from all types of helices in this dataset: 1, 5, 7, and 10. We see a very slight correlation here. Alpha helices seem to usually be in lengths between 10 and 30, a bit reserved, while 3-10 helices seem to over shoot the prior lengths with their own range of lengths: 15 to 60, a little bit more widespread. As for the Polyproline and Omega helices, their lengths seem very minimal/small, compared to the previous two helices mentioned. We see lengths less than 10 residues here, but these types of helices can range from the lengths seen here to the ballpark of 15.