In [1]:
import os, sys, subprocess, re, requests, json, warnings
import numpy as np
import pandas as pd
from urllib.request import urlopen
from time import sleep

from Bio import SeqIO
from Bio.PDB.DSSP import DSSP
from Bio.PDB.DSSP import PDBParser

warnings.filterwarnings('ignore')

In [10]:
import biotite.application.dssp as dssp

In [6]:
def get_url(url, **kwargs):
    '''
    Obatin a response from a given url
    '''
    response = requests.get(url, **kwargs);

    if not response.ok:
        print(response.text)
        response.raise_for_status()
        sys.exit()

    return response

In [7]:
# uniprot API URL
WEBSITE_API = "https://rest.uniprot.org/uniprotkb/"

In [20]:
def find_SS_location(res_seq, ss_seq, ss_symbol):
    '''
    Return residues that belong to helix
    as a list of residues and their locations
    Note that '*' is added in the end of both AA and prediction strings,
    thus precition[0-1] and predition[last] return non-H anyway,
    preventing the bug when predciton starts or ends with H
    '''
    helix_res_loc_dict = dict()
    
    res_seq += '*'
    ss_seq += '*'

    for i in range(len(ss_seq)):
        if (ss_seq[i-1] != ss_symbol) & (ss_seq[i] == ss_symbol):
            helix_start = i

        elif (ss_seq[i-1] == ss_symbol) & (ss_seq[i] != ss_symbol):
            helix_end = i
            
            # pack helix residues and locations into a dictionary
            helix_res = res_seq[helix_start:helix_end]
            helix_loc = str(helix_start+1) + '-' + str(helix_end) # residue number is non-pythonic and starts with 1, not 0
            helix_res_loc_dict[helix_res] = helix_loc

    return helix_res_loc_dict

# 1. FASTA sequence

In [3]:
# protein list import
df = pd.read_excel('./SourceData/AH_protein_list.xlsx')
columns = df.iloc[0, 1:]
df = df.iloc[1:, 1:]
df.columns = columns

In [4]:
# # Protein entry list
# entry_list = df.Entry.unique().tolist()

# # AH location start postion list
# AH_start_list = df.AH_location_start.unique

In [8]:
# Obtain FASTA seq from Uniprot, make AH position data
for entry in df.Entry:
    print(entry)
    try:
        r = get_url(f'{WEBSITE_API}search?query=accession:{entry}&format=fasta&compressed=false')
    except requests.exceptions.ConnectionError:
        r.status_code = "Connection refused"
        break
        
    # export fasta
    fasta = r.text
    with open('./TrainingData/Original_fasta/%s.fas' % entry, 'w') as f:
        f.write(fasta)
        f.close()
        
    sleep(1)

Q9UH99
Q8N6T3
Q15643
P20606
Q14534


# s4pred for ss prediction

In [21]:
model = '../s4pred/run_model.py'

for entry in df.Entry[:1]:
    command = \
    'python %s --outfmt fas ./TrainingData/Original_fasta/%s.fas > ./TrainingData/s4pred_output/%s_ss.fas' \
    % (model, entry, entry)
    
    # run s4pred from command line
    subprocess.run(command, shell=True)
    print(entry, ' done')
    
    file = open(f'./TrainingData/s4pred_output/{entry}_ss.fas', 'r')
    lines = file.readlines()
    aa_seq = lines[-2].strip()
    sse = lines[-1].strip()
    
    l = find_SS_location(aa_seq, sse, 'H')
    
    print(l)

Q9UH99  done
{'LRSAVSRAGSLLWMVA': '155-170', 'PGRLFRLLYWWAGTTWYRLTTAASLLDVFVL': '173-203', 'LKTFLWFLLPLLLLTCLTYGAWYF': '210-233', 'PY': '235-236', 'L': '238-238', 'PALVSWWA': '243-250', 'PHFQAEQRVMSRVHSLERRLEALAAEFSSNWQKEAMRLERLE': '268-309', 'HEDTLALLEGLVSRREAALKEDFRRETAARIQEELSALRAEHQQDSEDLFKKIVRASQESEARIQQLKSEWQSMTQESFQESSVKELRRLEDQLAGLQQELAALALKQSSVAEEV': '325-439', 'PQQIQAVRDDVESQ': '443-456', 'PAWISQFLA': '458-466', 'LLQREEMQAQLRELESKILTHVAEMQ': '475-500', 'AREAAASLSLTLQK': '504-517', 'EEQVHHIVKQALQRYSE': '525-541', 'I': '544-544', 'LADYALE': '546-552', 'TKTALLSLF': '569-577'}


In [18]:
# iterator = SeqIO.read(file, 'fasta')
# records = SeqIO.to_dict(iterator)
# print(iterator.seq)
# # print(list)
# #     print(seq_rec.seq)
# file.close()
# print(sse)

# 2. Helix sequence extracted from AlphaFold predicted struture

## DSSP - primarily used

In [None]:
file_path = '../../../AF-Q9WU40-F1-model_v3.pdb'

In [31]:
p = PDBParser()

In [70]:
def get_aaSeq_and_ss(pdb_filepath):

    # empty string for output    
    res_seq = ''
    ss_seq = ''
    
    # get dssp from the PDB file
    strcuture = p.get_structure('name', pdb_filepath)
    model = strcuture[0]
    dssp = DSSP(model, pdb_filepath)
    
    # parse dssp for residue symbol and ss info
    for i in range(len(dssp.keys())): # len of the protein
        # residue key
        a_key = list(dssp.keys())[i]

        # residue symbol and SS
        res = list(dssp[a_key])[1]
        ss = list(dssp[a_key])[2]
        res_seq += res
        ss_seq += ss

    return res_seq, ss_seq

In [71]:
res_seq, ss_seq = get_aaSeq_and_ss(file_path)

In [85]:
find_SS_location(res_seq, ss_seq, ss_symbol='H')

{'DEELFSQLRR': '15-24',
 'RPVYLKKLKKLREEEQQQQQQQQQQQHRA': '37-65',
 'EEELLQQFKRE': '461-471',
 'SFSAHYLSMFLLTAACLFFLILGLTYLGMR': '478-507',
 'ESEKNLLMSTLYKLHDRLAQIAGDHEC': '531-557',
 'VQEAAAYLKNL': '567-577',
 'EDVFNTSLLWIFKN': '582-595',
 'FWCRFRRAFITVTHRLLLLCLGVVLVCVALRYMRYRWTKEEEETRQMYDMVVKIIDVLRSHNEACQE': '630-696',
 'LPHVRDSL': '706-713',
 'KKVWDRAVDFLAAN': '723-736',
 'WHLAIQEAILEK': '812-823',
 'PEYAGKAFKAL': '851-861',
 'LDRYHHR': '878-884',
 'SHLR': '906-909'}

## Using biotite - but not used

In [8]:
# from tempfile import gettempdir, NamedTemporaryFile
# import biotite.structure as struc
# import biotite.structure.io as strucio
# import biotite.structure.io.pdb as pdb
# import biotite.database.rcsb as rcsb

# file_path = rcsb.fetch("1l2y", "pdb", gettempdir())
# pdb_file = pdb.PDBFile.read(file_path)
# tc5b = pdb_file.get_structure()
# print(type(tc5b).__name__)
# # print(tc5b.stack_depth())
# print(tc5b.array_length())
# print(tc5b.shape)

# array = strucio.load_structure(file_path)
# sse = struc.annotate_sse(array, chain_id='A')

# sse = ''.join(sse)

In [None]:
dssp = DSSP(model, '/local-pdb/1mot.pdb', dssp='mkdssp')

# DeepTMHMM

In [1]:
import biolib

In [3]:
deeptmhmm = biolib.load('DTU/DeepTMHMM')

# Run DeepTMHMM
deeptmhmm_res = deeptmhmm.cli(args='--fasta ./TrainingData/Original_fasta/Q15643.fas')

# # Save the results
# deeptmhmm_res.save_files("./biolib_results/")

# deeptmhmm_res.ipython_markdown()

2022-08-06 11:38:23,160 | INFO : Loaded project DTU/DeepTMHMM:1.0.12
2022-08-06 11:38:23,973 | INFO : Job "8bfee793-340f-4b6f-b252-65373fce959f" is starting...
2022-08-06 11:38:46,409 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:38:56,626 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:39:06,982 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:39:17,199 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:39:27,434 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:39:37,667 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:39:47,884 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:39:58,106 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:40:08,329 | INFO : Cloud: Server capacity is being allocated. Please wait...
2022-08-06 11:40:18,5

In [7]:
deeptmhmm_res.

bytearray(b'')