## 'Finding a Protein Motif'

**Connections**: `PROT`, `SUBS`

---


**Given**: At most 15 UniProt Protein Database access IDs.

**Return**: For each protein possessing the N-glycosylation motif, output its given access ID followed by a list of locations in the protein string where the motif can be found.



*Notes*:

a protein motif is represented by a shorthand as follows: 
- [XY] means "either X or Y" and {X} means "any amino acid except X."
    - example: N-glycosylation motif is written as N{P}[ST]{P}.


In [52]:
# Libraries to load:

import os
import urllib.request, shutil, tempfile
from Bio import SeqIO


In [None]:
# Previous functions generated


In [3]:
dna_nt_list     = ['A','C','G','T']
rna_nt_list     = ['A','C','G','U']
aa_residue_list = ['A','R','N','D','C',
                   'Q','E','G','H','I',
                   'L','K','M','F','P',
                   'S','T','W','Y','V']


In [77]:
def protein_motif_locator_uniprot(uniprot_id_list, protein_motif_targets_list, outfile_filename):
    '''
    Input: list of UniProt Database IDs, list of all possible protein motifs
    -!- Dependencies: URLLIB
    Output: an output file with the Uniprot ID's listed and the subsequent motif locations
    '''
    mainpath = 'http://www.uniprot.org/uniprot/'
    output = open(os.getcwd()+"/"+outfile_filename+".txt", "w")
    for UID in uniprot_id_list:
        uid = UID.split('_')[0]
        dataloc  = mainpath + uid +'.fasta'
        with urllib.request.urlopen(dataloc) as request:
            body = request.read().decode('utf-8')
            #if uid not in body.split('\n')[0].split(' ')[0].lstrip('>'):
            #    continue
            record_seq = ''.join(body.split('\n')[1:])
    
            target_hits = []
            for target in target_motifs:
                for i in range(len(record_seq)-len(target)):
                    if target in record_seq[i:i+len(target)]:
                        
                        print('HIT: ', UID, target)
                        
                        target_hits.append(i+1)
            if len(target_hits)>=1:
                output.write(UID+'\n')
                output.write(' '.join([str(i) for i in sorted(target_hits)])+'\n')
            del target_hits, record_seq, body
        del dataloc, uid
    output.truncate(output.tell() - 1)
    output.close()
    del mainpath
    return



In [71]:
with open(os.getcwd()+'/datasets/rosalind_sample_dataset.txt', 'r') as infile:
    uniprot_id_list = infile.readlines()
    uniprot_id_list = [i.strip('\n') for i in uniprot_id_list]


# Motif Target: N{P}[ST]{P}
target_motifs = ['N'+i+'S'+j 
                 for i in aa_residue_list if i!='P' 
                 for j in aa_residue_list if j!='P'] + ['N'+i+'T'+j 
                                                        for i in aa_residue_list if i!='P' 
                                                        for j in aa_residue_list if j!='P']

protein_motif_locator_uniprot(uniprot_id_list,
                             target_motifs,
                             'answer_submissions/rosalind_sample_dataset_answer')

del uniprot_id_list, target_motifs


HIT:  B5ZC00 NISA
HIT:  B5ZC00 NLSK
HIT:  B5ZC00 NFSD
HIT:  B5ZC00 NSSN
HIT:  B5ZC00 NWTE
HIT:  P07204_TRBM_HUMAN NASQ
HIT:  P07204_TRBM_HUMAN NTSY
HIT:  P07204_TRBM_HUMAN NNTS
HIT:  P07204_TRBM_HUMAN NQTA
HIT:  P07204_TRBM_HUMAN NQTS
HIT:  P20840_SAG1_YEAST NFSD
HIT:  P20840_SAG1_YEAST NSSQ
HIT:  P20840_SAG1_YEAST NTSA
HIT:  P20840_SAG1_YEAST NATR
HIT:  P20840_SAG1_YEAST NRTT
HIT:  P20840_SAG1_YEAST NDTN
HIT:  P20840_SAG1_YEAST NITN
HIT:  P20840_SAG1_YEAST NITV
HIT:  P20840_SAG1_YEAST NFTS
HIT:  P20840_SAG1_YEAST NTTF
HIT:  P20840_SAG1_YEAST NTTY


In [78]:
with open(os.getcwd()+'/datasets/rosalind_mprt_attempt01.txt', 'r') as infile:
    uniprot_id_list = infile.readlines()
    uniprot_id_list = [i.strip('\n') for i in uniprot_id_list]

print(uniprot_id_list)
print()

# Motif Target: N{P}[ST]{P}
target_motifs = ['N'+i+'S'+j 
                 for i in aa_residue_list if i!='P' 
                 for j in aa_residue_list if j!='P'] + ['N'+i+'T'+j 
                                                        for i in aa_residue_list if i!='P' 
                                                        for j in aa_residue_list if j!='P']

protein_motif_locator_uniprot(uniprot_id_list,
                             target_motifs,
                             'answer_submissions/rosalind_mprt_submission_attempt01')

#del uniprot_id_list, target_motifs


['Q60960', 'Q55AB5', 'Q3Z2Z2', 'P01045_KNH2_BOVIN', 'P00740_FA9_HUMAN', 'P01047_KNL2_BOVIN', 'A9N9G8', 'Q5FTZ8', 'Q5FMJ3', 'Q924A4', 'A8F2D7', 'P02749_APOH_HUMAN', 'O08537_ESR2_MOUSE']

HIT:  Q60960 NCSA
HIT:  Q60960 NGSG
HIT:  Q60960 NKSL
HIT:  Q60960 NATS
HIT:  Q60960 NCTL
HIT:  Q60960 NITA
HIT:  Q55AB5 NRTK
HIT:  Q3Z2Z2 NVTL
HIT:  P01045_KNH2_BOVIN NCSK
HIT:  P01045_KNH2_BOVIN NHSI
HIT:  P01045_KNH2_BOVIN NKSG
HIT:  P01045_KNH2_BOVIN NTSH
HIT:  P01045_KNH2_BOVIN NYSI
HIT:  P01045_KNH2_BOVIN NNTS
HIT:  P01045_KNH2_BOVIN NKTW
HIT:  P00740_FA9_HUMAN NITQ
HIT:  P00740_FA9_HUMAN NSTE
HIT:  P01047_KNL2_BOVIN NCSK
HIT:  P01047_KNL2_BOVIN NHSI
HIT:  P01047_KNL2_BOVIN NKSG
HIT:  P01047_KNL2_BOVIN NTSH
HIT:  P01047_KNL2_BOVIN NYSI
HIT:  P01047_KNL2_BOVIN NNTS
HIT:  P01047_KNL2_BOVIN NKTW
HIT:  Q5FTZ8 NVSV
HIT:  Q5FTZ8 NWTM
HIT:  Q5FMJ3 NISD
HIT:  Q5FMJ3 NLSN
HIT:  Q5FMJ3 NLSE
HIT:  Q924A4 NQTH
HIT:  P02749_APOH_HUMAN NNSL
HIT:  P02749_APOH_HUMAN NWSA
HIT:  P02749_APOH_HUMAN NDTI
HIT:  P02749_

---

### Problem Attempt:

In [79]:
with open(os.getcwd()+'/datasets/rosalind_mprt.txt', 'r') as infile:
    uniprot_id_list = infile.readlines()
    uniprot_id_list = [i.strip('\n') for i in uniprot_id_list]


# Motif Target: N{P}[ST]{P}
target_motifs = ['N'+i+'S'+j 
                 for i in aa_residue_list if i!='P' 
                 for j in aa_residue_list if j!='P'] + ['N'+i+'T'+j 
                                                        for i in aa_residue_list if i!='P' 
                                                        for j in aa_residue_list if j!='P']

protein_motif_locator_uniprot(uniprot_id_list,
                             target_motifs,
                             'answer_submissions/rosalind_mprt_submission')

del uniprot_id_list, target_motifs


HIT:  A5F5B4 NGSV
HIT:  P01046_KNL1_BOVIN NCSK
HIT:  P01046_KNL1_BOVIN NKSG
HIT:  P01046_KNL1_BOVIN NTSH
HIT:  P01046_KNL1_BOVIN NYSI
HIT:  P01046_KNL1_BOVIN NNTS
HIT:  P01046_KNL1_BOVIN NKTW
HIT:  Q00001_RHGA_ASPAC NWSG
HIT:  Q00001_RHGA_ASPAC NITV
HIT:  Q00001_RHGA_ASPAC NSTD
HIT:  P72173 NESE
HIT:  P72173 NKSN
HIT:  P72173 NYSN
HIT:  P10153_RNKD_HUMAN NISN
HIT:  P10153_RNKD_HUMAN NLTT
HIT:  P10153_RNKD_HUMAN NKTR
HIT:  P10153_RNKD_HUMAN NMTC
HIT:  P10153_RNKD_HUMAN NMTS
HIT:  Q16775 NLTV
HIT:  P13473_LMP2_HUMAN NGSI
HIT:  P13473_LMP2_HUMAN NGSV
HIT:  P13473_LMP2_HUMAN NISM
HIT:  P13473_LMP2_HUMAN NLSY
HIT:  P13473_LMP2_HUMAN NSST
HIT:  P13473_LMP2_HUMAN NATC
HIT:  P13473_LMP2_HUMAN NDTC
HIT:  P13473_LMP2_HUMAN NGTV
HIT:  P13473_LMP2_HUMAN NITQ
HIT:  P13473_LMP2_HUMAN NLTD
HIT:  P13473_LMP2_HUMAN NKTY
HIT:  P13473_LMP2_HUMAN NFTK
HIT:  P13473_LMP2_HUMAN NFTV
HIT:  P13473_LMP2_HUMAN NTTH
HIT:  P13473_LMP2_HUMAN NTTF
HIT:  P13473_LMP2_HUMAN NVTQ
HIT:  P01189_COLI_HUMAN NSSS
HIT:  P1049