## Step 1: cleaning data & removing redundant data

In [2]:
import re   
import os
import glob 

In [1]:
from __future__ import print_function  

In [3]:
scrape_dir = os.path.join('..', 'data-scrapes') #solve compatibility issue Win/Mac
print(scrape_dir)

../data-scrapes


In [5]:
import datetime, time
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d-%H%M%S') #to know when we created our files

print("Converting sequences ... ")
out_file = os.path.join('..', 'data', 'protein-seqs-' + st + '.txt')

print("Writing to: %s" % out_file)

Converting sequences ... 
Writing to: ../data/protein-seqs-2019-04-21-121716.txt


In [6]:
num_proteins_done = 0   # TODO: Remove (here to reduce complexity)

# All files are read like this: 
fasta_files = glob.glob(scrape_dir + "/*.fasta")  #get all fasta_files available in scrape_dir
print(fasta_files)

['../data-scrapes/all-human-0001.fasta']


In [7]:
# helper function 

def dump_to_file(protein_id, sequence):
    with open(out_file, "a") as f:
        f.write(protein_id + "," + sequence + "\n")

In [10]:
for fname in fasta_files: #for each file
    print("Converting: %s: " % fname)
    
    proteins = {}   # will hold all proteins in this form ->  id: seq

    with open (fname, 'r') as f:
        protein_seq = ''
        protein_id = ''
        
        for line in f: #processing each line one by one
            
            # Match this:   >[two chars]|[alphanumeric chars]|   
            
            match = re.search(r'^>([a-z]{2})\|([A-Z0-9]*)\|', line)  
            if match: #we are in our header line #IF HEADERLINE THEN WE REACHED THE END, DUMP TO FILE, START A NEW ONE
                # we matched one of the header lines 
                # - that means we're either starting the first protein record 
                # - or we're starting ANOTHER one ... in this case, we need to write the previous one to a file 
                if protein_id != '':  #if we've already seen it
                    dump_to_file(protein_id, protein_seq) #we dump it

                
                # to make sure we process only a few points during experimentation 
                num_proteins_done += 1 
                if num_proteins_done > 10: break   # TODO: Remove 
                    
                    
                # starting a new sequence 
                protein_id = match.group(2)
                protein_seq = ''   
    
            else: #we are in the amino acids sequence BASICALLY HERE WE PUT THE SEQUENCE LINE BY LINE
                # Header line not found. So, we must be seeing the protein sequences 
                protein_seq += line.strip() #appending to the protein sequence
                
                
            
                
        if protein_id != '':  # we also need the last one dumped  #dump the last protein
            dump_to_file(protein_id, protein_seq)

Converting: ../data-scrapes/all-human-0001.fasta: 


## Step 2: selecting only ATP-binding proteins

In [25]:
# convert function
print("Converting functions ...") 
out_file_fns = os.path.join('..', 'data', 'protein-functions-' + st + '.txt')
print(out_file_fns)
target_functions = ['0005524']   # just ATP binding proteins for now 

Converting functions ...
../data/protein-functions-2019-04-21-121716.txt


In [26]:
annot_files = glob.glob(scrape_dir + "/*annotations.txt")
print(annot_files)

['../data-scrapes/all-human-0001-annotations.txt']


In [30]:
has_function = []  # a list proteins_id that have out target function

for fname in annot_files:
    with open (fname, 'r') as f:
        for line in f:
            #print(line)
            match = re.search(r'([A-Z0-9]*)\sGO:(.*);\sF:.*;', line) #looking for  pattern
            if match:
                # we got the match correctly (should always happen)
                protein_id = match.group(1) #our reges has w groups. Group = is what we put into brackets (...)
                function = match.group(2)
                #print(match.group(1), match.group(2))
                
                if function not in target_functions: #check if the function is in our target_functions
                        continue
                        
                # Append the protein_id to class has_function Yay!
                has_function.append(protein_id) 
          
    import json
    with open(out_file_fns, 'w') as fp:
        json.dump(has_function, fp) #dump to the json file
        
    # Take a peek 
    print(has_function[:10])

['P27361', 'P53779', 'Q9UHC1', 'Q9NYL2', 'O15440', 'P33527', 'Q92887', 'O15438', 'O15439', 'Q5T3U5']
