In [2]:
import requests
from Bio import PDB
from elasticsearch import Elasticsearch, helpers
import os
import json

In [34]:
class AlphaFoldModel:
    """
    Class to represent a valid AlphaFold model.

    By default, should be initialized with an AlphaFold model filename/filepath, but can be
    initialized with a Uniprot ID using the '.fromUniprotID()' method. Also can be done with
    a model filename using '.fromModelFile()' (but not necessary to call that method explicitly).

    Code help from: https://stackoverflow.com/a/3135079/5420857
    """
    def __init__(self, alphafold_model_filepath, auto_download=True):
        """
        Initialize a new AlphaFold Model instance from a model's filename/path.

        By default, download the associated mmCIF model file. If you already have the mmCIF file locally,
        initialize with 'auto_download=False'.
        """

        self.alphafold_model_filepath = alphafold_model_filepath
        self.alphafold_model_filename = self.alphafold_model_filepath.split('/')[-1]

        if (self.alphafold_model_filename.startswith('AF-') and self.alphafold_model_filename.endswith('.cif')):
            self.uniprot_id = self.alphafold_model_filename.split('/')[-1].split('-')[1].upper()
        else:
            print("Provided AlphaFold file does not adhere to standard naming scheme--i.e., does not start with 'AF-' or end with '.cif'.")
            return None

        if not os.path.exists(self.alphafold_model_filepath):
            print("Provided AlphaFold filename and/or path does not exist.")

            if auto_download:
                print("Downloading model file automatically...")
                self.download_alphafold_model(self.alphafold_model_filename)
                self.mmcif_metadata = self.extract_alphafold_model_metadata_mmcif(self.alphafold_model_filename)
            else:
                print("Will NOT download model file automatically. Returning None.")
                return None

        else:
            self.mmcif_metadata = self.extract_alphafold_model_metadata_mmcif(self.alphafold_model_filepath)


    @classmethod
    def fromUniprotID(cls, uniprot_id):
        """
        Create an AlphaFoldModel instance from a given Uniprot ID.

        :params uniprot_id:         A UniProt ID as a string.

        :return AlphaFoldModel:     Instantiation of an AlphaFoldModel object.
        """
        uniprot_id_upper = str(uniprot_id).upper()

        ## Check if valid Uniprot ID
        url = "https://www.uniprot.org/uniprot/"+uniprot_id_upper
        response = requests.get(url)
        if response.status_code >= 400:
            print("Provided Uniprot ID '"+uniprot_id_upper+"' does not exist.")
            return None

        ## Check if corresponding AlphaFold entry exists for Uniprot ID
        url = "https://alphafold.ebi.ac.uk/entry/"+uniprot_id_upper
        response = requests.get(url)
        if response.status_code >= 400:
            print("Requested Uniprot ID entry '"+uniprot_id_upper+"' does not have an associated AlphaFold model.")
            return None

        ## If the Uniprot ID passes both above checks, proceed with initializing an instance with the filename
        alphafold_model_filename_mmcif = "AF-"+uniprot_id_upper+"-F1-model_v1.cif"

        return cls(alphafold_model_filename_mmcif)

    @classmethod
    def fromModelFile(cls, alphafold_model_filepath):
        """
        Create an AlphaFoldModel instance from a given mmCIF model file.

        Note that this class method doesn't actually need to be called explicitly, as the default behavior of instantiating a new
        AlphaFoldModel object is to accept the filepath/name in the base class--e.g., just call, AlphaFoldModel('modelfile.cif').

        :params alphafold_model_filepath:   Path to AlphaFold mmCIF model file.

        :return AlphaFoldModel:             Instantiation of an AlphaFoldModel object.
        """
        return cls(alphafold_model_filepath)


    def download_alphafold_model(self, filename):
        """
        Download an AlphaFold model file locally, from: https://alphafold.ebi.ac.uk/

        :params filename:       Filename of the AlphaFold model file to download.
        """
        url = "https://alphafold.ebi.ac.uk/files/"+filename
        # url = "https://alphafold.ebi.ac.uk/files/AF-P0ACR9-F1-model_v1.cif"

        ## Check if corresponding AlphaFold entry exists for Uniprot ID
        response = requests.get(url)
        if response.status_code >= 400:
            print("Requested AlphaFold model file '"+filename+"' does not exist.")
            return None

        with requests.get(url, stream=True) as r:
            r.raise_for_status()
            with open(filename, 'wb') as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)

        return

    def extract_alphafold_model_metadata_mmcif(self, filepath):
        """
        Return the metadata (or 'doc_body') from an AlphaFold mmCIF model file, which may be appended to a bulk index request.

        :params filepath:     File path of the AlphaFold mmCIF model file to extract metadata from.

        :return metadata:     Dictionary of metadata to index.
        """
        metadata={}

        parser = PDB.MMCIFParser()
        structure = parser.get_structure("", filepath)
        parsed_info = parser._mmcif_dict

        for token in parsed_info:
            if not (token.startswith('_atom_site') or token == 'data_'):   # Exclude all "_atom_site" tokens as well as the first "data_" header that the parser thinks is a token  (e.g., "data_AF-P0ACR9-F1") and ends up splitting up the string into a list of individual characters
                # All token values are by default loaded by the parser as a list of strings, regardless of the actual data type
                # So first check if value can be represented as an integer
                if all(i.isdigit() for i in parsed_info[token]):
                    metadata.update({token: [int(value) for value in parsed_info[token]]})
                else:
                    # Next try to represent it as a float
                    try:
                        metadata.update({token: [float(value) for value in parsed_info[token]]})
                    # Last, it must just be a string
                    except ValueError:
                        metadata.update({token: [value.replace("\n","") for value in parsed_info[token]]})

        metadata.update({"Uniprot_ID": self.uniprot_id})

        return metadata


### Example of parsing local mmCIF AlphaFold models 

In [35]:
alpha_fold_model_directory = './example_model_files'

In [36]:
os.listdir(alpha_fold_model_directory)

['AF-P0ACR9-F1-model_v1.pdb',
 'AF-P0ACR9-F1-model_v1.cif',
 'AF-A5A605-F1-model_v1.cif',
 'AF-Q7DFV4-F1-model_v1.pdb',
 'AF-P0CF41-F1-model_v1.cif',
 'AF-P0CF41-F1-model_v1.pdb',
 'AF-Q7DFV4-F1-model_v1.cif']

In [37]:
for file in os.listdir(alpha_fold_model_directory):
    if (file.startswith('AF-') and file.endswith('.cif')):
        filepath = os.path.join(alpha_fold_model_directory, file)
        model_instance = AlphaFoldModel(filepath)
        
        print(model_instance.mmcif_metadata, '\n\n')

{'_entry.id': ['AF-P0ACR9-F1'], '_af_target_ref_db_details.gene': ['mprA'], '_af_target_ref_db_details.seq_db_sequence_checksum': ['BF4D3E34DAD0718A'], '_af_target_ref_db_details.seq_db_sequence_version_date': ['2005-11-22'], '_atom_type.symbol': ['C', 'N', 'O', 'S'], '_audit_author.name': ['Jumper, John', 'Evans, Richard', 'Pritzel, Alexander', 'Green, Tim', 'Figurnov, Michael', 'Ronneberger, Olaf', 'Tunyasuvunakool, Kathryn', 'Bates, Russ', 'Zidek, Augustin', 'Potapenko, Anna', 'Bridgland, Alex', 'Meyer, Clemens', 'Kohl, Simon A. A.', 'Ballard, Andrew J.', 'Cowie, Andrew', 'Romera-Paredes, Bernardino', 'Nikolov, Stanislav', 'Jain, Rishub', 'Adler, Jonas', 'Back, Trevor', 'Petersen, Stig', 'Reiman, David', 'Clancy, Ellen', 'Zielinski, Michal', 'Steinegger, Martin', 'Pacholska, Michalina', 'Berghammer, Tamas', 'Silver, David', 'Vinyals, Oriol', 'Senior, Andrew W.', 'Kavukcuoglu, Koray', 'Kohli, Pushmeet', 'Hassabis, Demis'], '_audit_author.pdbx_ordinal': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10,

### Example of retrieving AlphaFold mmCIF model file provided a UniProt ID

In [38]:
os.listdir('.')

['Untitled.ipynb',
 'example_model_files',
 'README.md',
 'alphafold_api.py',
 '.ipynb_checkpoints',
 'alphafold.ipynb']

In [39]:
my_new_AF_model = AlphaFoldModel.fromUniprotID("P08709")

Provided AlphaFold filename and/or path does not exist.
Downloading model file automatically...


In [40]:
os.listdir('.')

['Untitled.ipynb',
 'example_model_files',
 'README.md',
 'alphafold_api.py',
 '.ipynb_checkpoints',
 'AF-P08709-F1-model_v1.cif',
 'alphafold.ipynb']

In [41]:
my_new_AF_model.mmcif_metadata

{'_entry.id': ['AF-P08709-F1'],
 '_af_target_ref_db_details.gene': ['F7'],
 '_af_target_ref_db_details.seq_db_sequence_checksum': ['9B5D501669D67B06'],
 '_af_target_ref_db_details.seq_db_sequence_version_date': ['1988-01-01'],
 '_atom_type.symbol': ['C', 'N', 'O', 'S'],
 '_audit_author.name': ['Jumper, John',
  'Evans, Richard',
  'Pritzel, Alexander',
  'Green, Tim',
  'Figurnov, Michael',
  'Ronneberger, Olaf',
  'Tunyasuvunakool, Kathryn',
  'Bates, Russ',
  'Zidek, Augustin',
  'Potapenko, Anna',
  'Bridgland, Alex',
  'Meyer, Clemens',
  'Kohl, Simon A. A.',
  'Ballard, Andrew J.',
  'Cowie, Andrew',
  'Romera-Paredes, Bernardino',
  'Nikolov, Stanislav',
  'Jain, Rishub',
  'Adler, Jonas',
  'Back, Trevor',
  'Petersen, Stig',
  'Reiman, David',
  'Clancy, Ellen',
  'Zielinski, Michal',
  'Steinegger, Martin',
  'Pacholska, Michalina',
  'Berghammer, Tamas',
  'Silver, David',
  'Vinyals, Oriol',
  'Senior, Andrew W.',
  'Kavukcuoglu, Koray',
  'Kohli, Pushmeet',
  'Hassabis, Dem

In [42]:
# Print each of the available 1-letter sequence codes
for one_letter_seq_code in ['_entity_poly.pdbx_seq_one_letter_code', '_entity_poly.pdbx_seq_one_letter_code_can', '_struct_ref.pdbx_seq_one_letter_code']:
    print(one_letter_seq_code)
    print(my_new_AF_model.mmcif_metadata[one_letter_seq_code], '\n')

_entity_poly.pdbx_seq_one_letter_code
['MVSQALRLLCLLLGLQGCLAAGGVAKASGGETRDMPWKPGPHRVFVTQEEAHGVLHRRRRANAFLEELRPGSLERECKEEQCSFEEAREIFKDAERTKLFWISYSDGDQCASSPCQNGGSCKDQLQSYICFCLPAFEGRNCETHKDDQLICVNENGGCEQYCSDHTGTKRSCRCHEGYSLLADGVSCTPTVEYPCGKIPILEKRNASKPQGRIVGGKVCPKGECPWQVLLLVNGAQLCGGTLINTIWVVSAAHCFDKIKNWRNLIAVLGEHDLSEHDGDEQSRRVAQVIIPSTYVPGTTNHDIALLRLHQPVVLTDHVVPLCLPERTFSERTLAFVRFSLVSGWGQLLDRGATALELMVLNVPRLMTQDCLQQSRKVGDSPNITEYMFCAGYSDGSKDSCKGDSGGPHATHYRGTWYLTGIVSWGQGCATVGHFGVYTRVSQYIEWLQKLMRSEPRPGVLLRAPFP'] 

_entity_poly.pdbx_seq_one_letter_code_can
['MVSQALRLLCLLLGLQGCLAAGGVAKASGGETRDMPWKPGPHRVFVTQEEAHGVLHRRRRANAFLEELRPGSLERECKEEQCSFEEAREIFKDAERTKLFWISYSDGDQCASSPCQNGGSCKDQLQSYICFCLPAFEGRNCETHKDDQLICVNENGGCEQYCSDHTGTKRSCRCHEGYSLLADGVSCTPTVEYPCGKIPILEKRNASKPQGRIVGGKVCPKGECPWQVLLLVNGAQLCGGTLINTIWVVSAAHCFDKIKNWRNLIAVLGEHDLSEHDGDEQSRRVAQVIIPSTYVPGTTNHDIALLRLHQPVVLTDHVVPLCLPERTFSERTLAFVRFSLVSGWGQLLDRGATALELMVLNVPRLMTQDCLQQSRKVGDSPNITEYMFCAGYSDGSKDSCKGDSGGPHATHYRGTWYLTGIVSWGQGCATVGHFGVYTRVSQYIE

In [43]:
# Print AlphaFold residue-specific quality metrics ("plDDT", predicted local distance difference test) 
my_new_AF_model.mmcif_metadata['_ma_qa_metric_local.metric_value']

[45.48,
 50.42,
 55.96,
 56.26,
 60.11,
 56.64,
 56.09,
 64.27,
 62.05,
 57.2,
 57.33,
 62.34,
 54.48,
 39.67,
 46.72,
 39.47,
 33.45,
 30.78,
 34.3,
 36.38,
 29.66,
 26.73,
 29.92,
 27.62,
 31.44,
 25.74,
 34.29,
 32.02,
 28.58,
 26.78,
 31.57,
 31.8,
 36.57,
 34.22,
 37.74,
 42.67,
 37.61,
 41.38,
 43.04,
 50.37,
 53.79,
 52.11,
 54.5,
 69.87,
 64.48,
 78.12,
 78.03,
 73.39,
 72.9,
 76.88,
 75.68,
 71.52,
 71.56,
 74.78,
 63.44,
 62.9,
 56.35,
 59.79,
 58.82,
 58.73,
 58.56,
 62.63,
 69.04,
 77.75,
 75.73,
 72.65,
 76.41,
 81.4,
 78.66,
 81.92,
 78.61,
 86.26,
 84.85,
 84.0,
 84.65,
 84.82,
 86.77,
 82.13,
 82.56,
 82.31,
 79.61,
 86.97,
 87.91,
 87.19,
 88.76,
 89.44,
 88.37,
 90.71,
 89.84,
 87.48,
 88.96,
 86.81,
 90.3,
 87.77,
 88.47,
 87.35,
 87.3,
 85.5,
 83.28,
 84.56,
 84.49,
 81.42,
 80.55,
 82.51,
 80.72,
 83.15,
 83.18,
 85.34,
 86.67,
 92.25,
 90.32,
 91.49,
 93.28,
 95.25,
 96.33,
 95.09,
 93.44,
 93.61,
 95.52,
 96.15,
 95.51,
 92.26,
 87.85,
 85.28,
 83.58,
 85.09,
 90

### Template for indexing metadata to an Elasticsearch instance

In [44]:
def append_doc_body_to_bulk_index_request(doc_body, bulk_index_request, target_index):
    """
    Append a metadata dictionary (or any "document body") to a growing bulk index request string.

    :params doc_body:             Metadta document body to append to bulk index request, as a dictionary.
    :params bulk_index_request:   Growing bulk index request, as a string of JSONs sepearated by newline characters.
    :params target_index:         ElasticSearch index to post documents to.

    :return bulk_index_request:   The extended bulk index request string.
    """

    doc_target={ "index" : { "_index" : target_index } }

    bulk_index_request += json.dumps(doc_target)+'\n'+json.dumps(doc_body)+'\n'

    return bulk_index_request

In [45]:
def submit_bulk_index_request(es, bulk_index_request):
    """
    Submit a bulk index request string to an Elasticsearch instance.

    :params es:                   ElasticSearch instance/object to connect to.
    :params bulk_index_request:   Bulk index request to submit.
    """

    es.bulk( body=bulk_index_request )

    return

In [46]:
es = Elasticsearch(['localhost:9200'], timeout=1000)

In [47]:
## Index data using a bulk index request, but posting only a few hundred models at a time

alpha_fold_model_directory = './example_model_files'
target_es_index = 'AlphaFold_Models'

bulk_index = ""
bulk_index_counter = 0

for file in os.listdir(alpha_fold_model_directory):
    if (file.startswith('AF-') and file.endswith('.cif')):
        try:
            filepath = os.path.join(alpha_fold_model_directory, file)
            model_instance = AlphaFoldModel(filepath)
            bulk_index = append_doc_body_to_bulk_index_request(model_instance.mmcif_metadata, bulk_index, target_es_index)
            bulk_index_counter += 1
            if bulk_index_counter == 200:
#                 submit_bulk_index_request(es, bulk_index)
                bulk_index_counter = 0
                bulk_index = ""
        except Exception as err:
            print("!!! Exception:", err)

## Submit final bulk index register 
# submit_bulk_index_request(es, bulk_index)

print(bulk_index)

{"index": {"_index": "AlphaFold_Models"}}
{"_entry.id": ["AF-P0ACR9-F1"], "_af_target_ref_db_details.gene": ["mprA"], "_af_target_ref_db_details.seq_db_sequence_checksum": ["BF4D3E34DAD0718A"], "_af_target_ref_db_details.seq_db_sequence_version_date": ["2005-11-22"], "_atom_type.symbol": ["C", "N", "O", "S"], "_audit_author.name": ["Jumper, John", "Evans, Richard", "Pritzel, Alexander", "Green, Tim", "Figurnov, Michael", "Ronneberger, Olaf", "Tunyasuvunakool, Kathryn", "Bates, Russ", "Zidek, Augustin", "Potapenko, Anna", "Bridgland, Alex", "Meyer, Clemens", "Kohl, Simon A. A.", "Ballard, Andrew J.", "Cowie, Andrew", "Romera-Paredes, Bernardino", "Nikolov, Stanislav", "Jain, Rishub", "Adler, Jonas", "Back, Trevor", "Petersen, Stig", "Reiman, David", "Clancy, Ellen", "Zielinski, Michal", "Steinegger, Martin", "Pacholska, Michalina", "Berghammer, Tamas", "Silver, David", "Vinyals, Oriol", "Senior, Andrew W.", "Kavukcuoglu, Koray", "Kohli, Pushmeet", "Hassabis, Demis"], "_audit_author.pdbx