In [None]:
#  Code ref: https://ritscm.regeneron.com/projects/DSEAIM/repos/secureaccess/browse/securekeyapp.py

In [1]:
!pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org --upgrade pip

Collecting pip
  Obtaining dependency information for pip from https://files.pythonhosted.org/packages/15/aa/3f4c7bcee2057a76562a5b33ecbd199be08cdb4443a02e26bd2c3cf6fc39/pip-23.3.2-py3-none-any.whl.metadata
  Downloading pip-23.3.2-py3-none-any.whl.metadata (3.5 kB)
Downloading pip-23.3.2-py3-none-any.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 23.2.1
    Uninstalling pip-23.2.1:
      Successfully uninstalled pip-23.2.1
Successfully installed pip-23.3.2
[0m

In [None]:
!pip install  --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org \
msal \
retry \
simplejson \
openai \
grpcio==1.58.0 \
pymilvus==2.3.5 \
protobuf \
grpcio-tools==1.58.0 \
pymongo \
tiktoken

In [4]:
import os
import openai
import requests
import simplejson as json
from retry import retry
from msal import PublicClientApplication, ConfidentialClientApplication, ClientApplication
import time

In [5]:
@retry (tries=3, delay=2)
def getapikey():
    # These are the AZURE parameters needed by the client application
    # In this scheme, the user does not have to worry about the api key, all of that is handled
    # at the AZURE Back End.   The API key is rotated hourly.
    # We can handle these as Kuberbetes secrets, or as environment variables.
    client_id = os.getenv("AZURE_CLIENT_ID")
    tenant_id = os.getenv("AZURE_TENANT_ID")
    endpoint = os.getenv("AZURE_ENDPOINT")

    scopes = [os.getenv("AZURE_APPLICATION_SCOPE")]

    app = ClientApplication(
        client_id=client_id,
        authority="https://login.microsoftonline.com/" + tenant_id
    )


    acquire_tokens_result = app.acquire_token_by_username_password(username=os.getenv("SVC_ACCOUNT"),
                                                                   password=os.getenv("SVC_PASSWORD"),
                                                                   scopes=scopes)
    if 'error' in acquire_tokens_result:
        print("Error: " + acquire_tokens_result['error'])
        print("Description: " + acquire_tokens_result['error_description'])
        return 2
    else:
        header_token = {"Authorization": "Bearer {}".format(acquire_tokens_result['access_token'])}
        rt = requests.post(url=endpoint, headers=header_token, data=b'{"key":"openaikey2"}')
        return rt.json()

In [6]:
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
# We are dynamically getting the key from AZURE>   Access is based on the service account/ad group combination
openai.api_key = getapikey()
#print(openai.api_key)

b0e249afda164b639e0227bfe73b61e5


In [7]:
import os
from openai import AzureOpenAI

ai_client = AzureOpenAI(
  api_key = getapikey(),  
  api_version = os.getenv("OPENAI_API_VERSION"),
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)


## Create a function to load multiple proteins from the bioregistry mongo database

In [8]:
from pymongo import MongoClient
username = os.getenv("BIOREGISTRY_USER")
password = os.getenv("BIOREGISTRY_PASSWORD")
host1 = os.getenv("MONGOID_DATABASE_HOST1")
host2 = os.getenv("MONGOID_DATABASE_HOST2")
host3 = os.getenv("MONGOID_DATABASE_HOST3")
db = os.getenv("MONGOID_DATABASE")
mongo_client = MongoClient(f'mongodb://{username}:{password}@{host1}/{db}')
mongo_collection = mongo_client.get_database(db).get_collection('proteins')

In [9]:
from bson.json_util import dumps
# { "$and" : [ { "descriptive_name" : { "$ne" : "null" } }, { "descriptive_name" : { "$exists" : "true" } } ] }
proteins = mongo_collection.find({ "$and" : [ { "descriptive_name" : { "$ne" : None } }, { "descriptive_name" : { "$exists" : "true" } } ] }).limit(100)
for protein in proteins:
	print(dumps(protein))

{"_id": {"$oid": "581764631823767f32002797"}, "_type": "PesProtein", "regn_name": "REGN1487", "descriptive_name": "10xHis(M1-H11).OprF(K12-K360)", "eln_ref": "3184", "created_by": "michael.podgorski", "created_at": {"$date": "2011-04-08T00:00:00Z"}, "targets": [], "linked_pes_prgns": [], "updated_at": {"$date": "2021-04-14T08:20:43.907Z"}, "regn_number": 1487, "synonyms": [{"_id": {"$oid": "581764631823767f32002798"}, "type": "common_name", "value": "10xHis OprF"}], "protein_type": {"_id": {"$oid": "581764631823767f32002799"}, "name": "Custom Structure", "chains": 0, "unique_chains": 0, "svg": "\n                <svg xmlns=\"http://www.w3.org/2000/svg\" width=\"300\" height=\"300\">\n                <rect width=\"300\" height=\"300\" fill=\"#eee\"/>\n                <path transform=\"translate(-150,-50)\" d=\"m305.85 246.3h-31.922c-0.2229-0.74117-1.25352-3.44135-3.09194-8.1006-1.83846-4.65908-2.75768-8.73586-2.75764-12.2303-0.00004-6.67098 0.97488-13.0773 2.92479-19.219 1.94979-6.14154

## Connect to Milvus Vector Database
### Set up schema

In [10]:
from pymilvus import connections, utility, db
conn = connections.connect(host="standalone", port=19530, db_name="default")
db.list_database()

['default', 'bioregistry']

In [11]:
found = False
for database_name in db.list_database():
    if database_name == "bioregistry":
        found = True
if not found:
    db.create_database("bioregistry")

In [12]:
db.list_database()

['default', 'bioregistry']

In [13]:
db.using_database("bioregistry")

In [14]:
utility.list_collections()

['proteins']

In [16]:
new_collection_name = "proteins"

In [17]:
for collection_name in utility.list_collections():
    if collection_name == new_collection_name:
        utility.drop_collection(new_collection_name)

In [18]:
from pymilvus import CollectionSchema, FieldSchema, DataType

protein_id = FieldSchema(
  name="protein_id",
  dtype=DataType.INT64,
  is_primary=True,
)

name = FieldSchema(
  name="name",
  dtype=DataType.VARCHAR,
  max_length=32,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown"
)

char_count = FieldSchema(
  name="char_count",
  dtype=DataType.INT64,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value=9999
)

protein_json = FieldSchema(
  name="protein_json",
  dtype=DataType.JSON,
  max_length=16000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown"
)

protein_vector = FieldSchema(
  name="protein_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=1536
)

schema = CollectionSchema(
  fields=[protein_id, name, char_count, protein_json, protein_vector],
  description="Protein search",
  enable_dynamic_field=True
)

In [19]:
from pymilvus import Collection
collection = Collection(
    name=new_collection_name,
    schema=schema,
    using='default',
    shards_num=2
    )

In [20]:
collection.create_partition("Disc1")
collection.has_partition("Disc1")

True

In [21]:
collection.create_partition("Disc2")
collection.has_partition("Disc2")

True

In [22]:
from pymilvus import Partition
disc2 = Partition(collection, name="Disc2")
disc2.release()
collection.drop_partition("Disc2")
collection.has_partition("Disc2")

False

In [23]:
utility.list_collections()

['proteins']

In [24]:
import tiktoken
def truncate_tokens(string: str, encoding_name: str, max_length: int = 8191) -> str:
    """Truncates a text string based on max number of tokens."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    encoded_string = encoding.encode(string)
    num_tokens = len(encoded_string)

    if num_tokens > max_length:
        string = encoding.decode(encoded_string[:max_length])

    return string

## Insert proteins as vetcors
### { protein_id, name, char_count, protein_json, protein_vector 

In [25]:
from bson.json_util import dumps
from bson import json_util
import json
proteins = mongo_collection.find({"created_by": "kristin.hudson"})
i = 1
for protein in proteins:
	if 'descriptive_name'in protein:
		protein_json = json.dumps(protein,default=json_util.default)
		protein_descriptive_name= f"{protein['descriptive_name']}"
		#tokenized = truncate_tokens(string=protein_string, encoding_name="gpt-3.5-turbo", max_length=8191)
		print(protein_descriptive_name)
		response = ai_client.embeddings.create(
			input = protein_descriptive_name, #tokenized,
			model = "RegnADA002"
		)
		protein_id_column = [i]
		name_column = [protein['regn_name']]
		char_count_column = [len(protein_descriptive_name)]
		protein_json_column = [json.loads(protein_json)]
		protein_vector_column = [response.data[0].embedding]
		data = [protein_id_column, name_column, char_count_column, protein_json_column, protein_vector_column]
		collection.upsert(data)
		i += 1

mROR1(M1-A29).hBZLF1(S40-L48).GCGGS+G4Sx2.hB2m(I21-M119).G4Sx4.hHLA-E(G22-P301;Y105C).PADRE(A1-A13).mycmychis6
mROR1(M1-A29).6His.GCN4(R249-R281).G4Sx3.hGITR_ligand(E74-S199)
mROR1(M1-A29).B-gal(D96-V103).GCGGS+G4sx2.mB2m(I21-M119).G4sx4.mH2D(b)ecto(G25-V309;Y108C).PADRE(A1-A13).mycmychis6
mROR1(M1-A29).GFP(D118-L126).GCGGS+G4Sx2.mB2m(I21-M119).G4Sx4.mH2D(b)_ecto(G25-V309;Y108C).PADRE(A1-A13).mycmychis6
mROR1(M1-A29).hAPOC3(S21-A99).mycmychis6
mROR1(M1-A29).mAPOC3(E21-S99).mycmychis6
mROR1(M1-A29).MfAPOC3mature(S21-A99).mymychis
mROR1(M1-A29).hBTN3A1(Q30-G254).mycmychis6
mROR(M1-A29).hBTNL8_ecto(Q18-K238).mycmychis6
mROR1(M1-A29).mTIM-3ecto(R20-A193).hIgG1_Fc
mROR1(M1-A29).mTIM-3ecto(BALB_C)(R20-A193).hIgG1_Fc
mROR1(M1-A29).hUSP(I50-Y58).GCGGS+G4Sx2.hB2m(I21-M119).G4Sx4.hHLA-B*15_01-01-01(G25-P300;Y108C).mycmychis6
hgH_signal_sequence(M1-S23).hgH(R24-R717).mycmychis6.Furin-recognition-sequence(R1-R4).GSG.T2A(E1-P18).hgL_signal_sequence(M1-S30).hgL(A31-R278)
mROR1(M1-A29).VαhCD3E_HAAW5B

In [26]:

collection.compact()

In [27]:
index_params = {
  "metric_type":"COSINE",
  "index_type":"IVF_SQ8",
  "params":{"nlist":1024},
  
}
collection.create_index(
  field_name="protein_vector", 
  index_name="protein_vector_idx",
  index_params=index_params
)

Status(code=0, message=)

In [28]:
# index on a scalar field
collection.create_index(
  field_name="name", 
  index_name="protein_name_idx"
)

Status(code=0, message=)

## Search , Query and Hybrid

In [37]:
search_string = 'mROR1(M1-A29).hCD8'
response = ai_client.embeddings.create(
    input = search_string,
    model= "RegnADA002"
)
query_embedding = response.data[0].embedding
query_embedding

[-0.03057379461824894,
 -0.013408981263637543,
 -0.021111732348799706,
 -0.015128756873309612,
 -0.006417861208319664,
 0.020189248025417328,
 -0.018238849937915802,
 0.016288453713059425,
 -0.019820252433419228,
 0.0010303830495104194,
 -0.0026274356059730053,
 0.019754361361265182,
 -0.00043818046106025577,
 0.027938123792409897,
 -0.028860608115792274,
 0.016670625656843185,
 0.013250840827822685,
 0.004701379686594009,
 0.006585885304957628,
 -0.00945547316223383,
 -0.014153558760881424,
 0.03231333941221237,
 -0.008671361021697521,
 -0.032155197113752365,
 0.010068266652524471,
 0.022034218534827232,
 0.006065340247005224,
 -0.03384202718734741,
 -0.005785299930721521,
 -0.01248649600893259,
 0.015168292447924614,
 -0.00971245113760233,
 -0.021256694570183754,
 -0.02005746401846409,
 -0.01377138588577509,
 -0.007313989568501711,
 0.017553575336933136,
 -0.015168292447924614,
 0.01991250179708004,
 0.009705862030386925,
 0.024538105353713036,
 -0.01008803490549326,
 -0.001303834025

In [38]:
search_params = {
    "metric_type": "COSINE", 
    "params": {"search_k": 64}
}

In [40]:
# Search
collection = Collection(new_collection_name)
utility.list_collections()
collection.load()
results =  collection.search(
    data=[query_embedding],
    anns_field="protein_vector",
    param=search_params,
    limit=10,
    expr=None,
    output_fields=['protein_json']
)
for result in results[0]:
    print(result)

id: 30, distance: 0.9395024180412292, entity: {'protein_json': {'_id': {'$oid': '581764721823767f3200445e'}, '_type': 'PesProtein', 'regn_name': 'REGN3912', 'descriptive_name': 'mROR1(M1-A29).hCD8b(L22-P170).mIgG2a_Fc', 'eln_ref': '201508250254', 'created_by': 'kristin.hudson', 'created_at': {'$date': '2015-08-27T00:00:00Z'}, 'targets': [], 'linked_pes_prgns': ['pRGN9057'], 'updated_at': {'$date': '2021-04-14T08:20:33.683Z'}, 'regn_number': 3912, 'synonyms': [{'_id': {'$oid': '581764721823767f3200445f'}, 'type': 'common_name', 'value': 'hCD8b ecto-mFc'}], 'protein_type': {'_id': {'$oid': '581764721823767f32004460'}, 'name': 'Custom Structure', 'chains': 0, 'unique_chains': 0, 'svg': '\n                <svg xmlns="http://www.w3.org/2000/svg" width="300" height="300">\n                <rect width="300" height="300" fill="#eee"/>\n                <path transform="translate(-150,-50)" d="m305.85 246.3h-31.922c-0.2229-0.74117-1.25352-3.44135-3.09194-8.1006-1.83846-4.65908-2.75768-8.73586-2.

In [41]:
query_expr = "protein_json['regn_name'] like 'REGN3853'" # Querying the json field

In [42]:
# Query
collection = Collection(new_collection_name)
utility.list_collections()
collection.load()
results = collection.query(
    expr = query_expr,
    offset = 0,
    limit = 10, 
    output_fields = ["name", "protein_json"]
)
results[0]

{'name': 'REGN3853',
 'protein_json': {'_id': {'$oid': '581764711823767f320043b4'},
  '_type': 'PesProtein',
  'regn_name': 'REGN3853',
  'descriptive_name': 'mROR1(M1-A29).VαhCD3E_HAAW5B8-2_(2712N)_VH(E1-S123).G4Sx3.VαhCD3E_HAAW5B8-2_(2712N)_VK(E1-K107).G4Sx3.hIgG1Fc*(D104-K390;H318R;Y319F)',
  'eln_ref': '1',
  'created_by': 'kristin.hudson',
  'created_at': {'$date': '2015-07-18T00:00:00Z'},
  'targets': [{'_id': {'$oid': '62839836820710025eed741c'},
    'target_id': 'C-TGT-99',
    'target_name': 'CD3 (CD3E)',
    'synonyms': None,
    'source_type': 'TargetsDb',
    'source_id': 'C-TGT-99'}],
  'linked_pes_prgns': ['pRGN8966'],
  'updated_at': {'$date': '2021-04-14T08:20:33.113Z'},
  'regn_number': 3853,
  'synonyms': [{'_id': {'$oid': '581764711823767f320043b5'},
    'type': 'common_name',
    'value': ' α-CD3E H1H2712N ScFv-hIgG1Fc'}],
  'protein_type': {'_id': {'$oid': '5d07a077700f3e489862aae5'},
   'name': 'Standard mAb',
   'chains': 4,
   'unique_chains': 2,
   'svg': '\n  

In [43]:
# Hybrid : sama as search with expr provided
results =  collection.search(
    data=[query_embedding],
    anns_field="protein_vector",
    param=search_params,
    limit=5,
    expr=query_expr,
    output_fields=['protein_json']
)
for result in results[0]:
    print(result)

id: 14, distance: 0.9035544395446777, entity: {'protein_json': {'_id': {'$oid': '581764711823767f320043b4'}, '_type': 'PesProtein', 'regn_name': 'REGN3853', 'descriptive_name': 'mROR1(M1-A29).VαhCD3E_HAAW5B8-2_(2712N)_VH(E1-S123).G4Sx3.VαhCD3E_HAAW5B8-2_(2712N)_VK(E1-K107).G4Sx3.hIgG1Fc*(D104-K390;H318R;Y319F)', 'eln_ref': '1', 'created_by': 'kristin.hudson', 'created_at': {'$date': '2015-07-18T00:00:00Z'}, 'targets': [{'_id': {'$oid': '62839836820710025eed741c'}, 'target_id': 'C-TGT-99', 'target_name': 'CD3 (CD3E)', 'synonyms': None, 'source_type': 'TargetsDb', 'source_id': 'C-TGT-99'}], 'linked_pes_prgns': ['pRGN8966'], 'updated_at': {'$date': '2021-04-14T08:20:33.113Z'}, 'regn_number': 3853, 'synonyms': [{'_id': {'$oid': '581764711823767f320043b5'}, 'type': 'common_name', 'value': ' α-CD3E H1H2712N ScFv-hIgG1Fc'}], 'protein_type': {'_id': {'$oid': '5d07a077700f3e489862aae5'}, 'name': 'Standard mAb', 'chains': 4, 'unique_chains': 2, 'svg': '\n                <svg xmlns="http://www.w3