In [None]:
#  Code ref: https://ritscm.regeneron.com/projects/DSEAIM/repos/secureaccess/browse/securekeyapp.py

In [None]:
!pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org --upgrade pip

In [None]:
!pip install  --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org \
msal \
retry \
simplejson \
openai \
grpcio==1.58.0 \
pymilvus==2.3.5 \
protobuf \
grpcio-tools==1.58.0 \
pymongo \
tiktoken

In [None]:
import os
import openai
import requests
import simplejson as json
from retry import retry
from msal import PublicClientApplication, ConfidentialClientApplication, ClientApplication
import time

In [None]:
@retry (tries=3, delay=2)
def getapikey():
    # These are the AZURE parameters needed by the client application
    # In this scheme, the user does not have to worry about the api key, all of that is handled
    # at the AZURE Back End.   The API key is rotated hourly.
    # We can handle these as Kuberbetes secrets, or as environment variables.
    client_id = os.getenv("AZURE_CLIENT_ID")
    tenant_id = os.getenv("AZURE_TENANT_ID")
    endpoint = os.getenv("AZURE_ENDPOINT")

    scopes = [os.getenv("AZURE_APPLICATION_SCOPE")]

    app = ClientApplication(
        client_id=client_id,
        authority="https://login.microsoftonline.com/" + tenant_id
    )


    acquire_tokens_result = app.acquire_token_by_username_password(username=os.getenv("SVC_ACCOUNT"),
                                                                   password=os.getenv("SVC_PASSWORD"),
                                                                   scopes=scopes)
    if 'error' in acquire_tokens_result:
        print("Error: " + acquire_tokens_result['error'])
        print("Description: " + acquire_tokens_result['error_description'])
        return 2
    else:
        header_token = {"Authorization": "Bearer {}".format(acquire_tokens_result['access_token'])}
        rt = requests.post(url=endpoint, headers=header_token, data=b'{"key":"openaikey2"}')
        return rt.json()

In [None]:
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
# We are dynamically getting the key from AZURE>   Access is based on the service account/ad group combination
openai.api_key = getapikey()
# print(openai.api_key)

In [None]:
import os
from openai import AzureOpenAI

ai_client = AzureOpenAI(
  api_key = getapikey(),  
  api_version = os.getenv("OPENAI_API_VERSION"),
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)


## Create a function to load multiple proteins from the bioregistry mongo database

In [None]:
from pymongo import MongoClient
username = os.getenv("BIOREGISTRY_USER")
password = os.getenv("BIOREGISTRY_PASSWORD")
host1 = os.getenv("MONGOID_DATABASE_HOST1")
host2 = os.getenv("MONGOID_DATABASE_HOST2")
host3 = os.getenv("MONGOID_DATABASE_HOST3")
db = os.getenv("MONGOID_DATABASE")
mongo_client = MongoClient(f'mongodb://{username}:{password}@{host1}/{db}')
mongo_collection = mongo_client.get_database(db).get_collection('proteins')

In [None]:
from bson.json_util import dumps
# { "$and" : [ { "descriptive_name" : { "$ne" : "null" } }, { "descriptive_name" : { "$exists" : "true" } } ] }
proteins = mongo_collection.find({ "$and" : [ { "descriptive_name" : { "$ne" : None } }, { "descriptive_name" : { "$exists" : "true" } } ] }).limit(100)
for protein in proteins:
	print(dumps(protein))

## Connect to Milvus Vector Database
### Set up schema

In [None]:
from pymilvus import connections, utility, db
conn = connections.connect(host="standalone", port=19530, db_name="default")
db.list_database()

In [None]:
found = False
for database_name in db.list_database():
    if database_name == "bioregistry":
        found = True
if not found:
    db.create_database("bioregistry")

In [None]:
db.list_database()

In [None]:
db.using_database("bioregistry")

In [None]:
utility.list_collections()

In [None]:
new_collection_name = "proteins"

In [None]:
for collection_name in utility.list_collections():
    if collection_name == new_collection_name:
        utility.drop_collection(new_collection_name)

In [None]:
from pymilvus import CollectionSchema, FieldSchema, DataType

protein_id = FieldSchema(
  name="protein_id",
  dtype=DataType.INT64,
  is_primary=True,
)

name = FieldSchema(
  name="name",
  dtype=DataType.VARCHAR,
  max_length=32,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown"
)

char_count = FieldSchema(
  name="char_count",
  dtype=DataType.INT64,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value=9999
)

protein_json = FieldSchema(
  name="protein_json",
  dtype=DataType.JSON,
  max_length=16000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown"
)

protein_vector = FieldSchema(
  name="protein_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=1536
)

schema = CollectionSchema(
  fields=[protein_id, name, char_count, protein_json, protein_vector],
  description="Protein search",
  enable_dynamic_field=True
)

In [None]:
from pymilvus import Collection
collection = Collection(
    name=new_collection_name,
    schema=schema,
    using='default',
    shards_num=2
    )

In [None]:
collection.create_partition("Disc1")
collection.has_partition("Disc1")

In [None]:
collection.create_partition("Disc2")
collection.has_partition("Disc2")

In [None]:
from pymilvus import Partition
disc2 = Partition(collection, name="Disc2")
disc2.release()
collection.drop_partition("Disc2")
collection.has_partition("Disc2")

In [None]:
utility.list_collections()

In [None]:
import tiktoken
def truncate_tokens(string: str, encoding_name: str, max_length: int = 8191) -> str:
    """Truncates a text string based on max number of tokens."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    encoded_string = encoding.encode(string)
    num_tokens = len(encoded_string)

    if num_tokens > max_length:
        string = encoding.decode(encoded_string[:max_length])

    return string

## Insert proteins as vetcors
### { protein_id, name, char_count, protein_json, protein_vector 

In [None]:
from bson.json_util import dumps
from bson import json_util
import json
proteins = mongo_collection.find({"created_by": "kristin.hudson"})
i = 1
for protein in proteins:
	if 'descriptive_name'in protein:
		protein_json = json.dumps(protein,default=json_util.default)
		protein_descriptive_name= f"{protein['descriptive_name']}"
		#tokenized = truncate_tokens(string=protein_string, encoding_name="gpt-3.5-turbo", max_length=8191)
		print(protein_descriptive_name)
		response = ai_client.embeddings.create(
			input = protein_descriptive_name, #tokenized,
			model = "RegnADA002"
		)
		protein_id_column = [i]
		name_column = [protein['regn_name']]
		char_count_column = [len(protein_descriptive_name)]
		protein_json_column = [json.loads(protein_json)]
		protein_vector_column = [response.data[0].embedding]
		data = [protein_id_column, name_column, char_count_column, protein_json_column, protein_vector_column]
		collection.upsert(data)
		i += 1

In [None]:

collection.compact()

In [None]:
index_params = {
  "metric_type":"COSINE",
  "index_type":"IVF_SQ8",
  "params":{"nlist":1024},
  
}
collection.create_index(
  field_name="protein_vector", 
  index_name="protein_vector_idx",
  index_params=index_params
)

In [None]:
# index on a scalar field
collection.create_index(
  field_name="name", 
  index_name="protein_name_idx"
)

## Search , Query and Hybrid

In [None]:
search_string = 'mROR1(M1-A29).hCD8'
response = ai_client.embeddings.create(
    input = search_string,
    model= "RegnADA002"
)
query_embedding = response.data[0].embedding
# query_embedding

In [None]:
search_params = {
    "metric_type": "COSINE", 
    "params": {"search_k": 64}
}

In [None]:
# Search
collection = Collection(new_collection_name)
utility.list_collections()
collection.load()
results =  collection.search(
    data=[query_embedding],
    anns_field="protein_vector",
    param=search_params,
    limit=10,
    expr=None,
    output_fields=['protein_json']
)
for result in results[0]:
    print(result)

In [None]:
query_expr = "protein_json['regn_name'] like 'REGN3853'" # Querying the json field

In [None]:
# Query
collection = Collection(new_collection_name)
utility.list_collections()
collection.load()
results = collection.query(
    expr = query_expr,
    offset = 0,
    limit = 10, 
    output_fields = ["name", "protein_json"]
)
results[0]

In [None]:
# Hybrid : sama as search with expr provided
results =  collection.search(
    data=[query_embedding],
    anns_field="protein_vector",
    param=search_params,
    limit=5,
    expr=query_expr,
    output_fields=['protein_json']
)
for result in results[0]:
    print(result)