# [DNA Sequence Classification based on Milvus](https://medium.com/@xiaofan.luan/dna-sequence-classification-based-on-milvus-f87e87bc5ba9)
##  Code ref: https://github.com/milvus-io/bootcamp/tree/v2.0.2/solutions/dna_sequence_classification

In [217]:
!pip install --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org --upgrade pip

[0m

In [218]:
!pip install  --trusted-host pypi.python.org --trusted-host pypi.org --trusted-host files.pythonhosted.org \
msal \
retry \
simplejson \
openai \
grpcio==1.58.0 \
pymilvus==2.3.5 \
protobuf \
grpcio-tools==1.58.0 \
pymongo \
tiktoken

[0m

In [219]:
import os
import openai
import requests
import simplejson as json
from retry import retry
from msal import PublicClientApplication, ConfidentialClientApplication, ClientApplication
import time

In [220]:
@retry (tries=3, delay=2)
def getapikey():
    # These are the AZURE parameters needed by the client application
    # In this scheme, the user does not have to worry about the api key, all of that is handled
    # at the AZURE Back End.   The API key is rotated hourly.
    # We can handle these as Kuberbetes secrets, or as environment variables.
    client_id = os.getenv("AZURE_CLIENT_ID")
    tenant_id = os.getenv("AZURE_TENANT_ID")
    endpoint = os.getenv("AZURE_ENDPOINT")

    scopes = [os.getenv("AZURE_APPLICATION_SCOPE")]

    app = ClientApplication(
        client_id=client_id,
        authority="https://login.microsoftonline.com/" + tenant_id
    )


    acquire_tokens_result = app.acquire_token_by_username_password(username=os.getenv("SVC_ACCOUNT"),
                                                                   password=os.getenv("SVC_PASSWORD"),
                                                                   scopes=scopes)
    if 'error' in acquire_tokens_result:
        print("Error: " + acquire_tokens_result['error'])
        print("Description: " + acquire_tokens_result['error_description'])
        return 2
    else:
        header_token = {"Authorization": "Bearer {}".format(acquire_tokens_result['access_token'])}
        rt = requests.post(url=endpoint, headers=header_token, data=b'{"key":"openaikey2"}')
        return rt.json()

In [221]:
openai.api_type = os.getenv("OPENAI_API_TYPE")
openai.api_base = os.getenv("OPENAI_API_BASE")
openai.api_version = os.getenv("OPENAI_API_VERSION")
# We are dynamically getting the key from AZURE>   Access is based on the service account/ad group combination
openai.api_key = getapikey()
#print(openai.api_key)

In [222]:
import os
from openai import AzureOpenAI

ai_client = AzureOpenAI(
  api_key = getapikey(),  
  api_version = os.getenv("OPENAI_API_VERSION"),
  azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
)


## Create a function to load multiple proteins from the bioregistry mongo database

In [223]:
from pymongo import MongoClient
username = os.getenv("BIOREGISTRY_USER")
password = os.getenv("BIOREGISTRY_PASSWORD")
host1 = os.getenv("MONGOID_DATABASE_HOST1")
host2 = os.getenv("MONGOID_DATABASE_HOST2")
host3 = os.getenv("MONGOID_DATABASE_HOST3")
db = os.getenv("MONGOID_DATABASE")
mongo_client = MongoClient(f'mongodb://{username}:{password}@{host1}/{db}')
mongo_collection = mongo_client.get_database(db).get_collection('nucleic_acids')

In [224]:
from bson.json_util import dumps
# { "$and" : [ { "descriptive_name" : { "$ne" : "null" } }, { "descriptive_name" : { "$exists" : "true" } } ] }
nucleic_acids = mongo_collection.find({ "$and" : [ { "sequence" : { "$ne" : None } }, { "sequence" : { "$exists" : "true" } } ] }).limit(10)
for nucleic_acid in nucleic_acids:
	print(dumps(nucleic_acid))

{"_id": {"$oid": "5817630e77ab417e60000026"}, "_type": "Plasmid", "created_at": {"$date": "2016-10-31T15:28:14.846Z"}, "created_by": "ron.test", "eln_ref": "", "nucleic_acid_category": "template", "reference_link": "", "regn_name": "pRGN11004", "aws_file_key": "", "pes_created_at": null, "pes_created_by": "", "sequence": "gaggccctttcgtctcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgccattcgccattcaggctgcgcaactgttgggaagggcgatcggtgcgggcctcttcgctattacgccagctggcgaaagggggatgtgctgcaaggcgattaagttgggtaacgccagggttttcccagtcacgacgttgtaaaacgacggccagtgaattcgagctcggtacccctgcaggcagctgcgcgctcgctcgctcactgaggccgcccgggcaaagcccgggcgtcgggcgacctttggtcgcccggcctcagtgagcgagcgagcgcgcagagagggagtggccaactccatcactaggggttcctacgcgtggctccggtgcccgtcagtgggcagagcgcacatcgcccacagtccccgagaagttggggggaggggtcggcaattgaaccggtgcctagagaaggtggcgc

## Connect to Milvus Vector Database
### Set up schema

In [225]:
from pymilvus import connections, utility, db
conn = connections.connect(host="standalone", port=19530, db_name="default")
db.list_database()

['default', 'bioregistry']

In [226]:
found = False
for database_name in db.list_database():
    if database_name == "bioregistry":
        found = True
if not found:
    db.create_database("bioregistry")

In [227]:
db.list_database()

['bioregistry', 'default']

In [228]:
db.using_database("bioregistry")

In [229]:
utility.list_collections()

['nucleic_acids']

In [230]:
new_collection_name = "nucleic_acids"

In [231]:
for collection_name in utility.list_collections():
    if collection_name == new_collection_name:
        utility.drop_collection(new_collection_name)

In [232]:
from pymilvus import CollectionSchema, FieldSchema, DataType

protein_id = FieldSchema(
  name="nucleic_acid_id",
  dtype=DataType.INT64,
  is_primary=True,
)

name = FieldSchema(
  name="regn_name",
  dtype=DataType.VARCHAR,
  max_length=32,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown"
)

char_count = FieldSchema(
  name="char_count",
  dtype=DataType.INT64,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value=9999
)

protein_json = FieldSchema(
  name="nucleic_acid_json",
  dtype=DataType.JSON,
  max_length=16000,
  # The default value will be used if this field is left empty during data inserts or upserts.
  # The data type of `default_value` must be the same as that specified in `dtype`.
  default_value="Unknown"
)

protein_vector = FieldSchema(
  name="nucleic_acid_vector",
  dtype=DataType.FLOAT_VECTOR,
  dim=1536
)

schema = CollectionSchema(
  fields=[protein_id, name, char_count, protein_json, protein_vector],
  description="Nucleic Acid search",
  enable_dynamic_field=True
)

In [233]:
from pymilvus import Collection
collection = Collection(
    name=new_collection_name,
    schema=schema,
    using='default',
    shards_num=2
    )

In [234]:
collection.create_partition("Disc1")
collection.has_partition("Disc1")

True

In [235]:
collection.create_partition("Disc2")
collection.has_partition("Disc2")

True

In [236]:
from pymilvus import Partition
disc2 = Partition(collection, name="Disc2")
disc2.release()
collection.drop_partition("Disc2")
collection.has_partition("Disc2")

False

In [237]:
utility.list_collections()

['nucleic_acids']

In [238]:
import tiktoken
def truncate_tokens(string: str, encoding_name: str, max_length: int = 8191) -> str:
    """Truncates a text string based on max number of tokens."""
    encoding = tiktoken.encoding_for_model(encoding_name)
    encoded_string = encoding.encode(string)
    num_tokens = len(encoded_string)

    if num_tokens > max_length:
        string = encoding.decode(encoded_string[:max_length])

    return string

## Insert Nucleic Acid as vectors
### { nucleic_acid_id, regn_name, char_count, nucleic_acid_json, nucleic_acid_vector 

In [239]:
from bson.json_util import dumps
from bson import json_util
import json
nucleic_acids = mongo_collection.find({ "$and" : [ { "sequence" : { "$ne" : None } }, { "sequence" : { "$exists" : "true" } } ] }).limit(10)
i = 1
for nucleic_acid in nucleic_acids:
	if 'sequence'in nucleic_acid:
		nucleic_acid_json = json.dumps(nucleic_acid,default=json_util.default)
		sequence = f"{nucleic_acid['sequence']}"
		#tokenized = truncate_tokens(string=protein_string, encoding_name="gpt-3.5-turbo", max_length=8191)
		print(sequence)
		response = ai_client.embeddings.create(
			input = sequence, #tokenized,
			model = "RegnADA002"
		)
		embedding = response.data[0].embedding
		print(embedding)
		nucleic_acid_id_column = [i]
		regn_name_column = [nucleic_acid['regn_name']]
		char_count_column = [len(sequence)]
		nucleic_acid_json_column = [json.loads(nucleic_acid_json)]
		nucleic_acid_vector_column = [embedding]
		data = [nucleic_acid_id_column, regn_name_column, char_count_column, nucleic_acid_json_column, nucleic_acid_vector_column]
		collection.upsert(data)
		i += 1

gaggccctttcgtctcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgccattcgccattcaggctgcgcaactgttgggaagggcgatcggtgcgggcctcttcgctattacgccagctggcgaaagggggatgtgctgcaaggcgattaagttgggtaacgccagggttttcccagtcacgacgttgtaaaacgacggccagtgaattcgagctcggtacccctgcaggcagctgcgcgctcgctcgctcactgaggccgcccgggcaaagcccgggcgtcgggcgacctttggtcgcccggcctcagtgagcgagcgagcgcgcagagagggagtggccaactccatcactaggggttcctacgcgtggctccggtgcccgtcagtgggcagagcgcacatcgcccacagtccccgagaagttggggggaggggtcggcaattgaaccggtgcctagagaaggtggcgcggggtaaactgggaaagtgatgtcgtgtactggctccgcctttttcccgagggtgggggagaaccgtatataagtgcagtagtcgccgtgaacgttctttttcgcaacgggtttgccgccagaacacaggtaagtgccgtgtgtggttcccgcgggcctggcctctttacgggttatggcccttgcgtgccttgaattacttccacctggctgcagtacgtgattcttgatcccgagcttcgggttggaagtgggtgggagagttcgaggccttgcgcttaaggagccccttcgcctcgtgcttgagttgaggcctggcctggg

[-0.03502824530005455, -0.0034351760987192392, -0.0005814508767798543, -0.0241424310952425, -0.005132593680173159, 0.0024902699515223503, -0.017712723463773727, 0.005753221921622753, 0.0011109241750091314, -0.006162836216390133, 0.011264398694038391, 0.018656078726053238, 0.022603273391723633, -0.013256615027785301, 0.013045601546764374, 0.01571430079638958, 0.040713198482990265, 0.008155052550137043, -0.023931417614221573, -0.03443244472146034, -0.0001813397539081052, 0.0006167491083033383, -0.029243992641568184, -0.013194551691412926, 0.018631253391504288, 0.004819176625460386, 0.01846989057958126, -0.023546626791357994, 0.005598064977675676, -0.013591754250228405, -0.019611844792962074, 0.0068331146612763405, -0.019884921610355377, -0.029243992641568184, -0.0013281439896672964, -0.013703466393053532, -0.009421133436262608, -0.00882533099502325, 0.01312007661908865, -0.005650818347930908, 0.013355915434658527, 0.025259559974074364, 0.005321885459125042, -0.014597170986235142, -0.0080

In [240]:

collection.compact()

In [241]:
index_params = {
  "metric_type":"COSINE",
  "index_type":"IVF_SQ8",
  "params":{"nlist":1024},
  
}
collection.create_index(
  field_name="nucleic_acid_vector", 
  index_name="nucleic_acid_vector_idx",
  index_params=index_params
)

Status(code=0, message=)

In [242]:
# index on a scalar field
collection.create_index(
  field_name="regn_name", 
  index_name="nucleic_acid_regn_name_idx"
)

Status(code=0, message=)

## Search , Query and Hybrid

In [243]:
search_string = 'gaggccctttcgtctcgcgcgtttcggt'
response = ai_client.embeddings.create(
    input = search_string,
    model= "RegnADA002"
)
query_embedding = response.data[0].embedding
query_embedding

[-0.022648727521300316,
 -0.016185231506824493,
 0.004893979523330927,
 -0.023959970101714134,
 -0.011867403984069824,
 0.001465213717892766,
 -0.009721734561026096,
 -0.025469884276390076,
 -0.01133760903030634,
 0.0013898835750296712,
 0.008860818110406399,
 0.03210556507110596,
 0.021867280825972557,
 0.0010902183130383492,
 0.018847450613975525,
 0.011827669106423855,
 0.039151836186647415,
 0.010582651011645794,
 0.004317827522754669,
 -0.034277722239494324,
 -0.0075826882384717464,
 7.12946493877098e-05,
 -0.017973288893699646,
 -0.009509816765785217,
 0.021734831854701042,
 0.0014569356571882963,
 0.008562808856368065,
 -0.035973068326711655,
 0.019814325496554375,
 -0.00803301390260458,
 -0.005251591093838215,
 0.0077813612297177315,
 -0.028900306671857834,
 -0.02762879803776741,
 -0.010927017778158188,
 -0.017271310091018677,
 -0.011708465404808521,
 -0.013893868774175644,
 0.0312843844294548,
 -0.011390588246285915,
 0.016370659694075584,
 0.007430372294038534,
 0.00279632327

In [244]:
search_params = {
    "metric_type": "COSINE", 
    "params": {"search_k": 64}
}

In [245]:
# Search
collection = Collection(new_collection_name)
utility.list_collections()
collection.load()
results =  collection.search(
    data=[query_embedding],
    anns_field="nucleic_acid_vector",
    param=search_params,
    limit=10,
    expr=None,
    output_fields=['nucleic_acid_json']
)
for result in results[0]:
    print(result)

id: 1, distance: 0.9201741218566895, entity: {'nucleic_acid_json': {'_id': {'$oid': '5817630e77ab417e60000026'}, '_type': 'Plasmid', 'created_at': {'$date': '2016-10-31T15:28:14.846Z'}, 'created_by': 'ron.test', 'eln_ref': '', 'nucleic_acid_category': 'template', 'reference_link': '', 'regn_name': 'pRGN11004', 'aws_file_key': '', 'pes_created_at': None, 'pes_created_by': '', 'sequence': 'gaggccctttcgtctcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgccattcgccattcaggctgcgcaactgttgggaagggcgatcggtgcgggcctcttcgctattacgccagctggcgaaagggggatgtgctgcaaggcgattaagttgggtaacgccagggttttcccagtcacgacgttgtaaaacgacggccagtgaattcgagctcggtacccctgcaggcagctgcgcgctcgctcgctcactgaggccgcccgggcaaagcccgggcgtcgggcgacctttggtcgcccggcctcagtgagcgagcgagcgcgcagagagggagtggccaactccatcactaggggttcctacgcgtggctccggtgcccgtcagtgggcagagcgcacatc

In [246]:
query_expr = f"nucleic_acid_json['regn_name'] like 'pRGN11004'" # Querying the json field

In [247]:
# Query
collection = Collection(new_collection_name)
utility.list_collections()
collection.load()
results = collection.query(
    expr = query_expr,
    offset = 0,
    limit = 10, 
    output_fields = ["regn_name", "nucleic_acid_json"]
)
results[0]

{'regn_name': 'pRGN11004',
 'nucleic_acid_json': {'_id': {'$oid': '5817630e77ab417e60000026'},
  '_type': 'Plasmid',
  'created_at': {'$date': '2016-10-31T15:28:14.846Z'},
  'created_by': 'ron.test',
  'eln_ref': '',
  'nucleic_acid_category': 'template',
  'reference_link': '',
  'regn_name': 'pRGN11004',
  'aws_file_key': '',
  'pes_created_at': None,
  'pes_created_by': '',
  'sequence': 'gaggccctttcgtctcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgccattcgccattcaggctgcgcaactgttgggaagggcgatcggtgcgggcctcttcgctattacgccagctggcgaaagggggatgtgctgcaaggcgattaagttgggtaacgccagggttttcccagtcacgacgttgtaaaacgacggccagtgaattcgagctcggtacccctgcaggcagctgcgcgctcgctcgctcactgaggccgcccgggcaaagcccgggcgtcgggcgacctttggtcgcccggcctcagtgagcgagcgagcgcgcagagagggagtggccaactccatcactaggggttcctacgcgtggctccggtgcccgtcagtgggcagagcgca

In [248]:
# Hybrid : sama as search with expr provided
results =  collection.search(
    data=[query_embedding],
    anns_field="nucleic_acid_vector",
    param=search_params,
    limit=5,
    expr=query_expr,
    output_fields=['nucleic_acid_json']
)
for result in results[0]:
    print(result)

id: 1, distance: 0.9201741218566895, entity: {'nucleic_acid_json': {'_id': {'$oid': '5817630e77ab417e60000026'}, '_type': 'Plasmid', 'created_at': {'$date': '2016-10-31T15:28:14.846Z'}, 'created_by': 'ron.test', 'eln_ref': '', 'nucleic_acid_category': 'template', 'reference_link': '', 'regn_name': 'pRGN11004', 'aws_file_key': '', 'pes_created_at': None, 'pes_created_by': '', 'sequence': 'gaggccctttcgtctcgcgcgtttcggtgatgacggtgaaaacctctgacacatgcagctcccggagacggtcacagcttgtctgtaagcggatgccgggagcagacaagcccgtcagggcgcgtcagcgggtgttggcgggtgtcggggctggcttaactatgcggcatcagagcagattgtactgagagtgcaccatatgcggtgtgaaataccgcacagatgcgtaaggagaaaataccgcatcaggcgccattcgccattcaggctgcgcaactgttgggaagggcgatcggtgcgggcctcttcgctattacgccagctggcgaaagggggatgtgctgcaaggcgattaagttgggtaacgccagggttttcccagtcacgacgttgtaaaacgacggccagtgaattcgagctcggtacccctgcaggcagctgcgcgctcgctcgctcactgaggccgcccgggcaaagcccgggcgtcgggcgacctttggtcgcccggcctcagtgagcgagcgagcgcgcagagagggagtggccaactccatcactaggggttcctacgcgtggctccggtgcccgtcagtgggcagagcgcacatc