In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

True

# Milvus to manage vector datasets
Milvus work as vector store to enable quick document queries. We compute document embeddings using a small BERT model for semantic search.

### Download the dataset
Put "wikihow.csv" dataset in the "file" directory. Please download in here URL: `https://ibm.box.com/s/8nvanf974t35d89cmibk75e3gc6d1pbo`

In [17]:
WH_PATH = "test.csv"
WH_PATH

'test.csv'

### Load and check the data
Observe that some of the data could be cleaner:
- One of the titles seems to be mistakenly registered as a sectionLabel
- Some odd codepoint choices, for example for apostrophes
- Some titles end in spurious numbers

In [23]:
import pandas
doc = pandas.read_csv(WH_PATH)

In [24]:
doc.head()

Unnamed: 0,id,revid,url,title,text
0,5,595831,https://ko.wikipedia.org/wiki?curid=5,지미 카터,"제임스 얼 카터 주니어(, 1924년 10월 1일 ~ )는 민주당 출신 미국 39대..."
1,9,221042,https://ko.wikipedia.org/wiki?curid=9,수학,"수학(數學, , 줄여서 math)은 수, 양, 구조, 공간, 변화 등의 개념을 다루..."
2,10,123884,https://ko.wikipedia.org/wiki?curid=10,수학 상수,"수학에서 상수란 그 값이 변하지 않는 불변량으로, 변수의 반대말이다. 물리 상수와는..."
3,19,22169,https://ko.wikipedia.org/wiki?curid=19,문학,"문학(文學, )은 언어를 예술적 표현의 제재로 삼아 새로운 의미를 창출하여, 인간과..."
4,20,529523,https://ko.wikipedia.org/wiki?curid=20,나라 목록,이 목록에 실린 국가 기준은 1933년 몬테비데오 협약 1장을 참고로 하였다. 협정...


'지미 카터'

In [25]:
doc_indexed = doc.drop(columns=["id","revid","url"])
doc_indexed.head()

Unnamed: 0,title,text
0,지미 카터,"제임스 얼 카터 주니어(, 1924년 10월 1일 ~ )는 민주당 출신 미국 39대..."
1,수학,"수학(數學, , 줄여서 math)은 수, 양, 구조, 공간, 변화 등의 개념을 다루..."
2,수학 상수,"수학에서 상수란 그 값이 변하지 않는 불변량으로, 변수의 반대말이다. 물리 상수와는..."
3,문학,"문학(文學, )은 언어를 예술적 표현의 제재로 삼아 새로운 의미를 창출하여, 인간과..."
4,나라 목록,이 목록에 실린 국가 기준은 1933년 몬테비데오 협약 1장을 참고로 하였다. 협정...


### Create Milvus connection
We will interact with our Milvus instance using the official pymilvus library. Alternatively, it is possible to use LangChain's Milvus vectorstores class to add documents instance. In that case, a simple `from_documents` or `from_texts` (or similar) will generate the collection using the correct settings expected by LangChain.

In [128]:
# Milvus requires a connection for all operations. Remember to disconnect at the end.

# from pymilvus import connections
# connections.connect(
#   alias="default",
#   host=os.getenv("MILVUS_HOST", None),
#   port=os.getenv("MILVUS_PORT", None)
# )

from pymilvus import connections
connections.connect(
  alias="default",
  host="127.0.0.1",
  port="19530"
)

### Create schema for the milvus store
If a collection with the same name but a different schema exists, Milvus may throw a SchemaNotReady exception.
Also the text fields' max length is in bytes not characters. Even though it's possible to get the byte size of the string and trim it to fit the byte limits in the schema, there are finicky bits and it's better to simply set limits to the max allowable (65535).

fields in the collection must follow some special rules:
- The primary key must be called pk
- The vector must be called vector
- The text entry must be called text

In [129]:
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection, utility

In [130]:
# Milvus also supports schemaless operations if `enable_dynamic_fields=True`.

MAX_TITLE = 512
MAX_TEXT = 2048
MAX_VEC = 384

NAME = "kowiki"

if NAME in utility.list_collections():
    whcollection = Collection(NAME)
    whcollection.drop()

whschema = CollectionSchema(
    fields=[
        FieldSchema(name="pk", dtype=DataType.INT64, is_primary=True, auto_id=True),
        FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=65535, default_value=""),
        FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=65535, default_value=""),
        FieldSchema(name="vector", dtype=DataType.FLOAT_VECTOR, dim=MAX_VEC)
    ],
    enable_dynamic_fields=False,
    description="kowiki collection"
)


In [131]:
whcollection = Collection(
    name=NAME,
    schema=whschema,
    consistency_level="Session" # Make sure we read our own writes, otherwise allowed to be a bit out of date.
)

### Batch-wise insertion into milvus
Use small BERT model to compute embeddings for our documents to place in the milvus store.

The choice of batch size in this example is arbitrary, and a double-batch system may be preferable to accomodate both the embedding model and milvus.

When the embedding model runs on GPU, the batch size should be selected so as to optimize the transfer-to-memory vs runtime overheads (too small and a major amount of time will be wasted on memory transfers instead of embedding proper, too large and it won't fit on the device).
If the model is accessed over the network, the batch size should be selected with the same concerns in mind, although further overhead may be incurred depending on how the model is scheduled or how the API is designed.

With regard to milvus, the idea is the same: a batch size that's too small means incurring milvus' operational overhead along with communication overhead. The other tradeoff of note regards any temporary processing or data streaming that may occur: a higher batch size also implies loading more data into memory and possibly generating longer-lasting temporary artifacts before submitting the data to milvus, after which it can all be discarded.



### Load embeddings
use HuggingFaceEmbeddings with the MiniLM BERT model.

In [132]:
import langchain
from langchain.embeddings import HuggingFaceEmbeddings

In [133]:
embeddings = HuggingFaceEmbeddings(model_name='all-MiniLM-L6-v2')

In [91]:
# entries = [[] for _ in range(3)]
# for index,data_row in doc_indexed.iterrows():
#     row_title = data_row['title']
#     row_text =data_row['text']
#     if type(row_title) is not str or type(row_text) is not str:
#         continue
#     entries[0].append(row_title)
#     entries[1].append(row_text)
#     entries[2].append(embeddings.embed_documents(data_row['title'])[0])
#     whcollection.insert(entries)

0


In [134]:
def batch_process(num_index,num_batch,table):
    print("from %d to %d" % (num_index,num_index+num_batch)) 
    entries = [[] for _ in range(3)]
    for index in range(num_index,num_index+num_batch):
        if type(table.loc[index]['title']) is not str or type(table.loc[index]['text']) is not str:
            continue
        title_len_diff = len(doc_indexed.loc[index]['title'].encode("utf-16-le"))-len(doc_indexed.loc[index]['title'])
        text_len_diff  = len(doc_indexed.loc[index]['text'].encode("utf-16-le"))-len(doc_indexed.loc[index]['text'])
        entries[0].append(table.loc[index]['title'][:MAX_TITLE-title_len_diff])
        entries[1].append(table.loc[index]['text'][:MAX_TEXT-text_len_diff])
        entries[2].append(embeddings.embed_documents(table.loc[index]['title'][:MAX_TITLE-title_len_diff])[0])
        if (index-num_index) % int((num_batch/4)) ==0:
            print("current index: %d" % index)
    whcollection.insert(entries)
    print("current batch input length: %d" % len(entries[0]))
BATCH = 10000
# process given input
for i in range(0,200000,BATCH):
    batch_process(i,BATCH,doc_indexed)

from 0 to 10000
current index: 0
current index: 2500
current index: 5000
current index: 7500
current batch input length: 7287
from 10000 to 20000
current index: 10000
current index: 12500
current index: 15000
current batch input length: 6694
from 20000 to 30000
current index: 22500
current index: 27500
current batch input length: 6252
from 30000 to 40000
current index: 35000
current index: 37500
current batch input length: 6082
from 40000 to 50000
current index: 45000
current index: 47500
current batch input length: 5957
from 50000 to 60000
current index: 50000
current batch input length: 5233
from 60000 to 70000
current index: 62500
current index: 67500
current batch input length: 4360
from 70000 to 80000
current index: 72500
current index: 75000
current batch input length: 5393
from 80000 to 90000
current index: 80000
current index: 82500
current index: 85000
current index: 87500
current batch input length: 5400
from 90000 to 100000
current index: 97500
current batch input length: 48

In [135]:
# Milvus will not seal segments that are too small, a flush is necessary to force it.
whcollection.flush()

## Create index and connect
Search can be accelerated significantly by creating an index on the vector. Here we use L2 similarity with a flat index using inverted files (`IVF_FLAT`).

If using the langchain milvus store interface, now is a good time to disconnect as well. Otherwise, now is the time to load the collection.

In [136]:
whcollection.create_index(field_name="vector", index_params={"metric_type": "L2", "index_type": "IVF_FLAT", "nlist": "1024"})

Status(code=0, message=)

In [137]:
whcollection.load()
# To actually use the data, we would have to do a `whcollection.load()` before any queries.
# Once done with queries, we should then use `whcollection.release()` to stop using resources

## Disconnect
Unload the collection to stop using up resources, then close the connection. We're done!

In [138]:
whcollection.release()
connections.disconnect("default")