## Library Installation

In [1]:
# !pip install pymilvus==2.4.1
# !pip install langchain==0.1.6
# !pip install sentence-transformers
# !pip install PyPDF2==1.23

In [2]:
# For watson studio
# !pip install --force-reinstall typing-extensions
# !pip install --force-reinstall packaging
# !pip install grpcio==1.58.0

In [33]:
import logging
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
from langchain.text_splitter import CharacterTextSplitter
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader

## Credential

In [3]:
### Milvus Credential
cert="""-----BEGIN CERTIFICATE-----
MIIDzzCCAregAwIBAgIUHvv3+lYxL4ZyBS6R9qD0D9pqfaIwDQYJKoZIhvcNAQEL
BQAwdzELMAkGA1UEBhMCR0IxDzANBgNVBAgMBkxvbmRvbjESMBAGA1UEBwwJWW9y
ayBSb2FkMRswGQYDVQQKDBJDbGllbnQgRW5naW5lZXJpbmcxEjAQBgNVBAsMCUFT
IGFuZCBQVzESMBAGA1UEAwwJbG9jYWxob3N0MB4XDTIzMDgxMDE1MDIwMVoXDTMz
MDgwNzE1MDIwMVowdzELMAkGA1UEBhMCR0IxDzANBgNVBAgMBkxvbmRvbjESMBAG
A1UEBwwJWW9yayBSb2FkMRswGQYDVQQKDBJDbGllbnQgRW5naW5lZXJpbmcxEjAQ
BgNVBAsMCUFTIGFuZCBQVzESMBAGA1UEAwwJbG9jYWxob3N0MIIBIjANBgkqhkiG
9w0BAQEFAAOCAQ8AMIIBCgKCAQEAw5GJ9I0yB+FD53ro7tQnPWlMfMfO9jOojztA
EIyVFgFhoZ+CZJH+3y1e2GZm7a3wIbQS6f0Y1rZGMktAq+8UPASMSarVraiWYsrL
4znFboNFRJ2wInnPlYJis6lbCffahHzE+ye3Mx6zeSQAijImCRtaCCwZzD93kVFB
MDFHQGAEwga5plAgHhkfpXrqrzVRq1idNiojj0PRSofhb0ywWbGyjTlbC7u6odcH
as78+S6SbXHM5AqAqfTMRPZKrRmphEDYYGNG+VBfUuVI6vqd3fS7xA0AImZ7j/CW
QbFdh9TLXl+D5dVToykIgFdjtkez93ORG1HYskrZnVlGObgOnwIDAQABo1MwUTAd
BgNVHQ4EFgQUZEW3JzZOej4jLgbKcmn/t9IYQ58wHwYDVR0jBBgwFoAUZEW3JzZO
ej4jLgbKcmn/t9IYQ58wDwYDVR0TAQH/BAUwAwEB/zANBgkqhkiG9w0BAQsFAAOC
AQEAZIwIRF7Wdx/QuseV13ALfZRjHWkFHaYLgUjXW+rIyCUEr1Iu421PF/CEJKMb
kQ3T+DGDBPjrWlTxQFAJoVpvsbVeaM6qRsqHe1z9xJ4tHYYUKwdeAJl5lnrGD027
HPP0qAUvm+D2NepMOJyomktB8J8TOS+2KWpot0HZtteFP4S53Lo7+tOu9374fF/2
eg/QZkZzKaiQhcnFir7etyQBBFvo4gXXHgo884hYA8DltGpA3zlIIkTKeftafjQ+
jyAnDpq+rQ3hfKMhjeC1ATausae6td6VRP55ZfOrM4t+DEbrmh/WGM3NzzMjf91M
b8a2Cp2o7BLIPi8LwWfkQM+4dQ==
-----END CERTIFICATE-----"""

with open("cert-milvus.pem", "w") as file:
    file.write(cert)

milvus_host="161.156.199.204"
milvus_port="8080"
milvus_password="4XYg2XK6sMU4UuBEjHq4EhYE8mSFO3Qq"

## Connect with milvus

In [7]:
import numpy as np
from pymilvus import (
    connections,
    utility,
    FieldSchema, CollectionSchema, DataType,
    Collection,
)

print("=== start connecting to Milvus ===")
connections.connect("default", host=milvus_host,
                    port=milvus_port, secure=True, server_pem_path="cert-milvus.pem",
                    server_name="localhost",user="root",
                    password=milvus_password)

=== start connecting to Milvus ===


In [42]:
"""Module providing utility functions for Milvus"""

import logging
from pymilvus import CollectionSchema, FieldSchema, DataType, Collection
from sentence_transformers import SentenceTransformer

def create_collection(
    milvus_connection_alias: str = "default",
    collection_name: str = "ggf_collection"
    ) -> None:
    # Defining the default collection schema
    idx = FieldSchema(
        name = "id",
        description = "Embedding ID",
        dtype = DataType.INT64,
        is_primary = True,
        auto_id = True,
    )
    embedding_vector = FieldSchema(
        name = "embedding_vector",
        description = "Embedding vector",
        dtype = DataType.FLOAT_VECTOR,
        dim = 384,
    )
    embedding_raw = FieldSchema(
        name = "embedding_raw",
        description = "Embedding raw value",
        dtype = DataType.VARCHAR,
        max_length = 65535,
    )
    document_id = FieldSchema(
        name = "document_id",
        description = "Document ID",
        dtype = DataType.VARCHAR,
        max_length = 256
    )
    metadata = FieldSchema(
        name = "metadata_json",
        description = "Metadata in JSON format",
        dtype = DataType.VARCHAR,
        max_length = 65535
    )
    default_schema = CollectionSchema(
        fields = [
            idx,
            embedding_vector,
            embedding_raw,
            document_id,
            metadata
        ],
        description = "Default collection schema",
        enable_dynamic_field = True
    )
    logging.debug("Collection schema defined.")

    # Creating the default collection
    Collection(
        name = collection_name,
        schema = default_schema,
        using = milvus_connection_alias,
        shards_num = 2,
    )
    
def embed_pdf_text(
    document_id: str,
    file_path: str,
    milvus_connection_alias: str = "default",
    collection_name: str = "collection",
    hf_model_id: str = 'sentence-transformers/all-MiniLM-L6-v2'
    ):
    # Loading text from pdf document
    loader = PyPDFLoader(file_path)
    text_splitter = CharacterTextSplitter(
        chunk_size = 1000,
        chunk_overlap = 100
    )
    docs = loader.load_and_split(
        text_splitter = text_splitter
    )

    # Embedding text in 'data' variable
    model = SentenceTransformer(hf_model_id)
    data = [
        [model.encode(doc.page_content) for doc in docs],
        [doc.page_content for doc in docs],
        [document_id for doc in docs],
        ["{}" for doc in docs]
    ]
    logging.debug("Text was successfully embedded from PDF {}.".format(file_path))

    # store the data in Milvus collection
    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )
    collection.insert(data)

def build_vector_index(
    milvus_connection_alias: str = "default",
    collection_name: str = "collection"
):
    # Parameters of the index being created
    index_params = {
        "metric_type":"L2",
        "index_type":"IVF_FLAT",
        "params":{"nlist":1024}
        }
    
    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )

    # Create an index from the embeddings vectors
    collection.create_index(
        field_name = "embedding_vector",
        index_params = index_params
    )

def build_vector_index(
    milvus_connection_alias: str = "techzone_connection",
    collection_name: str = "default_collection"
):
    # Parameters of the index being created
    index_params = {
        "metric_type":"L2",
        "index_type":"IVF_FLAT",
        "params":{"nlist":1024}
        }
    
    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )

    # Create an index from the embeddings vectors
    collection.create_index(
        field_name = "embedding_vector",
        index_params = index_params
    )

def similarity_search(
    user_question: str,
    limit=3,
    milvus_connection_alias: str = "default",
    collection_name: str = "workhsop_collection",
    hf_model_id: str = 'sentence-transformers/all-MiniLM-L6-v2'
    ) -> list:

    # Search parameters
    search_params = {
        "metric_type": "L2", 
        "offset": 0, 
        "ignore_growing": False, 
        "params": {"nprobe": 10}
    }

    collection = Collection(
        name = collection_name,
        using = milvus_connection_alias
    )
    collection.load()
    logging.debug("Collection loaded.")

    # Embedding model
    model = SentenceTransformer(hf_model_id)
    logging.debug("Embedding model loaded.")

    # Search the index for the 3 closest vectors
    results = collection.search(
        data = [model.encode(user_question)],
        anns_field = "embedding_vector",
        param = search_params,
        limit = limit,
        expr = None,
        output_field = ['title'],
        consistency_level = "Strong"
    )

    # Retrieving the text associated with the results ids
    results_text = collection.query(
        expr = "id in {}".format(results[0].ids), 
        output_fields = ["id", "embedding_raw", "document_id", "metadata_json"],
        consistency_level="Strong"
    )
    collection.release()
    logging.debug("Text chunks succesfully retrieved.")

    return results_text


In [29]:
collection_name = "collection"
has=utility.has_collection(f"{collection_name}")

if has == False:
    print(f"The collection is not available")
    create_collection(milvus_connection_alias="default", collection_name=collection_name)
    
else:
    print(f"The collection is available")
    print(utility.list_collections())

    collection = Collection(name=collection_name)
    print(collection)

The collection is available
['collection']
<Collection>:
-------------
<name>: collection
<description>: Default collection schema
<schema>: {'auto_id': True, 'description': 'Default collection schema', 'fields': [{'name': 'id', 'description': 'Embedding ID', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'embedding_vector', 'description': 'Embedding vector', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 384}}, {'name': 'embedding_raw', 'description': 'Embedding raw value', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}, {'name': 'document_id', 'description': 'Document ID', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 256}}, {'name': 'metadata_json', 'description': 'Metadata in JSON format', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 65535}}], 'enable_dynamic_field': True}



In [40]:
embed_pdf_text(document_id="policy", file_path="data/Peraturan_Perusahaan.pdf", milvus_connection_alias="default", collection_name=collection_name)

In [43]:
build_vector_index(milvus_connection_alias="default", collection_name=collection_name)

In [46]:
similarity_search("Berapa cuti yang dapat saya ambil dalam setahun?", milvus_connection_alias="default", collection_name=collection_name, limit=1)

[{'metadata_json': '{}',
  'id': 449503242723684633,
  'embedding_raw': '4. Pakaian Kerja Karyawan: a. Karyawan diwajibkan menggunakan pakaian seragam setiap hari Senin dan Selasa. yang akan diberikan oleh perusahaan sesuai dengan setiap departemen perusahaan. b. Pada hari Rabu sampai Jumat diwajibkan menggunakan pakaian kerja berupa kemeja kasual dan celana bahan. Karyawan tidak boleh menggunakan baju kaos dan celana jeans.  5. Tunjangan Hari Raya Karyawan: a. Karyawan yang telah bekerja minimal 12 bulan di perusahaan berhak mendapatkan Tunjangan Hari Raya (THR) minimal satu bulan gaji.  b. Tunjangan Hari Raya (THR) akan diberikan kepada karyawan selambat-lambatnya dua minggu sebelum hari Raya Idul Fitri. c. Untuk karyawan yang berhenti bekerja paling lama 30 hari sebelum hari raya Idul Fitri dan telah bekerja minimal tiga tahun juga bisa mendapatkan Tunjangan Hari Raya (THR).   6. Cuti karyawan (leave of absence): a. Cuti karyawan (leave of absence) adalah hak karyawan dalam melakuka