# Initialization

In [2]:
import os
import logging
os.environ['SOURCE_FILE']='C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf'
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)

# Load data from source

In [3]:
from langchain_community.document_loaders import PyMuPDFLoader
f = os.getenv('SOURCE_FILE')
logger.info(f"loading file: {f}")
loader = PyMuPDFLoader(f)
docs = loader.load()

INFO:__main__:loading file: C:/workspace/experiments/GenAI/Projects/Test Data/aws.pdf


# Split data

In [4]:
from langchain.text_splitter import CharacterTextSplitter
text_splitter = CharacterTextSplitter(chunk_size=256, chunk_overlap=100)
transformed_docs = text_splitter.split_documents(docs)

# Extract terms from chunks and save as metadata

In [5]:
from langchain.schema.document import Document
import nltk
from rake_nltk import Rake

nltk.download('stopwords')
nltk.download('punkt')
r = Rake()
transformed_docs_with_terms = []

for doc in transformed_docs:
    r.extract_keywords_from_text(doc.page_content)
    keywords = r.get_ranked_phrases()
    transformed_docs_with_terms.append(Document(page_content=doc.page_content, metadata={**doc.metadata, 'file_name': os.path.basename(doc.metadata['file_path']), 'terms': keywords}))

logger.info(f"total chunks - {len(transformed_docs_with_terms)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\samriddha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\samriddha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
INFO:__main__:total chunks - 42


# Initialize embedding function

In [6]:
from langchain_openai import OpenAIEmbeddings
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv('OPENAI_API_KEY'), model="text-embedding-ada-002")

# Create OpenSearch Index

In [7]:
mappings = {
    "properties": {
        "id": {
            "type": "text"
        },
        "file_name": {
            "type": "keyword"
        },
        "vector_field": {
            "type": "knn_vector",
            "dimension": 1536,
            "method": {
                "engine": "nmslib",
                "space_type": "l2",
                "name": "hnsw",
                "parameters": {
                  "ef_construction": 512,
                  "m": 16
                }
            }
        },
        "bert_embeddings": {
            "type": "knn_vector",
            "dimension": 768,
            "method": {
                "engine": "lucene",
                "space_type": "l2",
                "name": "hnsw",
                "parameters": {}
            }
        },
        "oss_sparse_embeddings": {
            "type": "rank_features"
        },
        "text": {
            "type": "text"
        },
        "terms": {
            "type": "text"
        }
    }
}


In [9]:
from opensearchpy import OpenSearch
from uuid import uuid4
import time

host = 'localhost'
port = 9200
auth = ('admin', 'Severus11#')

index_name = 'aws-idx'
index_body = {
  "settings": {
    "index.knn": True,
    "default_pipeline": "proof_of_concept_pipeline"
  },
  "mappings": mappings
}

client = OpenSearch(
    hosts = [{'host': host, 'port': port}],
    http_compress = True, # enables gzip compression for request bodies
    use_ssl = False,
    verify_certs = False,
    ssl_assert_hostname = False,
    ssl_show_warn = False
)
response = client.indices.create(index_name, body=index_body)
logger.info(response)

INFO:opensearch:PUT http://localhost:9200/aws-idx [status:200 request:1.191s]
INFO:__main__:{'acknowledged': True, 'shards_acknowledged': True, 'index': 'aws-idx'}


# Load data to OpenSearch Vector Store

In [10]:
from langchain_community.vectorstores import OpenSearchVectorSearch
OpenSearchVectorSearch.from_documents(
    transformed_docs_with_terms,
    embeddings,
    index_name="aws-idx",
    opensearch_url="http://localhost:9200",
    http_auth=("admin", "Severus11#"),
    use_ssl = False,
    verify_certs = False,
    timeout=300,
    ssl_assert_hostname = False,
    ssl_show_warn = False,
)

INFO:httpx:HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"
INFO:opensearch:GET http://localhost:9200/aws-idx [status:200 request:0.028s]
INFO:opensearch:POST http://localhost:9200/_bulk [status:200 request:60.571s]
INFO:opensearch:POST http://localhost:9200/_bulk [status:200 request:22.195s]
INFO:opensearch:POST http://localhost:9200/aws-idx/_refresh [status:200 request:0.327s]


<langchain_community.vectorstores.opensearch_vector_search.OpenSearchVectorSearch at 0x21aeef3f140>