In [4]:
import os
from langchain_core.tools import Tool
from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_community.vectorstores import FAISS
from langchain_openai import OpenAIEmbeddings
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langgraph.graph import END
from langgraph.graph import StateGraph
from typing import Annotated, TypedDict, Sequence
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage
from langgraph.graph.message import add_messages
import boto3
import uuid
from datetime import datetime
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyMuPDFLoader
from sklearn.metrics.pairwise import cosine_similarity
from langchain_aws import BedrockEmbeddings
from langchain_core.documents import Document
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore




In [5]:
def index_creation(location:str, model_id:str, index_name:str, namespace:str, dimension:str, metric:str, chunk_size:str, chunk_overlap:str):

    # 1) Recursive chunking
    loader = PyMuPDFLoader(location)
    docs = loader.load()
    splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", ".", "!", "?", " ", ""])
    chunks = splitter.split_documents(docs)

    # 2) set up bedrock client
    bedrock = boto3.client("bedrock-runtime",region_name = "us-east-1")

    # 3) Titan Embeddings (v2 recommended)
    embeddings = BedrockEmbeddings(client=bedrock,model_id = model_id)

    # 4) Check if index exists
    pc = Pinecone()
    if index_name not in [idx["name"] for idx in pc.list_indexes()]:
       
       pc.create_index(
            name=index_name,
            dimension=dimension,
            metric=metric,
            spec=ServerlessSpec(cloud='aws', region='us-east-1')
        )
       
       vector_store = PineconeVectorStore.from_documents(
           documents=chunks,  
           embedding=embeddings,
           index_name=index_name,
           namespace=namespace
           )
    # 5) Check if namespace exists
    index = pc.Index(index_name)
    stats = index.describe_index_stats()
    existing_namespaces = list(stats["namespaces"].keys())

    if namespace not in existing_namespaces:
        vector_store = PineconeVectorStore.from_documents(
            documents=chunks,
            embedding=embeddings,
            index_name=index_name,
            namespace=namespace
        )

In [6]:
us = index_creation(location = '/Users/rohan/Desktop/Work/RAG/US_pharma_compliance.pdf', model_id = 'amazon.titan-embed-text-v2:0', index_name = 'medical-compliance', namespace = 'US', dimension = 1024, metric = 'cosine', chunk_size = 500, chunk_overlap = 50)
india = index_creation(location = '/Users/rohan/Desktop/Work/RAG/India_pharma_compliance.pdf', model_id = 'amazon.titan-embed-text-v2:0', index_name = 'medical-compliance', namespace = 'india', dimension = 1024, metric = 'cosine', chunk_size = 500, chunk_overlap = 50)
japan = index_creation(location = '/Users/rohan/Desktop/Work/RAG/Japan_pharma_compliance.pdf', model_id = 'amazon.titan-embed-text-v2:0', index_name = 'medical-compliance', namespace = 'japan', dimension = 1024, metric = 'cosine', chunk_size = 500, chunk_overlap = 50)
canada = index_creation(location = '/Users/rohan/Desktop/Work/RAG/Canada_pharma_complaince.pdf', model_id = 'amazon.titan-embed-text-v2:0', index_name = 'medical-compliance', namespace = 'canada', dimension = 1024, metric = 'cosine', chunk_size = 500, chunk_overlap = 50)
russia = index_creation(location = '/Users/rohan/Desktop/Work/RAG/Russia_pharma_compliance.pdf', model_id = 'amazon.titan-embed-text-v2:0', index_name = 'medical-compliance', namespace = 'russia', dimension = 1024, metric = 'cosine', chunk_size = 500, chunk_overlap = 50)
