In [1]:
import pandas as pd
import numpy as np
import os
from io import StringIO 
import json
from collections import Counter


from enum import Enum

import sys  
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/langchain')
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/experimental/')
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/partners/openai')
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/community/langchain_community/')


from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import JsonOutputParser
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS, Chroma, Milvus
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_pinecone import PineconeVectorStore
import langchain_community
from langchain_core.vectorstores import VectorStore
from langchain_core.documents import Document


from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Sized,
    Tuple,
    Union,
)

%load_ext autoreload
%autoreload 2

## Read Data

In [2]:
file = '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf'


loader = PyPDFLoader(file)
doc = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index = True, add_chunk_id = True)

splits = splitter.split_documents(doc)

## Setup Database

In [6]:
from pinecone import Pinecone, ServerlessSpec, PodSpec
pinecone = Pinecone(api_key=os.environ.get("PINECONE_API_KEY"))

In [15]:
pinecone.list_indexes()

{'indexes': [{'dimension': 1536,
              'host': 'test2-ch8c41c.svc.aped-4627-b74a.pinecone.io',
              'metric': 'cosine',
              'name': 'test2',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [22]:
# # delete index
# pinecone.delete_index('test2')

In [23]:
# create new index 

pinecone.create_index(
    name='test2',
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud='aws', 
        region='us-east-1'
    ) 
) 

In [25]:
idList = [str(x) for x in list(range(len(splits)))]

store = PineconeVectorStore.from_documents(splits, OpenAIEmbeddings(), index_name='test2', ids = idList)

In [26]:
store._index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

## Querying Data

In [31]:
output = store._index.fetch(ids = ['5','22','3'])

In [61]:
def get_documents_by_ids(store: VectorStore, ids : int | str | List[int|str]) -> List[Document]:

    if isinstance(ids, list):
        results = store._index.fetch(ids = [str(x) for x in ids])
    else:
        results = store._index.fetch(ids = [str(ids)])

    output_docs = []

    if len(results.vectors)>0:
    
        for id_value in results.vectors.keys():
    
            metadata = results.vectors[id_value].get('metadata')
            page_content = metadata.get('text')
            _ = metadata.pop('text')
    
            metadata['id'] = id_value 
    
            output_docs.append(Document(page_content = page_content, metadata = metadata))

    return output_docs
    
            

    
    

In [62]:
get_documents_by_ids(store, 3)

[Document(page_content='D:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage time  \nC4:  Manage Data  \nD2:  Use Digital Technology  \nLevel Indicators:  \nA1.1:  Read brief texts to locate  specific details  \nA2.2:   Interpret simple documents to locate and connect information', metadata={'chunk_id': 3.0, 'page': 0.0, 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'start_index': 641.0, 'id': '3'})]

In [63]:
get_documents_by_ids(store, '3')

[Document(page_content='D:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage time  \nC4:  Manage Data  \nD2:  Use Digital Technology  \nLevel Indicators:  \nA1.1:  Read brief texts to locate  specific details  \nA2.2:   Interpret simple documents to locate and connect information', metadata={'chunk_id': 3.0, 'page': 0.0, 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'start_index': 641.0, 'id': '3'})]

In [64]:
get_documents_by_ids(store, ['3','22','123212','2'])

[Document(page_content='Task Description:  \nComplete an agenda for a training schedule and traveling times.   \nCompetency:  \nA:  Find and Use Information  \nB:  Communicate Ideas and Information  \nC:  Understand and  Use Numbers  \nD:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage time  \nC4:  Manage Data', metadata={'chunk_id': 2.0, 'page': 0.0, 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'start_index': 435.0, 'id': '2'}),
 Document(page_content='4:40 pm  5:20 pm  6:00 pm  \n5:20 pm 6:00 pm  6:40 pm  \n6:00 pm  6:40 pm  7:20 pm  \n6:40 pm  7:20 pm  8:00 pm  \n7:20 pm  8:00 pm  8:40 pm  \n8:00 pm  8:40 pm  9:20 pm  \n8:40 pm  9:20 pm  10:00 pm  \n9:20 pm  10:00 pm  10:40 pm  \n10:00 pm  10:40 pm  11:20 pm  \n10:40 pm  11:20 pm  12:00 am', metadata={'chunk_id': 22.0, 'page': 4.0, 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data

In [57]:
a

{'chunk_id': 3.0,
 'page': 0.0,
 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf',
 'start_index': 641.0}

## Testing with Sentence Window Retriever