In [1]:
import pandas as pd
import numpy as np
import os
from io import StringIO 
import json
from collections import Counter


from enum import Enum

import sys  
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/langchain')
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/experimental/')
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/partners/openai')
sys.path.append( '/Users/roshansk/Documents/GitHub/langchain/libs/community/langchain_community/')


from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain_core.output_parsers import JsonOutputParser
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS, Chroma, Milvus
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.runnables import RunnablePassthrough
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_community.vectorstores.faiss import DistanceStrategy
from langchain_pinecone import PineconeVectorStore
import langchain_community
from langchain_core.vectorstores import VectorStore
from langchain_core.documents import Document


from typing import (
    Any,
    Callable,
    Dict,
    Iterable,
    List,
    Optional,
    Sized,
    Tuple,
    Union,
)

%load_ext autoreload
%autoreload 2

## Read Data

In [2]:
file = '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf'


loader = PyPDFLoader(file)
doc = loader.load()

splitter = RecursiveCharacterTextSplitter(chunk_size=400, add_start_index = True, add_chunk_id = True)

splits = splitter.split_documents(doc)

## Setup Database

In [3]:
from milvus import default_server
from pymilvus import connections, utility

In [4]:
default_server.start()

In [5]:
print(default_server.running)

True


In [24]:
idList = [str(x) for x in list(range(len(splits)))]

store= Milvus.from_documents(splits, OpenAIEmbeddings(), ids = idList)

In [25]:
store.collection_name

'LangChainCollection'

In [26]:
store.collection_properties

In [28]:
# # To delete the collection
# store.col.drop()

## Querying Database

In [29]:
query = store.embeddings.embed_query("Hey there")

In [30]:
# data must be a list of lists
# param should be a dict 
# anns_field should be store._vector_field

store.col.search(data=[query], param={}, limit=5, anns_field=store._vector_field)

["['id: 15, distance: 0.42410343885421753, entity: {}', 'id: 33, distance: 0.506050705909729, entity: {}', 'id: 12, distance: 0.5248297452926636, entity: {}', 'id: 34, distance: 0.5332403182983398, entity: {}', 'id: 28, distance: 0.5413569211959839, entity: {}']"]

In [37]:
## Can get extra fields in the output using output_fields
output = store.col.search(data=[query], param={}, limit=5, anns_field=store._vector_field, output_fields=['chunk_id','source','pk'])

output

['["id: 15, distance: 0.42410343885421753, entity: {\'pk\': \'15\', \'chunk_id\': 15, \'source\': \'/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf\'}", "id: 33, distance: 0.506050705909729, entity: {\'pk\': \'33\', \'chunk_id\': 33, \'source\': \'/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf\'}", "id: 12, distance: 0.5248297452926636, entity: {\'pk\': \'12\', \'chunk_id\': 12, \'source\': \'/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf\'}", "id: 34, distance: 0.5332403182983398, entity: {\'pk\': \'34\', \'chunk_id\': 34, \'source\': \'/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf\'}", "id: 28, distance: 0.5413569211959839, entity: {\'pk\': \'28\', \'chunk_id\': 28, \'source\': \'/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf\'}"]']

In [34]:
# For full list of fields : 
store.fields

['source', 'page', 'start_index', 'chunk_id', 'text', 'pk', 'vector']

### Querying based on ID

In [36]:
store._primary_field

'pk'

In [42]:
store.col.query(expr = """pk in ['1','2','3']""", output_fields=['source','text','pk','page'])

[{'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf',
  'text': '1 \n OALCF Task Cover Sheet  \nTask Title:   Training Schedule  \nLearner Name:  \n \n \nDate Started:       Date Completed:  \n \n \nSuccessful Completion:    Yes__ _  No__ _ \nGoal Path:    Employment  \uf0fc    Apprenticeship  \uf0fc   Secondary School_ __  Post Secondary  \uf0fc Independence ___ \nTask Description:  \nComplete an agenda for a training schedule and traveling times.   \nCompetency:',
  'pk': '1',
  'page': 0},
 {'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf',
  'text': 'Task Description:  \nComplete an agenda for a training schedule and traveling times.   \nCompetency:  \nA:  Find and Use Information  \nB:  Communicate Ideas and Information  \nC:  Understand and  Use Numbers  \nD:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage tim

In [41]:
store.fields

['source', 'page', 'start_index', 'chunk_id', 'text', 'pk', 'vector']

In [71]:
a = [101,20,31]

In [80]:
def get_documents_by_ids(store: VectorStore, ids : int | str | List[int|str]) -> List[Document]:

    #Generating filtering expr for passing to query function
    if isinstance(ids, list):
        expr = "pk in ["
    
        for id_value in ids:
            expr += f"'{id_value}',"
    
        expr += "]"
        
    else:
        expr = "pk in ['{a}']".format(a = a)


    output_fields = list(set(store.fields).intersection(['source','text','pk','page', 'chunk_id']))

    results = store.col.query(expr = expr, output_fields=output_fields)

    output_docs = []
    
    if len(results)>0:

        for i in range(len(results)):

            page_content = results[i]['text']

            metadata = {'pk': results[i]['pk']}

            for metadata_field in ['source','page','chunk_id']:
                if metadata_field in output_fields:
                    metadata[metadata_field] = results[i][metadata_field]

            output_docs.append(Document(page_content=page_content, metadata=metadata) )

    return output_docs
            

In [84]:
get_documents_by_ids(store, [3])

[Document(page_content='D:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage time  \nC4:  Manage Data  \nD2:  Use Digital Technology  \nLevel Indicators:  \nA1.1:  Read brief texts to locate  specific details  \nA2.2:   Interpret simple documents to locate and connect information', metadata={'pk': '3', 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'page': 0, 'chunk_id': 3})]

In [85]:
get_documents_by_ids(store, ['3','23',45])

[Document(page_content='Transition Task: Prepared for the Project,  Teaching to Fish (Build Tasks) Integrating OALCF Task \nDevelopment within Ontario’s Literacy Programs  (2014)  \n \n6 \n Task Title:  Training Schedule  \nAnswer Key \nTask 1:   Complete the weekly agenda using the class schedule .  Include the class code.  \n \nWeekly Agenda  \nMonday  Tuesday   Wednesday  Thursday  Friday  \n7  7   7   7   7', metadata={'pk': '23', 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'page': 5, 'chunk_id': 23}),
 Document(page_content='D:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage time  \nC4:  Manage Data  \nD2:  Use Digital Technology  \nLevel Indicators:  \nA1.1:  Read brief texts to locate  specific details  \nA2.2:   Interpret simple documents to locate and connect information', metadata={'pk': '3', 'source': '/Users/roshansk/Documents/AI/

In [83]:
get_documents_by_ids(store, ['3','r','22'])

[Document(page_content='4:40 pm  5:20 pm  6:00 pm  \n5:20 pm 6:00 pm  6:40 pm  \n6:00 pm  6:40 pm  7:20 pm  \n6:40 pm  7:20 pm  8:00 pm  \n7:20 pm  8:00 pm  8:40 pm  \n8:00 pm  8:40 pm  9:20 pm  \n8:40 pm  9:20 pm  10:00 pm  \n9:20 pm  10:00 pm  10:40 pm  \n10:00 pm  10:40 pm  11:20 pm  \n10:40 pm  11:20 pm  12:00 am', metadata={'pk': '22', 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'page': 4, 'chunk_id': 22}),
 Document(page_content='D:  Use Digital Technology  \n Task Group(s):  \nA1:  Read continuous text  \nA2:  Interpret documents  \nB3:  Complete and create documents  \nC2:  Manage time  \nC4:  Manage Data  \nD2:  Use Digital Technology  \nLevel Indicators:  \nA1.1:  Read brief texts to locate  specific details  \nA2.2:   Interpret simple documents to locate and connect information', metadata={'pk': '3', 'source': '/Users/roshansk/Documents/AI/AdobeTest/test_data/pdfs/DR--110685614.pdf', 'page': 0, 'chunk_id': 3})]