### Split Documents into Chunks

In [1]:
from langchain_community.document_loaders import DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = DirectoryLoader(
    path='external_data/'
)

documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=2000,
    chunk_overlap=200
)

document_split = text_splitter.split_documents(
    documents=documents
)

  from .autonotebook import tqdm as notebook_tqdm


### Store the Chunks into Pinecone Index

In [2]:
import os
from dotenv import load_dotenv

from pinecone import Pinecone
from langchain.vectorstores import Pinecone as vec_storer
from langchain.embeddings import SentenceTransformerEmbeddings

load_dotenv()

api_key = os.getenv(
    key='PINECONE_API_KEY'
)

index_name = os.getenv(
    key='PINECONE_INDEX_NAME'
)

Pinecone(
    api_key=api_key
)

embedding_model = SentenceTransformerEmbeddings(
    model_name='all-MiniLM-L6-v2'
)

vec_storer.from_documents(
    documents=document_split,
    embedding=embedding_model,
    index_name=index_name
)

<langchain_community.vectorstores.pinecone.Pinecone at 0x29ce9d79640>

### Retrieve Relevant Documents from Given Query

In [3]:
from sentence_transformers import SentenceTransformer

sentence_transformer = SentenceTransformer(
    model_name_or_path='all-MiniLM-L6-v2'
)

input_embedded = sentence_transformer.encode(
    sentences="Who are presidential and vice-presidential candidates for Indonesia's general election?"
).tolist()

pinecode_index = Pinecone(
    api_key=api_key
).Index(
    name=index_name
)

results = pinecode_index.query(
    vector=input_embedded,
    top_k=10,
    include_metadata=True
)

results['matches']

[{'id': 'e042bc35-3a19-41ae-9ccc-95a1bae7efc4',
  'metadata': {'source': 'external_data\\List of presidential and '
                         'vice-presidential candidate v4.pdf',
               'text': 'Registration stages for presidential and '
                       'vice-presidential candidates for 2024 Indonesia’s '
                       'general election ended on October 25th, 2023. The '
                       'general election will be held simultaneously throughout '
                       'Indonesia on February 14th, 2024. There are three pairs '
                       'of presidential and vice-presidential candidates having '
                       'registered with Komisi Pemilihan Umum (KPU). Here’s the '
                       'list of candidate pairs:\n'
                       '\n'
                       'Candidate pair number 1 are Anies Rasyid Baswedan as '
                       'presidential candidate and Abdul Muhaimin Iskandar as '
                       'vice presid