In [18]:
from langchain_community.document_loaders import PyPDFLoader
 
def extract_text_from_pdf(pdf_path):
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    texts = ""
    for doc in documents:
        texts += doc.page_content + " "
    return texts

docs = extract_text_from_pdf("./data/Uploads/Mutalib_2019_IOP_Conf._Ser.__Mater._Sci._Eng._469_012092.pdf")
print(docs)

IOP Conference Series:
Materials Science and
Engineering
      
PAPER • OPEN ACCESS
Magnetorheological finishing on metal surface: A
review
To cite this article: N A Mutalib et al 2019 IOP Conf. Ser.: Mater. Sci. Eng. 469 012092
 
View the article online for updates and enhancements.
You may also like
Synthesis of Magnetorheological Fluid
using Electrolytic Carbonyl Iron Powder for
Enhanced Stability
Rakesh Erappa Jinaga and T Jagadeesha
-
Development of Vibration Isolator
Magnetorheological Elastomer Based
G Priyandoko, P Suwandono, NR Ismail et
al.
-
Magnetorheological torque transmission
devices with permanent magnets
H Böse, T Gerlach and J Ehrlich
-
 
This content was downloaded from IP address 65.1.45.255 on 25/02/2025 at 08:39 Content from this work may be used under the terms of the Creative Commons Attribution 3.0 licence. Any further distribution
of this work must maintain attribution to the author(s) and the title of the work, journal citation and DOI.
Published under licenc

In [25]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

def split_docs(documents, chunk_size=200, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    return text_splitter.split_text(documents)

chunks = split_docs(docs)
print(len(chunks))
print('First Chunk\n', chunks[0])
print('\n\n\nsecond chunk\n', chunks[1])

231
First Chunk
 IOP Conference Series:
Materials Science and
Engineering
      
PAPER • OPEN ACCESS
Magnetorheological finishing on metal surface: A
review



second chunk
 review
To cite this article: N A Mutalib et al 2019 IOP Conf. Ser.: Mater. Sci. Eng. 469 012092
 
View the article online for updates and enhancements.
You may also like


In [42]:
from langchain.embeddings import HuggingFaceEmbeddings
import faiss
from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_community.vectorstores import FAISS
from langchain_core.documents import Document
from uuid import uuid4

embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

index = faiss.IndexFlatL2(len(embedding_model.embed_query(chunks[0])))

vector_store = FAISS(
    embedding_function=embedding_model,
    index=index,
    docstore=InMemoryDocstore(),
    index_to_docstore_id={},
)
documents = [Document(page_content=chunk, metadata={'chunk_id':i}) for i, chunk in enumerate(chunks)]
uuids = [str(uuid4()) for _ in range(len(documents))]

vector_store.add_documents(documents=documents, ids=uuids)


['6646f05b-3ac5-4311-b017-e15cb348f426',
 '677b1154-44c9-4b45-a8b1-38ee783299cd',
 '724a0bc6-b203-4ce7-881e-11f7258b0b3a',
 'c281edc5-26ef-4492-86c9-076db61e72d5',
 'f0fdc3ca-d69b-4f66-b6bb-4c97d58bce35',
 'b651d37c-4924-46ca-8b97-74cd0744f7d8',
 '25f7ace7-7ee9-49f0-a499-7fc22ade06f9',
 '64792981-7974-4132-b862-cb2036592e97',
 'c3184475-a8e0-4c6e-b7e4-cd9cc7eaea4e',
 'b4a8a8fc-a43b-4656-a913-d3ba69596268',
 '38d186d6-d204-4e4d-b676-aaa3c1df4a72',
 'b2cd0d88-5131-4db8-a104-ed0d37d9f33b',
 '268a3ccd-5680-4c73-a33d-aec5991626e1',
 '450ae97c-6e4f-484a-8372-cdda74953084',
 '796d0691-9102-4ad7-9351-b56fa0778b66',
 'de578152-dbc2-4147-a90b-0e300635fd64',
 '923d697a-d975-488f-b172-4c300cfd3741',
 '0a5d38ef-2377-48a6-8bf9-c04257ef5446',
 '53a7fc6d-900c-4414-9aff-f0d206fd67c5',
 '64cf840c-6c94-4a9b-b332-9cc246d1ab3d',
 'c989a9b3-29c5-40fe-91d2-3585aaf9acf4',
 '5b07365b-c66e-4f35-a96c-afb9cb0836fa',
 '72037cde-790d-4ddc-9f10-8631f86ca6b0',
 '6b8f52e3-5016-48d6-a7e4-9fcb3c277ffc',
 'bb194694-9768-

In [45]:
embedding_vector = index.reconstruct(0)
print("Embedding vector shape:", embedding_vector.shape)
print("Embedding vector:", embedding_vector)

Embedding vector shape: (384,)
Embedding vector: [-4.27920334e-02  4.62989882e-03  3.73352803e-02  1.81439817e-02
  4.31342423e-02 -2.59420443e-02 -2.76514404e-02 -3.69591452e-02
 -9.31640118e-02 -6.06667809e-02 -1.08544990e-01  5.49387336e-02
  6.07107207e-03  1.05600934e-02 -6.31525964e-02  2.01729257e-02
  2.44991301e-04 -6.89911768e-02  6.06481992e-02  4.08921801e-02
  3.88053134e-02  1.89966783e-02  4.16894369e-02 -7.15586618e-02
 -3.78946513e-02  5.10967113e-02  1.02987923e-02 -3.17859426e-02
  1.42898019e-02 -2.15978306e-02  5.46289235e-02 -8.45311489e-03
 -1.31047377e-02 -8.11325610e-02  4.52518724e-02 -2.03848276e-02
 -9.53565463e-02  4.70087714e-02 -2.27881614e-02 -2.56955959e-02
 -3.49350534e-02 -6.47842437e-02 -4.00958629e-03 -3.07174381e-02
  1.03715152e-01  2.29896009e-02  1.04387552e-01 -8.56211483e-02
  6.20598299e-03  2.46601701e-02 -3.22496556e-02 -1.71742644e-02
  4.14946750e-02 -9.19486508e-02  2.58763656e-02  7.56571963e-02
 -1.60024092e-02 -1.10764749e-01 -1.01333