In [49]:
import sys
import os
from pathlib import Path

from dotenv import load_dotenv

load_dotenv()

current_dir = Path.cwd().parent

sys.path.append(str(current_dir))

from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from backend.brain.document_processing import load_pdf, chunk_docs

from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings


pdf_path = (
        current_dir / "backend" / "docs" / "Bitcoin - A Peer-to-Peer Electronic Cash System.pdf"
    )


from langchain_community.document_loaders import PyMuPDFLoader

loader_py = PyMuPDFLoader(pdf_path)
pages = loader_py.load()



character_text_splitter = CharacterTextSplitter(
        chunk_size=1000, chunk_overlap=150, separator="\n"
    )

recursive_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500, separators=[".", "?", "!"], chunk_overlap=200
)

semantic_text_splitter = SemanticChunker(
    embeddings=OpenAIEmbeddings(model="text-embedding-3-small"),
)

TEXT_SPLITTERS_REGISTRY = {
    "character": character_text_splitter,
    "recursive": recursive_text_splitter,
    "semantic": semantic_text_splitter,
}

text_splitter = TEXT_SPLITTERS_REGISTRY["recursive"]

docs = text_splitter.split_documents(pages)


print(docs)

[Document(metadata={'source': '/Users/stephen/Documents/Github/pdf-chat-hub/backend/docs/Bitcoin - A Peer-to-Peer Electronic Cash System.pdf', 'file_path': '/Users/stephen/Documents/Github/pdf-chat-hub/backend/docs/Bitcoin - A Peer-to-Peer Electronic Cash System.pdf', 'page': 0, 'total_pages': 9, 'format': 'PDF 1.4', 'title': '', 'author': '', 'subject': '', 'keywords': '', 'creator': 'Writer', 'producer': 'OpenOffice.org 2.4', 'creationDate': "D:20090324113315-06'00'", 'modDate': '', 'trapped': ''}, page_content='Bitcoin: A Peer-to-Peer Electronic Cash System\nSatoshi Nakamoto\nsatoshin@gmx.com\nwww.bitcoin.org\nAbstract.  A purely peer-to-peer version of electronic cash would allow online \npayments to be sent directly from one party to another without going through a \nfinancial institution.  Digital signatures provide part of the solution, but the main \nbenefits are lost if a trusted third party is still required to prevent double-spending'), Document(metadata={'source': '/Users/s

In [50]:
print(docs[0].page_content)
print("-----", "/n")
print(docs[1].page_content)
print("-----", "/n")
print(docs[2].page_content)
print("-----", "/n")
print(docs[3].page_content)
print("-----", "/n")

Bitcoin: A Peer-to-Peer Electronic Cash System
Satoshi Nakamoto
satoshin@gmx.com
www.bitcoin.org
Abstract.  A purely peer-to-peer version of electronic cash would allow online 
payments to be sent directly from one party to another without going through a 
financial institution.  Digital signatures provide part of the solution, but the main 
benefits are lost if a trusted third party is still required to prevent double-spending
----- /n
. 
We propose a solution to the double-spending problem using a peer-to-peer network. 
The network timestamps transactions by hashing them into an ongoing chain of 
hash-based proof-of-work, forming a record that cannot be changed without redoing 
the proof-of-work.  The longest chain not only serves as proof of the sequence of 
events witnessed, but proof that it came from the largest pool of CPU power
----- /n
.  The longest chain not only serves as proof of the sequence of 
events witnessed, but proof that it came from the largest pool of CPU power. 

In [32]:

print(docs[3].page_content)


2. Transactions
We define an electronic coin as a chain of digital signatures. Each owner transfers the coin to the 
next by digitally signing a hash of the previous transaction and the public key of the next owner 
and adding these to the end of the coin. A payee can verify the signatures to verify the chain of 
ownership. The problem of course is the payee can't verify that one of the owners did not double-spend 
the coin. A common solution is to introduce a trusted central authority, or mint, that checks every 
transaction for double spending. After each transaction, the coin must be returned to the mint to 
issue a new coin, and only coins issued directly from the mint are trusted not to be double-spent. The problem with this solution is that the fate of the entire money system depends on the 
company running the mint, with every transaction having to go through them, just like a bank. We need a way for the payee to know that the previous owners did not sign any earlier 
transactio