# TOKENIZATION METHODS

# METHOD 1: Unstructured default tokenization

# Install the packages and libmagic for automatic file type detection
pip install "unstructured[all-docs]"
pip install unstructured-client
brew install libmagic

In [None]:
from unstructured_client import UnstructuredClient
s = UnstructuredClient(api_key_auth="UXNnvEtTT7FyVI1qY1R85616zcN8eO")

In [None]:
filename = "./gpt-4.pdf"
file = open(filename, "rb")

In [None]:
from unstructured_client.models import shared

req = shared.PartitionParameters(
    # Note that this currently only supports a single file
    files=shared.Files(
        content=file.read(),
        file_name=filename,
    ),
    # Other partition params
    strategy="fast", # fast, hi_res, auto. For details see https://unstructured-io.github.io/unstructured/best_practices/strategies.html
)
print(req)

In [None]:
# Default Unstructured partitioning

from unstructured_client.models.errors import SDKError

try:
    res = s.general.partition(req)
    for i in range(10):
        print(res.elements[i])
except SDKError as e:
    print(e)

# METHOD 2: Proposition-based tokenization

# Install package to read PDF
pip install PyPDF2
pip install transformers
pip install torch

In [None]:
# Proposition-based retrieval

# Read all text from PDF
import PyPDF2
reader = PyPDF2.PdfReader('gpt-4.pdf')
content = ""
for i in range(len(reader.pages)):
    content += reader.pages[i].extract_text()

# Use a small portion of text for testing
content = content[:2000]

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch
import json

model_name = "chentong00/propositionizer-wiki-flan-t5-large"
device = "cuda" if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)

input_ids = tokenizer(content, return_tensors="pt").input_ids
outputs = model.generate(input_ids.to(device), max_new_tokens=512).cpu()

output_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
try:
    prop_list = json.loads(output_text)
except:
    prop_list = []
    print("[ERROR] Failed to parse output text as JSON.")
print(json.dumps(prop_list, indent=2))

# METHOD 2: Parent document retriever

# Install langchain for proof of concept
pip install langchain
pip install tiktoken
pip install chromadb

In [22]:
from langchain.retrievers import ParentDocumentRetriever
from langchain.document_loaders import UnstructuredFileLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.storage import InMemoryStore
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
import dotenv

dotenv.load_dotenv()

loader = UnstructuredFileLoader("gpt-4.pdf")
docs = loader.load()

# This text splitter is used to create the parent documents
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
# This text splitter is used to create the child documents
# It should create documents smaller than the parent
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
# The vectorstore to use to index the child chunks
vectorstore = Chroma(
    collection_name="split_parents", embedding_function=OpenAIEmbeddings(disallowed_special=()), persist_directory="./chroma_db"
)
# The storage layer for the parent documents
store = InMemoryStore()

retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

retriever.add_documents(docs)

print(len(list(store.yield_keys())))

sub_docs = vectorstore.similarity_search("GPT-4")
print(sub_docs[0].page_content)

retrieved_docs = retriever.get_relevant_documents("GPT-4")
print(len(retrieved_docs[0].page_content))
print(retrieved_docs[0].page_content)

176
GPT-4
1844
GPT-4 (launch)2.9PotentialforRiskyEmergentBehaviorsNovelcapabilitiesoftenemergeinmorepowerfulmodels.[60,61]Somethatareparticularlyconcerningaretheabilitytocreateandactonlong-termplans,[62]toaccruepowerandresources(“power-seeking”),[63]andtoexhibitbehaviorthatisincreasingly“agentic.”[64]Agenticinthiscontextdoesnotintendtohumanizelanguagemodelsorrefertosentiencebutratherreferstosystemscharacterizedbyabilityto,e.g.,accomplishgoalswhichmaynothavebeenconcretelyspeciﬁedand54

Prompt

1. Insecure password hashing: The code uses MD5 for hashing passwords, which is considered insecure due to its vulnerability to collision attacks and its speed, allowing attackers to perform brute force attacks more easily. A more secure alternative would be to use bcrypt or Argon2.2. SQL Injection: The ``fetch'' function constructs an SQL query using string concatenation with unsanitized user input, which could allow an attacker to inject malicious SQL code. To mitigate this risk, you should use 