In [1]:
filename = "paul_graham_essay.txt"
f = open(f"/Users/user/Documents/GitHub/graph_maker/data_input/{filename}", "r")
sourcetext = f.read()

# 1. Text Only

## A. Langchain library

### RecursiveCharacterTextSplitter

In [3]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=100,
    chunk_overlap=20,
    length_function=len,
    is_separator_regex=False,
)

temp = text_splitter.create_documents([sourcetext])
print(len(list(temp)))
list(temp)

974


[Document(page_content='What I Worked On\n\nFebruary 2021'),
 Document(page_content='Before college the two main things I worked on, outside of school, were writing and programming. I'),
 Document(page_content="and programming. I didn't write essays. I wrote what beginning writers were supposed to write then,"),
 Document(page_content='to write then, and probably still are: short stories. My stories were awful. They had hardly any'),
 Document(page_content='They had hardly any plot, just characters with strong feelings, which I imagined made them deep.'),
 Document(page_content='The first programs I tried writing were on the IBM 1401 that our school district used for what was'),
 Document(page_content='used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The'),
 Document(page_content="I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high"),
 Document(page_content='of our junior high school, and my friend Rich 

### NLTKTextSplitter

In [4]:
from langchain_text_splitters import NLTKTextSplitter
text_splitter = NLTKTextSplitter()
docs = text_splitter.split_text(sourcetext)
print(len(docs))
list(docs)

20


['What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming.\n\nI didn\'t write essays.\n\nI wrote what beginning writers were supposed to write then, and probably still are: short stories.\n\nMy stories were awful.\n\nThey had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing."\n\nThis was in 9th grade, so I was 13 or 14.\n\nThe school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it.\n\nIt was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\nThe language we used was an early version of Fortran.\n\nYou had to type programs o

### SpacyTextSplitter

In [5]:
from langchain_text_splitters import SpacyTextSplitter
text_splitter = SpacyTextSplitter()
docs = text_splitter.split_text(sourcetext)
print(len(docs))
print(list(docs))

20
['What I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming.\n\nI didn\'t write essays.\n\nI wrote what beginning writers were supposed to write then, and probably still are: short stories.\n\nMy stories were awful.\n\nThey had hardly any plot, just characters with strong feelings, which I imagined made them deep.\n\n\n\nThe first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing."\n\nThis was in 9th grade, so I was 13 or 14.\n\nThe school district\'s 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it.\n\nIt was like a mini Bond villain\'s lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights.\n\n\n\nThe language we used was an early version of Fortran.\n\nYou had to type



### SemanticChunker

In [6]:
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings

In [7]:
import os
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [8]:
text_splitter = SemanticChunker(OpenAIEmbeddings())

In [9]:
docs = text_splitter.create_documents([sourcetext])
print(docs[0].page_content)



What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays.


This chunker works by determining when to "break" apart sentences. This is done by looking for differences in embeddings between any two sentences. When that difference is past some threshold, then they are split.

In [10]:
docs[0].page_content

"\n\nWhat I Worked On\n\nFebruary 2021\n\nBefore college the two main things I worked on, outside of school, were writing and programming. I didn't write essays."

In [11]:
len(docs)

39

In [12]:
docs[5].page_content

"There's a whole world there that's barely been explored. But all I wanted was to get out of grad school, and my rapidly written dissertation sufficed, just barely. Meanwhile I was applying to art schools. I applied to two: RISD in the US, and the Accademia di Belli Arti in Florence, which, because it was the oldest art school, I imagined would be good. RISD accepted me, and I never heard back from the Accademia, so off to Providence I went. I'd applied for the BFA program at RISD, which meant in effect that I had to go to college again. This was not as strange as it sounds, because I was only 25, and art schools are full of people of different ages. RISD counted me as a transfer sophomore and said I had to do the foundation that summer. The foundation means the classes that everyone has to take in fundamental subjects like drawing, color, and design. Toward the end of the summer I got a big surprise: a letter from the Accademia, which had been delayed because they'd sent it to Cambrid

In [13]:
for x in docs:
    print(x)

page_content='

What I Worked On

February 2021

Before college the two main things I worked on, outside of school, were writing and programming. I didn't write essays.'
page_content='I wrote what beginning writers were supposed to write then, and probably still are: short stories. My stories were awful. They had hardly any plot, just characters with strong feelings, which I imagined made them deep. The first programs I tried writing were on the IBM 1401 that our school district used for what was then called "data processing." This was in 9th grade, so I was 13 or 14. The school district's 1401 happened to be in the basement of our junior high school, and my friend Rich Draves and I got permission to use it. It was like a mini Bond villain's lair down there, with all these alien-looking machines — CPU, disk drives, printer, card reader — sitting up on a raised floor under bright fluorescent lights. The language we used was an early version of Fortran. You had to type programs on punch 

### Agentic Chunker

In [2]:
from langchain.output_parsers.openai_tools import JsonOutputToolsParser
from langchain_community.chat_models import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnableLambda
from langchain.chains import create_extraction_chain
from typing import Optional, List
from langchain.chains import create_extraction_chain_pydantic
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub

In [3]:
import os
obj = hub.pull("wfh/proposal-indexing")
llm = ChatOpenAI(model='gpt-3.5-turbo', openai_api_key = os.getenv("OPENAI_API_KEY", 'YouKey'))

  warn_deprecated(


In [4]:
# use it in a runnable
runnable = obj | llm

In [5]:
# Pydantic data class
class Sentences(BaseModel):
    sentences: List[str]

# Extraction
extraction_chain = create_extraction_chain_pydantic(pydantic_schema=Sentences, llm=llm)

  warn_deprecated(


In [6]:
def get_propositions(text):
    runnable_output = runnable.invoke({
    	"input": text
    }).content
    
    propositions = extraction_chain.run(runnable_output)[0].sentences
    return propositions

In [7]:
paragraphs = sourcetext.split("\n\n")

In [8]:
len(paragraphs)

177

In [9]:
essay_propositions = []

for i, para in enumerate(paragraphs[:5]):
    propositions = get_propositions(para)
    
    essay_propositions.extend(propositions)
    print (f"Done with {i}")

  warn_deprecated(


Done with 0
Done with 1
Done with 2
Done with 3
Done with 4


In [10]:
print (f"You have {len(essay_propositions)} propositions")
essay_propositions[:10]

You have 20 propositions


['I need more information or the content to decompose.',
 'I worked on something.',
 'February 2021 is a month.',
 'February 2021 is in the year 2021.',
 'Before college, the two main things I worked on outside of school were writing and programming.',
 "I didn't write essays.",
 'I wrote what beginning writers were supposed to write then, and probably still are: short stories.',
 'My stories were awful.',
 'They had hardly any plot.',
 'The stories had just characters with strong feelings.']

In [11]:
# mini script I made
from agentic_chunker import AgenticChunker

ModuleNotFoundError: No module named 'agentic_chunker'

In [None]:
ac = AgenticChunker()

ValueError: API key is not provided and not found in environment variables

In [None]:
ac.add_propositions(essay_propositions)

In [None]:
ac.pretty_print_chunks()

In [None]:
chunks = ac.get_chunks(get_type='list_of_strings')

## B. sentence_splitter library

In [None]:
from sentence_splitter import SentenceSplitter, split_text_into_sentences

splitter = SentenceSplitter(language='en', non_breaking_prefix_file='en.txt')

temp2 = splitter.split(text=sourcetext)
print(len(temp2))
temp2

## C. Proposal

# 2. Text and Image (Multi-Modal)

## Unstructured

In [None]:
import os
from unstructured.partition.pdf import partition_pdf
from unstructured.staging.base import elements_to_json

In [None]:
filename = "data_input/SalesforceFinancial.pdf"

# Extracts the elements from the PDF
elements = partition_pdf(
    filename=filename,

    # Unstructured Helpers
    strategy="hi_res", 
    infer_table_structure=True, 
    model_name="yolox"
)

In [None]:
elements

In [None]:
elements[-4].metadata.text_as_html

In [None]:
filename = "data_input/facultymanual.pdf"

# Extracts the elements from the PDF
elements = partition_pdf(
    filename=filename,

    # Unstructured Helpers
    strategy="hi_res", 
    infer_table_structure=True, 
    model_name="yolox"
)

In [None]:
elements

In [None]:
len(elements)

In [None]:
elements[12].metadata.text_as_html

In [2]:
import os
print(os.getenv("OPENAI_API_KEY"))

sk-proj-9ot48FSfk4enl3h8YFLKT3BlbkFJF3S0EnuhWAPR6VYQ4KZ0
