In [1]:
!pip3 install --upgrade --quiet langchain langchain-community langchain-openai chromadb 
!pip3 install --upgrade --quiet pypdf pandas streamlit python-dotenv

In [7]:
# LangChain community + core imports
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import Chroma
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_openai import OpenAIEmbeddings, ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from pydantic import BaseModel, Field


# Other Python modules
import os
import tempfile
import streamlit as st
import pandas as pd
from dotenv import load_dotenv


In [12]:
load_dotenv()

True

In [13]:
# OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")

## Define our LLM

In [14]:
llm = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)
llm.invoke("Tell me a joke about cats")

AIMessage(content='Why was the cat sitting on the computer?\n\nBecause it wanted to keep an eye on the mouse!', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 20, 'prompt_tokens': 13, 'total_tokens': 33, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CbGHn7tbvfliXOOdCpzscuSKffBrx', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--cea14857-29f3-4594-8701-1f89ed0fea97-0', usage_metadata={'input_tokens': 13, 'output_tokens': 20, 'total_tokens': 33, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

## Process PDF document

### Load PDF document

In [15]:
loader = PyPDFLoader("data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf")
pages = loader.load()
pages

Ignoring wrong pointing object 18 0 (offset 0)


[Document(metadata={'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'creator': 'Preview', 'creationdate': "D:20240909152042Z00'00'", 'author': 'Thu Vu', 'moddate': "D:20240910141854Z00'00'", 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'total_pages': 3, 'page': 0, 'page_label': '1'}, page_content='APPLIED COGNITIVE PSYCHOLOGY\nAppl. Cognit. Psychol. 20: 139–156 (2006)\nPublished online 31 October 2005 in Wiley InterScience\n(www.interscience.wiley.com) DOI: 10.1002/acp.1178\nConsequences of Erudite Vernacular Utilized Irrespective\nof Necessity: Problems with Using Long Words Needlessly\nDANIEL M. OPPENHEIMER*\nPrinceton University, USA\nSUMMARY\nMost texts on writing style encourage authors to avoid overly-complex words. However, a majority\nof undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give\nthe impression of intelligen

### Split document

In [16]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1500,
                                            chunk_overlap=200,
                                            length_function=len,
                                            separators=["\n\n", "\n", " "])
chunks = text_splitter.split_documents(pages)

### Create embeddings

In [20]:
def get_embedding_function():
    embeddings = OpenAIEmbeddings(
        model="text-embedding-ada-002", openai_api_key=OPENAI_API_KEY
    )
    return embeddings

embedding_function = get_embedding_function()
test_vector = embedding_function.embed_query("cat")

In [22]:
# !pip install -U langchain-experimental


Collecting langchain-experimental
  Downloading langchain_experimental-0.4.0-py3-none-any.whl.metadata (1.3 kB)
Downloading langchain_experimental-0.4.0-py3-none-any.whl (209 kB)
Installing collected packages: langchain-experimental
Successfully installed langchain-experimental-0.4.0


In [28]:
import langchain_experimental
dir(langchain_experimental)

['__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__']

In [30]:
# from langchain_experimental.evaluation import load_evaluator

# evaluator = load_evaluator(evaluator="embedding_distance", 
#                             embeddings=embedding_function)

# evaluator.evaluate_strings(prediction="Amsterdam", reference="coffeeshop")
!pip install -U langchain-openai scipy

from langchain_openai import OpenAIEmbeddings
from scipy.spatial.distance import cosine

emb = OpenAIEmbeddings()

def embedding_similarity(prediction: str, reference: str) -> float:
    """Return cosine similarity between two text embeddings (0–1, higher = more similar)."""
    pred_vec = emb.embed_query(prediction)
    ref_vec = emb.embed_query(reference)
    return 1 - cosine(pred_vec, ref_vec)

score = embedding_similarity("Paris", "coffeeshop")
print({"score": score, "explanation": "Cosine similarity between embeddings"})




Collecting scipy
  Downloading scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Downloading scipy-1.16.3-cp311-cp311-macosx_14_0_arm64.whl (20.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.9/20.9 MB[0m [31m26.7 MB/s[0m  [33m0:00:00[0m eta [36m0:00:01[0m
[?25hInstalling collected packages: scipy
Successfully installed scipy-1.16.3
{'score': np.float64(0.775740350604033), 'explanation': 'Cosine similarity between embeddings'}


In [31]:
# evaluator.evaluate_strings(prediction="Paris", reference="coffeeshop")

### Create vector database

In [47]:
import uuid

def create_vectorstore(chunks, embedding_function, vectorstore_path):

    # Create a list of unique ids for each document based on the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]
    
    # Ensure that only unique docs with unique ids are kept
    unique_ids = set()
    unique_chunks = []
    
    unique_chunks = [] 
    for chunk, id in zip(chunks, ids):     
        if id not in unique_ids:       
            unique_ids.add(id)
            unique_chunks.append(chunk) 

    # Create a new Chroma database from the documents
    vectorstore = Chroma.from_documents(documents=unique_chunks, 
                                        ids=list(unique_ids),
                                        embedding=embedding_function, 
                                        persist_directory = vectorstore_path)

    vectorstore.persist()
    
    return vectorstore

In [72]:
# # Create vectorstore
# vectorstore = create_vectorstore(chunks=chunks, 
#                                  embedding_function=embedding_function, 
#                                  vectorstore_path="vectorstore_test")
# print(vectorstore)

In [49]:
import uuid
from langchain_community.vectorstores import Chroma

def create_vectorstore(chunks, embedding_function, vectorstore_path):
    # Generate deterministic UUIDs from the content
    ids = [str(uuid.uuid5(uuid.NAMESPACE_DNS, doc.page_content)) for doc in chunks]

    # Keep only unique chunks
    unique_ids = set()
    unique_chunks = []
    for chunk, id_ in zip(chunks, ids):
        if id_ not in unique_ids:
            unique_ids.add(id_)
            unique_chunks.append(chunk)

    # Create a Chroma vectorstore (auto-persisting)
    vectorstore = Chroma.from_documents(
        documents=unique_chunks,
        ids=list(unique_ids),
        embedding=embedding_function,
        persist_directory=vectorstore_path
    )

    # ❌ No need for vectorstore.persist() — automatic in new versions
    # ✅ Optional manual persist (if you want explicit flush):
    # vectorstore._client.persist()

    return vectorstore


In [62]:
from langchain_community.vectorstores import Chroma

vectorstore = Chroma(
    persist_directory="vectorstore_test",
    embedding_function=embedding_function
)
retriever = vectorstore.as_retriever()
print(retriever)

tags=['Chroma', 'OpenAIEmbeddings'] vectorstore=<langchain_community.vectorstores.chroma.Chroma object at 0x11f075490> search_kwargs={}


In [63]:
# Example: use as retriever
retriever = vectorstore.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("What is this document about?")
for d in docs:
    print(d.page_content[:200], "...\n")

was unnecessary and thus surprising readers with the relative disﬂuency of the text.
Both the experts and prevailing wisdom present plausible views, but which (if either) is
correct? The present paper ...

be inﬂuenced by the irrelevant source of ﬂuency, they over-compensate and are biased in the opposite
direction. Implications and applications are discussed. Copyright # 2005 John Wiley & Sons, Ltd.
Wh ...

complicated language?’ 86.4% of the sample admitted to having done so. Nearly two-
thirds answered yes to the question, ‘When you write an essay, do you turn to the thesaurus
to choose words that are  ...



## 2. Query for relevant data

In [44]:
# # Load vectorstore
# # !pip install -U langchain-chroma
# from langchain_chroma import Chroma
# vectorstore = Chroma(persist_directory="vectorstore_chroma", embedding_function=embedding_function)

Collecting langchain-chroma
  Downloading langchain_chroma-1.0.0-py3-none-any.whl.metadata (1.9 kB)
Downloading langchain_chroma-1.0.0-py3-none-any.whl (12 kB)
Installing collected packages: langchain-chroma
Successfully installed langchain-chroma-1.0.0


In [64]:
# Create retriever and get relevant chunks
retriever = vectorstore.as_retriever(search_type="similarity")
relevant_chunks = retriever.invoke("What is the title of the paper?")
relevant_chunks
print(relevant_chunks)

[Document(metadata={'producer': 'macOS Version 14.4.1 (Build 23E224) Quartz PDFContext, AppendMode 1.1', 'source': 'data/Oppenheimer-2006-Applied_Cognitive_Psychology.pdf', 'page': 1, 'creationdate': "D:20240909152042Z00'00'", 'author': 'Thu Vu', 'moddate': "D:20240910141854Z00'00'", 'title': 'Oppenheimer-2006-Applied_Cognitive_Psychology', 'page_label': '2', 'creator': 'Preview', 'total_pages': 3}, page_content='was unnecessary and thus surprising readers with the relative disﬂuency of the text.\nBoth the experts and prevailing wisdom present plausible views, but which (if either) is\ncorrect? The present paper provides an empirical investigation of the strategy of complex-\nity, and ﬁnds such a strategy to be unsuccessful. Five studies demonstrate that the loss of\nﬂuency due to needless complexity in a text negatively impacts raters’ assessments of the\ntext’s authors.\nEXPERIMENT 1\nExperiment 1 aimed to answer several simple questions. First, does increasing the\ncomplexity of tex

In [65]:
# Prompt template
PROMPT_TEMPLATE = """
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

{context}

---

Answer the question based on the above context: {question}
"""

## 3. Generate responses

In [66]:
# Concatenate context text
context_text = "\n\n---\n\n".join([doc.page_content for doc in relevant_chunks])

# Create prompt
prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
prompt = prompt_template.format(context=context_text, 
                                question="What is the title of the paper?")
print(prompt)

Human: 
You are an assistant for question-answering tasks.
Use the following pieces of retrieved context to answer
the question. If you don't know the answer, say that you
don't know. DON'T MAKE UP ANYTHING.

was unnecessary and thus surprising readers with the relative disﬂuency of the text.
Both the experts and prevailing wisdom present plausible views, but which (if either) is
correct? The present paper provides an empirical investigation of the strategy of complex-
ity, and ﬁnds such a strategy to be unsuccessful. Five studies demonstrate that the loss of
ﬂuency due to needless complexity in a text negatively impacts raters’ assessments of the
text’s authors.
EXPERIMENT 1
Experiment 1 aimed to answer several simple questions. First, does increasing the
complexity of text succeed in making the author appear more intelligent? Second, to
what extent does the success of this strategy depend on the quality of the original, simpler
writing? Finally, if the strategy is unsuccessful, is th

In [67]:
llm.invoke(prompt)

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 1117, 'total_tokens': 1149, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CbIvq0uUpxnQkBRpGQVfZ0Ikle416', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--7f2c85d2-4353-4937-9c42-5e4a88d1785b-0', usage_metadata={'input_tokens': 1117, 'output_tokens': 32, 'total_tokens': 1149, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Using Langchain Expression Language

In [68]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm
        )
rag_chain.invoke("What's the title of this paper?")

AIMessage(content='The title of the paper is "Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly."', additional_kwargs={'refusal': None}, response_metadata={'token_usage': {'completion_tokens': 32, 'prompt_tokens': 1111, 'total_tokens': 1143, 'completion_tokens_details': {'accepted_prediction_tokens': 0, 'audio_tokens': 0, 'reasoning_tokens': 0, 'rejected_prediction_tokens': 0}, 'prompt_tokens_details': {'audio_tokens': 0, 'cached_tokens': 0}}, 'model_provider': 'openai', 'model_name': 'gpt-4o-mini-2024-07-18', 'system_fingerprint': 'fp_560af6e559', 'id': 'chatcmpl-CbIwaKf3wc2Gy3z98pN4c0I50xXTv', 'service_tier': 'default', 'finish_reason': 'stop', 'logprobs': None}, id='lc_run--251c4812-0cdd-4830-b668-8b824e7a8692-0', usage_metadata={'input_tokens': 1111, 'output_tokens': 32, 'total_tokens': 1143, 'input_token_details': {'audio': 0, 'cache_read': 0}, 'output_token_details': {'audio': 0, 'reasoning': 0}})

### Generate structured responses

In [69]:
class AnswerWithSources(BaseModel):
    """An answer to the question, with sources and reasoning."""
    answer: str = Field(description="Answer to question")
    sources: str = Field(description="Full direct text chunk from the context used to answer the question")
    reasoning: str = Field(description="Explain the reasoning of the answer based on the sources")
    
class ExtractedInfo(BaseModel):
    """Extracted information about the research article"""
    paper_title: AnswerWithSources
    paper_summary: AnswerWithSources
    publication_year: AnswerWithSources
    paper_authors: AnswerWithSources

In [70]:
rag_chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt_template
            | llm.with_structured_output(ExtractedInfo, strict=True)
        )

rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")

ExtractedInfo(paper_title=AnswerWithSources(answer='Consequences of Erudite Vernacular Utilized Irrespective of Necessity: Problems with Using Long Words Needlessly', sources='SUMMARY\nMost texts on writing style encourage authors to avoid overly-complex words. However, a majority of undergraduates admit to deliberately increasing the complexity of their vocabulary so as to give the impression of intelligence. This paper explores the extent to which this strategy is effective.', reasoning='The title is referenced directly in the text where it discusses the issue of using complex language unnecessarily.'), paper_summary=AnswerWithSources(answer='The paper investigates the tendency of individuals to use unnecessarily complex vocabulary to appear more intelligent and explores the negative relationship between text complexity and perceived intelligence. Experiments demonstrate that more complex texts are judged to be written by less intelligent authors, and this is mediated by processing f

### Transform response into a dataframe

In [71]:
structured_response = rag_chain.invoke("Give me the title, summary, publication date, authors of the research paper.")
df = pd.DataFrame([structured_response.dict()])

# Transforming into a table with two rows: 'answer' and 'source'
answer_row = []
source_row = []
reasoning_row = []

for col in df.columns:
    answer_row.append(df[col][0]['answer'])
    source_row.append(df[col][0]['sources'])
    reasoning_row.append(df[col][0]['reasoning'])

# Create new dataframe with two rows: 'answer' and 'source'
structured_response_df = pd.DataFrame([answer_row, source_row, reasoning_row], columns=df.columns, index=['answer', 'source', 'reasoning'])
structured_response_df

/var/folders/f8/qx812j190h17vz08lqv3c0r00000gn/T/ipykernel_55691/1998849655.py:2: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.12/migration/
  df = pd.DataFrame([structured_response.dict()])


Unnamed: 0,paper_title,paper_summary,publication_year,paper_authors
answer,Consequences of Erudite Vernacular Utilized Ir...,The paper explores how the deliberate use of o...,2005,Daniel M. Oppenheimer
source,"the paper title is ""Consequences of Erudite Ve...",Most texts on writing style encourage authors ...,Published online 31 October 2005 in Wiley Inte...,"DANIEL M. OPPENHEIMER* Princeton University, USA"
reasoning,The title is explicitly stated in the context.,The summary is derived from the overarching th...,The publication date is stated in the context.,The authorship is clearly indicated within the...
