In [1]:
from langchain.llms import OpenAI
from langchain.document_loaders import UnstructuredPDFLoader
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter

from dotenv import load_dotenv
_ = load_dotenv()

In [2]:
from pathlib import Path
from typing import Sequence
from langchain.schema import Document

article_path = "data/On-algorithmic-fairness-in-medical-practice-2.pdf"
loader = UnstructuredPDFLoader(article_path)
data: Sequence[Document] = loader.load()
assert len(data) == 1

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts: Sequence[Document] = text_splitter.split_documents(data)

In [4]:
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores.pgvector import PGVector
from typing import Optional, Sequence
from datetime import datetime
from pgvector.sqlalchemy import Vector
from sqlalchemy import Column

# persist_directory = 'db'
embedding = OpenAIEmbeddings()

# from app.models.document import Document, VectorEmbedding
# from app.models.user import User
from sqlmodel import SQLModel, Session, Field, create_engine

conn_string = "postgresql://postgres:postgres@localhost:5432/postgres"
engine = create_engine(conn_string, echo=True)

class PunkVectorEmbedding(SQLModel, table=True):
    id: Optional[int] = Field(default=None, primary_key=True)
    # document_id: int = Field(..., foreign_key="document.id")
    document_name: str = Field(..., max_length=100)
    embedding: Sequence[float] = Field(..., sa_column=Column(Vector(1536)))
    content: str = Field(...)
    
    created_at: datetime = Field(default_factory=datetime.now)

try:
    SQLModel.metadata.create_all(engine)
except Exception as e:
    print(e)

2023-07-09 23:19:23,413 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2023-07-09 23:19:23,421 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-09 23:19:23,429 INFO sqlalchemy.engine.Engine select current_schema()
2023-07-09 23:19:23,438 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-09 23:19:23,456 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2023-07-09 23:19:23,456 INFO sqlalchemy.engine.Engine [raw sql] {}
2023-07-09 23:19:23,478 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-07-09 23:19:23,486 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2023-07-09 23:19:23,492 INFO sqlalchemy.engine.Engine [generated in 0.00667s] {'name': 'vectorembedding'}
2023-07-09 23:19:23,511 INFO sqlalchemy.engine.Engine 
CREATE TABLE vectorembedding (
	embedding VECTOR(1536), 
	id SERIAL NOT NULL, 
	document_name VARCHAR(100) NOT NULL, 
	content

In [5]:
with Session(engine) as session:
    chunks_to_embed = [text.page_content for text in texts]
    vectors = embedding.embed_documents(chunks_to_embed)
    db_vectors = [
        PunkVectorEmbedding(
            document_name=Path(article_path).name, embedding=vector, content=chunk
        )
        for vector, chunk in zip(vectors, chunks_to_embed)
    ]
    for db_vector in db_vectors:
        session.add(db_vector)
    session.commit()

2023-07-09 23:19:27,621 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-07-09 23:19:27,846 INFO sqlalchemy.engine.Engine INSERT INTO vectorembedding (embedding, document_name, content, created_at) VALUES (%(embedding)s, %(document_name)s, %(content)s, %(created_at)s) RETURNING vectorembedding.id
2023-07-09 23:19:27,846 INFO sqlalchemy.engine.Engine [generated in 0.21987s] ({'embedding': '[0.0065591825491602114,-0.004126159552925896,0.02907238956092918,-0.04080045673558116,0.01379974864657808,0.040552680222525575,-0.0187896828955745,0. ... (32561 characters truncated) ... 0.03380766575423498,-0.010874615258195313,-0.030614106568278544,-0.02108849439000587,-0.014866562843656967,0.009071355858552507,-0.02289175285832608]', 'document_name': 'On-algorithmic-fairness-in-medical-practice-2.pdf', 'content': 'Cambridge Quarterly of Healthcare Ethics (2022), 31: 1, 83–94 doi:10.1017/S0963180121000839\n\nB I O E T H I C S A N D I N F O R M A T I O N T E C H  ... (503 characters truncated) ...

In [6]:
from langchain.chains import VectorDBQA, RetrievalQA, ConversationalRetrievalChain
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.memory import ConversationBufferMemory

# qa = VectorDBQA.from_chain_type(
#     llm=OpenAI(model="gpt-3.5-turbo"),
#     chain_type="stuff",
#     vectorstore=PGVector(
#         connection_string=conn_string,
#         embedding_function=embedding,
#         collection_name="public",
#     ),
# )
# qa = RetrievalQA(
#     llm=OpenAI(model="gpt-3.5-turbo"),
# )
# vectorstore = PGVector(
#     connection_string=conn_string,
#     embedding_function=embedding,
#     collection_name="public",
# )
# memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
# qa = ConversationalRetrievalChain.from_llm(
#     ChatOpenAI(temperature=0, model="gpt-4"), 
#     vectorstore.as_retriever(),
#     condense_question_llm=ChatOpenAI(temperature=0, model='gpt-3.5-turbo'),
#     return_source_documents=True,
# )

In [7]:
# vs_retriever = vectorstore.as_retriever()
# vs_retriever.get_relevant_documents("What is the meaning of life?")

In [8]:
# qa.run("Can you explain the problem with fairness?")

In [9]:
# result = qa(
#     {
#         "question": "Please provide a two-paragraph summary of the entire paper.",
#         "chat_history": [],
#     }
# )
# result

In [10]:
# TODO:
# 1. Add a function that makes the question into an embedding
# 2. Extract the relevant documents via direct SQL query (compare on vector distance, but get the actual text)
# 3. Combine the document(s) and the question into a single GPT-4 query

In [11]:
from langchain.embeddings.base import Embeddings
from sqlmodel import select


def get_embedding_from_query(query: str, embedder: Embeddings) -> Sequence[float]:
    embeddings = embedder.embed_documents([query])
    return embeddings[0]


def get_k_similar_chunks(
    query_embedding: Sequence[float], session: Session, k: int = 3
) -> Sequence[str]:
    # query = session.query(
    #     f"SELECT content FROM vectorembeddings ORDER BY embedding <=> %s LIMIT {k}",
    #     query_embedding,
    # )
    query = (
        select(PunkVectorEmbedding.content)
        .order_by(PunkVectorEmbedding.embedding.l2_distance(query_embedding))
        .limit(k)
    )
    results = session.exec(query)

    return results


TEMPLATE = """
You are a good chatbot that answers questions regarding published journal papers. You are academic and precise.

The relevant chunks of the paper are:

- {chunks}.

Please respond to the following question or request: {query}
"""


def create_query(
    query: str, session: Session, embedder: Embeddings, template: str = TEMPLATE
) -> str:
    query_embedding = get_embedding_from_query(query, embedder=embedding)
    relevant_chunks = get_k_similar_chunks(query_embedding, session)

    connected_chunks = "\n- ".join(relevant_chunks)

    final_query = template.format(chunks=connected_chunks, query=query)

    return final_query

In [12]:
# query_embedding = get_embedding_from_query(
#     "Please provide a two-paragraph summary of the entire paper", embedder=embedding
# )
# chunks = get_k_similar_chunks(query_embedding, session)
initial_query = "Please provide a two-paragraph summary of the entire paper"
with Session(engine) as session:
    final_query = create_query(
        query=initial_query,
        session=session,
        embedder=embedding
    )

2023-07-09 23:19:28,636 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2023-07-09 23:19:28,654 INFO sqlalchemy.engine.Engine SELECT vectorembedding.content 
FROM vectorembedding ORDER BY vectorembedding.embedding <-> %(embedding_1)s 
 LIMIT %(param_1)s
2023-07-09 23:19:28,660 INFO sqlalchemy.engine.Engine [generated in 0.01091s] {'embedding_1': '[-0.025741690304702535,0.018027016567325336,0.016134245029648445,-0.001245802595123262,-0.0037365918303938436,0.013060122492383468,-0.028561267485762 ... (32628 characters truncated) ... ,-0.03305170625336563,0.002491605190246524,-0.044303904418377,0.006761762657037608,0.0007599640383914042,-0.011950567148775315,-0.010390662400445384]', 'param_1': 3}
2023-07-09 23:19:28,719 INFO sqlalchemy.engine.Engine ROLLBACK


In [15]:
print(final_query)


You are a good chatbot that answers questions regarding published journal papers. You are academic and precise.

The relevant chunks of the paper are:

- In addition, one upshot of the paper is that it ties together certain themes in the emerging literature on “fairness in machine learning” with the philosophical debate on justice in healthcare. The remainder of the paper will be structured as follows: In the section “Mechanisms of Algorithmic Bias in Medical Practice,” we give an outline of the different mechanisms of algorithmic bias in medical practice, whereby we distinguish between formal, substantive, and normative notions of algorithmic bias. Thereafter, the section “From Algorithmic Bias to Fairness” will discuss to what extent different standards of fairness help to counteract the threat of algorithmic discrimination. Here, we argue that fairness cannot be restored merely by mitigating the differences in the algorithm’s predictive accuracy for different demographics. Instead,

In [20]:
# TODO: Pass the final query to GPT-4 via either Langchain or direct OpenAI API
from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)


chat = ChatOpenAI(model="gpt-4", temperature=0)
# TODO: Split up query into system message and human message?
response = chat([
    HumanMessage(content=final_query)
])

In [22]:
print(response.content)

This paper is an in-depth exploration of algorithmic bias in medical practice, linking themes from the burgeoning field of fairness in machine learning to philosophical discussions on justice in healthcare. It aims to outline the different mechanisms of algorithmic bias, distinguishing between formal, substantive, and normative notions. The paper also scrutinizes the extent to which different fairness standards can combat the threat of algorithmic discrimination, arguing that fairness restoration requires more than just mitigating predictive accuracy differences across various demographics. The constraints of formal accounts of algorithmic fairness are also discussed, suggesting a need for consideration of a wider array of normative criteria.

The paper sets three objectives: to provide an understanding of how algorithmic discrimination manifests in medical practice, to examine the underlying mechanisms of algorithmic bias, and to identify the suitable normative standards for fair algo