# Summarization of Clinical Notes with GPT-3.5 Turbo + DSM-5 Retrieval-Augmented Generation (RAG)

This notebook demonstrates how to use GPT-3.5 Turbo and DSM-5 content to summarize clinical notes.

In [None]:

import os
import openai
from dotenv import load_dotenv

# Specify the path to your .env file
dotenv_path = '/home/skbae/Documents/skbae/f.env'

# Load the .env file
load_dotenv(dotenv_path)

# Set the OpenAI API key
openai.api_key = os.getenv('OPENAI_API_KEY')

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer

# Initialize the LLM
llm = ChatOpenAI(model="gpt-3.5-turbo")


## Step 1: Load and Split DSM-5 Content

In [None]:

# Load DSM-5 PDFs
loaders = [
    PyPDFLoader("/home/skbae/Documents/skbae/ASD/DSM5.pdf"),
]

# Load pages from the PDF
pages = []
for loader in loaders:
    pages.extend(loader.load())

# Split text into chunks using a Hugging Face tokenizer
text_splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
    tokenizer=AutoTokenizer.from_pretrained(
        "sentence-transformers/all-MiniLM-L12-v2"
    ),
    chunk_size=256,
    chunk_overlap=32,
    strip_whitespace=True,
)

# Split the DSM-5 content into chunks
splits = text_splitter.split_documents(pages)


## Step 2: Create a Vector Store for Retrieval

In [None]:

# Create a vector store for retrieval using the DSM-5 splits
vectorstore = Chroma.from_documents(documents=splits, embedding=OpenAIEmbeddings())

# Set up a retriever to search for relevant content
retriever = vectorstore.as_retriever(search_type="similarity", search_kwargs={"k": 10})


## Step 3: Define the RAG Chain

In [None]:

from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

# Custom prompt template for summarization
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.

{context}

Question: {question}

Helpful Answer:"""

# Create a prompt template
custom_rag_prompt = PromptTemplate.from_template(template)

# Define the RAG chain
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)


## Step 4: Summarize Clinical Notes

In [None]:

import pandas as pd

# Load clinical notes
df_merged = pd.read_csv('./df_merged_summary_Apr03.csv')

# Define a function to process and summarize clinical notes
def process_text2(text):
    response = rag_chain.invoke(f"Please check the below and summarize clear symptoms of ASD on it:
{text}")
    lines = response.split('\n')
    return '\n'.join(lines)  # Join lines into a single string

# Apply summarization to the dataset
df_merged['summarized_text_llm2F'] = df_merged['deidentified_text'].apply(process_text2)

# Save the summarized data
df_merged.to_csv('./df_merged_summary2_LLM_F_Apr012F.csv', index=False)


## Step 5: View Summarized Output

In [None]:

# Display the first few rows of the summarized output
df_merged[['deidentified_text', 'summarized_text_llm2F']].head()
