In [1]:
import os
import json
from langchain.document_loaders import PyPDFLoader

def get_pdf_files(dir):
    pdf_files = []
    for root, dirs, files in os.walk(dir):
        for file in files:
            if file.lower().endswith('.pdf'):
                pdf_files.append(os.path.join(root, file))
    return pdf_files


def load_papers(directory):
    pdf_file_paths = get_pdf_files(directory)
    pdf_docs = []

    for pdf_file in pdf_file_paths:
        document_list = load_pdf(pdf_file)
        pdf_docs.append(document_list)
        
    return pdf_docs

def load_pdf(filename):
    loader = PyPDFLoader(filename)
    document_list = loader.load_and_split()
    return document_list

def transform_papers(papers):
    docs = []
    for paper in papers:
        doc = {}
        doc["source"] = paper[0].metadata["source"]
        doc["content"] = paper
        docs.append(doc)
    return docs

def write_dict_to_file(dictionary, file_path):
    with open(file_path, 'w') as file:
        json.dump(dictionary, file)

In [2]:
from langchain.llms import Ollama
from langchain.chains.summarize import load_summarize_chain

def ollama(model="llama2"):
    return Ollama(model=model)

llm = ollama(model="mistral")
chain = load_summarize_chain(llm, chain_type="stuff")

def summarize(paper):
    print("# Starting summary for " + paper["source"])
    content = paper["content"]
    summary = chain.run(content)
    paper["summary"] = summary
    del paper["content"]
    print("# Finished summary for " + paper["source"])
    return paper

In [3]:
directory = './papers'
summary_path = './summaries.json'

pdf_docs = load_papers(directory)
pdf_docs = transform_papers(pdf_docs)

summaries = [summarize(paper) for paper in pdf_docs]

write_dict_to_file(summaries, summary_path)

Multiple definitions in dictionary at byte 0x15fe1 for key /MediaBox
Multiple definitions in dictionary at byte 0x161fd for key /MediaBox
Multiple definitions in dictionary at byte 0x16407 for key /MediaBox
Multiple definitions in dictionary at byte 0x165d9 for key /MediaBox
Multiple definitions in dictionary at byte 0x16873 for key /MediaBox
Multiple definitions in dictionary at byte 0x16a6d for key /MediaBox


# Starting summary for ./papers/Large language models in health care Development, applications, and challenges.pdf
# Finished summary for ./papers/Large language models in health care Development, applications, and challenges.pdf
# Starting summary for ./papers/Leveraging Generative AI and Large Language Models A Comprehensive Roadmap for Healthcare Integration.pdf
# Finished summary for ./papers/Leveraging Generative AI and Large Language Models A Comprehensive Roadmap for Healthcare Integration.pdf
# Starting summary for ./papers/CAN LARGE LANGUAGE MODELS REASON ABOUT MEDICAL QUESTIONS.pdf
# Finished summary for ./papers/CAN LARGE LANGUAGE MODELS REASON ABOUT MEDICAL QUESTIONS.pdf
# Starting summary for ./papers/The practical implementation of artificial intelligence technologies in medicine.pdf
# Finished summary for ./papers/The practical implementation of artificial intelligence technologies in medicine.pdf
# Starting summary for ./papers/Domain-specific language models and lexico