<a href="https://colab.research.google.com/github/preetamjumech/LLM/blob/main/Summarize_HUGE_Documents_Locally!_(Langchain_%2B_Ollama_%2B_KmeansClusteringFilter)_08_11_2024.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install langchain
!pip install langchain-community
!pip install langchain-ollama
!pip install sentence-transformers
!pip install langchain-text-splitters
!pip install pypdf

In [None]:
from langchain_ollama import ChatOllama
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFLoader
from langchain.chains.summarize import load_summarize_chain
from langchain_community.document_transformers import EmbeddingsClusteringFilter
from langchain_community.embeddings import HuggingFaceBgeEmbeddings


def extract(file_path):
    loader = PyPDFLoader(file_path)
    pages = loader.load()
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
    texts = text_splitter.split_documents(pages)
    return texts


def summarize_document_with_kmeans_clustering(file, llm, embeddings):
    filter = EmbeddingsClusteringFilter(embeddings=embeddings, num_clusters=10)

    texts = extract(file)
    try:
        result = filter.transform_documents(documents=texts)
        checker_chain = load_summarize_chain(llm, chain_type="stuff")
        summary = checker_chain.run(result)
        return summary
    except Exception as e:
        return str(e)

model_name = "BAAI/bge-base-en-v1.5"
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

model = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)


llm = ChatOllama(
    model="llama3.1",
    temperature=0,

)

print(summarize_document_with_kmeans_clustering("E:\Preetam\pdf summarization\IBM_Annual_Report_2020.pdf", llm, model))