In [71]:
import os
import dotenv
from dotenv import load_dotenv
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_openai import AzureOpenAIEmbeddings, AzureChatOpenAI
from langchain_experimental.text_splitter import SemanticChunker
from langchain.vectorstores import Chroma
from langchain_cohere import CohereRerank
from langchain_core.vectorstores import VectorStoreRetriever
from langchain.chains import RetrievalQA
from langchain.agents import Tool, initialize_agent
import warnings
from ragatouille import RAGPretrainedModel
from langchain.tools import tool
from langchain.retrievers import ContextualCompressionRetriever
import warnings

warnings.simplefilter("ignore", FutureWarning)
warnings.simplefilter("ignore", UserWarning)
warnings.filterwarnings("ignore")



# Initialize the GPT-4 model with Azure configuration 


In [72]:

gpt_4 = AzureChatOpenAI(
    azure_deployment= "MagicPlatform-GPT4-O-GS-Test",
    api_version= "2023-12-01-preview",
    temperature= 0.1,
    model="gpt-4o",
    azure_endpoint="https://magicplatform-gpt4-o-gs-test-canadaeast.openai.azure.com/",
    api_key="***************"
)

# Configuration dictionary for OpenAI embeddings model


In [73]:


OPENAI_MODEL_ARGS = {
"azure_deployment": "MagicPlatform-TextEmbedding-ADA-Test",
"api_version": "2024-03-01-preview",
"model": "text-embedding-ada-002",
"azure_endpoint": "https://magicplatform-textembedding-ada-test-australiaeast.openai.azure.com/",
"api_key": "***************"
}

embedding_model = AzureOpenAIEmbeddings(**OPENAI_MODEL_ARGS)


# Function to load text documents from a specified directory


In [74]:
def load_documents():
    
    loader = DirectoryLoader('data1/', glob='*.txt', loader_cls=TextLoader)
    docs = loader.load()
    print(f"Loaded {len(docs)} documents.")
    return docs

# Function to split documents into chunks for better processing


In [75]:
def split_into_chunks(docs, embedding_model):
    text_splitter = SemanticChunker(embedding_model)
    docs = text_splitter.create_documents([doc.page_content for doc in docs])
    return docs

# Function to create a Chroma database from document chunks


In [76]:
def db(docs, embedding_model):
    db_chroma = Chroma.from_documents(docs, embedding_model)
    return db_chroma
    

# Function to create a retriever using Cohere's reranking


In [77]:
def cohere_ranker(db_chroma):
    reranker = CohereRerank(cohere_api_key="***************", model="rerank-english-v3.0")
    retriver = VectorStoreRetriever(vectorstore=db_chroma, reranker=reranker)
    
    return retriver

# Load documents, split them into chunks, and create a Chroma database


In [78]:
docs = load_documents()
chunks = split_into_chunks(docs, embedding_model)
db_chroma = db(chunks, embedding_model)
db_chroma.add_documents(chunks)

Loaded 6 documents.


['a1c19cf6-4773-4213-b635-19cc14023839',
 '239ee455-bf18-4100-bf99-4e0a39d885f6',
 'e2a8ccaf-1499-41f9-9e79-d664942a31be',
 '5d6b68cd-12b6-44c1-94ce-23d29005ed3d',
 '299659e6-854e-4250-9125-47d6a552e40c',
 '48d94824-1455-47e9-8999-5615cca7fe98',
 '3b511e2b-51de-4a5c-9639-b01e1520e2f0',
 '25692cb6-de7b-4bc5-80af-fd1ff8adb96f',
 'ab3b5e83-948e-41bc-a997-9f8bdd5a135c',
 '92067f4c-5da6-4d2b-a789-14ac4759b1a1',
 'c80792a8-e9d1-459e-b34c-554a7776f9ef',
 'b258f9a5-8cc3-46af-aa5f-6600dfe51b0a',
 'c2b62c5c-d074-4793-8e7a-adb120fa3e43',
 '4e3b66af-c24b-49ae-b89e-221d4b2e2bf5',
 '8d5a5619-f73c-4dde-a7d9-897dd7666700',
 '85737638-683f-430d-9125-220602a21824']

# Tool for classifying the content of a given file


In [79]:
@tool
def classify(file_name):

    """
    Classify the content of a file as either an email or an invoice.

    Args:
        file_name (str): The name of the file to classify.

    Returns:
        str: Classification result ("email" or "invoice").
    """

    prompt = "extract the file named with " + file_name
    retriever = cohere_ranker(db_chroma)
    retrieved_docs = retriever.invoke(prompt)
    
    top_doc = retrieved_docs[0]
    

    gpt_prompt = (
        prompt + top_doc.page_content + 
        " classify the above content is from an invoice (email can contain the following: date, from, to, general greetings, general mail body) "
        "or invoice (invoice can contain the following: total, amount, amt, tax, item names, and general invoice information). "
        "give answer in one word (email or invoice)"
    )

    response = gpt_4.invoke(gpt_prompt)
    return response.content


# Load the RAG pretrained model for document compression


In [80]:
RAG = RAGPretrainedModel.from_pretrained('colbert-ir/colbertv2.0')

# Function to create a contextual retriever based on ColBERT


In [81]:
def get_colbert_retriever(db):
    base_retriever = db.as_retriever(
        search_type="mmr",
        search_kwargs={"k":3}
    )
    compression_retriever = ContextualCompressionRetriever(
        base_compressor=RAG.as_langchain_document_compressor(k=3),
        base_retriever=base_retriever
    )
    return compression_retriever

# Tool for extracting specific entities from a classified document


In [82]:
@tool
def extract_ent(file_name, entity_query):
    """
    Extract entities from the specified email file based on the entity query.

    Args:
        file_name (str): The name of the email file.
        entity_query (str): The query specifying which entities to extract.

    Returns:
        str: A string with required entities.
    """


    retriever = get_colbert_retriever(db_chroma)
    
    qa_chain = RetrievalQA.from_chain_type(
        llm=gpt_4,
        chain_type="stuff",
        retriever=retriever,
        return_source_documents=True
    )
    
    enhanced_query = f"Extract the following entities: {entity_query} from the document: {file_name}. Provide a clear and structured output."

    
    result = qa_chain({'query':enhanced_query})
    return result['result']

    

# Initialize a conversational agent with specified tools and model


In [84]:
from langchain.agents import Tool
from langchain.tools import BaseTool
from langchain.agents import initialize_agent



agent_prompt = "given the file_name and entity_query, use classify tool to classify if the file is email or invoice. if the file is invoice use extract_ent tool to get entities asked in entity_query"


tools = [classify, extract_ent]

conversational_agent = initialize_agent(
    agent='structured-chat-zero-shot-react-description',
    tools=tools,
    llm=gpt_4,
    verbose=True,
    max_iterations=4,
    early_stopping_method='generate',
)

# Execute the agent with a specific file name and query


In [85]:
conversational_agent("file name is Myntra_invoice.txt classify it and if the file is invoice then extract the required entity. Give Invoice Details")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mAction:
```
{
  "action": "classify",
  "action_input": {
    "file_name": "Myntra_invoice.txt"
  }
}
```[0m
Observation: [36;1m[1;3mInvoice[0m
Thought:[32;1m[1;3mAction:
```
{
  "action": "extract_ent",
  "action_input": {
    "file_name": "Myntra_invoice.txt",
    "entity_query": "Invoice Details"
  }
}
```[0m

100%|██████████| 1/1 [00:00<00:00,  3.12it/s]



Observation: [33;1m[1;3mSure, here are the extracted invoice details from the document Myntra_invoice.txt:

**Invoice Details:**

- **Invoice Number:** MNT-987654
- **Date:** 2024-11-01
- **Customer:** Alice Johnson
- **Address:** 456 Style Ave, Fashion City

**Items:**

1. **Item:** T-Shirt
   - **Quantity:** 2
   - **Unit Price:** $20.00
   - **Total:** $40.00

2. **Item:** Jeans
   - **Quantity:** 1
   - **Unit Price:** $60.00
   - **Total:** $60.00

3. **Item:** Sneakers
   - **Quantity:** 1
   - **Unit Price:** $80.00
   - **Total:** $80.00

**Total Amount:** $180.00

**Notes:** Thank you for shopping at Myntra! Returns are accepted within 15 days.[0m
Thought:[32;1m[1;3mAction:
```
{
  "action": "Final Answer",
  "action_input": "Sure, here are the extracted invoice details from the document Myntra_invoice.txt:\n\n**Invoice Details:**\n\n- **Invoice Number:** MNT-987654\n- **Date:** 2024-11-01\n- **Customer:** Alice Johnson\n- **Address:** 456 Style Ave, Fashion City\n\n**It

{'input': 'file name is Myntra_invoice.txt classify it and if the file is invoice then extract the required entity. Give Invoice Details',
 'output': 'Sure, here are the extracted invoice details from the document Myntra_invoice.txt:\n\n**Invoice Details:**\n\n- **Invoice Number:** MNT-987654\n- **Date:** 2024-11-01\n- **Customer:** Alice Johnson\n- **Address:** 456 Style Ave, Fashion City\n\n**Items:**\n\n1. **Item:** T-Shirt\n   - **Quantity:** 2\n   - **Unit Price:** $20.00\n   - **Total:** $40.00\n\n2. **Item:** Jeans\n   - **Quantity:** 1\n   - **Unit Price:** $60.00\n   - **Total:** $60.00\n\n3. **Item:** Sneakers\n   - **Quantity:** 1\n   - **Unit Price:** $80.00\n   - **Total:** $80.00\n\n**Total Amount:** $180.00\n\n**Notes:** Thank you for shopping at Myntra! Returns are accepted within 15 days.'}