In [1]:
import openai, langchain, os
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter 
from langchain_community.embeddings import SentenceTransformerEmbeddings
from pinecone import Pinecone, ServerlessSpec
from langchain_pinecone import PineconeVectorStore
from langchain_community.llms import OpenAI 
from dotenv import load_dotenv 
from sentence_transformers import SentenceTransformer  

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = SentenceTransformer('paraphrase-MiniLM-L3-v2')
embeddings = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L3-v2')

load_dotenv()


  embeddings = SentenceTransformerEmbeddings(model_name='paraphrase-MiniLM-L3-v2')


False

In [3]:
from langchain.document_loaders import DirectoryLoader, TextLoader, CSVLoader, UnstructuredFileLoader
from langchain_community.document_loaders import PyPDFLoader
from pathlib import Path

def load_docs_from_directory(path):
    def custom_loader(file_path_str):
        file_path = Path(file_path_str)
        ext = file_path.suffix.lower()

        loader_map = {
            ".pdf": PyPDFLoader,
            ".txt": TextLoader,
            ".csv": CSVLoader,
        }

        loader_cls = loader_map.get(ext, UnstructuredFileLoader)
        return loader_cls(str(file_path))

    loader = DirectoryLoader(
        path,
        glob="**/*",
        loader_cls=custom_loader
    )
    documents = loader.load()
    print(documents)
    return documents



In [4]:
def chunk_data(docs, chunk_size=800, chunk_overlap=50):
    text_spliter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
    doc = text_spliter.split_documents(docs)
    print(type(doc), type(doc[0]), len(doc))
    
    return doc


In [5]:
data = load_docs_from_directory('./static')
doc_list = chunk_data(docs=data)

vector = embeddings.embed_query("any text")
dimension = len(vector)
print(dimension)

[Document(metadata={'source': 'static\\13.txt'}, page_content='An alien called Jasper landed on Planet Mars in the year 1945.\n\nJasper had 3 heads and 6 legs making it almost impossible for his enemy to chase it. \n\n'), Document(metadata={'source': 'static\\20.txt'}, page_content='During friendly encounters and bonding, tigers rub against each other\'s bodies.[97] Facial expressions include the "defence threat", which involves a wrinkled face, bared teeth, pulled-back ears and widened pupils.[98][47] Both males and females show a flehmen response, a characteristic curled-lip grimace, when smelling urine markings. Males also use the flehmen to detect the markings made by tigresses in oestrus.[47] Tigers will move their ears around to display the white spots, particularly during aggressive encounters and between mothers and cubs.[99] They also use their tails to signal their mood. To show cordiality, the tail sticks up and sways slowly, while an apprehensive tiger lowers its tail or wa

In [22]:

api_key=os.getenv("PINECONE_API_KEY")

In [13]:
index_name = "lcvector"

vectorstore_from_docs = PineconeVectorStore.from_documents(
        doc_list,
        index_name=index_name,
        embedding=embeddings
)


In [14]:
def retrieve_query(query, k=2):
    matching_results = vectorstore_from_docs.similarity_search(query, k=k
                                                               )
    
    print(matching_results)
    return matching_results


In [34]:
from langchain.chains.question_answering import load_qa_chain
from langchain_community.llms import Ollama

# Initialize the local LLM via Ollama
llm = Ollama(model="mistral", temperature=0.1)

# Load QA chain
chain = load_qa_chain(llm, chain_type="stuff")

def retreive_answers(query):
    doc_search=retrieve_query(query)
    response= chain.run(input_documents=doc_search, question=query) 
    return response 



In [35]:
our_query= "Who is Jasper from Planet Mars? How many legs it had?"
answer=retreive_answers(our_query)
print(answer)

[Document(id='2005647d-0ed4-4bac-b288-da602b955b47', metadata={'source': 'static\\13.txt'}, page_content='An alien called Jasper landed on Planet Mars in the year 1945.\n\nJasper had 3 heads and 6 legs making it almost impossible for his enemy to chase it.'), Document(id='92363b22-9fe2-4483-8af0-b393ab2715a7', metadata={'source': 'static\\13.txt'}, page_content='An alien called Jasper landed on Planet Mars in the year 1945.\n\nJasper had 3 heads and 6 legs making it almost impossible for his enemy to chase it.')]
 Jasper is an alien who landed on Planet Mars in the year 1945. He has 6 legs.


In [None]:
from langchain.agents import Tool, initialize_agent, AgentType
import random
from datetime import datetime as dt

def juggler(a, b): 
    return [random.randint(a, b) for _ in range(5)]

def daysGetter(date: str) -> int:
    
    if "'" in date:
        date = date.replace("'", "")
    start_date = dt.strptime(date, "%d-%m-%Y")
    current_date = dt.now()
    days_diff = (current_date - start_date).days
    return days_diff
    

juggler_tool = Tool(name= "Juggler", \
                    func=lambda x: juggler( *map( int, x.split(" "))), 
                    description="Pass a single string arguments containing two integers which mean the range (a, b), pass like a b ; where a and b are integers"
                    )


days_getter_tool = Tool(name= "getdays", \
                        func= daysGetter, 
                        description="Pass a single string called date and gets the time difference in days from current date"
                        )


In [None]:
import pandas as pd
from langchain_experimental.agents import create_pandas_dataframe_agent
# from langchain.llms import OpenAI

# Step 1: Read multiple CSVs into a list of DataFrames
csv_files = ["./static/customers1.csv"]
dfs = [pd.read_csv(file) for file in csv_files]

# Step 2: Concatenate all into a single DataFrame (can also merge if needed)
combined_df = pd.concat(dfs, ignore_index=True)

# Step 3: Create a Pandas agent
pd_agent = create_pandas_dataframe_agent(llm, combined_df, verbose=True, allow_dangerous_code=True)



In [96]:
import re

def run_agent_safely(agent, query):
    try:
        return agent.run(query)
    except Exception as e:
        print("Caught Exception:", str(e))

        # Try to extract "Final Answer" from the exception string
        match = re.search(r"Final Answer\s*:\s*(.*)", str(e), re.IGNORECASE | re.DOTALL)
        if match:
            extracted_answer = match.group(1).strip().split('\n')[0]  # Grab only the first line if multiline
            print("Extracted Final Answer:", extracted_answer)
            return extracted_answer
        else:
            print("Could not extract Final Answer from the error.")
            return "An error occurred and no final answer could be recovered."


In [160]:
pd_tool = Tool(
    name="PandasData", 
    func=pd_agent.run, 
    description= ("Use this tool to query customer CSV data loaded into a dataframe. "
        "This includes retrieving or analyzing columns such as 'Subscription Date', 'Customer ID', etc. "
        "Always use this tool to interact with the customer CSV, instead of using raw Python."
    )
)

In [161]:
agent= initialize_agent(
    tools= [pd_tool, juggler_tool, days_getter_tool], 
    llm = llm, 
    agent=AgentType.ZERO_SHOT_REACT_DESCRIPTION, 
    verbose= True, 
    handle_parsing_errors=True
)

In [171]:
# Step 4: Ask your questions
response = run_agent_safely(agent=agent, query = "Now calculate the days from start date using daysgetter tool for date as 20-03-1976")
print(response)




[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m I need to find the number of days elapsed since a specific date. To do this, I will use the `getdays` function provided.

Action: getdays
Action Input: '20-03-1976'[0mCaught Exception: time data "'20-03-1976'" does not match format '%d-%m-%Y'
Could not extract Final Answer from the error.
An error occurred and no final answer could be recovered.


Deletion from DB code

In [172]:
# idx = pc.Index(index_name)

# # Delete all vectors from the index
# idx.delete(delete_all=True)

# print("All vectors have been deleted from the index.")

In [173]:
# idx = pc.Index(index_name)
# pdf_path = "static\\RahatBhambriResume_alter.pdf"

# idx.delete(
#     filter={
#         # "source": pdf_path
#     }
# )
# print(f"Vectors with source '{pdf_path}' have been deleted.")
