In [90]:
import warnings
warnings.filterwarnings('ignore')

In [91]:
!pip install python-dotenv faiss-cpu pdfplumber pytesseract pdf2image
!pip install groq
!pip install pytesseract
!sudo apt-get update
!sudo apt-get install -y tesseract-ocr
!tesseract --version
!sudo apt-get install -y tesseract-ocr
!pip install pdfminer.six
!pip install -U langchain-community
!pip install pinecone-client
!pip install tavily-python

Get:1 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Hit:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:3 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:5 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Hit:6 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,751 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,515 kB]
Fetched 4,527 kB in 2s (2,343 kB/s)
Reading package lists... Done
W: Skipping acquire o

In [106]:
from dotenv import load_dotenv
import os

# Load environment variables from .env file
load_dotenv()

# Access variables
UNSTRUCTURED_API_KEY = os.getenv("UNSTRUCTURED_API_KEY")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY")

# Debug Print
# print(f"Loaded API Keys:\nUNSTRUCTURED_API_KEY: {UNSTRUCTURED_API_KEY}\nPINECONE_API_KEY: {PINECONE_API_KEY}\nGROQ_API_KEY: {GROQ_API_KEY}\nTAVILY_API_KEY: {TAVILY_API_KEY}")


In [93]:
import pinecone
from pinecone import Pinecone, ServerlessSpec, Index
from sentence_transformers import SentenceTransformer

# Initialize Pinecone
pinecone_client = Pinecone(
    api_key=PINECONE_API_KEY,
    environment="us-east1-gcp"
)

# List available indexes
indexes = pinecone_client.list_indexes().names()
print("Indexes:", indexes)

# Create or access the index
if indexes:
    index_name = indexes[0]
    index_description = pinecone_client.describe_index(index_name)
    print(f"Details of index '{index_name}':", index_description)
else:
    pinecone_client.create_index(
        name='finance',
        dimension=384,  # Based on the embedding model
        metric='cosine',
        spec=ServerlessSpec(cloud='aws', region='us-east-1')
    )
    print("Created 'finance' index.")

# Access the index
index = pinecone_client.Index(index_name)

# Load Sentence-BERT model and generate embeddings
embedding_model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
file_path = "/content/netsol_financial_statement.json"

# Load JSON document
import json
with open(file_path, "r") as f:
    data = json.load(f)

texts = [item.get('text', '') for item in data if item.get('text')]  # Extract valid text fields
embeddings = embedding_model.encode(texts)

# Upsert documents into Pinecone
for i, embedding in enumerate(embeddings):
    index.upsert(vectors=[(f"doc_{i}", embedding, {"content": texts[i]})])
print("Upsertion completed.")


Indexes: ['finance']
Details of index 'finance': {'deletion_protection': 'disabled',
 'dimension': 384,
 'host': 'finance-zbpd4tt.svc.aped-4627-b74a.pinecone.io',
 'metric': 'cosine',
 'name': 'finance',
 'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
 'status': {'ready': True, 'state': 'Ready'}}
Upsertion completed.


In [95]:
import os
import numpy as np
from pinecone import Index
from groq import Groq
from tavily import TavilyClient
from sklearn.metrics.pairwise import cosine_similarity

# Initialize Pinecone index, Tavily, and Groq API
pinecone_index = Index(
    name='finance',
    host='finance-zbpd4tt.svc.aped-4627-b74a.pinecone.io',
    api_key=PINECONE_API_KEY
)

tavily_client = TavilyClient(api_key=TAVILY_API_KEY)

os.environ["GROQ_API_KEY"] = GROQ_API_KEY
groq_client = Groq(api_key=os.getenv("GROQ_API_KEY"))

# --- Query Router ---
def query_router(query, model, category_embeddings, threshold=0.7):
    query_embedding = model.encode([query])[0]
    similarities = cosine_similarity([query_embedding], category_embeddings)[0]
    best_match_idx = np.argmax(similarities)
    best_match_score = similarities[best_match_idx]
    return "retriever" if best_match_idx == 0 else ("search" if best_match_score >= threshold else "llm")

# --- Search Functions ---
def search_current_events_tavily(user_query):
    try:
        answer = tavily_client.qna_search(query=user_query)
        return answer if answer else "No relevant current events found."
    except Exception as e:
        return f"Error searching current events with Tavily: {e}"

def retrieve_context(query, index, model, threshold=0.7):
    query_embedding = model.encode([query])[0]
    results = index.query(vector=query_embedding.tolist(), top_k=5, include_metadata=True)
    filtered_results = [
        match['metadata']['content']
        for match in results['matches']
        if match['score'] >= threshold
    ]
    return "\n".join(filtered_results) if filtered_results else "No relevant information found."

def get_response_from_lama(prompt: str, model: str = "llama3-8b-8192"):
    try:
        chat_completion = groq_client.chat.completions.create(
            messages=[{"role": "user", "content": prompt}],
            model=model,
            temperature=0.7,
        )
        return chat_completion.choices[0].message.content.strip()
    except Exception as e:
        raise ValueError(f"Groq API call failed: {e}")

# --- Query Processing ---
def process_query(user_query):
    print(f"Processing Query: {user_query}")
    route = query_router(user_query, embedding_model, embeddings)
    print(f"Query routed to: {route}")

    if route == "retriever":
        context = retrieve_context(user_query, pinecone_index, embedding_model)
        if "No relevant information" in context:
            return "No relevant information found in the financial report."

        prompt = f"Context:\n{context}\n\nUser Query: {user_query}\nAnswer directly:"
        return get_response_from_lama(prompt)

    elif route == "search":
        return search_current_events_tavily(user_query)

    else:
        prompt = f"User Query: {user_query}\nAnswer:"
        return get_response_from_lama(prompt)

# --- Run the Chatbot ---
test_queries = [
    "Who is the Chairman of NetSol?",
    "What awards has NETSOL Technologies won?",
    "What are the core products offered by NETSOL?",
    "Who is the President of the USA?",
]

for query in test_queries:
    response = process_query(query)
    print(f"Query: {query}\nResponse: {response}\n{'-'*50}")


Processing Query: Who is the Chairman of NetSol?
Query routed to: search
Query: Who is the Chairman of NetSol?
Response: The Chairman of NetSol Technologies is Najeeb Ghauri.
--------------------------------------------------
Processing Query: What awards has NETSOL Technologies won?
Query routed to: search
Query: What awards has NETSOL Technologies won?
Response: NETSOL Technologies has won several awards, including two Teradata National IT Excellence Awards, with Mr. Sajjad Kirmani recognized as "CIO of the Year" for his contributions to the company. Additionally, in China, Amanda Li Linjie, President of NETSOL China, received the Automotive Finance New Productivity Innovation Pioneer Award and the China Financial Leasing Excellence Service Award. Furthermore, NETSOL was awarded the "First Rate and Best Selling Finance and Leasing Solution Provider" for four consecutive years in China.
--------------------------------------------------
Processing Query: What are the core products off

In [89]:
!pip install gradio
import gradio as gr
def chatbot_response(user_query):
    return process_query(user_query)

iface = gr.Interface(fn=chatbot_response, inputs="text", outputs="text")
iface.launch(share=True)

Collecting gradio
  Downloading gradio-5.8.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.6-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.5.1 (from gradio)
  Downloading gradio_client-1.5.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.19-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.8.2-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://9e9ff175e009f7087a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [99]:
!pip freeze > /content/requirements.txt