In [1]:
print('Final Version')

Final Version


In [2]:
!pip install -qU rapidocr-onnxruntime langchain_community pypdf langchain_text_splitters faiss-cpu nse-live-stocks langchain_huggingface langchain_groq langchain_community pypdf2 requests beautifulsoup4 langchain_text_splitters


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.0.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
import os
import re
import requests
from langchain_groq import ChatGroq
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import FakeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import datetime

USER_AGENT environment variable not set, consider setting it to identify your requests.


In [4]:
DOWNLOAD_DIR = "pdf_downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [5]:
def scrape_screener_pdfs(company_url):
    print(f"Scraping: {company_url}")
    headers = {"User-Agent": "Mozilla/5.0"}

    html = requests.get(company_url, headers=headers).text
    soup = BeautifulSoup(html, "html.parser")

    links = soup.select(".documents a")

    pdf_links = []

    for a in links:
        href = a.get("href", "")
        if href.endswith(".pdf"):
            text = a.text.strip().replace("\n", "_").replace(" ", "_")
            if not text:
                text = "Document"
            pdf_links.append((href, text + ".pdf"))

    print(f"Found {len(pdf_links)} PDF links.\n")
    return pdf_links

In [6]:
def classify_transcript_or_ppt(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        text = reader.pages[0].extract_text()[:800].lower()
    except:
        return None

    if "transcript" in text or "earnings call" in text:
        return "transcript"
    if "presentation" in text or "investor presentation" in text:
        return "presentation"

    return None


In [7]:
def maybe_rename_transcript_or_ppt(saved_path):
    doc_type = classify_transcript_or_ppt(saved_path)

    if not doc_type:
        print("   ‚Üí Not transcript/presentation. Keeping original filename.\n")
        return saved_path

    # Original filename
    folder = os.path.dirname(saved_path)
    base = os.path.basename(saved_path)
    name, ext = os.path.splitext(base)

    # Make new filename with incremental number
    counter = 1
    new_name = f"{name}_{counter}{ext}"
    new_path = os.path.join(folder, new_name)

    while os.path.exists(new_path):
        counter += 1
        new_name = f"{name}_{counter}{ext}"
        new_path = os.path.join(folder, new_name)

    try:
        os.rename(saved_path, new_path)
        print(f"   ‚úî Transcript/PPT detected ‚Üí renamed to {new_name}\n")
        return new_path
    except:
        print("   ‚ö† Rename failed. Keeping original.\n")
        return saved_path


In [8]:
def download_bse_annpdf(url, download_dir, filename):
    print(f"URL: {url}")
    print(f"[BSE-ANNPDF] Requesting: {url}")

    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    savepath = os.path.join(download_dir, filename)

    with open(savepath, "wb") as f:
        f.write(r.content)

    print(f"‚úî Saved BSE AnnPdf: {savepath}")
    return savepath

In [9]:
def download_bse_iframe_pdf(url, download_dir, filename):
    print(f"URL: {url}")
    print("   [BSE-IFRAME] Requesting main page‚Ä¶")

    html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text
    soup = BeautifulSoup(html, "html.parser")
    iframe = soup.find("iframe")

    if not iframe:
        print("   ‚ùå No iframe found. Cannot download.")
        return None

    real_pdf = iframe.get("src")
    if not real_pdf.startswith("http"):
        real_pdf = "https://www.bseindia.com" + real_pdf

    print(f"   ‚Üí PDF Source: {real_pdf}")

    r = requests.get(real_pdf, headers={"User-Agent": "Mozilla/5.0"})
    savepath = os.path.join(download_dir, filename)

    with open(savepath, "wb") as f:
        f.write(r.content)

    print(f"   ‚úî Saved BSE iframe PDF: {savepath}")
    return savepath


In [10]:
def download_direct_pdf(url, download_dir, filename):
    print(f"[DIRECT] Downloading: {url}")
    r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
    savepath = os.path.join(download_dir, filename)

    with open(savepath, "wb") as f:
        f.write(r.content)

    print(f"‚úî Saved direct PDF: {savepath}")
    return savepath

In [11]:
def clean_filename(name):
    name = re.sub(r'[\/:*?"<>|;,]', '_', name)
    name = re.sub(r"_+", "_", name)
    name = name.strip(" .")
    if len(name) > 180:
        name = name[:180]
    return name

In [12]:
def download_pdf(url, filename):

    filename = clean_filename(filename)

    if "AnnPdfOpen.aspx" in url:
        saved = download_bse_annpdf(url, DOWNLOAD_DIR, filename)
        return maybe_rename_transcript_or_ppt(saved)

    if "xml-data/corpfiling" in url:
        saved = download_bse_iframe_pdf(url, DOWNLOAD_DIR, filename)
        return maybe_rename_transcript_or_ppt(saved)

    saved = download_direct_pdf(url, DOWNLOAD_DIR, filename)
    return maybe_rename_transcript_or_ppt(saved)


In [13]:
def run(company_url):
    pdfs = scrape_screener_pdfs(company_url)

    for url, filename in pdfs:
        print("-----------------------------------------")
        print(f"Downloading: {filename}")
        download_pdf(url, filename)


In [14]:
url = 'https://www.screener.in/company/JIOFIN/consolidated/'

run(url)

Scraping: https://www.screener.in/company/JIOFIN/consolidated/
Found 26 PDF links.

-----------------------------------------
Downloading: Announcement_under_Regulation_30_(LODR)-Acquisition_________________________1d_-_JLSL_subscribed_4,50,00,000_8.1%_OCPS_for_Rs45_crore_in_RILIPL;_aggregate_investment_Rs166.55_crore..pdf
URL: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=39c74549-8916-4e1b-a22e-f34f27fb9a2f.pdf
[BSE-ANNPDF] Requesting: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=39c74549-8916-4e1b-a22e-f34f27fb9a2f.pdf
‚úî Saved BSE AnnPdf: pdf_downloads/Announcement_under_Regulation_30_(LODR)-Acquisition_1d_-_JLSL_subscribed_4_50_00_000_8.1%_OCPS_for_Rs45_crore_in_RILIPL_aggregate_investment_Rs166.55_crore..pdf
   ‚Üí Not transcript/presentation. Keeping original filename.

-----------------------------------------
Downloading: Announcement_under_Regulation_30_(LODR)-Analyst_/_Investor_Meet_-_Intimation_________________________11_Nov_-_The_Company_executives_

unknown widths : 
[3, 3, 226, 4, 4, 578, 17, 17, 543, 18, 18, 533, 28, 28, 488, 38, 38, 459, 44, 44, 623, 47, 47, 251, 58, 58, 318, 60, 60, 519, 62, 62, 420, 68, 68, 854, 69, 69, 645, 75, 75, 662, 87, 87, 516, 90, 90, 542, 94, 94, 459, 116, 116, 889, 258, 258, 479, 271, 271, 525, 272, 272, 422, 282, 282, 525, 286, 286, 497, 296, 296, 305, 336, 336, 470, 346, 346, 525, 349, 349, 229, 361, 361, 239, 364, 364, 454, 367, 367, 229, 373, 373, 798, 374, 374, 525, 381, 381, 527, 393, 393, 525, 396, 396, 348, 400, 400, 391, 410, 410, 334, 437, 437, 525, 448, 448, 451, 449, 449, 714, 454, 454, 433, 455, 455, 452, 853, 853, 249, 855, 855, 267, 856, 856, 252, 882, 882, 306, IndirectObject(113, 0, 128550415060656), IndirectObject(113, 0, 128550415060656), 303, 895, 895, 303, 923, 923, IndirectObject(113, 0, 128550415060656), 1004, 1004, 506, 1005, 1005, 506, 1006, 1006, 506, 1007, 1007, 506, 1008, 1008, 506, 1009, 1009, 506, 1010, 1010, 506, 1012, 1012, 506, 1013, 1013, 506, 1085, 1085, 498]


‚úî Saved BSE AnnPdf: pdf_downloads/Transcript.pdf
   ‚úî Transcript/PPT detected ‚Üí renamed to Transcript_14.pdf

-----------------------------------------
Downloading: PPT.pdf
URL: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=eaaf1f2b-9a61-4931-8d3f-9dec05e92c88.pdf
[BSE-ANNPDF] Requesting: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=eaaf1f2b-9a61-4931-8d3f-9dec05e92c88.pdf


unknown widths : 
[3, 3, 226, 4, 4, 578, 17, 17, 543, 18, 18, 533, 28, 28, 488, 38, 38, 459, 44, 44, 623, 47, 47, 251, 58, 58, 318, 60, 60, 519, 62, 62, 420, 68, 68, 854, 69, 69, 645, 75, 75, 662, 87, 87, 516, 90, 90, 542, 94, 94, 459, 116, 116, 889, 258, 258, 479, 271, 271, 525, 272, 272, 422, 282, 282, 525, 286, 286, 497, 296, 296, 305, 336, 336, 470, 346, 346, 525, 349, 349, 229, 361, 361, 239, 364, 364, 454, 367, 367, 229, 373, 373, 798, 374, 374, 525, 381, 381, 527, 393, 393, 525, 396, 396, 348, 400, 400, 391, 410, 410, 334, 437, 437, 525, 448, 448, 451, 449, 449, 714, 454, 454, 433, 455, 455, 452, 853, 853, 249, 855, 855, 267, 856, 856, 252, 882, 882, 306, IndirectObject(342, 0, 128550415058208), IndirectObject(342, 0, 128550415058208), 303, 895, 895, 303, 923, 923, IndirectObject(342, 0, 128550415058208), 1004, 1004, 506, 1005, 1005, 506, 1006, 1006, 506, 1007, 1007, 506, 1008, 1008, 506, 1009, 1009, 506, 1010, 1010, 506, 1012, 1012, 506, 1013, 1013, 506, 1085, 1085, 498]


‚úî Saved BSE AnnPdf: pdf_downloads/PPT.pdf
   ‚úî Transcript/PPT detected ‚Üí renamed to PPT_13.pdf



In [30]:
def delete_old_pdfs():

    FOLDER = "pdf_downloads/"
    ONE_YEAR_DAYS = 365

    def parse_pdf_date(date_str):
        # Format type 1: D:20181012055359+05'30'
        if date_str.startswith("D:"):
            try:
                dt = datetime.datetime.strptime(date_str[2:16], "%Y%m%d%H%M%S")
                return dt
            except:
                pass

        # Format type 2: Fri 12 Oct 2018 05:53:59 PM +05:30
        try:
            dt = datetime.datetime.strptime(date_str, "%a %d %b %Y %I:%M:%S %p %z")
            return dt.replace(tzinfo=None)
        except:
            return None


    now = datetime.datetime.now()
    deleted_files = 0

    for filename in os.listdir(FOLDER):
        if not filename.lower().endswith(".pdf"):
            continue
        
        file_path = os.path.join(FOLDER, filename)

        try:
            reader = PdfReader(file_path)
            metadata = reader.metadata
            
            if "/CreationDate" in metadata:
                pdf_date = parse_pdf_date(metadata["/CreationDate"])
            elif "/ModDate" in metadata:
                pdf_date = parse_pdf_date(metadata["/ModDate"])
            else:
                print(f"‚ö† No metadata date found for {filename}, skipping.")
                continue

            if not pdf_date:
                print(f"‚ö† Could not parse date for {filename}, skipping.")
                continue

            age_days = (now - pdf_date).days

            if age_days > ONE_YEAR_DAYS:
                print(f"üóë Deleting: {filename} (Age: {age_days} days)")
                os.remove(file_path)
                deleted_files += 1
            else:
                print(f"‚úî Keeping: {filename} (Age: {age_days} days)")

        except Exception as e:
            print(f"‚ùå Error reading {filename}: {e}")

    print(f"\n‚úÖ Done. Deleted {deleted_files} old PDF(s).")

    return f'Deleted {deleted_files} old PDF(s).'

In [32]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
embeddings = FakeEmbeddings(size=1352)


vector_store = None
count = 0

def create_vector_store():
    for filename in os.listdir("pdf_downloads/"):
        file_path = f"pdf_downloads/{filename}"
        print(f"\nüìÑ Processing: {filename}")

        # Robust PDF loading
        try:
            try:
                loader = PyPDFLoader(file_path)
                docs = loader.load()
            except:
                print("‚ö† PyPDF failed, trying PDFMiner...")
                loader = PDFMinerLoader(file_path)
                docs = loader.load()
        except Exception as e:
            print(f"‚ùå Skipping {filename}: cannot load PDF -> {e}")
            continue

        # Ensure text exists
        splits = text_splitter.split_documents(docs)
        if not splits:
            print(f"‚ö† No extractable text in {filename}, skipping.")
            continue

        # Build or update FAISS index
        if vector_store is None:
            vector_store = FAISS.from_documents(splits, embedding=embeddings)
        else:
            vector_store.add_documents(splits)

        count += 1
        print(f"‚úî Added {len(splits)} chunks from {filename}")

    # Save final index
    if vector_store:
        vector_store.save_local("faiss_index")
        print("\nüéâ FAISS index saved successfully.")
    else:
        print("\n‚ùå No valid PDFs processed; FAISS index not created.")

    return vector_store
 

In [31]:
def create_url_vector_store(url):

    url_loader = WebBaseLoader(url)

    url_docs = url_loader.load()

    url_split_documents = text_splitter.split_documents(url_docs)

    vector_store.add_documents(url_split_documents)

    vector_store.save_local("faiss_index")

    vector_store = vector_store.load_local("faiss_index", embeddings=embeddings,allow_dangerous_deserialization=True)
    
    return vector_store

In [None]:
llm = ChatGroq(
    model="openai/gpt-oss-120b",
    temperature=0,
    max_tokens=None,
    reasoning_format="parsed",
    timeout=None,
    max_retries=2,
    api_key=''
    # other params...
)

In [34]:
base_prompt = '''
You are an intelligent financial analysis agent specialized in reviewing company quarterly reports and earnings call transcripts.
Your primary function is to extract accurate financial metrics, analyze qualitative management commentary, and generate clear, structured insights.

During execution, you will receive:

{context}: Extracted text chunks from financial reports and transcripts

{question}: A specific analytical task (e.g., trends, risks, sentiment, outlook)

Guidelines for responding to {question} using {context}:

Accurate & Grounded: Use only the information found in the provided context‚Äîno guessing or fabricating data.

Financially Insightful: Provide concise explanations about revenue trends, margin movements, demand commentary, risks, and opportunities.

Forecast-Oriented: Highlight management sentiment and forward-looking statements relevant to future performance.

Structured JSON: Always respond in a predictable JSON format suitable for downstream processing.

No Investment Advice: Do not provide stock buy/sell/hold recommendations or personal financial advice.

Example JSON Output:
{{
  "reply": "Revenue grew 5% YoY driven by cloud and BFSI demand, while margins remained stable. Management highlighted healthy deal wins and improving client budgets.",
  "guidance_caution": "This summary is based solely on the provided financial context and does not constitute investment advice.",
  "follow_up_prompt": "Would you like insights on risks, opportunities, or the outlook for the next quarter?"
}}

'''

In [None]:
user_question_2 = 'What were the key financial metrics (Revenue, Net Profit, Operating Margin, and Segment Performance) reported for this quarter?'

prompt = PromptTemplate(template=base_prompt, input_variables=["context", "question"])
vectorstore_retreiver = vector_store.as_retriever(search_kwargs={"k": 3})
extracted_chunks = vectorstore_retreiver.invoke(user_question_2)
chain = prompt | llm | JsonOutputParser()
response = chain.invoke({"context": extracted_chunks, "question": user_question_2})

In [36]:
def user_query_answer(query):
    vectorstore_retreiver = vector_store.as_retriever(search_kwargs={"k": 3})
    extracted_chunks = vectorstore_retreiver.invoke(query)
    chain = prompt | llm | JsonOutputParser()
    response = chain.invoke({"context": extracted_chunks, "question": query})
    return response, extracted_chunks

In [27]:
extracted_chunks

[Document(id='bc21a2cf-cfd0-4776-8e0c-14d6e713c1f3', metadata={'producer': 'SAMBox 3.0.18', 'creator': 'PDFsam Basic v5.2.9', 'creationdate': '2025-08-05T13:25:15+05:30', 'moddate': '2025-08-05T13:25:15+05:30', 'source': 'pdf_downloads/Financial_Year_2025_from_bse.pdf', 'total_pages': 139, 'page': 58, 'page_label': '59'}, page_content='employees have a right to report violations to the Chairman of the Audit Committee, and there was no instance of denial of access to the \nAudit Committee. \nThe Vigil Mechanism and Whistle-Blower Policy is available on the website of the Company and can be accessed at https://www.jfs.in/docs/\ncms/assets/jfs/investor-relations/policy-documents/vigil-mechanism-and-whistle-blower-policy.pdf\nAnti-Bribery and Anti-Corruption Policy'),
 Document(id='42b5908b-bbd6-43a4-9b27-0a7e1acf707f', metadata={'producer': 'SAMBox 3.0.18', 'creator': 'PDFsam Basic v5.2.9', 'creationdate': '2025-10-16T18:50:52+05:30', 'moddate': '2025-10-16T18:50:52+05:30', 'source': 'pdf

In [28]:
print(response['reply'])

The supplied documents do not contain the quarterly financial metrics you asked for. The only quantitative figures present are: 
- From the Screener summary: Revenue of‚ÄØ‚Çπ2,525‚ÄØcrore and Profit of‚ÄØ‚Çπ1,631‚ÄØcrore (these appear to be consolidated figures, not explicitly tied to the current quarter). 
- From the PPT slide: equity share capital, other equity, other liabilities, and total liabilities, which do not provide revenue, net profit, operating margin, or segment performance.
No specific quarterly revenue, net profit, operating margin, or segment‚Äëwise performance numbers are disclosed in the provided excerpts, so a precise answer cannot be generated from the available data.


In [37]:
from nse_live_stocks import Nse

stock = Nse()

result = stock.get_current_price('TCS')

print(result)

{'error': False, 'nse_symbol': 'TCS', 'current_value': '3112', 'date': '14-Nov-2025 16:00:00'}
