In [None]:
import os
import re
import requests
from langchain_groq import ChatGroq
from bs4 import BeautifulSoup
from PyPDF2 import PdfReader
from langchain_community.document_loaders import WebBaseLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import FakeEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader, PDFMinerLoader
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import datetime
from nse_live_stocks import Nse



USER_AGENT environment variable not set, consider setting it to identify your requests.


In [9]:
DOWNLOAD_DIR = "pdf_downloads"
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [2]:
url = 'https://www.screener.in/company/JIOFIN/consolidated/'

In [37]:
base_prompt='''
You are a Financial Forecasting Analyst Agent designed to analyze quarterly financial reports and earnings call transcripts for Tata Consultancy Services (TCS).  
Your role is to extract factual financial metrics, identify qualitative themes, and produce a coherent forward-looking business forecast.

You will receive:
- {context}: A set of extracted chunks from financial reports and earnings call transcripts.
- {question}: A specific analytical task related to forecasting, trend analysis, risk assessment, or qualitative synthesis.

-----------------------------
## CORE BEHAVIOR & CONSTRAINTS
-----------------------------

### 1. Grounded Analysis Only
- Use ONLY the information provided in {context}.  
- Do NOT guess, hallucinate, or infer any financial figures not explicitly present.
- If the context lacks information, explicitly say so in the JSON output.

### 2. Financial Reasoning Requirements
When analyzing, prioritize:
- Revenue trends, margin behavior, and profitability drivers.
- Segment or geography commentary if available.
- Management tone, sentiment, and guidance.
- Recurring themes from past 1‚Äì2 quarters.
- Stated risks, opportunities, demand outlook, and deal trends.
- Any forward-looking commentary relevant to forecasting.

### 3. Output Format
Your response MUST always be a **machine-readable JSON object** in the structure below.  
Do NOT include anything outside the JSON (no prose, greetings, explanations, etc.).

### 4. No Investment Advice
- Never give buy/sell/hold recommendations.
- Never speculate on stock price movement.
- Only provide analytical and qualitative forecasting insights.

-----------------------------
## REQUIRED JSON STRUCTURE
-----------------------------
Return ONLY this JSON structure:

{{
  "analysis_summary": "High-level overview of trends and themes from the provided context.",
  "quantitative_insights": {{
    "revenue_trend": "...",
    "profitability_trend": "...",
    "margin_notes": "...",
    "other_key_metrics": "..."
  }},
  "qualitative_insights": {{
    "management_sentiment": "...",
    "recurring_themes": "...",
    "forward_looking_commentary": "..."
  }},
  "forecast": {{
    "expected_direction": "growth | stable | decline | uncertain",
    "drivers": "...",
    "risks": "...",
    "opportunities": "..."
  }},
  "limitations": "Explicitly state missing data or gaps in the context.",
  "guidance_caution": "This analysis is based solely on the provided documents and is NOT investment advice.",
  "follow_up_prompt": "Suggest a logical next query the user may ask."
}}

-----------------------------
## RESPONSE EXPECTATIONS
-----------------------------
- Maintain concise, executive-level clarity.
- Cite only what is present in the provided context.
- If multiple chunks provide conflicting info, note it in 'limitations'.
- If context is large or repetitive, synthesize themes rather than paraphrasing.

-----------------------------
## END OF PROMPT
-----------------------------


'''

In [38]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
embeddings = FakeEmbeddings(size=1352)
vector_store = None
count = 0
llm = ChatGroq(model="openai/gpt-oss-120b",api_key=os.getenv("GROQ_API_KEY"),temperature=0)
prompt = PromptTemplate(template=base_prompt, input_variables=["context", "question"])


In [39]:
def download_pdfs(company_url):

    def scrape_screener_pdfs(company_url):
        print(f"Scraping: {company_url}")
        headers = {"User-Agent": "Mozilla/5.0"}

        html = requests.get(company_url, headers=headers).text
        soup = BeautifulSoup(html, "html.parser")

        links = soup.select(".documents a")

        pdf_links = []

        for a in links:
            href = a.get("href", "")
            if href.endswith(".pdf"):
                text = a.text.strip().replace("\n", "_").replace(" ", "_")
                if not text:
                    text = "Document"
                pdf_links.append((href, text + ".pdf"))

        print(f"Found {len(pdf_links)} PDF links.\n")
        return pdf_links


    def classify_transcript_or_ppt(pdf_path):
        try:
            reader = PdfReader(pdf_path)
            text = reader.pages[0].extract_text()[:800].lower()
        except:
            return None

        if "transcript" in text or "earnings call" in text:
            return "transcript"
        if "presentation" in text or "investor presentation" in text:
            return "presentation"

        return None


    def maybe_rename_transcript_or_ppt(saved_path):
        doc_type = classify_transcript_or_ppt(saved_path)

        if not doc_type:
            print("   ‚Üí Not transcript/presentation. Keeping original filename.\n")
            return saved_path

        # Original filename
        folder = os.path.dirname(saved_path)
        base = os.path.basename(saved_path)
        name, ext = os.path.splitext(base)

        # Make new filename with incremental number
        counter = 1
        new_name = f"{name}_{counter}{ext}"
        new_path = os.path.join(folder, new_name)

        while os.path.exists(new_path):
            counter += 1
            new_name = f"{name}_{counter}{ext}"
            new_path = os.path.join(folder, new_name)

        try:
            os.rename(saved_path, new_path)
            print(f"   ‚úî Transcript/PPT detected ‚Üí renamed to {new_name}\n")
            return new_path
        except:
            print("   ‚ö† Rename failed. Keeping original.\n")
            return saved_path

    def download_bse_annpdf(url, download_dir, filename):
        print(f"URL: {url}")
        print(f"[BSE-ANNPDF] Requesting: {url}")

        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        savepath = os.path.join(download_dir, filename)

        with open(savepath, "wb") as f:
            f.write(r.content)

        print(f"‚úî Saved BSE AnnPdf: {savepath}")
        return savepath

    def download_bse_iframe_pdf(url, download_dir, filename):
        print(f"URL: {url}")
        print("   [BSE-IFRAME] Requesting main page‚Ä¶")

        html = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}).text
        soup = BeautifulSoup(html, "html.parser")
        iframe = soup.find("iframe")

        if not iframe:
            print("   ‚ùå No iframe found. Cannot download.")
            return None

        real_pdf = iframe.get("src")
        if not real_pdf.startswith("http"):
            real_pdf = "https://www.bseindia.com" + real_pdf

        print(f"   ‚Üí PDF Source: {real_pdf}")

        r = requests.get(real_pdf, headers={"User-Agent": "Mozilla/5.0"})
        savepath = os.path.join(download_dir, filename)

        with open(savepath, "wb") as f:
            f.write(r.content)

        print(f"   ‚úî Saved BSE iframe PDF: {savepath}")
        return savepath


    def download_direct_pdf(url, download_dir, filename):
        print(f"[DIRECT] Downloading: {url}")
        r = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
        savepath = os.path.join(download_dir, filename)

        with open(savepath, "wb") as f:
            f.write(r.content)

        print(f"‚úî Saved direct PDF: {savepath}")
        return savepath


    def clean_filename(name):
        name = re.sub(r'[\/:*?"<>|;,]', '_', name)
        name = re.sub(r"_+", "_", name)
        name = name.strip(" .")
        if len(name) > 180:
            name = name[:180]
        return name


    def download_pdf(url, filename):

        filename = clean_filename(filename)

        if "AnnPdfOpen.aspx" in url:
            saved = download_bse_annpdf(url, DOWNLOAD_DIR, filename)
            return maybe_rename_transcript_or_ppt(saved)

        if "xml-data/corpfiling" in url:
            saved = download_bse_iframe_pdf(url, DOWNLOAD_DIR, filename)
            return maybe_rename_transcript_or_ppt(saved)

        saved = download_direct_pdf(url, DOWNLOAD_DIR, filename)
        return maybe_rename_transcript_or_ppt(saved)


    def delete_old_pdfs():

        FOLDER = "pdf_downloads/"
        ONE_YEAR_DAYS = 365

        def parse_pdf_date(date_str):
            # Format type 1: D:20181012055359+05'30'
            if date_str.startswith("D:"):
                try:
                    dt = datetime.datetime.strptime(date_str[2:16], "%Y%m%d%H%M%S")
                    return dt
                except:
                    pass

            # Format type 2: Fri 12 Oct 2018 05:53:59 PM +05:30
            try:
                dt = datetime.datetime.strptime(date_str, "%a %d %b %Y %I:%M:%S %p %z")
                return dt.replace(tzinfo=None)
            except:
                return None


        now = datetime.datetime.now()
        deleted_files = 0

        for filename in os.listdir(FOLDER):
            if not filename.lower().endswith(".pdf"):
                continue
            
            file_path = os.path.join(FOLDER, filename)

            try:
                reader = PdfReader(file_path)
                metadata = reader.metadata
                
                if "/CreationDate" in metadata:
                    pdf_date = parse_pdf_date(metadata["/CreationDate"])
                elif "/ModDate" in metadata:
                    pdf_date = parse_pdf_date(metadata["/ModDate"])
                else:
                    print(f"‚ö† No metadata date found for {filename}, skipping.")
                    continue

                if not pdf_date:
                    print(f"‚ö† Could not parse date for {filename}, skipping.")
                    continue

                age_days = (now - pdf_date).days

                if age_days > ONE_YEAR_DAYS:
                    print(f"üóë Deleting: {filename} (Age: {age_days} days)")
                    os.remove(file_path)
                    deleted_files += 1
                else:
                    print(f"‚úî Keeping: {filename} (Age: {age_days} days)")

            except Exception as e:
                print(f"‚ùå Error reading {filename}: {e}")

        print(f"\n‚úÖ Done. Deleted {deleted_files} old PDF(s).")

        return f'Deleted {deleted_files} old PDF(s).'

        
        
    def run(company_url):
        pdfs = scrape_screener_pdfs(company_url)

        for url, filename in pdfs:
            print("-----------------------------------------")
            print(f"Downloading: {filename}")
            download_pdf(url, filename)
            delete_old_pdfs()
    
    run(company_url)

    return "Finished downloading PDFs."

In [11]:
download_pdfs(url)

Scraping: https://www.screener.in/company/JIOFIN/consolidated/
Found 26 PDF links.

-----------------------------------------
Downloading: Announcement_under_Regulation_30_(LODR)-Acquisition_________________________2d_-_JLSL_subscribed_4,50,00,000_8.1%_OCPS_for_Rs45_crore_in_RILIPL;_aggregate_investment_Rs166.55_crore..pdf
URL: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=39c74549-8916-4e1b-a22e-f34f27fb9a2f.pdf
[BSE-ANNPDF] Requesting: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=39c74549-8916-4e1b-a22e-f34f27fb9a2f.pdf
‚úî Saved BSE AnnPdf: pdf_downloads/Announcement_under_Regulation_30_(LODR)-Acquisition_2d_-_JLSL_subscribed_4_50_00_000_8.1%_OCPS_for_Rs45_crore_in_RILIPL_aggregate_investment_Rs166.55_crore..pdf
   ‚Üí Not transcript/presentation. Keeping original filename.

‚úî Keeping: Announcement_under_Regulation_30_(LODR)-Acquisition_2d_-_JLSL_subscribed_4_50_00_000_8.1%_OCPS_for_Rs45_crore_in_RILIPL_aggregate_investment_Rs166.55_crore..pdf (Age: 2 days)


unknown widths : 
[3, 3, 226, 4, 4, 578, 17, 17, 543, 18, 18, 533, 28, 28, 488, 38, 38, 459, 44, 44, 623, 47, 47, 251, 58, 58, 318, 60, 60, 519, 62, 62, 420, 68, 68, 854, 69, 69, 645, 75, 75, 662, 87, 87, 516, 90, 90, 542, 94, 94, 459, 116, 116, 889, 258, 258, 479, 271, 271, 525, 272, 272, 422, 282, 282, 525, 286, 286, 497, 296, 296, 305, 336, 336, 470, 346, 346, 525, 349, 349, 229, 361, 361, 239, 364, 364, 454, 367, 367, 229, 373, 373, 798, 374, 374, 525, 381, 381, 527, 393, 393, 525, 396, 396, 348, 400, 400, 391, 410, 410, 334, 437, 437, 525, 448, 448, 451, 449, 449, 714, 454, 454, 433, 455, 455, 452, 853, 853, 249, 855, 855, 267, 856, 856, 252, 882, 882, 306, IndirectObject(113, 0, 136944496548480), IndirectObject(113, 0, 136944496548480), 303, 895, 895, 303, 923, 923, IndirectObject(113, 0, 136944496548480), 1004, 1004, 506, 1005, 1005, 506, 1006, 1006, 506, 1007, 1007, 506, 1008, 1008, 506, 1009, 1009, 506, 1010, 1010, 506, 1012, 1012, 506, 1013, 1013, 506, 1085, 1085, 498]


‚úî Saved BSE AnnPdf: pdf_downloads/Transcript.pdf
   ‚úî Transcript/PPT detected ‚Üí renamed to Transcript_5.pdf

‚úî Keeping: PPT_3.pdf (Age: 213 days)
‚úî Keeping: Financial_Year_2025_from_bse.pdf (Age: 103 days)
‚úî Keeping: PPT_4.pdf (Age: 303 days)
‚úî Keeping: PPT_1.pdf (Age: 31 days)
‚úî Keeping: Transcript_4.pdf (Age: 299 days)
‚úî Keeping: Announcement_under_Regulation_30_(LODR)-Acquisition_2d_-_JLSL_subscribed_4_50_00_000_8.1%_OCPS_for_Rs45_crore_in_RILIPL_aggregate_investment_Rs166.55_crore..pdf (Age: 2 days)
‚úî Keeping: Transcript_3.pdf (Age: 212 days)
‚úî Keeping: Transcript_1.pdf (Age: 24 days)
üóë Deleting: Transcript_5.pdf (Age: 759 days)
‚úî Keeping: Transcript_2.pdf (Age: 116 days)
‚úî Keeping: PPT_2.pdf (Age: 122 days)

‚úÖ Done. Deleted 1 old PDF(s).
-----------------------------------------
Downloading: PPT.pdf
URL: https://www.bseindia.com/stockinfo/AnnPdfOpen.aspx?Pname=eaaf1f2b-9a61-4931-8d3f-9dec05e92c88.pdf
[BSE-ANNPDF] Requesting: https://www.bseindia.com/

unknown widths : 
[3, 3, 226, 4, 4, 578, 17, 17, 543, 18, 18, 533, 28, 28, 488, 38, 38, 459, 44, 44, 623, 47, 47, 251, 58, 58, 318, 60, 60, 519, 62, 62, 420, 68, 68, 854, 69, 69, 645, 75, 75, 662, 87, 87, 516, 90, 90, 542, 94, 94, 459, 116, 116, 889, 258, 258, 479, 271, 271, 525, 272, 272, 422, 282, 282, 525, 286, 286, 497, 296, 296, 305, 336, 336, 470, 346, 346, 525, 349, 349, 229, 361, 361, 239, 364, 364, 454, 367, 367, 229, 373, 373, 798, 374, 374, 525, 381, 381, 527, 393, 393, 525, 396, 396, 348, 400, 400, 391, 410, 410, 334, 437, 437, 525, 448, 448, 451, 449, 449, 714, 454, 454, 433, 455, 455, 452, 853, 853, 249, 855, 855, 267, 856, 856, 252, 882, 882, 306, IndirectObject(342, 0, 136944502820576), IndirectObject(342, 0, 136944502820576), 303, 895, 895, 303, 923, 923, IndirectObject(342, 0, 136944502820576), 1004, 1004, 506, 1005, 1005, 506, 1006, 1006, 506, 1007, 1007, 506, 1008, 1008, 506, 1009, 1009, 506, 1010, 1010, 506, 1012, 1012, 506, 1013, 1013, 506, 1085, 1085, 498]


‚úî Saved BSE AnnPdf: pdf_downloads/PPT.pdf
   ‚úî Transcript/PPT detected ‚Üí renamed to PPT_5.pdf

‚úî Keeping: PPT_3.pdf (Age: 213 days)
‚úî Keeping: Financial_Year_2025_from_bse.pdf (Age: 103 days)
‚úî Keeping: PPT_4.pdf (Age: 303 days)
‚úî Keeping: PPT_1.pdf (Age: 31 days)
‚úî Keeping: Transcript_4.pdf (Age: 299 days)
‚úî Keeping: Announcement_under_Regulation_30_(LODR)-Acquisition_2d_-_JLSL_subscribed_4_50_00_000_8.1%_OCPS_for_Rs45_crore_in_RILIPL_aggregate_investment_Rs166.55_crore..pdf (Age: 2 days)
‚úî Keeping: Transcript_3.pdf (Age: 212 days)
‚úî Keeping: Transcript_1.pdf (Age: 24 days)
‚úî Keeping: Transcript_2.pdf (Age: 116 days)
‚úî Keeping: PPT_2.pdf (Age: 122 days)
üóë Deleting: PPT_5.pdf (Age: 762 days)

‚úÖ Done. Deleted 1 old PDF(s).


'Finished downloading PDFs.'

In [40]:
def create_vector_store(url):
    vector_store = None
    count = 0
    for filename in os.listdir("pdf_downloads/"):
        file_path = f"pdf_downloads/{filename}"
        print(f"\nüìÑ Processing: {filename}")

        # Robust PDF loading
        try:
            try:
                loader = PyPDFLoader(file_path)
                docs = loader.load()
            except:
                print("‚ö† PyPDF failed, trying PDFMiner...")
                loader = PDFMinerLoader(file_path)
                docs = loader.load()
        except Exception as e:
            print(f"‚ùå Skipping {filename}: cannot load PDF -> {e}")
            continue

        # Ensure text exists
        splits = text_splitter.split_documents(docs)
        if not splits:
            print(f"‚ö† No extractable text in {filename}, skipping.")
            continue

        # Build or update FAISS index
        if vector_store is None:
            vector_store = FAISS.from_documents(splits, embedding=embeddings)
        else:
            vector_store.add_documents(splits)

        count += 1
        print(f"‚úî Added {len(splits)} chunks from {filename}")

    # Save final index
    if vector_store:
        vector_store.save_local("faiss_index")
        print("\nüéâ FAISS index saved successfully.")
    else:
        print("\n‚ùå No valid PDFs processed; FAISS index not created.")
        
    try:

        url_loader = WebBaseLoader(url)

        url_docs = url_loader.load()

        url_split_documents = text_splitter.split_documents(url_docs)

        vector_store.add_documents(url_split_documents)
        
        print(f"‚úî Added {len(url_split_documents)} chunks from URL content")

        vector_store.save_local("faiss_index")

        vector_store = vector_store.load_local("faiss_index", embeddings=embeddings,allow_dangerous_deserialization=True)
    
    except Exception as e:
        print(f"‚ùå Error processing URL content: {e}")
        
        
    return vector_store

In [45]:
vector_store = create_vector_store(url)


üìÑ Processing: PPT_3.pdf
‚úî Added 45 chunks from PPT_3.pdf

üìÑ Processing: Financial_Year_2025_from_bse.pdf
‚úî Added 1791 chunks from Financial_Year_2025_from_bse.pdf

üìÑ Processing: Announcement_under_Regulation_30_(LODR)-Earnings_Call_Transcript_23_Oct_-_Transcript_of_Presentation_on_Unaudited_Financial_Results_(Consolidated_and_Standalone)_for_the_quarter_an_1
‚úî Added 113 chunks from Announcement_under_Regulation_30_(LODR)-Earnings_Call_Transcript_23_Oct_-_Transcript_of_Presentation_on_Unaudited_Financial_Results_(Consolidated_and_Standalone)_for_the_quarter_an_1

üìÑ Processing: PPT_4.pdf
‚úî Added 33 chunks from PPT_4.pdf

üìÑ Processing: PPT_1.pdf
‚úî Added 54 chunks from PPT_1.pdf

üìÑ Processing: Transcript_4.pdf
‚úî Added 54 chunks from Transcript_4.pdf

üìÑ Processing: Announcement_under_Regulation_30_(LODR)-Monitoring_Agency_Report_17_Oct_-_CRISIL_report_Rs3_956.25_crore_received_(25%_of_Rs15_825_crore_PI)_Rs823.80_crore_utilized_by_Sept_30_2025
‚úî Added 53 c

In [46]:
def user_query_answer(query):
    extracted_chunks = vector_store.similarity_search(query, k=5)
    chain = prompt | llm | JsonOutputParser()
    response = chain.invoke({"context": extracted_chunks, "question": query})
    return response, extracted_chunks

In [47]:
response, chunks = user_query_answer("Analyze the financial reports and earnings call transcripts for TCS from the last two quarters and provide a qualitative business forecast for the upcoming quarter. Identify trends in revenue, margins, demand environment, key risks, opportunities, and management forward-looking guidance.")

print(f"Response:{response}")
print('------------\n')
print(len(chunks))
chunks

Response:{'analysis_summary': 'The supplied document excerpts pertain to Jio Financial Services, a BlackRock joint venture, and generic corporate governance language. No financial metrics, earnings commentary, or management discussion specific to Tata Consultancy Services (TCS) for the last two quarters are present.', 'quantitative_insights': {'revenue_trend': 'No revenue data for TCS provided.', 'profitability_trend': 'No profitability or margin data for TCS provided.', 'margin_notes': 'No margin information for TCS provided.', 'other_key_metrics': 'No key performance indicators (e.g., order intake, headcount, utilization) for TCS provided.'}, 'qualitative_insights': {'management_sentiment': 'Management sentiment for TCS cannot be assessed from the given excerpts.', 'recurring_themes': 'No recurring themes related to TCS are identifiable.', 'forward_looking_commentary': 'No forward‚Äëlooking statements about TCS are present.'}, 'forecast': {'expected_direction': 'uncertain', 'drivers'

[Document(id='a0e7dbfe-303e-4bda-967c-c753af16ddd2', metadata={'producer': 'SAMBox 3.0.18', 'creator': 'PDFsam Basic v5.2.9', 'creationdate': '2025-08-05T13:25:15+05:30', 'moddate': '2025-08-05T13:25:15+05:30', 'source': 'pdf_downloads/Financial_Year_2025_from_bse.pdf', 'total_pages': 139, 'page': 39, 'page_label': '40'}, page_content='Limited ‚Äî respectively. By combining BlackRock‚Äôs global investment capabilities with \nJFSL ‚Äôs digital distribution strength and deep understanding of the Indian customer, \nthis strategic alliance seeks to make world-class investment products accessible to all \nIndians ‚Äî digitally, affordably and at scale.\nl Global Expertise, Local Reach: By combining \nBlackRock‚Äôs decades of investment management \nexperience and proprietary technologies with \nJFSL ‚Äôs digital reach, the JV brings best-in-class'),
 Document(id='eacc0e2f-ab63-4a02-bbfa-8fe34d6bfc80', metadata={'producer': 'SAMBox 3.0.18', 'creator': 'PDFsam Basic v5.2.9', 'creationdate': '

In [51]:
def current_market_price(url):
    import re
    stock_symbol = re.search(r'/company/([A-Z]+)', url).group(1)
    stock = Nse()
    result = stock.get_current_price(stock_symbol)
    return result['current_value']

In [52]:
current_market_price(url)

'315'