In [21]:
from langchain import hub
from langchain.agents import AgentExecutor, create_structured_chat_agent
from langchain.memory import ConversationBufferMemory
from langchain_core.messages import AIMessage, HumanMessage, SystemMessage
from langchain_core.tools import Tool
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import create_history_aware_retriever, create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_community.vectorstores import Chroma
from langchain_core.prompts import ChatMessagePromptTemplate, MessagesPlaceholder
from langchain_community.llms import Ollama
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import yfinance as yf
import pandas as pd

In [22]:
import time
import random


In [23]:
## model 
model=Ollama(model="llama3.2")

Part 1- Portfolio Developer

In [24]:
##Input Prompt template
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate

system_template = """
You are an intelligent investment assistant.

Your job is to extract structured data from the user's natural language prompt.

**You must return a valid Python dictionary string, and nothing else.**

The dictionary should contain the following keys:

- "sector"
- "investment_type"
- "investment_goal"
- "risk_appetite"
- "investment_horizon" (as an integer number of years)
- "principal_amount" (in numeric format, no currency symbols)
- "final_amount" (in numeric format)
- "number_of_years"
- "growth_rate" (as a percentage, no % sign — leave blank if not specified)

**Constraints:**
- Only use these sectors: ["IT", "Banking & Finance", "Pharma", "Energy", "FMCG"]
- If a field is missing in the prompt, set its value to an empty string or `None` (but keep the key).
- You must return only the dictionary (as a single-line valid Python dictionary string). Ensure all keys are present, braces are properly closed, and the syntax is correct.

Example:
{{
    "sector": "Pharma",
    "investment_type": "lumpsum",
    "investment_goal": "wealth creation",
    "risk_appetite": "medium",
    "investment_horizon": 5,
    "principal_amount": 100000,
    "final_amount": 200000,
    "number_of_years": 5,
    "growth_rate": ""
}}
"""



In [25]:
##ticker names
sector_tickers = {
    "IT": [
        "TCS", "INFY", "WIPRO", "HCLTECH", "TECHM", "LTIM", "PERSISTENT", "COFORGE", 
        "MPHASIS", "BSOFT", "SONATSOFTW", "CYIENT", "ZENSARTECH", "NIITLTD", 
        "KELLTONTEC", "TATAELXSI", "ECLERX", "NEWGEN", "INTELLECT", "HAPPSTMNDS"
    ],
    "Banking & Finance": [
        "HDFCBANK", "ICICIBANK", "SBIN", "KOTAKBANK", "AXISBANK", "INDUSINDBK", "YESBANK", 
        "FEDERALBNK", "IDFCFIRSTB", "BANDHANBNK", "RBLBANK", "PNB", "CANBK", "BANKBARODA", 
        "UNIONBANK", "AUBANK", "IDBI", "UJJIVANSFB", "CENTRALBK", "SOUTHBANK"
    ],
    "Pharma": [
        "SUNPHARMA", "DIVISLAB", "DRREDDY", "CIPLA", "LUPIN", "BIOCON", "TORNTPHARM", 
        "AUROPHARMA", "ZYDUSLIFE", "ALKEM", "GLAND", "IPCALAB", "PFIZER", 
        "ABBOTINDIA", "SANOFI", "NATCOPHARM", "GRANULES", "AJANTPHARM", 
        "JUBLPHARMA", "INDOCO"
    ],
    "Energy": [
        "RELIANCE", "ONGC", "NTPC", "POWERGRID", "TATAPOWER", "ADANIGREEN", 
        "ADANITRANS", "NHPC", "GAIL", "OIL", "IOC", "BPCL", "HPCL", 
        "JSWENERGY", "SJVN", "TORNTPOWER", "CESC", "NLCINDIA", "BHEL", "COALINDIA"
    ],
    "FMCG": [
        "HINDUNILVR", "ITC", "NESTLEIND", "BRITANNIA", "DABUR", "MARICO", 
        "GODREJCP", "COLPAL", "EMAMILTD", "VBL", "TATACONSUM", "UBL", 
        "RADICO", "ZYDUSWELL", "HATSUN", "KRBL", "MANAPPURAM", 
        "HERITGFOOD", "EVEREADY", "JYOTHYLAB"
    ]
}

In [26]:
nifty_50_comp=[
    "HDFCBANK",
    "ICICIBANK",
    "RELIANCE",
    "INFY",
    "BHARTIARTL",
    "ITC",
    "LT",
    "TCS",
    "AXISBANK",
    "KOTAKBANK",
    "SBIN",
    "M&M",
    "BAJFINANCE",
    "HINDUNILVR",
    "SUNPHARMA",
    "ULTRACEMCO",
    "WIPRO",
    "NTPC",
    "ASIANPAINT",
    "HCLTECH"
    ]

In [27]:

def beta_risk_level(risk):
    if risk=="low":
        return (0.0,0.8)
    if risk=="moderate":
        return (0.8,1.2)
    if risk=="high":
        return (1.2,2.5)
    if risk=="":
        return(0.8,1.2)

In [28]:
def calc_weights(dict):
    total_cagr = sum(data["5yr_CAGR"] for data in dict.values())
    weights_calculated = {
    ticker: data["5yr_CAGR"] / total_cagr
    for ticker, data in dict.items()
    }
    return weights_calculated

In [29]:
explanation_human_template = """
Original Input:
{input}

Selected Stocks:
{results}

Portfolio Weights:
{weights}
"""
explanation_system_template = """
You are a financial advisor assistant that explains an investment portfolio to the user in clear, professional, and engaging language.

You will be given:
- The user's original investment intent (as natural language)
- A dictionary of selected stocks with their financial metrics
- A dictionary of weights (allocation percentages)

Your job is to:
1. Summarize the user's investment preferences (sector, risk, growth, time).
2. List each selected stock along with its rationale for selection:
    - 5yr CAGR
    - ROE, ROCE
    - PE ratio
    - Beta (estimated)
    - Debt-to-equity
3. Explain why each stock fits the user's risk and return profile.
4. Clearly state the portfolio weights and how much of the principal should be invested in each.
5. End with a short paragraph about the portfolio's strengths and its alignment with the user's goals.

Be crisp, confident, and use easy-to-understand finance language. Keep it factual and structured, but not robotic.

Do NOT add disclaimers or legal text unless asked.
"""


In [42]:
def estimate_beta(pe=None, roce=None, roe=None, debt_to_equity=None, market_cap=None):
    # Normalize inputs to risk scale (0 to 1), based on common Indian market ranges
    risk = 0

    if pe is not None:
        risk += min(pe / 40, 1) * 0.25  # High P/E → growth → higher risk
    if roce is not None:
        risk += max(0, (20 - roce) / 20) * 0.25  # Lower ROCE → higher risk
    if roe is not None:
        risk += max(0, (20 - roe) / 20) * 0.15  # Lower ROE → more risk
    if debt_to_equity is not None:
        risk += min(debt_to_equity / 2, 1) * 0.25  # Higher leverage → more risk
    if market_cap is not None:
        if market_cap < 5000:       # Small cap
            risk += 0.1
        elif market_cap < 20000:    # Mid cap
            risk += 0.05

    # Map risk to beta range (0.6 to 2.0)
    beta = 0.6 + (1.4 * min(risk, 1.0))
    return round(beta, 2)


In [31]:

import pandas as pd

def save_portfolio_with_pandas(results, weights, filename="portfolio.csv"):
    data = []
    for ticker, metrics in results.items():
        entry = metrics.copy()  # Don't mutate original data
        entry['ticker'] = ticker
        entry['weight'] = round(weights.get(ticker, 0), 4)  # Safe access + rounding
        data.append(entry)
    
    df = pd.DataFrame(data)
    df.to_csv(filename, index=False)  # <- Save the CSV file
    print(f"✅ Portfolio saved to {filename}")


In [38]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import re

def get_finology_ratios(ticker):
    url = f"https://ticker.finology.in/company/{ticker}"
    
    options = Options()
    options.add_argument("--headless")  # Comment this line to see browser
    driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
    
    try:
        driver.get(url)
        wait = WebDriverWait(driver, 10)

        # Wait for a known element to make sure the page is fully loaded
        wait.until(EC.presence_of_element_located((By.XPATH, "//div[contains(@class,'compess')][small[contains(text(),'P/E')]]")))

        # Get the current CAGR text to compare after click
        old_cagr_text = driver.find_element(By.ID, "pricereturn").text.strip()

        # Click the 5Yr CAGR button
        cagr_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, "a[data-duration='5Yr']")))
        cagr_button.click()

        # Wait for CAGR value to change
        wait.until(lambda d: d.find_element(By.ID, "pricereturn").text.strip() != old_cagr_text)

        # Now fetch updated CAGR value
        cagr_text = driver.find_element(By.ID, "pricereturn").text.strip()

        # Helper functions
        def get_text(xpath):
            try:
                return driver.find_element(By.XPATH, xpath).text.strip()
            except:
                return None

        def parse_number(text):
            if not text:
                return None
            text = text.replace(',', '').replace('%', '').strip()
            try:
                return float(text)
            except:
                return None

        ratios = {}
        ratios['PE'] = parse_number(get_text("//div[contains(@class,'compess')][small[contains(text(),'P/E')]]/p"))
        ratios['ROCE'] = parse_number(get_text("//div[contains(@class,'compess')][small[contains(text(),'ROCE')]]//span[@class='Number']"))
        ratios['ROE'] = parse_number(get_text("//div[contains(@class,'compess')][small[contains(text(),'ROE')]]//span[@class='Number']"))
        ratios['Debt_Equity'] = parse_number(get_text("//div[@id='mainContent_divDebtEquity']//span[@class='Number']"))
        ratios['Market_Cap_Cr'] = parse_number(get_text("//p/span[@class='Number']"))

        match = re.search(r"([\d.]+)", cagr_text)
        ratios['5yr_CAGR'] = float(match.group(1)) if match else None

        return ratios

    except Exception as e:
        print(f"[!] Error scraping {ticker}: {e}")
        return None

    finally:
        driver.quit()


In [39]:
print(get_finology_ratios("HDFCBANK"))

{'PE': 22.91, 'ROCE': 15.26, 'ROE': 16.97, 'Debt_Equity': None, 'Market_Cap_Cr': 2016.0, '5yr_CAGR': 12.5}


In [53]:
import yfinance as yf

def is_nifty_moving(threshold=1.2):
    """
    Checks if Nifty 50 has moved more than the given threshold (%) in the last trading day.

    Args:
        threshold (float): Percentage move to consider significant (default: 1.2%)

    Returns:
        bool: True if Nifty moved more than threshold, else False
    """
    nifty = yf.Ticker("^NSEI")  # Nifty 50 index
    hist = nifty.history(period="2d")

    if len(hist) < 2:
        print("⚠️ Not enough historical data.")
        return False

    latest = hist['Close'].iloc[-1]
    prev = hist['Close'].iloc[-2]
    change_pct = abs((latest - prev) / prev) * 100

    print(f"📊 Nifty moved {change_pct:.2f}% in the last 24h.")
    return change_pct >= threshold

In [54]:
##result_dict contains the principal, years in a structured format
# Human/user prompt — this is where {input} goes
def portfolio_developer(input):
    # Check Nifty movement
    if not is_nifty_moving():
        print("✅ Nifty is stable. Using cached beta data with smart filtering.")

        # Parse user inputs first
        human_template = "{input}"
        prompt = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(system_template),
            HumanMessagePromptTemplate.from_template(human_template),
        ])
        chain = prompt | model | StrOutputParser()
        result_dict_string = chain.invoke({"input": input})
        print(result_dict_string)

        import ast
        result_dict = ast.literal_eval(result_dict_string)

        principal = int(result_dict["principal_amount"])
        years = int(result_dict["investment_horizon"])
        risk = result_dict["risk_appetite"]

        if result_dict["growth_rate"] == "":
            final_amount = int(result_dict["final_amount"])
            growth_rate = (final_amount / principal) ** (1 / years) - 1
        else:
            growth_rate = float(result_dict["growth_rate"])

        beta_low, beta_high = beta_risk_level(risk)

        # Load and filter cached CSV
        df = pd.read_csv("merged_beta_cache.csv")

        def is_valid_row(row):
            try:
                beta = float(row["Estimated_Beta"])
                cagr = float(row["5yr_CAGR"])
                return (beta <= beta_high + 0.2) and (growth_rate - 6 <= cagr <= growth_rate + 6)
            except:
                return False

        filtered_df = df[df.apply(is_valid_row, axis=1)].head(4)

        if filtered_df.empty:
            print("⚠️ No stocks matched the user criteria in cache.")
            return "No valid portfolio could be created from cached data."

        results = {
            row["Ticker"]: {
                "PE": row["PE"],
                "ROCE": row["ROCE"],
                "ROE": row["ROE"],
                "Debt_Equity": row["Debt_Equity"],
                "Market_Cap_Cr": row["Market_Cap_Cr"],
                "5yr_CAGR": row["5yr_CAGR"]
            }
            for _, row in filtered_df.iterrows()
        }

        weights = calc_weights(results)

        prompt2 = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(explanation_system_template),
            HumanMessagePromptTemplate.from_template(explanation_human_template),
        ])
        chain2 = prompt2 | model | StrOutputParser()

        output = chain2.invoke({
            "input": input,
            "results": results,
            "weights": weights
        })

        save_portfolio_with_pandas(results, weights, filename="portfolio.csv")
        return output

    # 🚨 If Nifty has moved → run original full scraping pipeline (unchanged)
    print("🚨 Nifty movement detected. Running full scraping pipeline...")
    human_template ="{input}"

    # Build the full ChatPromptTemplate
    prompt = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(system_template),
        HumanMessagePromptTemplate.from_template(human_template),
    ])
    chain=prompt| model | StrOutputParser()
    result_dict_string = chain.invoke({"input": input})
    import ast
    print(result_dict_string)  # inspect this

    result_dict = ast.literal_eval(result_dict_string)


    principal=int(result_dict["principal_amount"])
    years=int(result_dict["investment_horizon"])
    risk=result_dict["risk_appetite"]
    sector=result_dict["sector"]
    if result_dict["growth_rate"]=="":
        final_amount=int(result_dict["final_amount"])
        growth_rate = (final_amount / principal) ** (1 / years) - 1
    else:
        growth_rate=float(result_dict["growth_rate"])
    if growth_rate < 10:
        print("⚠️ Warning: Growth rate too low for equity investing. Consider debt instruments instead.")
    
        print("Low growth rate detected. This may take longer as fewer companies match the criteria.")

    beta_low,beta_high=beta_risk_level(risk)
    if sector=="":
        companies=nifty_50_comp
    else:
        companies=sector_tickers[sector]
    results = {}
    c=0
    i=0


    while c < 4 and i < len(companies):
        slug = companies[i]
        data = get_finology_ratios(slug)
        try:
            PE = float(data["PE"])
            ROCE = float(data["ROCE"])
            ROE = float(data["ROE"])
            Market_cap = float(data["Market_Cap_Cr"])
            debt_equity = float(data["Debt_Equity"])
            cagr = float(data.get("5yr_CAGR", 0))
        except (ValueError, TypeError):
            i += 1
            continue

        beta_data = estimate_beta(pe=PE, roce=ROCE, roe=ROE, debt_to_equity=debt_equity, market_cap=Market_cap)
        print(f"\nCompany: {slug}")
        print(f"  Estimated Beta: {beta_data:.2f} | Max Acceptable beta :  {beta_high})")
        print(f"  Company 5yr CAGR: {data['5yr_CAGR']} | Target Growth Rate: {growth_rate:.2f} ± 6")
        if ( beta_data <= beta_high + 0.2) and (growth_rate - 6<= cagr <= growth_rate + 6):
            print(data)
            results[slug] = data
            c += 1
    
        i += 1
        time.sleep(random.uniform(2.5, 5.0))  # Respect Screener's server
    portfolio_weights=calc_weights(results)

    prompt2 = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(explanation_system_template),
        HumanMessagePromptTemplate.from_template(explanation_human_template),
    ])
    chain2=prompt2|model|StrOutputParser()
    output = chain2.invoke({
        "input": input,
        "results": results,
        "weights": portfolio_weights
    })
    print(output)
    save_portfolio_with_pandas(results, portfolio_weights, filename="portfolio.csv")

In [55]:
input="I'm investing ₹1,00,000 in the IT sector over 5 years. My goal is aggressive growth — I'm okay with high risk and want an annual return of around 18%. Suggest the best stocks for this."
portfolio_developer(input)

📊 Nifty moved 0.19% in the last 24h.
✅ Nifty is stable. Using cached beta data with smart filtering.
{"sector": "IT", "investment_type": " lumpsum", "investment_goal": "aggressive growth", "risk_appetite": "high", "growth_rate": "18", "investment_horizon": 5, "principal_amount": 100000, "final_amount": None, "number_of_years": 5}
⚠️ No stocks matched the user criteria in cache.


'No valid portfolio could be created from cached data.'

In [44]:
import csv
import time

# Your cache universe
beta_cache = [
    "HDFCBANK", "ICICIBANK", "SBIN", "KOTAKBANK", "AXISBANK",
    "INFY", "TCS", "HCLTECH", "WIPRO",
    "SUNPHARMA",
    "RELIANCE", "NTPC",
    "HINDUNILVR", "ITC", "NESTLEIND",
    "ASIANPAINT", "ULTRACEMCO",
    "BAJFINANCE", "LT"
]
tier2_reactive = [  # Conditional scrape
    "MARUTI", "M&M", "HDFCLIFE", "ICICIPRULI", "LTI", "COFORGE", "DABUR",
    "PIDILITIND", "EICHERMOT", "TATAMOTORS", "POWERGRID", "HAVELLS", "GODREJCP",
    "DIVISLAB", "DRREDDY", "LICHSGFIN", "JSWSTEEL", "ADANIPORTS", "BHARTIARTL",
    "TATACHEM", "ABB"
]
# Assuming these exist



# Output list
data = []

for ticker in tier2_reactive:
    try:
        ratios = get_finology_ratios(ticker)

        if not ratios:
            print(f"⚠️ Skipped {ticker}: Missing ratio data")
            continue

        beta = estimate_beta(
            pe=ratios.get("PE"),
            roce=ratios.get("ROCE"),
            roe=ratios.get("ROE"),
            debt_to_equity=ratios.get("Debt_Equity"),
            market_cap=ratios.get("Market_Cap_Cr")
        )

        entry = {
            "Ticker": ticker,
            "CAGR": ratios.get("5yr_CAGR"),
            "Estimated_Beta": beta
        }

        data.append(entry)
        print(f"✅ Processed {ticker}")

        time.sleep(1)  # Optional: Be polite if scraping

    except Exception as e:
        print(f"❌ Error with {ticker}: {e}")
        continue

# Save to CSV
csv_file = "beta_cache_data2.csv"
with open(csv_file, mode="w", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=["Ticker", "CAGR", "Estimated_Beta"])
    writer.writeheader()
    writer.writerows(data)

print(f"\n📁 Done. Saved {len(data)} entries to {csv_file}")


✅ Processed MARUTI
[!] Error scraping M&M: Message: 
Stacktrace:
	GetHandleVerifier [0x0x9a3b03+62899]
	GetHandleVerifier [0x0x9a3b44+62964]
	(No symbol) [0x0x7d10f3]
	(No symbol) [0x0x81980e]
	(No symbol) [0x0x819bab]
	(No symbol) [0x0x8625c2]
	(No symbol) [0x0x83e554]
	(No symbol) [0x0x85fd81]
	(No symbol) [0x0x83e306]
	(No symbol) [0x0x80d670]
	(No symbol) [0x0x80e4e4]
	GetHandleVerifier [0x0xc04793+2556483]
	GetHandleVerifier [0x0xbffd02+2537394]
	GetHandleVerifier [0x0x9ca2fa+220586]
	GetHandleVerifier [0x0x9baae8+157080]
	GetHandleVerifier [0x0x9c141d+184013]
	GetHandleVerifier [0x0x9aba68+95512]
	GetHandleVerifier [0x0x9abc10+95936]
	GetHandleVerifier [0x0x996b5a+9738]
	BaseThreadInitThunk [0x0x76a95d49+25]
	RtlInitializeExceptionChain [0x0x7712d09b+107]
	RtlGetAppContainerNamedObjectPath [0x0x7712d021+561]

⚠️ Skipped M&M: Missing ratio data
✅ Processed HDFCLIFE
✅ Processed ICICIPRULI
✅ Processed LTI
✅ Processed COFORGE
✅ Processed DABUR
✅ Processed PIDILITIND
✅ Processed EICHE

In [46]:
beta_cache_data=pd.read_csv("beta_cache_data.csv")
beta_cache_data2=pd.read_csv("beta_cache_data2.csv")

In [None]:
merged_beta_cache=pd.concat([beta_cache_data.csv,beta_cache_data2.csv]

In [52]:
is_nifty_moving(1.2)

📊 Nifty moved 0.19% in the last 24h.


False

In [50]:
merged_beta_cache.to_csv("merged_beta_cache.csv", index=False)


NameError: name 'system_template' is not defined

PART 2- PORTFOLIO MANAGER

In [35]:
import feedparser

def get_google_news_rss(company, num_articles=5):
    query = company.replace(" ", "+")
    feed_url = f"https://news.google.com/rss/search?q={query}"
    feed = feedparser.parse(feed_url)
    
    return [entry.title for entry in feed.entries[:num_articles]]

In [25]:
sentiment_system_prompt = """
You are a financial sentiment analyst.

Your job is to analyze news headlines about a company and return one word only along with the company name:
- "good" if the overall sentiment is positive for the company
- "bad" if the headlines indicate issues, scandals, losses, etc.
- "neutral" if the news is mixed or non-impactful

Only return one of: "good", "bad", or "neutral"
"""


In [36]:
get_google_news_rss("HDFCBANK")

['HDFC Bank share price hits record high. Is HDB Financial IPO the reason? - Mint',
 'HDFC Bank Dividend: Record date announced for shareholder eligibility; buy before deadline to get paid - Moneycontrol',
 'Another HC judges recuses from hearing HDFC Bank CEO’s plea against bribery charge - The Indian Express',
 'PhonePe, HDFC Bank launch co-branded RuPay credit card with UPI integration - CNBC TV18',
 'Second FD rate cut in June: HDFC Bank revises interest rate on this tenure - The Economic Times']

In [None]:
import pandas as pd
import time
import random
def sentiment_anal(portfolio):
    df = pd.read_csv(portfolio)
    tickers = df['ticker'].tolist() 

    sentiments = {}

    for ticker in tickers:
        headlines = get_google_news_rss(ticker)  # fetch for this ticker only
        if not headlines:
            print(f"⚠️ No headlines found for {ticker}. Skipping.")
            continue

        headline_string = "\n".join(headlines)
    
        prompt3 = ChatPromptTemplate.from_messages([
            SystemMessagePromptTemplate.from_template(sentiment_system_prompt),
            HumanMessagePromptTemplate.from_template("{input}")
        ])

        chain = prompt3 | model | StrOutputParser()
        sentiment = chain.invoke({"input": f"Analyze the sentiment of the following headlines:\n\n{headline_string}"})
    
        sentiments[ticker] = sentiment.strip().lower()
        time.sleep(random.uniform(3, 6))  # polite scraping

    # Flag stocks for review
    print("\n===  Sentiment Analysis Summary ===\n")
    for ticker in tickers:
        sentiment = sentiments.get(ticker, "unknown")
        print(f"{ticker}: {sentiment.capitalize()}")

        if sentiment in ["bad", "negative", "bearish"]:
            print("⚠️ Action: Review this company\n")
        elif sentiment == "unknown":
            print("❓ Action: Sentiment not available\n")
        else:
            print("✅ All good\n")



===  Sentiment Analysis Summary ===

LTIM: Ltim - good
✅ All good



PART 3- ChatBot

In [7]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores  import Chroma
from langchain_community.llms import ollama
import requests
from bs4 import BeautifulSoup
import os
import re
from langchain.embeddings import HuggingFaceEmbeddings

In [8]:
import requests
from bs4 import BeautifulSoup
import os
import re
from datetime import datetime

def download_latest_research_report(company_ticker, save_dir="research_reports"):
    headers = {"User-Agent": "Mozilla/5.0"}
    url = f"https://ticker.finology.in/company/{company_ticker.upper()}"

    try:
        html = requests.get(url, headers=headers).text
    except Exception:
        return None

    soup = BeautifulSoup(html, "html.parser")

    try:
        title_text = soup.title.text
        company_name = title_text.split("|")[0].strip()
    except:
        company_name = company_ticker.upper()

    os.makedirs(save_dir, exist_ok=True)

    report_list = soup.select("ul.reportsli li")
    latest_date = None
    latest_link = None
    latest_source = None

    for item in report_list:
        badge = item.select_one(".badge-research")
        if badge:
            anchor = item.find("a", href=True)
            date_text = item.select_one("small.text-grey")
            if not anchor or not date_text:
                continue

            try:
                date = datetime.strptime(date_text.text.strip(), "%d %b %Y")
            except:
                continue

            if (latest_date is None) or (date > latest_date):
                latest_date = date
                latest_link = anchor['href']
                latest_source = anchor.text.strip().replace("Report By:", "").replace("Report by:", "").strip()

    if not latest_link:
        return None

    safe_name = re.sub(r'\W+', '_', company_name)
    source_tag = re.sub(r'\W+', '_', latest_source)
    filename = f"{safe_name}_Research_{latest_date.strftime('%Y')}_{source_tag}.pdf"
    filepath = os.path.join(save_dir, filename)

    try:
        response = requests.get(latest_link, timeout=15)
        if "application/pdf" in response.headers.get("Content-Type", "").lower():
            with open(filepath, "wb") as f:
                f.write(response.content)
            return filename
        else:
            return None
    except:
        return None


In [59]:
filename=download_latest_annual_report("HDFCBANK", save_dir="reports")

In [60]:
print(filename)

HDFC_Bank_Ltd_Share_Price_Today_Market_Cap_Price_Chart_Balance_Sheet_Annual_Report_2024.pdf


In [9]:
import os
import re
import pdfplumber

def extract_text_from_pdf(pdf_path, save_txt=False, txt_output_dir="txt_outputs"):
    if not os.path.exists(pdf_path):
        print(f"[!] File not found: {pdf_path}")
        return None

    try:
        text = ""
        with pdfplumber.open(pdf_path) as pdf:
            for page in pdf.pages:
                page_text = page.extract_text()
                if page_text:
                    text += page_text + "\n"

        text = text.strip()

        if save_txt:
            os.makedirs(txt_output_dir, exist_ok=True)
            raw_name = os.path.basename(pdf_path).replace(".pdf", "")
            safe_name = re.sub(r"\W+", "_", raw_name)
            txt_filename = f"{safe_name}.txt"
            txt_path = os.path.join(txt_output_dir, txt_filename)
            with open(txt_path, "w", encoding="utf-8") as out:
                out.write(text)
            print(f"✅ Saved extracted text to: {txt_path}")
            return txt_filename

        return text

    except Exception as e:
        print(f"[!] Failed to extract from {pdf_path}: {e}")
        return None


In [63]:
pdf_path = os.path.join("reports", filename)
txt_file = extract_text_from_pdf(pdf_path, save_txt=True)

✅ Saved extracted text to: txt_outputs\HDFC_Bank_Ltd_Share_Price_Today_Market_Cap_Price_Chart_Balance_Sheet_Annual_Report_2024.txt


In [10]:
from langchain.prompts import (
    ChatPromptTemplate,
    SystemMessagePromptTemplate,
    HumanMessagePromptTemplate
)
from langchain.schema.output_parser import StrOutputParser


In [12]:
from langchain_community.embeddings import OllamaEmbeddings




In [11]:
import os
import re
import time
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.llms import Ollama
from langchain_core.output_parsers import StrOutputParser

from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.embeddings import OllamaEmbeddings



In [13]:
import os
import re
import time
from langchain.prompts import ChatPromptTemplate, SystemMessagePromptTemplate, HumanMessagePromptTemplate
from langchain_community.llms import Ollama
from langchain_community.document_loaders import TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_community.embeddings import OllamaEmbeddings
from langchain_core.output_parsers import StrOutputParser


def RAG(query, force=False):
    start = time.time()
    print("⏱ RAG pipeline started...")

    # 1. Extract company name
    system_prompt = """
    You are an assistant that extracts company names or stock tickers from user questions.
    Return only the ticker/company name, don't explain anything.
    """
    prompt = ChatPromptTemplate.from_messages([
        SystemMessagePromptTemplate.from_template(system_prompt),
        HumanMessagePromptTemplate.from_template("{query}")
    ])

    llm = Ollama(model="llama3.2")
    chain = prompt | llm | StrOutputParser()
    company_name = chain.invoke({"query": query}).strip()
    company_name = re.sub(r"\.(NS|BO|NSE|BSE)$", "", company_name.strip(), flags=re.IGNORECASE)
    company_ticker = re.sub(r'\W+', '', company_name).upper()
    print(f"🔍 Extracted company: {company_name} → Ticker: {company_ticker} [{time.time() - start:.2f}s]")

    # 2. Vector DB path
    db_dir = os.path.abspath(f"vector_db/{company_ticker}")
    embeddings = OllamaEmbeddings(model="nomic-embed-text")

    if os.path.exists(db_dir) and len(os.listdir(db_dir)) > 0 and not force:
        print(f"📦 Using cached vector DB at {db_dir}")
        db = Chroma(persist_directory=db_dir, embedding_function=embeddings)

    else:
        print(f"📄 Rebuilding vector DB for {company_name}...")

        # Download and extract
        filename = download_latest_research_report(company_ticker, save_dir="reports")
        if not filename:
            return f"[!] Couldn't get report for '{company_name}'"

        pdf_path = os.path.join("reports", filename)
        txt_file = extract_text_from_pdf(pdf_path, save_txt=True)

        if not txt_file:
            return f"[!] PDF extraction failed: {filename}"

        txt_file = os.path.basename(txt_file).strip()
        if not txt_file.endswith(".txt"):
            txt_file += ".txt"

        txt_path = os.path.abspath(os.path.join("txt_outputs", txt_file))
        if not os.path.isfile(txt_path):
            return f"[!] Text file missing at {txt_path}"

        # Chunking
        docs = TextLoader(txt_path, encoding="utf-8").load()
        splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
        chunks = splitter.split_documents(docs)

        # Embedding and DB creation
        db = Chroma.from_documents(chunks, embeddings, persist_directory=db_dir)
        db.persist()
        print(f"✅ Vector DB saved at {db_dir}")

    # 3. Retrieval + DEBUG: show top 3 chunks
    retriever = db.as_retriever(search_kwargs={"k": 3})
    top_docs = retriever.get_relevant_documents(query)
    print("\n🔎 Top Retrieved Chunks:")
    for i, doc in enumerate(top_docs):
        print(f"\n--- Chunk {i+1} ---\n{doc.page_content[:800]}\n")

    # 4. QA
    qa_chain = RetrievalQA.from_chain_type(llm=llm, retriever=retriever)
    response = qa_chain.run(query)

    print(f"✅ Done in {time.time() - start:.2f}s")
    return response


In [15]:
query="What is the PAT ratio for HDFCBANK this year?"
print(RAG(query))

⏱ RAG pipeline started...


  db = Chroma(persist_directory=db_dir, embedding_function=embeddings)
Failed to send telemetry event ClientStartEvent: capture() takes 1 positional argument but 3 were given
Failed to send telemetry event ClientCreateCollectionEvent: capture() takes 1 positional argument but 3 were given


🔍 Extracted company: HDFCBANK → Ticker: HDFCBANK [4.70s]
📦 Using cached vector DB at c:\Users\Sagnik giri\Downloads\projects\LangChainproj\vector_db\HDFCBANK

🔎 Top Retrieved Chunks:

--- Chunk 1 ---
RESULT REVIEW
BUY
TP: Rs 2,213 |  16%
HDFC BANK | Banking | 21 April 2025
Asset quality remains resilient; eyes set on growth
Niraj Jalan | Vijiya Rao
▪ CD ratio continued to moderate with slowdown in credit growth, while
research@bobcaps.in
deposit growth higher than system growth
▪ Asset quality remains pristine supported by lower slippages; credit
cost stays stable at 40–50bps
▪ Maintain BUY. Raise SOTP-based TP to Rs 2,213 (from Rs 2,008), set at
2.5x FY27E ABV
CD ratio continues to improve: CD ratio moderated to 96.5% in Q4FY25 (98.2% in Key changes
Q3FY25) vs a high of 110.5% in Q3FY24. The moderation was driven by slowdown Target Rating
in credit growth to 5.4% YoY vs deposit growth of 14.1% YoY higher than system  
growth (~11%) in Q4FY25. HDFCB plans to reduce its CD ratio t

In [15]:
import sys
print(sys.executable)


C:\Users\Sagnik giri\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe


In [3]:
import os

In [4]:
import shutil

def delete_vector_db(company_name):
    db_path = os.path.abspath(f"vector_db/{company_name.replace(' ', '_')}")
    if os.path.exists(db_path):
        shutil.rmtree(db_path)
        print(f"🧹 Deleted vector DB at: {db_path}")
    else:
        print(f"[!] No vector DB found at: {db_path}")


In [6]:
delete_vector_db("HDFC_BANK")

🧹 Deleted vector DB at: c:\Users\Sagnik giri\Downloads\projects\LangChainproj\vector_db\HDFC_BANK
