<a href="https://colab.research.google.com/github/rkuo2000/AI-stocks/blob/main/ch07_annual_reporter_gemini.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CH-07 年報問答機器人

## 7-2 取得年報 (需要開T4 GPU)

### 1️⃣  匯入套件

In [1]:
import requests
from bs4 import BeautifulSoup

### 2️⃣ 建立函式-取得年報

In [2]:
def annual_report(id,y):
  url = 'https://doc.twse.com.tw/server-java/t57sb01'
  # 建立 POST 請求的表單
  data = {
      "id":"",
      "key":"",
      "step":"1",
      "co_id":id,
      "year":y,
      "seamon":"",
      "mtype":'F',
      "dtype":'F04'
  }
  try:
    # 發送 POST 請求
    response = requests.post(url, data=data)
    # 取得回應後擷取檔案名稱
    link=BeautifulSoup(response.text, 'html.parser')
    link1=link.find('a').text
    print(link1)
  except Exception as e:
    print(f"發生{e}錯誤")
  # 建立第二個 POST 請求的表單
  data2 = {
      'step':'9',
      'kind':'F',
      'co_id':id,
      'filename':link1 # 檔案名稱
  }
  try:
    # 發送 POST 請求
    response = requests.post(url, data=data2)
    link=BeautifulSoup(response.text, 'html.parser')
    link1=link.find('a')
    # 取得 PDF 連結
    link2 = link1.get('href')
    print(link2)
  except Exception as e:
    print(f"發生{e}錯誤")
  # 發送 GET 請求
  try:
    response = requests.get('https://doc.twse.com.tw' + link2)
    # 取得 PDF 資料
    with open(y + '_' + id + '.pdf', 'wb') as file:
        file.write(response.content)
    print('OK')
  except Exception as e:
    print(f"發生{e}錯誤")

### 3️⃣ 呼叫函式

In [3]:
annual_report('2330','113')

2023_2330_20240604F04.pdf
/pdf/2023_2330_20240604F04_20251019_061028.pdf
OK


## 7-3 年報問答

langchain 以更新至新版寫法，舊版可參考：
https://colab.research.google.com/drive/16x0mUitJjH0PZx7kk2Kew2MSlkB9G_lA

###4️⃣ 安裝相關套件

In [4]:
!pip install langchain langchain-text-splitters langchain-community langchain-chroma

Collecting langchain-core<1.0.0,>=0.3.72 (from langchain)
  Using cached langchain_core-0.3.79-py3-none-any.whl.metadata (3.2 kB)
INFO: pip is looking at multiple versions of langchain-chroma to determine which version is compatible with other requirements. This could take a while.
Collecting langchain-chroma
  Downloading langchain_chroma-0.2.6-py3-none-any.whl.metadata (1.1 kB)
Downloading langchain_chroma-0.2.6-py3-none-any.whl (12 kB)
Using cached langchain_core-0.3.79-py3-none-any.whl (449 kB)
Installing collected packages: langchain-core, langchain-chroma
  Attempting uninstall: langchain-core
    Found existing installation: langchain-core 1.0.0
    Uninstalling langchain-core-1.0.0:
      Successfully uninstalled langchain-core-1.0.0
  Attempting uninstall: langchain-chroma
    Found existing installation: langchain-chroma 1.0.0
    Uninstalling langchain-chroma-1.0.0:
      Successfully uninstalled langchain-chroma-1.0.0
[31mERROR: pip's dependency resolver does not currently

In [5]:
!pip install -U "langchain[openai]"

Collecting langchain[openai]
  Downloading langchain-1.0.0-py3-none-any.whl.metadata (4.6 kB)
Collecting langchain-core<2.0.0,>=1.0.0 (from langchain[openai])
  Using cached langchain_core-1.0.0-py3-none-any.whl.metadata (3.4 kB)
Collecting langgraph<1.1.0,>=1.0.0 (from langchain[openai])
  Downloading langgraph-1.0.0-py3-none-any.whl.metadata (7.4 kB)
Collecting langchain-openai (from langchain[openai])
  Downloading langchain_openai-1.0.0-py3-none-any.whl.metadata (1.8 kB)
Collecting langgraph-checkpoint<3.0.0,>=2.1.0 (from langgraph<1.1.0,>=1.0.0->langchain[openai])
  Downloading langgraph_checkpoint-2.1.2-py3-none-any.whl.metadata (4.2 kB)
Collecting langgraph-prebuilt<1.1.0,>=1.0.0 (from langgraph<1.1.0,>=1.0.0->langchain[openai])
  Downloading langgraph_prebuilt-1.0.0-py3-none-any.whl.metadata (5.0 kB)
Collecting langgraph-sdk<0.3.0,>=0.2.2 (from langgraph<1.1.0,>=1.0.0->langchain[openai])
  Downloading langgraph_sdk-0.2.9-py3-none-any.whl.metadata (1.5 kB)
Collecting ormsgpack>=

In [6]:
!pip install -U "langchain-core"



In [7]:
!pip install langchain-huggingface



###  5️⃣ 匯入相關套件

In [8]:
from google.colab import userdata
api_key = userdata.get('GEMINI_API_KEY')

### 6️⃣ 設定環境變數和建立 Google Gemini 模型

In [10]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm_model = ChatGoogleGenerativeAI(
    api_key = api_key,
    model="gemini-2.5-flash",
    temperature=0,
    max_tokens=None,
    timeout=None,
    max_retries=2,
    # other params...
)

### 7️⃣ 建立函式-建立向量資料庫

In [11]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="gemini-embedding-001")

In [None]:
#from langchain_huggingface.embeddings import HuggingFaceEmbeddings

#model_id = "BAAI/bge-base-en-v1.5"
#embeddings = HuggingFaceEmbeddings(model_name=model_id,encode_kwargs={"normalize_embeddings": True})

#model_id = "sentence-transformers/all-MiniLM-l6-v2"
#embeddings = HuggingFaceEmbeddings(model_name=model_id)

### Load PDF

In [12]:
!pip install pypdf



In [13]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("/content/113_2330.pdf")

pages = loader.load_and_split()


In [15]:
print(len(pages))

pages[0]

350


Document(metadata={'producer': 'Adobe Acrobat Pro DC (32-bit) 21.1.20138', 'creator': 'Adobe Acrobat Pro DC (32-bit) 21.1.20138', 'creationdate': '2024-04-18T20:10:41+08:00', 'moddate': '2024-05-15T16:42:52+08:00', 'title': '', 'trapped': '/False', 'source': '/content/113_2330.pdf', 'total_pages': 345, 'page': 0, 'page_label': '1'}, page_content='Fab 18\nAP2\nFab 23\nFab 21\nFab 5\nFab 16\nFab 3\nFab 8\nFab 12\nFab 2\nFab 15\nAP6\nGlobal R&D Center\nFab 11\n300-096 新竹科學園區力行六路8號  | 電話：+886-3-5636688  | 傳真：+886-3-5637000  | https://www.tsmc.com  \n本報告書採用再生紙印製\n董事長 劉 德 音台灣積體電路製造股份有限公司\n台灣證券交易所公開資訊觀測站網址：https://mops.twse.com.tw\n台積公司年報網址：https://investor.tsmc.com/chinese/annual-reports\n台\n灣\n積\n體\n電\n路\n製\n造\n股\n份\n有\n限\n公\n司\n                                                                                        \n民\n國\n一\n百\n一\n十\n二\n年\n度\n年\n報\n︵\n一\n︶\n台灣積體電路製造股份有限公司 \n民國一百一十二年度年報 （一)\n刊印時間  民國一百一十三年三月十二日 股票代號：2330 | NYSE: TSM\n54.$@ϋజ\x13\x11\x13\x14@.&5"--*$@$)*/&4&\x0fJOEE\x01\x01\

In [17]:
# Split the document into chunks

from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="\n",
    chunk_size=500,
    chunk_overlap=100
)

chunks = text_splitter.split_documents(pages)

print(len(chunks))

print(type(chunks[0]))

1545
<class 'langchain_core.documents.base.Document'>


### Indexing

In [19]:
# Creating Chunks Embedding

from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding_model = GoogleGenerativeAIEmbeddings(google_api_key=api_key, model="models/embedding-001")

### Vector DB

In [20]:
# Store the chunks in vector store
from langchain_community.vectorstores import Chroma

# Embed each chunk and load it into the vector store
db = Chroma.from_documents(chunks, embedding_model, persist_directory="./chroma_db_")

# Persist the database on drive
db.persist()

GoogleGenerativeAIError: Error embedding content: 429 You exceeded your current quota, please check your plan and billing details. For more information on this error, head to: https://ai.google.dev/gemini-api/docs/rate-limits.
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0
* Quota exceeded for metric: generativelanguage.googleapis.com/embed_content_free_tier_requests, limit: 0 [violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerMinutePerUserPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerDayPerUserPerProjectPerModel-FreeTier"
}
violations {
  quota_metric: "generativelanguage.googleapis.com/embed_content_free_tier_requests"
  quota_id: "EmbedContentRequestsPerDayPerProjectPerModel-FreeTier"
}
, links {
  description: "Learn more about Gemini API quotas"
  url: "https://ai.google.dev/gemini-api/docs/rate-limits"
}
]

### 9️⃣ 查詢相關資料

In [None]:
from langchain_core.messages import SystemMessage
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate

### 🔟  匯入問答相關套件

### 1️⃣1️⃣  建立函式-問答程式

In [None]:
# 提示模板
prompt = ChatPromptTemplate.from_messages([
    ("system",
     "你是一個根據年報資料與上下文作回答的助手,"
     "如果有明確數據或技術(產品)名稱可以用數據或名稱回答,"
     "回答以繁體中文為主。"
     "{context}"),
    ("human","{question}")])

# 建立問答函式
def question_and_answer(question):
    retrievalQA = RetrievalQA.from_llm(
        llm=llm_model,
        prompt=prompt,
        return_source_documents=True,
        retriever=db.as_retriever(
        search_kwargs={'k':5}))
    answer = retrievalQA.invoke(question)
    return answer

### 1️⃣2️⃣ 建立迴圈進行問答

In [None]:
while True:
    question = input("輸入問題:")
    if not question.strip():
        break
    result = question_and_answer(question)
    print(result['result'])
    print('_________')
    #print(result["source_documents"])

## 7-4 年報總結與分析

### 1️⃣3️⃣ 回答結果及原始資料

In [None]:
from langchain.chains.summarize import load_summarize_chain

### 1️⃣4️⃣ 總結原始資料

In [None]:
# 建立關鍵字串列
key_word = ['正在開發的產品及銷售狀況',
            '市場策略的調整或變化',
            '公司預期未來展望',
            '總營收、稅前淨利的成長或變動分析',
            '國際競爭以及海外市場銷售情形']

data_list = []
for word in key_word:
    data = db.similarity_search(word, k=3)
    # 整合 Document 串列
    data_list += data

# 建立提示訊息串列
prompt_template = [("system","你的任務是對年報資訊進行摘要總結。"
                    "以下為提供的年報資訊：{text},"
                    "請給我重點數據, 如銷售增長情形、營收變化、開發項目等,"
                    "最後請使用繁體中文輸出報告")]
prompt = ChatPromptTemplate.from_messages(messages=prompt_template)

### 1️⃣5️⃣  呼叫函式

In [None]:
refine_chain = load_summarize_chain(llm=llm_model,chain_type='stuff',prompt=prompt)
summary_refine = refine_chain.invoke({"input_documents": data_list})
print(summary_refine['output_text'])

### 1️⃣6️⃣  提取關鍵字
使用 MMR 搜尋方法

In [None]:
from langchain_core.output_parsers import CommaSeparatedListOutputParser, StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnableSequence

word_prompt = PromptTemplate.from_template(
     "從{input}聯想出4個與年報分析有關的重要關鍵字,"\
     "請確保回答具有具有關聯性、多樣性和變化性。 \n "
     "僅回覆關鍵字, 並以半形逗號與空格來分隔。不要加入其他內容")

word_chain = word_prompt | llm_model | CommaSeparatedListOutputParser()
print(word_chain.invoke({"input": "公司的營運狀況如何?"}))

### 1️⃣7️⃣ 設定 AI 角色讓其分析報告

In [None]:
data_prompt = PromptTemplate.from_template(
    "你現在是一位專業的股票分析師,"
    "會以詳細、嚴謹的角度針對 {key_words} 進行年報分析,"
    "請提及關於營收、是否成長以及利潤等重要數字,"
    "最後生成一份專業的趨勢分析報告。"
    "以下為年報資料：{data_content}")

data_chain = data_prompt | llm_model | StrOutputParser()

### 1️⃣8️⃣ 整合函式

In [None]:
def analyze_chain(input):
    # 搜尋「問題」的相關資料
    data = db.max_marginal_relevance_search(input, fetch_k=5, k=2)

    # 第一個 Chain 元件, 建立「關鍵字」串列
    word_list = word_chain.invoke({"input": input})

    # 搜尋「關鍵字」的相關資料
    for word in word_list:
      data += db.max_marginal_relevance_search(word, fetch_k=5, k=2)
    word_list.append(input)

    # 第二個 Chain 元件, 生成分析報告
    result = data_chain.invoke({'key_words':word_list,'data_content':data})

    return result

### 1️⃣9️⃣ 呼叫函式

In [None]:
input = '公司的營收狀況如何？'
analyze_report = analyze_chain(input)
print(analyze_report)