## 爬取 Yahoo 新聞


In [None]:
import requests
from bs4 import BeautifulSoup


def fetch_yahoo_news(query):
    search_url = f"https://tw.news.yahoo.com/search?p={query}"
    response = requests.get(search_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = []

    # 搜尋新聞標題和鏈接
    for item in soup.select('li[class*="StreamMegaItem"]'):
        title_element = item.select_one('h3[class*="Mb"] a[class*="Fw"]')
        if title_element:
            title = title_element.get_text()
            link = title_element['href']
            articles.append({'title': title, 'link': link})

    return articles


news_articles = fetch_yahoo_news("中華職棒中信兄弟")
for article in news_articles:
    print(article['title'])

## 完整代碼


In [39]:
import requests
from bs4 import BeautifulSoup
import os
import certifi
from pymongo import MongoClient
import streamlit as st
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from langchain.prompts import PromptTemplate
import pprint
from langchain.docstore.document import Document

# 配置 OpenAI API Key 和 MongoDB 連接字串
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
ATLAS_CONNECTION_STRING = st.secrets["MONGODB_URL"]

# 建立 MongoDB 連線對象
client = MongoClient(
    ATLAS_CONNECTION_STRING,
    tlsCAFile=certifi.where()
)
# 資料庫、集合
db_name = "MyDatabase2024"
collection_name = "MyCollection2024"
# 連線指定的資料庫與集合
atlas_collection = client[db_name][collection_name]
# 向量搜尋名稱
vector_search_index = "vector_index"

# 爬取 Yahoo 新聞
def fetch_yahoo_news(query):
    search_url = f"https://tw.news.yahoo.com/search?p={query}"
    response = requests.get(search_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    articles = []

    # 搜尋新聞標題和鏈接
    for item in soup.select('li[class*="StreamMegaItem"]'):
        title_element = item.select_one('h3[class*="Mb"] a[class*="Fw"]')
        if title_element:
            title = title_element.get_text()
            link = title_element['href']
            articles.append({'title': title, 'link': link})

    return articles

# 爬取新聞並生成生成嵌入向量
def process_news_and_store_embeddings(query):
    news_articles = fetch_yahoo_news(query)
    
    news_documents = []
    embedding_model = OpenAIEmbeddings(disallowed_special=())

    # 遍歷
    for article in news_articles:
        # 輸出
        # print(f"文章：{article['title']}")
        embeddings = embedding_model.embed_documents([article["title"]])
        if embeddings:
            doc = Document(page_content=article["title"], metadata={"title": article["title"]})
            news_documents.append(doc)
        else:
            print(f"對文章生成 Embedding 時出錯：{article['title']}")

    # 儲存到 MongoDB
    if news_documents:
        atlas_collection.insert_many(
            # 列表生成
            [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in news_documents]
        )
        print("嵌入向量已儲存到 MongoDB Atlas。")
    else:
        print("未能生成嵌入向量。")

# 執行爬取和儲存
query = "中信兄弟"
process_news_and_store_embeddings(query)

# 從 MongoDB 載入數據並進行相似度搜索
documents = []
for doc in atlas_collection.find():
    if "page_content" in doc:
        documents.append(
            Document(page_content=doc["page_content"], metadata={})
        )

vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=atlas_collection,
    index_name=vector_search_index,
)

# 相似度查詢
query_text = "中信兄弟近日戰績？"
results = vector_search.similarity_search(query_text)
print(f'\n相似度查詢：{query_text}')
pprint.pprint(results)

# 定義 RAG（檢索增強生成）鏈
retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10, "score_threshold": 0.75},
)

template = """
使用以下內容來回答最後的問題。
如果你不知道答案，就說你不知道，不要試圖編造答案。
{context}
問題：{question}
"""

custom_rag_prompt = PromptTemplate.from_template(template)
llm = ChatOpenAI()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)
# 提問
question = "中信兄弟近日戰績？"
print(f'\n提問：{question}')
# 取得相關文件
documents = retriever.get_relevant_documents(question)
# 輸出
# pprint.pprint(documents)

try:
    formatted_docs = format_docs(documents)
    # 輸出
    # print(f"Formatted documents: {formatted_docs}")
    # 確保上下文是字串
    if isinstance(formatted_docs, str) and isinstance(question, str):
        # 是字串就直接提問
        answer = rag_chain.invoke(question)
        print("\n回答：" + answer)
    else:
        print("錯誤：格式化文件或問題不是字串。")
except Exception as e:
    print(f"進行 RAG chain invoke 時發生錯誤：{e}")


嵌入向量已儲存到 MongoDB Atlas。

相似度查詢：中信兄弟近日戰績？
[Document(page_content='猛登7局失1分優質先發 助兄弟擊敗獅 (圖)', metadata={'_id': {'$oid': '66545c3104d5609d59da77d1'}}),
 Document(page_content='中職／兄弟慘敗…雄鷹18安16分猛攻 寫3大隊史紀錄', metadata={'_id': {'$oid': '66545c3104d5609d59da77d5'}}),
 Document(page_content='中職／單局遭「完全打擊」、潘威倫3.2局失7分敗 張志豪雙響率兄弟奪勝', metadata={'_id': {'$oid': '66545c3104d5609d59da77cb'}}),
 Document(page_content='中職》首位突破10萬票！ 中信兄弟王威晨暫居人氣王寶座', metadata={'_id': {'$oid': '66545c3104d5609d59da77d7'}})]

提問：中信兄弟近日戰績？

回答：我不知道中信兄弟近日的戰績。


## 拆解

In [38]:
import os
import certifi
from pymongo import MongoClient
import streamlit as st
from langchain.docstore.document import Document
from langchain_openai import OpenAIEmbeddings
from langchain_mongodb import MongoDBAtlasVectorSearch
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

# 設置環境變數
os.environ["OPENAI_API_KEY"] = st.secrets["OPENAI_API_KEY"]
ATLAS_CONNECTION_STRING = st.secrets["MONGODB_URL"]

#
client = MongoClient(
    ATLAS_CONNECTION_STRING,
    tlsCAFile=certifi.where()
)
# 選擇特定的資料庫和集合
db_name = "MyDatabase2024"
collection_name = "MyCollection2024"
atlas_collection = client[db_name][collection_name]
vector_search_index = "vector_index"

#
def process_news_and_store_embeddings(query):
    news_articles = fetch_yahoo_news(query)
    
    news_documents = []
    embedding_model = OpenAIEmbeddings(disallowed_special=())

    for article in news_articles:
        embeddings = embedding_model.embed_documents([article["title"]])
        if embeddings:
            doc = Document(page_content=article["title"], metadata={"title": article["title"]})
            news_documents.append(doc)
        else:
            print(f"對文章生成 Embedding 時出錯：{article['title']}")

    if news_documents:
        atlas_collection.insert_many(
            [{"page_content": doc.page_content, "metadata": doc.metadata} for doc in news_documents]
        )
        print("嵌入向量已儲存到 MongoDB Atlas。")
    else:
        print("未能生成嵌入向量。")


query = "中信兄弟"
process_news_and_store_embeddings(query)

documents = []
for doc in atlas_collection.find():
    if "page_content" in doc:
        documents.append(
            Document(page_content=doc["page_content"], metadata={})
        )

vector_search = MongoDBAtlasVectorSearch.from_documents(
    documents=documents,
    embedding=OpenAIEmbeddings(disallowed_special=()),
    collection=atlas_collection,
    index_name=vector_search_index,
)

query_text = "中信兄弟最近的比賽表現如何？"
results = vector_search.similarity_search(query_text)
print(f'\n相似度查詢：{query_text}')
pprint.pprint(results)

retriever = vector_search.as_retriever(
    search_type="similarity",
    search_kwargs={"k": 10, "score_threshold": 0.75},
)

template = """
使用以下內容來回答最後的問題。
如果你不知道答案，就說你不知道，不要試圖編造答案。
{context}
問題：{question}
"""

custom_rag_prompt = PromptTemplate.from_template(template)
llm = ChatOpenAI()

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

# 提問
question = "中信兄弟最近比賽的表現如何？"
print(f'\n提問：{question}')
# 取得相關文件
documents = retriever.get_relevant_documents(question)
# 輸出
# pprint.pprint(documents)

try:
    formatted_docs = format_docs(documents)
    # 輸出
    # print(f"Formatted documents: {formatted_docs}")
    # 確保上下文是字串
    if isinstance(formatted_docs, str) and isinstance(question, str):
        # 是字串就直接提問
        answer = rag_chain.invoke(question)
        print("\n回答：" + answer)
    else:
        print("錯誤：格式化文件或問題不是字串。")
except Exception as e:
    print(f"進行 RAG chain invoke 時發生錯誤：{e}")

嵌入向量已儲存到 MongoDB Atlas。

相似度查詢：
[]

提問：中信兄弟最近比賽的表現如何？

回答：中信兄弟最近的比賽表現並不理想，他們在一些比賽中遭遇了慘敗，被對手猛攻並且失分較多。在一些比賽中，他們的投手表現也不盡如人意。然而，在一些比賽中，兄弟隊的火力攻勢表現出色，取得了勝利。总体来说，中信兄弟的表现有起有伏。
