# RAG

## Part 1: Enviornment

In [None]:
import os
import requests
import pandas as pd
from typing import Literal

In [None]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = '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'

API_SECRET_KEY = "sk-fQmZ27DLJestJOeZ915dEbA0325d47AaAc86B6Ed609625E5"
BASE_URL = "https://api.gpts.vin/v1"

os.environ["OPENAI_API_KEY"] = API_SECRET_KEY
os.environ["OPENAI_API_BASE"] = BASE_URL

## Part 2: Indexing

In [None]:
from langchain_community.vectorstores import Chroma
from langchain_community.embeddings.huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_community.document_loaders import DataFrameLoader

`Document Preparation`

In [None]:
loader1 = PyPDFLoader("围棋史话.pdf")
loader2 = PyPDFLoader("围棋高手.pdf")
loader3 = PyPDFLoader("围棋历史对决.pdf")
go_history_books = loader1.load()
go_masters_players = loader2.load()
go_classic_matches = loader3.load()

In [None]:
# Web crawler to get real-time match information
def get_game_data(page):
    url = f"https://www.19x19.com/api/engine/games/0086-golaxy_public?page={page}&size=7&game_type=2&username=0086-golaxy_public"
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        return data.get('data', {}).get('gameMetaList', [])
    else:
        print("获取数据失败。")
        return []


def extract_game_info(game):
    create_date = game.get("createTime", {}).get("date", {})
    play_date = game.get("playTime", {}).get("date", {})
    analyze_date = game.get("analyzeTime", {}).get("date", {})

    create_date_str = f"{create_date.get('year', '')}-{create_date.get('month', '')}-{create_date.get('day', '')}"
    play_date_str = f"{play_date.get('year', '')}-{play_date.get('month', '')}-{play_date.get('day', '')}"
    analyze_date_str = f"{analyze_date.get('year', '')}-{analyze_date.get('month', '')}-{analyze_date.get('day', '')}"

    # Deal with the information
    game_result = game.get("gameResult", "")
    if "W+R" in game_result:
        game_result = "白方胜利"
    elif "B+R" in game_result:
        game_result = "黑方胜利"
    else:
        game_result = "未知"

    game_info = {
        "对局名称": game.get("gamename", ""),
        "黑方选手": game.get("pb", ""),
        "白方选手": game.get("pw", ""),
        "黑方段位": game.get("pbLevel", ""),
        "白方段位": game.get("pwLevel", ""),
        "起始手数": game.get("startMoveNum", 0),
        "总手数": game.get("moveNum", 0),
        "棋盘大小": game.get("boardSize", 0),
        "规则": game.get("rule", ""),
        "贴目": game.get("komi", 0),
        "让子数": game.get("handicap", 0),
        "对局结果": game_result,
        "分析状态": game.get("analyzeStatus", ""),
        "分析点数": game.get("analyzePo", 0),
        "创建时间": create_date_str,
        "对局时间": play_date_str,
        "分析时间": analyze_date_str,
    }
    return game_info


def get_current_go_match(start_page, end_page):
    all_game_info_list = []
    for page_number in range(start_page - 1, end_page):
        games = get_game_data(page_number)
        if games:
            game_info_list = [extract_game_info(game) for game in games]
            all_game_info_list.extend(game_info_list)
            print(f"第{page_number + 1}页,数据：{game_info_list}。")
        else:
            print(f"第{page_number + 1}页未找到游戏数据。\n")

    if all_game_info_list:
        df = pd.DataFrame(all_game_info_list)
        return df
        # df.to_csv(f"围棋对决数据{start_page}_{end_page}页.csv", index=False)
        # print("所有围棋对决数据已保存至 围棋对决数据.csv\n")
    else:
        print("没有找到任何围棋对决数据。\n")

In [None]:
info = get_current_go_match(start_page=1, end_page=10)
go_current_matches = DataFrameLoader(info, page_content_column="对局名称").load()

`Save In VectorStores` (In this process contain `Embedding`)

In [None]:
# embedding_function = HuggingFaceEmbeddings(model_name="aspire/acge_text_embedding")
# # sentences = ["数据1", "数据2"]

# # embeddings_1 = model.encode(sentences, normalize_embeddings=True)
# # embeddings_2 = model.encode(sentences, normalize_embeddings=True)
# # similarity = embeddings_1 @ embeddings_2.T
# # print(similarity)
# vectorstore = Chroma.from_documents(documents=history_go_books, embedding=embedding_function)

In [None]:
# # Given the split doc then embedding them in the Vectorstorage/Graphicstorage
embedding_function = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

In [None]:
vectorstore_for_classic_match = Chroma.from_documents(documents=go_classic_matches, embedding=embedding_function)
vectorstore_for_players = Chroma.from_documents(documents=go_masters_players, embedding=embedding_function)
vectorstore_for_current_match = Chroma.from_documents(documents=go_current_matches, embedding=embedding_function)

## Part 3: Retrieval

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain.load import dumps, loads
from langchain_community.retrievers import BM25Retriever

判断两部棋子的时间是否过长，如果过长添加历史书籍

In [None]:
add_history_knowladge = False

`Routing` to choose which retrieval method to use

In [None]:
class RouterQuery(BaseModel):
    datasource: Literal["no_need_for_knowledge", "current_go_match_info", "go_classic_match", "go_masters"] = Field(
        description="Given a description of the Go board question choose which datasource would be most relevant for answering their question"
    )

llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
structured_llm = llm.with_structured_output(RouterQuery)

# Prompt for routing
system = """You are an expert at routing a user question to the appropriate data source.
Based on the programming language the question is referring to, route it to the relevant data source."""

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system),
        ("human", "{description}")
    ]
)

# Define router
router = prompt | structured_llm

In [None]:
# Go board description
description = """从整体来看，黑棋在右上角和右下角的地盘较为稳固，形势较好;而白棋在左上角和中腹的形势较为复杂，尚不明朗。整体上，黑棋在地盘上略占优势。我们可以来关注一下现在其他场次的比赛情况
"""

routing_result = router.invoke({"description": description})
routing_result

Do not `Save In VectorStores`, no `Embedding`, directly use some character information

In [None]:
# retriever = BM25Retriever.from_documents(documents=go_classic_matches, k=5)

`RAG-Fusion`(Query Translation)

In [None]:
# RAG-Fusion to generate the similar queries for better retrieval
template = """Receive a description of the current Go board game, 
you need to judge whether you need Go-related questions based on your knowledge of Go, 
if not then you don't need to generate multiple query, 
if you do then you need to generate multiple query, the content of these query can be related to the three directions: the current situation of the Go game, the history of the classic Go game, and the history of the Go masters. \n
Judge whether generating multiple search queries related to: {description} \n
Output Example:\n
Q: 
A:
"""

prompt_query_translation = ChatPromptTemplate.from_template(template=template)

generate_multi_query = (
    prompt_query_translation
    | ChatOpenAI(model="gpt-3.5-turbo", temperature=0)
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)

In [None]:
# For each query retrieval for couples of documents, then rank these documents according to the ranking score and delete overlap
def reciprocal_rank_fusion(results: list[list], k=60):
    '''
        Reciprocal_rank_fusion that takes multiple lists of ranked documents 
            and an optional parameter k used in the RRF formula 
    ''' 
    
    # Initialize a dictionary to hold fused scores for each unique document
    fused_scores = {}

    # Iterate through each list of ranked documents
    for docs in results:
        # Iterate through each document in the list, with its rank (position in the list)
        for rank, doc in enumerate(docs):
            # Convert the document to a string format to use as a key (assumes documents can be serialized to JSON)
            doc_str = dumps(doc)
            # If the document is not yet in the fused_scores dictionary, add it with an initial score of 0
            if doc_str not in fused_scores:
                fused_scores[doc_str] = 0
            # Retrieve the current score of the document, if any
            previous_score = fused_scores[doc_str]
            # Update the score of the document using the RRF formula: 1 / (rank + k)
            fused_scores[doc_str] += 1 / (rank + k)

    # Sort the documents based on their fused scores in descending order to get the final reranked results
    reranked_results = [
        (loads(doc), score)
        for doc, score in sorted(fused_scores.items(), key=lambda x: x[1], reverse=True)
    ]

    # Return the reranked results as a list of tuples, each containing the document and its fused score
    return reranked_results

`Create Retriever`

In [None]:
# Create a retriever for the Vectorstore
def get_docs(retriever):
    retrieval_chain_rag_fusion = (
        generate_multi_query 
        | retriever.map() 
        | reciprocal_rank_fusion
    )

    docs = retrieval_chain_rag_fusion.invoke({"description": description})
    return docs

## Part 4: Generation

In [None]:
def route_select(routing_result):
    if add_history_knowladge == True:
        pass
    else:
        if routing_result.datasource == 'go_masters':
            retriever = vectorstore_for_players.as_retriever(search_kwargs={"k": 5})
            return get_docs(retriever=retriever)
        elif routing_result.datasource ==  'go_classic_match':
            retriever = vectorstore_for_classic_match.as_retriever(search_kwargs={"k": 5})
            return get_docs(retriever=retriever)
        elif routing_result.datasource == 'current_go_match_info':
            retriever = vectorstore_for_current_match.as_retriever(search_kwargs={"k": 5})
            return get_docs(retriever=retriever)
        else:
            return []

In [None]:
docs = route_select(routing_result=routing_result)

Final `Prompt`

In [None]:
# Combine the context(RAG) and the question to generate a templete for prompt
template = """You will act as a Go commentator.
I will provide you with the current state of the Go board, as well as some additional commentary, and you will need to reorganise the order of the words to produce an organised, clear commentary.
Description of the current Go board situation: {descriptions}
Some additional commentary: {context}
All the replies are in Chinese, you just need to reply, not keep asking me questions:
解说稿:
"""

prompt = ChatPromptTemplate.from_template(template)

In [None]:
# LLM specified
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

chain = prompt | llm

ans = chain.invoke({"descriptions":description, "context": docs})
ans.content