# 目的
生成したデータに対する、bgeでのretrieveファイルの作成

- MAP@25: 
- recall: 

Please let me know if there are any mistakes.

# Setting

In [1]:
EXP_NAME = "e013-ret-bge-prepare"
DATA_PATH = "../../data"

DATASET_NAME = EXP_NAME
OUTPUT_PATH = f"../../output/retriever/{EXP_NAME}"

RETRIEVE_NUM = 25  # TODO: 多くしてみる

# Install

In [2]:
# %pip install sentence-transformers==3.1.0

# Import

In [3]:
import os

import polars as pl
import numpy as np

from sklearn.metrics.pairwise import cosine_similarity

import torch
from transformers import AutoTokenizer, AutoModel
from sentence_transformers import SentenceTransformer

In [4]:
# import transformers
# import torch
# import sentence_transformers

# assert transformers.__version__ == "4.44.2"
# assert torch.__version__ == "2.3.1"
# assert sentence_transformers.__version__ == "3.1.0"

# Data Load

In [5]:
# train = pl.read_csv(f"{DATA_PATH}/train.csv")
misconception_mapping = pl.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

# Preprocess

In [6]:
# common_col = [
#     "QuestionId",
#     "ConstructName",
#     "SubjectName",
#     "QuestionText",
#     "CorrectAnswer",
# ]

# train_long = (
#     pl.read_csv(f"{DATA_PATH}/train.csv")
#     .select(
#         pl.col(common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]])
#     )
#     .unpivot(
#         index=common_col,
#         variable_name="AnswerType",
#         value_name="AnswerText",
#     )
#     .with_columns(
#         pl.concat_str(
#             [
#                 pl.col("ConstructName"),
#                 pl.col("SubjectName"),
#                 pl.col("QuestionText"),
#                 pl.col("AnswerText"),
#             ],
#             separator=" ",
#         ).alias("AllText"),
#         pl.col("AnswerType").str.extract(r"Answer([A-D])Text$").alias("AnswerAlphabet"),
#     )
#     .with_columns(
#         pl.concat_str(
#             [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
#         ).alias("QuestionId_Answer"),
#     )
#     .sort("QuestionId_Answer")
# )
# train_long.head()

In [7]:
# train_misconception_long = (
#     train.select(
#         pl.col(
#             common_col + [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
#         )
#     )
#     .unpivot(
#         index=common_col,
#         variable_name="MisconceptionType",
#         value_name="MisconceptionId",
#     )
#     .with_columns(
#         pl.col("MisconceptionType")
#         .str.extract(r"Misconception([A-D])Id$")
#         .alias("AnswerAlphabet"),
#     )
#     .with_columns(
#         pl.concat_str(
#             [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
#         ).alias("QuestionId_Answer"),
#     )
#     .sort("QuestionId_Answer")
#     .select(pl.col(["QuestionId_Answer", "MisconceptionId"]))
#     .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
# )
# train_misconception_long.head()

In [8]:
# # join MisconceptionId
# train_long = train_long.join(train_misconception_long, on="QuestionId_Answer")

In [10]:
train_ex_long = pl.read_csv("../../data/e001-generate.csv")

In [12]:
train_ex_long.shape

(2570, 6)

In [13]:
misconception_name_to_id = dict(
    zip(
        misconception_mapping["MisconceptionName"],
        misconception_mapping["MisconceptionId"],
    )
)

In [14]:
train_ex_long.columns

['QuestionText',
 'ConstructName',
 'SubjectName',
 'CorrectAnswerText',
 'IncorrectAnswerText',
 'MisconceptionName']

In [15]:
train_ex_long = train_ex_long.with_columns(
    pl.concat_str(
        [
            pl.col("ConstructName"),
            pl.col("SubjectName"),
            pl.col("QuestionText"),
            pl.col("IncorrectAnswerText"),
        ],
        separator=" ",
    ).alias("AllText"),
    pl.col("MisconceptionName")
    .replace_strict(misconception_name_to_id, default=None)
    .alias("MisconceptionId"),
)

# BGE

In [16]:
# model = SentenceTransformer("BAAI/bge-large-zh-v1.5") # CV: 0.06ぐらい
model = SentenceTransformer("BAAI/bge-large-en-v1.5")  # CV: 0.1841439184198388
# model = SentenceTransformer("all-MiniLM-L6-v2") # CV: 0.17806659878200218
# model = SentenceTransformer("nvidia/NV-Embed-v2", trust_remote_code=True)  # CV: OOMのため、試せていない
# model.max_seq_length = 32768
# model.tokenizer.padding_side = "right"
# model = SentenceTransformer("dunzhang/stella_en_1.5B_v5", trust_remote_code=True)

train_long_vec = model.encode(
    train_ex_long["AllText"].to_list(), normalize_embeddings=True
)
misconception_mapping_vec = model.encode(
    misconception_mapping["MisconceptionName"].to_list(), normalize_embeddings=True
)
print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

(2570, 1024)
(2587, 1024)


In [17]:
# misconception_mapping_vecを保存する
os.makedirs(OUTPUT_PATH, exist_ok=True)
np.save(f"{OUTPUT_PATH}/misconception_mapping_vec.npy", misconception_mapping_vec)

In [18]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

In [19]:
# example
def print_example(df: pl.DataFrame, sorted_indices: np.ndarray, idx: int) -> None:
    print(f"Query idx{idx}")
    print(df["AllText"][idx])
    print("\nCos Sim No.1")
    print(misconception_mapping["MisconceptionName"][int(sorted_indices[idx, 0])])
    print("\nCos Sim No.2")
    print(misconception_mapping["MisconceptionName"][int(sorted_indices[idx, 1])])

In [20]:
print_example(train_ex_long, train_sorted_indices, 0)

Query idx0
Use the angle sum property of triangles to find a missing angle Geometry If you have a triangle with two angles measuring \(60^\circ\) and \(80^\circ\), what is the measure of the third angle in the triangle? \(50^\circ\)

Cos Sim No.1
Forgets to subtract sum of angles we know from 180 to find missing angle in a triangle

Cos Sim No.2
Finds the sum of the missing angles but not the angle being asked


In [21]:
print_example(train_ex_long, train_sorted_indices, 1)

Query idx1
Multiply fractions by applying the correct multiplication rule Fractions What is the product of \( \frac{1}{3} \) and \( \frac{4}{5} \)? \( \frac{5}{12} \)

Cos Sim No.1
When multiplying fractions, multiplies the denominator

Cos Sim No.2
When multiplying fractions, multiplies the numerator and adds the denominator


# Evaluate

In [22]:
train_ex_long = train_ex_long.with_columns(
    pl.Series(train_sorted_indices[:, :RETRIEVE_NUM].tolist()).alias(
        "PredictMisconceptionId"
    )
)

In [23]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [24]:
map_at_25_score = map_at_25(
    train_ex_long.filter(pl.col("MisconceptionId").is_not_null())[
        "PredictMisconceptionId"
    ],
    train_ex_long.filter(pl.col("MisconceptionId").is_not_null())["MisconceptionId"],
)
map_at_25_score

0.4374102480029539

In [25]:
def recall(predictions, labels):
    acc_num = np.sum([1 for x, y in zip(predictions, labels) if y in x])
    return acc_num / len(predictions)


recall_score = recall(
    train_ex_long.filter(pl.col("MisconceptionId").is_not_null())[
        "PredictMisconceptionId"
    ],
    train_ex_long.filter(pl.col("MisconceptionId").is_not_null())["MisconceptionId"],
)
recall_score

0.8410147991543341

# Make Retrieved Train File

In [26]:
train_retrieved = (
    train_ex_long.filter(
        pl.col(
            "MisconceptionId"
        ).is_not_null()  # TODO: Consider ways to utilize data where MisconceptionId is NaN.
    )
    .explode("PredictMisconceptionId")
    .with_columns(
        (pl.col("MisconceptionId") == pl.col("PredictMisconceptionId"))
        .cast(pl.Int64)
        .alias("target")
    )
    .join(
        misconception_mapping,
        on="MisconceptionId",
    )
    .join(
        misconception_mapping.rename(lambda x: "Predict" + x),
        on="PredictMisconceptionId",
    )
)
train_retrieved.shape

(59125, 12)

In [27]:
train_ex_long

QuestionText,ConstructName,SubjectName,CorrectAnswerText,IncorrectAnswerText,MisconceptionName,AllText,MisconceptionId,PredictMisconceptionId
str,str,str,str,str,str,str,i64,list[i64]
"""If you have a triangle with tw…","""Use the angle sum property of …","""Geometry""","""\(40^\circ\)""","""\(50^\circ\)""","""Does not know that angles in a…","""Use the angle sum property of …",0,"[793, 356, … 1527]"
"""What is the product of \( \fra…","""Multiply fractions by applying…","""Fractions""","""\( \frac{4}{15} \)""","""\( \frac{5}{12} \)""","""Uses dividing fractions method…","""Multiply fractions by applying…",1,"[1374, 1280, … 825]"
"""Sarah is designing a pinwheel …","""Understand and use angle measu…","""Geometry""","""72 degrees""","""20 degrees""","""Believes there are 100 degrees…","""Understand and use angle measu…",2,"[1067, 640, … 1964]"
"""Consider the quadratic equatio…","""Factorizing quadratic equation…","""Algebra""","""Yes, it can be factorized as \…","""No, it cannot be factorized be…","""Thinks a quadratic without a n…","""Factorizing quadratic equation…",3,"[3, 2142, … 1735]"
"""Solve for \( x \) in the equat…","""Solve algebraic equations invo…","""Algebra""","""\( x = 2 \) or \( x = -1 \)""","""\( x = 2 \)""","""Believes addition of terms and…","""Solve algebraic equations invo…",4,"[1134, 2085, … 2070]"
…,…,…,…,…,…,…,…,…
"""What is the result of \( x^3 \…","""Multiplying powers with the sa…","""Exponents and Powers""","""\( x^7 \)""","""\( x^{12} \)""","""When multiplying numbers with …","""Multiplying powers with the sa…",2582,"[2582, 1792, … 2512]"
"""Which of the following numbers…","""Identify and understand cube n…","""Properties of Numbers""","""\( 8 \)""","""\( 10 \)""","""Does not know what a cube numb…","""Identify and understand cube n…",2583,"[2583, 2489, … 2086]"
"""Which is greater, 2% of 1000 o…","""Compare percentages of differe…","""Percentage Comparison""","""10% of 150""","""2% of 1000""","""Believes that any percentage o…","""Compare percentages of differe…",2584,"[914, 2361, … 2123]"
"""Which of the following is a cu…","""Identify the structure of poly…","""Algebraic Expressions""","""D) \( x^3 - 5x + 2 \)""","""A) \( x^3 + x^2 + x \)""","""Believes a cubic expression sh…","""Identify the structure of poly…",2585,"[2585, 1239, … 1936]"


In [28]:
train_retrieved.shape

(59125, 12)

In [29]:
train_retrieved.write_csv(
    f"{OUTPUT_PATH}/{EXP_NAME}-ret{RETRIEVE_NUM}-map{map_at_25_score:.4f}-recall{recall_score:.4f}.csv",
)

In [30]:
print(
    f"{OUTPUT_PATH}/{EXP_NAME}-ret{RETRIEVE_NUM}-map{map_at_25_score:.4f}-recall{recall_score:.4f}.csv",
)

../../output/retriever/e013-ret-bge-prepare/e013-ret-bge-prepare-ret25-map0.4374-recall0.8410.csv


# Kaggle Upload

In [31]:
import os
import json

from kaggle.api.kaggle_api_extended import KaggleApi


def dataset_create_new(dataset_name: str, upload_dir: str):
    # if "_" in dataset_name:
    #     raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata["id"] = f"sinchir0/{dataset_name}"
    dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
    dataset_metadata["title"] = dataset_name
    with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")


print(f"Create Dataset name:{DATASET_NAME}, output_dir:{OUTPUT_PATH}")
dataset_create_new(dataset_name=DATASET_NAME, upload_dir=OUTPUT_PATH)

Create Dataset name:e013-ret-bge-prepare, output_dir:../../output/retriever/e013-ret-bge-prepare
Starting upload for file misconception_mapping_vec.npy


100%|██████████| 10.1M/10.1M [00:01<00:00, 7.07MB/s]


Upload successful: misconception_mapping_vec.npy (10MB)
Starting upload for file e013-ret-bge-prepare-ret25-map0.4374-recall0.8410.csv


100%|██████████| 36.3M/36.3M [00:04<00:00, 9.02MB/s]

Upload successful: e013-ret-bge-prepare-ret25-map0.4374-recall0.8410.csv (36MB)



