# Overview
In this notebook, `ConstructName` + `SubjectName` + `QuestionText` and `Answer[A-D]Text` are vectorized using TFIDF, and those with high cosine similarity are submitted as inference results.

- MAP@25: 0.1378
- recall: 0.4530

Please let me know if there are any mistakes.

# Setting

In [117]:
EXP_NAME = "e001-ret-tfidf"
DATA_PATH = "../../data"

DATA_OUTPUT_PATH = "../../retrieved_data"

DATASET_NAME = EXP_NAME
MODEL_OUTPUT_PATH = (
    "../../eedi-mining-misconceptions-in-mathematics/trained_models/retriever/e001"
)

RETRIEVE_NUM = 25  # TODO: 多くしてみる

# Import

In [118]:
import polars as pl
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [119]:
train = pl.read_csv(f"{DATA_PATH}/train.csv")
test = pl.read_csv(f"{DATA_PATH}/test.csv")

misconception_mapping = pl.read_csv(f"{DATA_PATH}/misconception_mapping.csv")
sample_submission = pl.read_csv(f"{DATA_PATH}/sample_submission.csv")

# Preprocess

In [121]:
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
]

train_long = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .select(
        pl.col(common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]])
    )
    .unpivot(
        index=common_col,
        variable_name="AnswerType",
        value_name="AnswerText",
    )
    .with_columns(
        pl.concat_str(
            [
                pl.col("ConstructName"),
                pl.col("SubjectName"),
                pl.col("QuestionText"),
                pl.col("AnswerText"),
            ],
            separator=" ",
        ).alias("AllText"),
        pl.col("AnswerType").str.extract(r"Answer([A-D])Text$").alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str(
            [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
        ).alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
)
train_long.head()

QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,AnswerType,AnswerText,AllText,AnswerAlphabet,QuestionId_Answer
i64,str,str,str,str,str,str,str,str,str
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerAText""","""\( 3 \times(2+4)-5 \)""","""Use the order of operations to…","""A""","""0_A"""
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerBText""","""\( 3 \times 2+(4-5) \)""","""Use the order of operations to…","""B""","""0_B"""
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerCText""","""\( 3 \times(2+4-5) \)""","""Use the order of operations to…","""C""","""0_C"""
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D"""
1000,"""Simplify an algebraic fraction…","""Simplifying Algebraic Fraction…","""Simplify the following, if pos…","""B""","""AnswerAText""","""\( t \)""","""Simplify an algebraic fraction…","""A""","""1000_A"""


In [122]:
train_misconception_long = (
    pl.read_csv(f"{DATA_PATH}/train.csv")
    .select(
        pl.col(
            common_col + [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
        )
    )
    .unpivot(
        index=common_col,
        variable_name="MisconceptionType",
        value_name="MisconceptionId",
    )
    .with_columns(
        pl.col("MisconceptionType")
        .str.extract(r"Misconception([A-D])Id$")
        .alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str(
            [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
        ).alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
    .select(pl.col(["QuestionId_Answer", "MisconceptionId"]))
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)
train_misconception_long.head()

QuestionId_Answer,MisconceptionId
str,i64
"""0_A""",
"""0_B""",
"""0_C""",
"""0_D""",1672.0
"""1000_A""",891.0


In [123]:
# join MisconceptionId
train_long = train_long.join(train_misconception_long, on="QuestionId_Answer")

# Train tfidf

In [128]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(
    pl.concat(
        [train_long["AllText"], misconception_mapping["MisconceptionName"]],
        how="vertical",
    )
)

In [129]:
# save fitted vectorizer
import pickle
import os

# save vectorizer
os.makedirs(MODEL_OUTPUT_PATH, exist_ok=True)
with open(f"{MODEL_OUTPUT_PATH}/vectorizer.pkl", "wb") as file:
    pickle.dump(vectorizer, file)

In [130]:
train_long_vec = tfidf_matrix.toarray()[: len(train_long)]
misconception_mapping_vec = tfidf_matrix.toarray()[len(train_long) :]

print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

(7476, 3564)
(2587, 3564)


In [131]:
# misconception_mapping_vecを保存する
np.save(f"{MODEL_OUTPUT_PATH}/misconception_mapping_vec.npy", misconception_mapping_vec)

In [132]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

In [133]:
# example
def print_example(df: pl.DataFrame, sorted_indices: np.ndarray, idx: int) -> None:
    print(f"Query idx{idx}")
    print(df["AllText"][idx])
    print("\nCos Sim No.1")
    print(misconception_mapping["MisconceptionName"][int(sorted_indices[idx, 0])])
    print("\nCos Sim No.2")
    print(misconception_mapping["MisconceptionName"][int(sorted_indices[idx, 1])])

In [134]:
print_example(train_long, train_sorted_indices, 0)

Query idx0
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times(2+4)-5 \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


In [135]:
print_example(train_long, train_sorted_indices, 1)

Query idx1
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times 2+(4-5) \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


# Evaluate

In [136]:
train_long = train_long.with_columns(
    pl.Series(train_sorted_indices[:, :RETRIEVE_NUM].tolist()).alias(
        "PredictMisconceptionId"
    )
)

In [137]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [138]:
map_at_25_score = map_at_25(
    train_long.filter(pl.col("MisconceptionId").is_not_null())[
        "PredictMisconceptionId"
    ],
    train_long.filter(pl.col("MisconceptionId").is_not_null())["MisconceptionId"],
)
map_at_25_score

0.13781884858544355

In [139]:
def recall(predictions, labels):
    acc_num = np.sum([1 for x, y in zip(predictions, labels) if y in x])
    return acc_num / len(predictions)


recall_score = recall(
    train_long.filter(pl.col("MisconceptionId").is_not_null())[
        "PredictMisconceptionId"
    ],
    train_long.filter(pl.col("MisconceptionId").is_not_null())["MisconceptionId"],
)
recall_score

0.45308924485125857

# Make Retrieved Train File

In [143]:
train_retrieved = (
    train_long.filter(
        pl.col(
            "MisconceptionId"
        ).is_not_null()  # TODO: Consider ways to utilize data where MisconceptionId is NaN.
    )
    .explode("PredictMisconceptionId")
    .with_columns(
        (pl.col("MisconceptionId") == pl.col("PredictMisconceptionId"))
        .cast(pl.Int64)
        .alias("target")
    )
    .join(
        misconception_mapping.with_columns(pl.all().name.prefix("Predict")),
        on="PredictMisconceptionId",
    )
)
train_retrieved.head()

QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,AnswerType,AnswerText,AllText,AnswerAlphabet,QuestionId_Answer,MisconceptionId,PredictMisconceptionId,target,MisconceptionId_right,MisconceptionName,PredictMisconceptionName
i64,str,str,str,str,str,str,str,str,str,i64,i64,i64,i64,str,str
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D""",1672,2488,0,2488,"""Answers order of operations qu…","""Answers order of operations qu…"
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D""",1672,2551,0,2551,"""Believes range does not need u…","""Believes range does not need u…"
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D""",1672,2532,0,2532,"""Believes order of operations d…","""Believes order of operations d…"
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D""",1672,2039,0,2039,"""Thinks you need to just add a …","""Thinks you need to just add a …"
0,"""Use the order of operations to…","""BIDMAS""","""\[ 3 \times 2+4-5 \] Where do …","""A""","""AnswerDText""","""Does not need brackets""","""Use the order of operations to…","""D""","""0_D""",1672,1872,0,1872,"""Believes they only need to mul…","""Believes they only need to mul…"


In [144]:
train_retrieved.write_csv(
    f"{DATA_OUTPUT_PATH}/e001-train_ret{RETRIEVE_NUM}_map{map_at_25_score:.4f}_recall{recall_score:.4f}.csv",
)

In [145]:
print(
    f"{DATA_OUTPUT_PATH}/e001-train_ret{RETRIEVE_NUM}_map{map_at_25_score:.4f}_recall{recall_score:.4f}.csv"
)

../../retrieved_data/e001-train_ret25_map0.1378_recall0.4531.csv


# Kaggle Upload

In [79]:
import os
import json

from kaggle.api.kaggle_api_extended import KaggleApi


def dataset_create_new(dataset_name: str, upload_dir: str):
    # if "_" in dataset_name:
    #     raise ValueError("datasetの名称に_の使用は禁止です")
    dataset_metadata = {}
    dataset_metadata["id"] = f"sinchir0/{dataset_name}"
    dataset_metadata["licenses"] = [{"name": "CC0-1.0"}]
    dataset_metadata["title"] = dataset_name
    with open(os.path.join(upload_dir, "dataset-metadata.json"), "w") as f:
        json.dump(dataset_metadata, f, indent=4)
    api = KaggleApi()
    api.authenticate()
    api.dataset_create_new(folder=upload_dir, convert_to_csv=False, dir_mode="tar")


print(f"Create Dataset name:{DATASET_NAME}, output_dir:{MODEL_OUTPUT_PATH}")
dataset_create_new(dataset_name=DATASET_NAME, upload_dir=MODEL_OUTPUT_PATH)

Create Dataset name:e001-ret-tfidf, output_dir:../../eedi-mining-misconceptions-in-mathematics/trained_models/retriever/e001


Starting upload for file vectorizer.pkl


100%|██████████| 69.5k/69.5k [00:00<00:00, 114kB/s]


Upload successful: vectorizer.pkl (70KB)
Starting upload for file misconception_mapping_vec.npy


100%|██████████| 70.3M/70.3M [00:03<00:00, 23.3MB/s]

Upload successful: misconception_mapping_vec.npy (70MB)



