# Overview
In this notebook, `ConstructName` + `SubjectName` + `QuestionText` and `Answer[A-D]Text` are vectorized using TFIDF, and those with high cosine similarity are submitted as inference results.

- MAP@25: 0.1378
- recall: 0.4530

Please let me know if there are any mistakes.

# Setting

In [33]:
DATA_PATH = "../../data"
OUTPUT_PATH = "../../retrieved_data"
RETRIEVE_NUM = 10  # TODO: 多くしてみる

# Import

In [34]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [35]:
train = pd.read_csv(f"{DATA_PATH}/train.csv")
test = pd.read_csv(f"{DATA_PATH}/test.csv")

misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")
sample_submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

# Preprocess

In [36]:
# def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
#     df["AllQuestionText"] = (
#         df["ConstructName"] + " " + df["SubjectName"] + " " + df["QuestionText"]
#     )
#     return df


# train = make_all_question_text(train)
# test = make_all_question_text(test)

In [37]:
from typing import Literal


def wide_to_long(
    df: pd.DataFrame, col: Literal["AnswerText", "MisconceptionId"]
) -> pd.DataFrame:
    if col == "AnswerText":
        add_col = [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]
        var_name = "AnswerType"
    elif col == "MisconceptionId":
        add_col = [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
        var_name = "MisconceptionType"
    else:
        raise Exception

    return pd.melt(
        df[
            [
                "QuestionId",
                "ConstructName",
                "SubjectName",
                "QuestionText",
                "CorrectAnswer",
            ]
            + add_col
        ],
        id_vars=[
            "QuestionId",
            "ConstructName",
            "SubjectName",
            "QuestionText",
            "CorrectAnswer",
        ],
        var_name=var_name,
        value_name=col,
    )


train_long = wide_to_long(train, col="AnswerText")
test_long = wide_to_long(test, col="AnswerText")

train_long_mis = wide_to_long(train, col="MisconceptionId")

In [38]:
train_long["AnswerAlphabet"] = train_long["AnswerType"].str.extract(
    r"Answer([A-Z])Text$"
)
test_long["AnswerAlphabet"] = test_long["AnswerType"].str.extract(r"Answer([A-Z])Text$")

train_long_mis["MisconceptionAlphabet"] = train_long_mis[
    "MisconceptionType"
].str.extract(r"Misconception([A-Z])Id$")

In [39]:
train_long = pd.merge(
    train_long,
    train_long_mis[["QuestionId", "MisconceptionId", "MisconceptionAlphabet"]],
    left_on=["QuestionId", "AnswerAlphabet"],
    right_on=["QuestionId", "MisconceptionAlphabet"],
)

In [40]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["AllText"] = (
        df["ConstructName"]
        + " "
        + df["SubjectName"]
        + " "
        + df["QuestionText"]
        + " "
        + df["AnswerText"]
    )
    return df


train_long = make_all_text(train_long)
test_long = make_all_text(test_long)

In [41]:
train_long

Unnamed: 0,QuestionId,ConstructName,SubjectName,QuestionText,CorrectAnswer,AnswerType,AnswerText,AnswerAlphabet,MisconceptionId,MisconceptionAlphabet,AllText
0,0,Use the order of operations to carry out calcu...,BIDMAS,\[\n3 \times 2+4-5\n\]\nWhere do the brackets ...,A,AnswerAText,\( 3 \times(2+4)-5 \),A,,A,Use the order of operations to carry out calcu...
1,1,Simplify an algebraic fraction by factorising ...,Simplifying Algebraic Fractions,"Simplify the following, if possible: \( \frac{...",D,AnswerAText,\( m+1 \),A,2142.0,A,Simplify an algebraic fraction by factorising ...
2,2,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,Tom and Katie are discussing the \( 5 \) plant...,B,AnswerAText,Only\nTom,A,1287.0,A,Calculate the range from a list of data Range ...
3,3,Recall and use the intersecting diagonals prop...,Properties of Quadrilaterals,The angles highlighted on this rectangle with ...,C,AnswerAText,acute,A,1180.0,A,Recall and use the intersecting diagonals prop...
4,4,Substitute positive integer values into formul...,Substitution into Formula,The equation \( f=3 r^{2}+3 \) is used to find...,A,AnswerAText,\( 30 \),A,,A,Substitute positive integer values into formul...
...,...,...,...,...,...,...,...,...,...,...,...
7471,1864,Calculate the range from a list of data,Range and Interquartile Range from a List of Data,What is the range of the following numbers?\n\...,C,AnswerDText,\( 16 \),D,1349.0,D,Calculate the range from a list of data Range ...
7472,1865,"Describe an enlargement, with no centre of enl...",Length Scale Factors in Similar Shapes,Shape \( Q \) is an enlargement of shape \( P ...,B,AnswerDText,\( 11-3 \),D,1258.0,D,"Describe an enlargement, with no centre of enl..."
7473,1866,Use the order of operations to carry out calcu...,BIDMAS,What does the following equal?\n\[\n8-7+10 \ti...,B,AnswerDText,\( 33 \),D,1507.0,D,Use the order of operations to carry out calcu...
7474,1867,Distinguish between congruency and similarity,Congruency in Other Shapes,Tom and Katie are discussing congruence and si...,B,AnswerDText,Neither is correct,D,2312.0,D,Distinguish between congruency and similarity ...


In [42]:
# sort
train_long = train_long.sort_values(["QuestionId", "AnswerType"]).reset_index(drop=True)
test_long = test_long.sort_values(["QuestionId", "AnswerType"]).reset_index(drop=True)

# Train tfidf

In [43]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(
    pd.concat([train_long["AllText"], misconception_mapping["MisconceptionName"]])
)

In [44]:
train_long_vec = tfidf_matrix.toarray()[: len(train_long)]
misconception_mapping_vec = tfidf_matrix.toarray()[len(train_long) :]

print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

(7476, 3564)
(2587, 3564)


In [45]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

In [46]:
train_sorted_indices[:, :RETRIEVE_NUM]

array([[2488, 2532, 2039, ..., 1672, 1941,   15],
       [2488, 2532, 2039, ..., 1672, 1941,   15],
       [2488, 2532, 2039, ..., 1672, 1941,   15],
       ...,
       [1640,  801,  805, ..., 2090,  311, 1498],
       [1640,  801,  805, ...,  141, 2090,  311],
       [1640,  801,  805, ..., 2090, 1498, 1605]])

In [47]:
# example
def print_example(df: pd.DataFrame, sorted_indices: np.ndarray, idx: int) -> None:
    print(f"Query idx{idx}")
    print(df["AllText"][idx])
    print("\nCos Sim No.1")
    print(misconception_mapping["MisconceptionName"][sorted_indices[idx, 0]])
    print("\nCos Sim No.2")
    print(misconception_mapping["MisconceptionName"][sorted_indices[idx, 1]])


print_example(train_long, train_sorted_indices, 0)

Query idx0
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times(2+4)-5 \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


In [48]:
print_example(train_long, train_sorted_indices, 1)

Query idx1
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times 2+(4-5) \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


# Evaluate

In [49]:
train_long["PredictMisconceptionId"] = train_sorted_indices[:, :RETRIEVE_NUM].tolist()

In [50]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [51]:
map_at_25_score = map_at_25(
    train_long["PredictMisconceptionId"][train_long["MisconceptionId"].notnull()],
    train_long["MisconceptionId"][train_long["MisconceptionId"].notnull()],
)
map_at_25_score

0.13781884858544377

In [52]:
def recall(predictions, labels):
    acc_num = np.sum([1 for x, y in zip(predictions, labels) if y in x])
    return acc_num / len(predictions)


recall_score = recall(
    train_long["PredictMisconceptionId"][train_long["MisconceptionId"].notnull()],
    train_long["MisconceptionId"][train_long["MisconceptionId"].notnull()],
)
recall_score

0.45308924485125857

# Predict

In [53]:
test_long_vec = vectorizer.transform(test_long["AllText"])

In [54]:
test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [55]:
test_sorted_indices[:, :RETRIEVE_NUM]

array([[2488, 2532, 2039,  294, 1965, 1005, 2475, 1728, 1507,  706,  656,
        1872, 2586,  521,  256, 1119, 1999,  537, 2479, 2104, 1026,  987,
        1672, 1941,   15],
       [2488, 2532, 2039,  294, 1965, 1005, 2475, 1728, 1507,  706,  656,
        1872, 2586,  521,  256, 1119, 1999,  537, 2479, 2104, 1026,  987,
        1672, 1941,   15],
       [2488, 2532, 2039,  294, 1965, 1005, 2475, 1728, 1507,  706,  656,
        1872, 2586,  521,  256, 1119, 1999,  537, 2479, 2104, 1026,  987,
        1672, 1941,   15],
       [2488, 2551, 2532, 2039, 1872,  294, 1965,  537, 1392, 1728, 2479,
         256, 2475, 1999, 2131, 2104, 1005, 1119, 1026, 1432, 1643,  656,
         802,  259, 1756],
       [1540,  979,  363, 2398, 1825, 1593,   29,  606,   59,  848,  885,
        2307,  609, 1309,   80, 2463, 2020, 1621, 2199,   78,  381, 1976,
        1542, 1008,  419],
       [1540,  979,  363, 2398, 1825, 1593,   29,  606,   59,  848,  885,
        2307,  609, 1309,   80, 2463, 2020, 1621, 2

In [56]:
print_example(test_long, test_sorted_indices, 0)

Query idx0
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times(2+4)-5 \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


In [57]:
print_example(test_long, test_sorted_indices, 1)

Query idx1
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times 2+(4-5) \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


# Make Retrieved Train Test File

In [58]:
# TODO: 本当にMisconceptionIdがNaNのデータを省いて良いのか考える
train_long = train_long[train_long["MisconceptionId"].notnull()].reset_index(drop=True)
train_long["MisconceptionId"] = train_long["MisconceptionId"].astype(int)


In [59]:
train_long["PredictMisconceptionId"][0]

[2488,
 2551,
 2532,
 2039,
 1872,
 294,
 1965,
 537,
 1392,
 1728,
 2479,
 256,
 2475,
 1999,
 2131,
 2104,
 1005,
 1119,
 1026,
 1432,
 1643,
 656,
 802,
 259,
 1756]

In [60]:
# TrainとTestのPredictMisconceptionIdの中に、正解が入っていない場合は、追加する
def check_answer_predict_misconception_id(
    actual: int, predict: list[int]
) -> list[KeyboardInterrupt]:
    if actual in predict:
        return predict
    else:
        predict.append(actual)
        return predict


train_long["PredictMisconceptionId"] = [
    check_answer_predict_misconception_id(actual, predict)
    for actual, predict in zip(
        train_long["MisconceptionId"], train_long["PredictMisconceptionId"]
    )
]

In [61]:
train_long["PredictMisconceptionId"][0]  # 1672が追加された

[2488,
 2551,
 2532,
 2039,
 1872,
 294,
 1965,
 537,
 1392,
 1728,
 2479,
 256,
 2475,
 1999,
 2131,
 2104,
 1005,
 1119,
 1026,
 1432,
 1643,
 656,
 802,
 259,
 1756,
 1672]

In [62]:
train_long = train_long.explode("PredictMisconceptionId").reset_index(drop=True)
# 正解列の追加
train_long["target"] = (
    train_long["MisconceptionId"] == train_long["PredictMisconceptionId"]
).astype(int)

In [63]:
# targetが1の行数が、元々の正解データがNanではない一人一致することを確認する
assert (
    train_long[train_long["target"] == 1].shape[0]
    == train[
        ["MisconceptionAId", "MisconceptionBId", "MisconceptionCId", "MisconceptionDId"]
    ]
    .notnull()
    .sum()
    .sum()
)

In [64]:
train_long.to_csv(
    f"{OUTPUT_PATH}/e001-train_ret{RETRIEVE_NUM}_map{map_at_25_score:.4f}_recall{recall_score:.4f}.csv"
)