# Overview
In this notebook, `ConstructName` + `SubjectName` + `QuestionText` and `Answer[A-D]Text` are vectorized using TFIDF, and those with high cosine similarity are submitted as inference results.

- MAP@25: 0.1378
- recall: 0.4530

Please let me know if there are any mistakes.

# Setting

In [3]:
DATA_PATH = "../../data"
OUTPUT_PATH = "../../retrieved_data"

# Import

In [4]:
import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Data Load

In [5]:
train = pd.read_csv(f"{DATA_PATH}/train.csv")
test = pd.read_csv(f"{DATA_PATH}/test.csv")

misconception_mapping = pd.read_csv(f"{DATA_PATH}/misconception_mapping.csv")
sample_submission = pd.read_csv(f"{DATA_PATH}/sample_submission.csv")

# Preprocess

In [6]:
def make_all_question_text(df: pd.DataFrame) -> pd.DataFrame:
    df["AllQuestionText"] = (
        df["ConstructName"] + " " + df["SubjectName"] + " " + df["QuestionText"]
    )
    return df


train = make_all_question_text(train)
test = make_all_question_text(test)

In [7]:
from typing import Literal


def wide_to_long(
    df: pd.DataFrame, col: Literal["AnswerText", "MisconceptionId"]
) -> pd.DataFrame:
    if col == "AnswerText":
        add_col = [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]]
        var_name = "AnswerType"
    elif col == "MisconceptionId":
        add_col = [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
        var_name = "MisconceptionType"
    else:
        raise Exception

    return pd.melt(
        df[["QuestionId", "AllQuestionText", "CorrectAnswer"] + add_col],
        id_vars=["QuestionId", "AllQuestionText", "CorrectAnswer"],
        var_name=var_name,
        value_name=col,
    )


train_long = wide_to_long(train, col="AnswerText")
test_long = wide_to_long(test, col="AnswerText")

train_long_mis = wide_to_long(train, col="MisconceptionId")

In [8]:
train_long["AnswerAlphabet"] = train_long["AnswerType"].str.extract(
    r"Answer([A-Z])Text$"
)
test_long["AnswerAlphabet"] = test_long["AnswerType"].str.extract(r"Answer([A-Z])Text$")

train_long_mis["MisconceptionAlphabet"] = train_long_mis[
    "MisconceptionType"
].str.extract(r"Misconception([A-Z])Id$")

In [9]:
train_long = pd.merge(
    train_long,
    train_long_mis[["QuestionId", "MisconceptionId", "MisconceptionAlphabet"]],
    left_on=["QuestionId", "AnswerAlphabet"],
    right_on=["QuestionId", "MisconceptionAlphabet"],
)

In [10]:
def make_all_text(df: pd.DataFrame) -> pd.DataFrame:
    df["AllText"] = df["AllQuestionText"] + " " + df["AnswerText"]
    return df


train_long = make_all_text(train_long)
test_long = make_all_text(test_long)

In [11]:
# sort
train_long = train_long.sort_values(["QuestionId", "AnswerType"]).reset_index(drop=True)
test_long = test_long.sort_values(["QuestionId", "AnswerType"]).reset_index(drop=True)

# Train tfidf

In [12]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(
    pd.concat([train_long["AllText"], misconception_mapping["MisconceptionName"]])
)

In [13]:
train_long_vec = tfidf_matrix.toarray()[: len(train_long)]
misconception_mapping_vec = tfidf_matrix.toarray()[len(train_long) :]

print(train_long_vec.shape)
print(misconception_mapping_vec.shape)

(7476, 3564)
(2587, 3564)


In [14]:
train_cos_sim_arr = cosine_similarity(train_long_vec, misconception_mapping_vec)
train_sorted_indices = np.argsort(-train_cos_sim_arr, axis=1)

In [15]:
train_sorted_indices[:, :25]

array([[2488, 2532, 2039, ..., 1672, 1941,   15],
       [2488, 2532, 2039, ..., 1672, 1941,   15],
       [2488, 2532, 2039, ..., 1672, 1941,   15],
       ...,
       [1640,  801,  805, ..., 2090,  311, 1498],
       [1640,  801,  805, ...,  141, 2090,  311],
       [1640,  801,  805, ..., 2090, 1498, 1605]])

In [16]:
# example
def print_example(df: pd.DataFrame, sorted_indices: np.ndarray, idx: int) -> None:
    print(f"Query idx{idx}")
    print(df["AllText"][idx])
    print("\nCos Sim No.1")
    print(misconception_mapping["MisconceptionName"][sorted_indices[idx, 0]])
    print("\nCos Sim No.2")
    print(misconception_mapping["MisconceptionName"][sorted_indices[idx, 1]])


print_example(train_long, train_sorted_indices, 0)

Query idx0
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times(2+4)-5 \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


In [17]:
print_example(train_long, train_sorted_indices, 1)

Query idx1
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times 2+(4-5) \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


# Evaluate

In [18]:
train_long["PredictMisconceptionId"] = train_sorted_indices[:, :25].tolist()

In [19]:
# https://www.kaggle.com/code/cdeotte/how-to-train-open-book-model-part-1#MAP@3-Metric
def map_at_25(predictions, labels):
    map_sum = 0
    for x, y in zip(predictions, labels):
        z = [1 / i if y == j else 0 for i, j in zip(range(1, 26), x)]
        map_sum += np.sum(z)
    return map_sum / len(predictions)

In [20]:
map_at_25(
    train_long["PredictMisconceptionId"][train_long["MisconceptionId"].notnull()],
    train_long["MisconceptionId"][train_long["MisconceptionId"].notnull()],
)

0.13781884858544377

In [21]:
def recall(predictions, labels):
    acc_num = np.sum([1 for x, y in zip(predictions, labels) if y in x])
    return acc_num / len(predictions)


recall(
    train_long["PredictMisconceptionId"][train_long["MisconceptionId"].notnull()],
    train_long["MisconceptionId"][train_long["MisconceptionId"].notnull()],
)

0.45308924485125857

# Predict

In [22]:
test_long_vec = vectorizer.transform(test_long["AllText"])

In [23]:
test_cos_sim_arr = cosine_similarity(test_long_vec, misconception_mapping_vec)
test_sorted_indices = np.argsort(-test_cos_sim_arr, axis=1)

In [24]:
test_sorted_indices[:, :25]

array([[2488, 2532, 2039,  294, 1965, 1005, 2475, 1728, 1507,  706,  656,
        1872, 2586,  521,  256, 1119, 1999,  537, 2479, 2104, 1026,  987,
        1672, 1941,   15],
       [2488, 2532, 2039,  294, 1965, 1005, 2475, 1728, 1507,  706,  656,
        1872, 2586,  521,  256, 1119, 1999,  537, 2479, 2104, 1026,  987,
        1672, 1941,   15],
       [2488, 2532, 2039,  294, 1965, 1005, 2475, 1728, 1507,  706,  656,
        1872, 2586,  521,  256, 1119, 1999,  537, 2479, 2104, 1026,  987,
        1672, 1941,   15],
       [2488, 2551, 2532, 2039, 1872,  294, 1965,  537, 1392, 1728, 2479,
         256, 2475, 1999, 2131, 2104, 1005, 1119, 1026, 1432, 1643,  656,
         802,  259, 1756],
       [1540,  979,  363, 2398, 1825, 1593,   29,  606,   59,  848,  885,
        2307,  609, 1309,   80, 2463, 2020, 1621, 2199,   78,  381, 1976,
        1542, 1008,  419],
       [1540,  979,  363, 2398, 1825, 1593,   29,  606,   59,  848,  885,
        2307,  609, 1309,   80, 2463, 2020, 1621, 2

In [25]:
print_example(test_long, test_sorted_indices, 0)

Query idx0
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times(2+4)-5 \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


In [26]:
print_example(test_long, test_sorted_indices, 1)

Query idx1
Use the order of operations to carry out calculations involving powers BIDMAS \[
3 \times 2+4-5
\]
Where do the brackets need to go to make the answer equal \( 13 \) ? \( 3 \times 2+(4-5) \)

Cos Sim No.1
Answers order of operations questions with brackets as if the brackets are not there

Cos Sim No.2
Believes order of operations does not affect the answer to a calculation


# Make Submit File

In [27]:
test_long["QuestionId_Answer"] = (
    test_long["QuestionId"].astype("str") + "_" + test_long["AnswerAlphabet"]
)
test_long["MisconceptionId"] = test_sorted_indices[:, :25].tolist()
test_long["MisconceptionId"] = test_long["MisconceptionId"].apply(
    lambda x: " ".join(map(str, x))
)
test_long = test_long[
    test_long["CorrectAnswer"] != test_long["AnswerAlphabet"]
]  # filter correct row
submission = test_long[["QuestionId_Answer", "MisconceptionId"]].reset_index(drop=True)

In [28]:
submission.head(10)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_B,2488 2532 2039 294 1965 1005 2475 1728 1507 70...
1,1869_C,2488 2532 2039 294 1965 1005 2475 1728 1507 70...
2,1869_D,2488 2551 2532 2039 1872 294 1965 537 1392 172...
3,1870_A,1540 979 363 2398 1825 1593 29 606 59 848 885 ...
4,1870_B,1540 979 363 2398 1825 1593 29 606 59 848 885 ...
5,1870_C,1540 979 363 2398 1825 1593 29 606 59 848 885 ...
6,1871_A,632 1073 1287 2064 2243 1873 1098 1306 1797 21...
7,1871_C,632 1073 1287 2064 2243 1873 1098 1306 1797 21...
8,1871_D,632 1073 1287 2064 1873 2243 1098 1306 1797 21...


In [29]:
sample_submission.head(10)

Unnamed: 0,QuestionId_Answer,MisconceptionId
0,1869_A,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,1869_B,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
2,1869_C,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
3,1870_B,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
4,1870_C,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
5,1870_D,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
6,1871_A,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
7,1871_C,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
8,1871_D,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...


In [30]:
submission.to_csv("submission.csv", index=False)