In [2]:
import polars as pl

In [3]:
DATA_PATH = "../data"
train = pl.read_csv(f"{DATA_PATH}/train.csv")
misconception_mapping = pl.read_csv(f"{DATA_PATH}/misconception_mapping.csv")

In [12]:
train.filter(pl.col("QuestionId") == 2)["QuestionText"][0]

"Tom and Katie are discussing the \\( 5 \\) plants with these heights:\n\\( 24 \\mathrm{~cm}, 17 \\mathrm{~cm}, 42 \\mathrm{~cm}, 26 \\mathrm{~cm}, 13 \\mathrm{~cm} \\)\nTom says if all the plants were cut in half, the range wouldn't change.\nKatie says if all the plants grew by \\( 3 \\mathrm{~cm} \\) each, the range wouldn't change.\nWho do you agree with?"

In [27]:
# TODO: refactor by Polars Expression
correct_answer = []
for row in train.select(
    pl.col(
        ["CorrectAnswer", "AnswerAText", "AnswerBText", "AnswerCText", "AnswerDText"]
    )
).iter_rows():
    if row[0] == "A":
        correct_answer.append(row[1])
    elif row[0] == "B":
        correct_answer.append(row[2])
    elif row[0] == "C":
        correct_answer.append(row[3])
    elif row[0] == "D":
        correct_answer.append(row[4])
    else:
        raise Exception


train = train.with_columns(pl.Series(correct_answer).alias("CorrectAnswerText"))

In [31]:
common_col = [
    "QuestionId",
    "ConstructName",
    "SubjectName",
    "QuestionText",
    "CorrectAnswer",
    "CorrectAnswerText",
]

train_long = (
    train.select(
        pl.col(common_col + [f"Answer{alpha}Text" for alpha in ["A", "B", "C", "D"]])
    )
    .unpivot(
        index=common_col,
        variable_name="AnswerType",
        value_name="AnswerText",
    )
    .with_columns(
        pl.concat_str(
            [
                pl.col("ConstructName"),
                pl.col("SubjectName"),
                pl.col("QuestionText"),
                pl.col("AnswerText"),
            ],
            separator=" ",
        ).alias("AllText"),
        pl.col("AnswerType").str.extract(r"Answer([A-D])Text$").alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str(
            [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
        ).alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
)
train_long.head()

train_misconception_long = (
    train.select(
        pl.col(
            common_col + [f"Misconception{alpha}Id" for alpha in ["A", "B", "C", "D"]]
        )
    )
    .unpivot(
        index=common_col,
        variable_name="MisconceptionType",
        value_name="MisconceptionId",
    )
    .with_columns(
        pl.col("MisconceptionType")
        .str.extract(r"Misconception([A-D])Id$")
        .alias("AnswerAlphabet"),
    )
    .with_columns(
        pl.concat_str(
            [pl.col("QuestionId"), pl.col("AnswerAlphabet")], separator="_"
        ).alias("QuestionId_Answer"),
    )
    .sort("QuestionId_Answer")
    .select(pl.col(["QuestionId_Answer", "MisconceptionId"]))
    .with_columns(pl.col("MisconceptionId").cast(pl.Int64))
)
train_misconception_long.head()

# join MisconceptionId
train_long = train_long.join(train_misconception_long, on="QuestionId_Answer").join(
    misconception_mapping, on="MisconceptionId", how="left"
)

In [34]:
train_long.write_csv("../data/train_long.csv")