In [187]:
import polars as pl
import pandas as pd

In [188]:
data = pd.read_csv("../valid_df/exp024.csv")

pred_data = data[["document_pred", "token_pred", "label_pred", "token_str_pred"]]
pred_data = pred_data.dropna()
pred_data["document_pred"] = pred_data["document_pred"].astype(int)
pred_data["token_pred"] = pred_data["token_pred"].astype(int)
pred_data.columns = ["document", "token", "label", "token_str"]

In [189]:
pred_data = pl.DataFrame(pred_data)

In [190]:
pred_data

document,token,label,token_str
i64,i64,str,str
11092,10,"""B-NAME_STUDENT…","""Bilal"""
11092,11,"""I-NAME_STUDENT…","""Khan"""
6133,20,"""B-NAME_STUDENT…","""Mahammad"""
6133,21,"""I-NAME_STUDENT…","""Khan"""
7932,24,"""B-NAME_STUDENT…","""Ahmed"""
7932,25,"""I-NAME_STUDENT…","""Carlos"""
7932,528,"""B-NAME_STUDENT…","""Ahmed"""
7932,529,"""I-NAME_STUDENT…","""Carlos"""
7932,750,"""B-NAME_STUDENT…","""Ahmed"""
7932,751,"""I-NAME_STUDENT…","""Alaa"""


In [191]:
# Modified function to extract up to three distinct B-NAME_STUDENT and their corresponding I-NAME_STUDENT(s)
# TODO: これがあっているのか確認する。
import numpy as np


def extract_three_distinct_students(df):
    # Group by document
    grouped = df.groupby("document")

    # Initialize a list to store the results
    results = []

    # Iterate over each group
    for doc_id, group in grouped:
        # Find the indices of B-NAME_STUDENT
        b_name_indices = group[group["label"] == "B-NAME_STUDENT"].index.tolist()
        if not b_name_indices:
            continue  # Skip this document if no B-NAME_STUDENT is found

        # Function to get the name student and corresponding I-NAME_STUDENT(s)
        def get_name(index):
            # Get the token_str for B-NAME_STUDENT
            b_name_student = group.loc[index, "token_str"]

            # Initialize a list to store I-NAME_STUDENT(s)
            i_name_students = []

            # Check if the next label(s) are I-NAME_STUDENT and add them to the list
            i = 1
            while True:
                next_index = index + i
                if (
                    next_index in group.index
                    and group.loc[next_index, "label"] == "I-NAME_STUDENT"
                ):
                    i_name_students.append(group.loc[next_index, "token_str"])
                    i += 1
                else:
                    break

            return b_name_student, " ".join(i_name_students)

        # Store the result for the first B-NAME_STUDENT
        first_b_name_student, first_i_name_students = get_name(b_name_indices[0])
        result = {
            "document": doc_id,
            "1st_B-NAME_STUDENT": first_b_name_student,
            "1st_I-NAME_STUDENT": first_i_name_students,
        }

        # Store the results for the second and third distinct B-NAME_STUDENT if they exist
        distinct_names = [first_b_name_student]
        for index in b_name_indices[1:]:
            b_name_student, i_name_students = get_name(index)
            if b_name_student not in distinct_names:
                distinct_names.append(b_name_student)
                if len(distinct_names) == 2:
                    result["2nd_B-NAME_STUDENT"] = b_name_student
                    result["2nd_I-NAME_STUDENT"] = i_name_students
                elif len(distinct_names) == 3:
                    result["3rd_B-NAME_STUDENT"] = b_name_student
                    result["3rd_I-NAME_STUDENT"] = i_name_students
                    break

        # Fill in empty values if fewer than three distinct names were found
        # for i in range(len(distinct_names), 3):
        #    result[f"{i+1}st_B-NAME_STUDENT"] = ""
        #    result[f"{i+1}st_I-NAME_STUDENT"] = ""

        results.append(result)

    return pd.DataFrame(results).fillna(np.nan).replace("", np.nan)


# Extract the names of up to three distinct students for each document
three_distinct_students = extract_three_distinct_students(pred_data.to_pandas())
three_distinct_students = pl.DataFrame(three_distinct_students)

In [192]:
three_distinct_students

document,1st_B-NAME_STUDENT,1st_I-NAME_STUDENT,2nd_B-NAME_STUDENT,2nd_I-NAME_STUDENT,3rd_B-NAME_STUDENT,3rd_I-NAME_STUDENT
i64,str,str,str,str,str,str
10,"""Diego""","""Estrada""",,,,
93,"""Silvia""","""Villalobos""",,,,
104,"""Sakir""","""Ahmad""",,,,
112,"""Francisco""","""Ferreira""",,,,
123,"""Stefano""","""Lovato""",,,,
136,"""Al""",,,,,
166,"""Pepa""","""Medrano""",,,,
204,"""Deiby""",,,,,
214,"""Fareed""","""Ponce""",,,,
308,"""Maud""","""Dias""",,,,


In [193]:
def filter_pred(three_distinct_students: pl.DataFrame, valid_pred_df: pl.DataFrame):
    print(f"Before: valid_pred_df.shape = {valid_pred_df.shape}")
    for document in three_distinct_students["document"].unique().to_list():
        # three_distinct_studentsから、推論データに残す生徒の名前を取得する。
        student_only_document = three_distinct_students.filter(
            pl.col("document") == document
        )

        # use_b_name_col = ["1st_B-NAME_STUDENT", "2nd_B-NAME_STUDENT", "3rd_B-NAME_STUDENT"]
        use_b_name_col = ["1st_B-NAME_STUDENT"]
        # use_i_name_col = ["1st_I-NAME_STUDENT", "2nd_I-NAME_STUDENT", "3rd_I-NAME_STUDENT"]
        use_i_name_col = ["1st_I-NAME_STUDENT"]

        use_b_name_students: list[str] = []
        for b_name_col in use_b_name_col:
            b_name_student: str = student_only_document.get_column(b_name_col)[0]
            use_b_name_students.append(b_name_student)

        use_i_name_students: list[str] = []
        for i_name_col in use_i_name_col:
            i_name_student: str = student_only_document.get_column(i_name_col)[0]
            use_i_name_students.append(i_name_student)

        # 推論データから該当のデータ以外を省く
        unmatch_b_name_students_expr = (
            (pl.col("document") == document)
            & (pl.col("label") == "B-NAME_STUDENT")
            & ~(pl.col("token_str").is_in(use_b_name_students))
        )

        unmatch_i_name_students_expr = (
            (pl.col("document") == document)
            & (pl.col("label") == "I-NAME_STUDENT")
            & ~(pl.col("token_str").is_in(use_i_name_students))
        )

        valid_pred_df = valid_pred_df.filter(
            ~(unmatch_b_name_students_expr | unmatch_i_name_students_expr)
        )

    print(f"After: valid_pred_df.shape = {valid_pred_df.shape}")

    return valid_pred_df


filter_pred(three_distinct_students, pred_data)

Before: valid_pred_df.shape = (1438, 4)
After: valid_pred_df.shape = (1245, 4)


document,token,label,token_str
i64,i64,str,str
11092,10,"""B-NAME_STUDENT…","""Bilal"""
11092,11,"""I-NAME_STUDENT…","""Khan"""
6133,20,"""B-NAME_STUDENT…","""Mahammad"""
6133,21,"""I-NAME_STUDENT…","""Khan"""
7932,24,"""B-NAME_STUDENT…","""Ahmed"""
7932,25,"""I-NAME_STUDENT…","""Carlos"""
7932,528,"""B-NAME_STUDENT…","""Ahmed"""
7932,529,"""I-NAME_STUDENT…","""Carlos"""
7932,750,"""B-NAME_STUDENT…","""Ahmed"""
7932,757,"""B-NAME_STUDENT…","""Ahmed"""


In [194]:
pred_data

document,token,label,token_str
i64,i64,str,str
11092,10,"""B-NAME_STUDENT…","""Bilal"""
11092,11,"""I-NAME_STUDENT…","""Khan"""
6133,20,"""B-NAME_STUDENT…","""Mahammad"""
6133,21,"""I-NAME_STUDENT…","""Khan"""
7932,24,"""B-NAME_STUDENT…","""Ahmed"""
7932,25,"""I-NAME_STUDENT…","""Carlos"""
7932,528,"""B-NAME_STUDENT…","""Ahmed"""
7932,529,"""I-NAME_STUDENT…","""Carlos"""
7932,750,"""B-NAME_STUDENT…","""Ahmed"""
7932,751,"""I-NAME_STUDENT…","""Alaa"""
