In [18]:
import pandas as pd
import numpy as np
import torch
from torch.nn.functional import cosine_similarity
from pathlib import Path
from collections import Counter
import sys


base_in = Path.cwd().parent / "data-merged" / "data-merged" / "air-exercise-2" / "Part-1"
base_in_prev = Path.cwd().parent / "data-merged" / "data-merged"  # output of previous script
base_out = Path.cwd().parent / "output"

docs = pd.read_csv(base_in_prev / "fira-22.documents.embeddings.tsv", sep="\t")
queries = pd.read_csv(base_in_prev / "fira-22.queries.embeddings.tsv", sep="\t")
judgements: pd.DataFrame = pd.read_csv(base_in / "fira-22.judgements-anonymized.tsv", sep="\t")


def preprocess_docs(docs: pd.DataFrame) -> pd.DataFrame:
    docs = docs[docs["doc_id"].isin(judgements["documentId"].unique())]
    len_j = len(judgements["documentId"].unique())
    len_d = len(docs["doc_id"].unique())
    assert len_j == len_d
    return docs


def preprocess_queries(queries: pd.DataFrame) -> pd.DataFrame:
    queries = queries[queries["query_id"].isin(judgements["queryId"].unique())]
    len_j = len(judgements["queryId"].unique())
    len_q = len(queries["query_id"].unique())
    assert len_j == len_q
    return queries


def preprocess_judgements(judgements: pd.DataFrame) -> pd.DataFrame:
    prev_len = len(judgements)
    judgements = judgements.dropna().drop_duplicates()
    assert len(judgements) == prev_len
    judgements = judgements[["relevanceLevel", "queryId", "documentId"]]
    judgements["relevanceLevel"] = judgements["relevanceLevel"].map({"0_NOT_RELEVANT": 0, "1_TOPIC_RELEVANT_DOES_NOT_ANSWER": 1, "2_GOOD_ANSWER": 2, "3_PERFECT_ANSWER": 3})
    return judgements


def get_cos_similarity(q_id: str, d_id: str) -> float:
    q_embedding: torch.tensor = torch.tensor([float(i) for i in queries[queries["query_id"] == q_id]["query_embedding"].values[0].strip("[]").split(", ")]).unsqueeze(0)  # type: ignore
    d_embedding: torch.tensor = torch.tensor([float(i) for i in docs[docs["doc_id"] == d_id]["doc_embedding"].values[0].strip("[]").split(", ")]).unsqueeze(0)  # type: ignore
    sim: float = cosine_similarity(q_embedding, d_embedding).item()
    assert 0 <= sim <= 1
    return sim


docs = preprocess_docs(docs)  # "doc_id", "doc_embedding"
queries = preprocess_queries(queries)  # "query_id", "query_embedding"
judgements = preprocess_judgements(judgements)  # "relevanceLevel", "queryId", "documentId"

if __name__ == "__main__":
    SAMPLE_SIZE = 5
    
    # randomize query order
    queries = queries.sample(frac=1).reset_index(drop=True)

    for _, q in queries.iterrows():
        q_id = q["query_id"]
        d_ids = judgements[judgements["queryId"] == q_id]["documentId"].unique()

        # randomize doc order
        d_ids = np.random.permutation(d_ids)

        for doc_id in d_ids:
            votes = judgements[judgements["documentId"] == doc_id]["relevanceLevel"].values
            sim = get_cos_similarity(q_id, doc_id)

            doc_content = docs[docs["doc_id"] == doc_id]["doc_text"].values[0]
            query_content = queries[queries["query_id"] == q_id]["query_text"].values[0]
            print(f"- query id: {q_id}, doc id: {doc_id}")

            sim_vote = 3 if sim >= 0.75 else 2 if sim >= 0.5 else 1 if sim >= 0.25 else 0
            print(f"- expert votes: {votes} (mean: {np.mean(votes):.2f}, median: {int(np.median(votes))}, mode: {Counter(votes).most_common(1)[0][0]})")
            print("- similarity vote:", sim_vote)
            votes = np.append(votes, sim_vote)

            agg_vote = np.round(np.mean(votes)).astype(int)
            assert agg_vote in [0, 1, 2, 3]

            print(f"- aggregated vote: {agg_vote}\n\n")

            # go to next query
            break

        # limit to sample size
        if SAMPLE_SIZE == 0:
            sys.exit(0)
        SAMPLE_SIZE -= 1


- query id: trip_1337, doc id: trip_4728579
- expert votes: [2 2 2] (mean: 2.00, median: 2, mode: 2)
- similarity vote: 2
- aggregated vote: 2


- query id: trip_443528, doc id: trip_9943688
- expert votes: [0 0 0] (mean: 0.00, median: 0, mode: 0)
- similarity vote: 2
- aggregated vote: 0


- query id: rob_q_FT933-11533, doc id: rob_FT924-4715
- expert votes: [0 2 1] (mean: 1.00, median: 1, mode: 0)
- similarity vote: 1
- aggregated vote: 1


- query id: trip_57861, doc id: trip_5571694
- expert votes: [3 1 2] (mean: 2.00, median: 2, mode: 3)
- similarity vote: 2
- aggregated vote: 2


- query id: rob_qq_FR940811-1-00004, doc id: rob_FR940811-1-00004
- expert votes: [0 0 0] (mean: 0.00, median: 0, mode: 0)
- similarity vote: 2
- aggregated vote: 0


- query id: rob_qq_FR940318-0-00056, doc id: rob_FR940106-0-00031
- expert votes: [1 0 0] (mean: 0.33, median: 0, mode: 0)
- similarity vote: 3
- aggregated vote: 1




SystemExit: 0

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
