In [11]:
import pandas as pd
import numpy as np
import torch
from torch.nn.functional import cosine_similarity
from pathlib import Path
from collections import Counter


base_in = Path.cwd().parent / "data-merged" / "data-merged" / "air-exercise-2" / "Part-1"
base_in_prev = Path.cwd().parent / "data-merged" / "data-merged"  # output of previous script
base_out = Path.cwd().parent / "output"

docs = pd.read_csv(base_in_prev / "fira-22.documents.embeddings.tsv", sep="\t")
queries = pd.read_csv(base_in_prev / "fira-22.queries.embeddings.tsv", sep="\t")
judgements: pd.DataFrame = pd.read_csv(base_in / "fira-22.judgements-anonymized.tsv", sep="\t")


def preprocess_docs(docs: pd.DataFrame) -> pd.DataFrame:
    docs = docs[docs["doc_id"].isin(judgements["documentId"].unique())]
    len_j = len(judgements["documentId"].unique())
    len_d = len(docs["doc_id"].unique())
    assert len_j == len_d
    return docs


def preprocess_queries(queries: pd.DataFrame) -> pd.DataFrame:
    queries = queries[queries["query_id"].isin(judgements["queryId"].unique())]
    len_j = len(judgements["queryId"].unique())
    len_q = len(queries["query_id"].unique())
    assert len_j == len_q
    return queries


def preprocess_judgements(judgements: pd.DataFrame) -> pd.DataFrame:
    prev_len = len(judgements)
    judgements = judgements.dropna().drop_duplicates()
    assert len(judgements) == prev_len
    judgements = judgements[["relevanceLevel", "queryId", "documentId"]]
    judgements["relevanceLevel"] = judgements["relevanceLevel"].map({"0_NOT_RELEVANT": 0, "1_TOPIC_RELEVANT_DOES_NOT_ANSWER": 1, "2_GOOD_ANSWER": 2, "3_PERFECT_ANSWER": 3})
    return judgements


def get_cos_similarity(q_id: str, d_id: str) -> float:
    q_embedding: torch.tensor = torch.tensor([float(i) for i in queries[queries["query_id"] == q_id]["query_embedding"].values[0].strip("[]").split(", ")]).unsqueeze(0)  # type: ignore
    d_embedding: torch.tensor = torch.tensor([float(i) for i in docs[docs["doc_id"] == d_id]["doc_embedding"].values[0].strip("[]").split(", ")]).unsqueeze(0)  # type: ignore
    sim: float = cosine_similarity(q_embedding, d_embedding).item()
    assert 0 <= sim <= 1
    return sim


docs = preprocess_docs(docs)  # "doc_id", "doc_embedding"
queries = preprocess_queries(queries)  # "query_id", "query_embedding"
judgements = preprocess_judgements(judgements)  # "relevanceLevel", "queryId", "documentId"


if __name__ == "__main__":
    SAMPLE_SIZE = 5
    
    # randomize query order
    queries = queries.sample(frac=1).reset_index(drop=True)

    for _, q in queries.iterrows():

        q_id = q["query_id"]
        d_ids = judgements[judgements["queryId"] == q_id]["documentId"].unique()

        for doc_id in d_ids:
            votes = judgements[judgements["documentId"] == doc_id]["relevanceLevel"].values
            sim = get_cos_similarity(q_id, doc_id)

            doc_content = docs[docs["doc_id"] == doc_id]["doc_text"].values[0]
            query_content = queries[queries["query_id"] == q_id]["query_text"].values[0]
            print(f"QUERY: {q_id} - {query_content}")
            print(f"DOC: {doc_id} - {doc_content}")

            sim_vote = 3 if sim >= 0.75 else 2 if sim >= 0.5 else 1 if sim >= 0.25 else 0
            print(f"EXPERT VOTES: {votes}")
            print("SIM VOTE:", sim_vote)
            votes = np.append(votes, sim_vote)

            agg_vote = int(np.median(votes))
            assert agg_vote in [0, 1, 2, 3]

            print(f"AGG VOTE: {agg_vote}\n")

            SAMPLE_SIZE -= 1
            if SAMPLE_SIZE == 0:
                break

QUERY: trip_44401 - antibiotic stewardship
DOC: trip_9374367 - effect of antibiotic stewardship on the incidence of infection and colonisation with antibiotic - resistant bacteria and clostridium difficile infection : a systematic review and meta - analysis . background : antibiotic stewardship programmes have been shown to reduce antibiotic use and hospital costs . we aimed to evaluate evidence of the effect of antibiotic stewardship on the incidence of infections and colonisation with antibiotic - resistant bacteria . methods : for this systematic review and meta - analysis , we searched pubmed , the cochrane database of systematic reviews , the cochrane central register of controlled trials , and web of science for studies published from jan 1 , 1960 , to may 31 , 2016 , that analysed the effect of antibiotic stewardship programmes on the incidence of infection and colonisation with antibiotic - resistant bacteria and clostridium difficile infections in hospital inpatients . two aut

KeyboardInterrupt: 