#### Configuration

Configuration for the training of linear regression. The following are the options:
* `ds_type`: either T5-large-ssm or T5-xl-ssm. The graph features dataset and the seq2seq outputs dataset will be depended on the specified model from this argument. 
* `classification`: to whether use `LogisticRegression` (`classification=True`) or `LinearRegression` (`classification=False`)
* `text_only`: to wheter use the textual feature (question+answer)
* `graph_numerical_only`: to whether use the numerical graph/network features (num nodes, edges, etc.)
* `sequence_type`: either None (no g2t sequence used) or the desired sequence (T5, Determ. or GAP)

In [42]:
ds_type = "xl"
cache_path = "/workspace/storage/misc/huggingface"
graph_feats_path = f"hle2000/KGQA_T5-{ds_type}-ssm"
res_csv_path = f"hle2000/Mintaka_T5_{ds_type}_ssm_outputs"
classification = True  # clf true = logreg, false = linreg
text_only = False  # to add text features or not
graph_numerical_only = False  # to add numerical graph features or not
sequence_type = "gap"  # to add g2t seq or not (if yes, t5, gap or determ)

#### Reading and Processing Data

In [43]:
import pandas as pd
from datasets import load_dataset

graph_features_ds = load_dataset(graph_feats_path, cache_dir=cache_path)
train_df = graph_features_ds["train"].to_pandas()
val_df = graph_features_ds["validation"].to_pandas()
test_df = graph_features_ds["test"].to_pandas()

Found cached dataset parquet (/workspace/storage/misc/huggingface/hle2000___parquet/hle2000--KGQA_T5-xl-ssm-1744d8040d58562f/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

In [44]:
# core_cols will be the same for all approaches, hence the core
core_cols = ["id", "question", "question_answer", "answerEntity", "graph", "correct"]
text_features = [
    "question",
    "question_answer",
]  # textual features that we always want to drop for training (used later for reranking)
id_cols = [
    "graph",
    "id",
    "answerEntity",
]  # cols used to id (for reranking), but drop for training

In [45]:
import numpy as np
from sklearn import preprocessing


def get_numeric_cols(df):
    """return all dataframe's cols with numeric features"""
    cols_numeric = []
    for k, v in df.dtypes.to_dict().items():
        if (v is np.dtype("int64") or v is np.dtype("float64")) and k != "correct":
            cols_numeric.append(k)

    return cols_numeric


def apply_col_scale(df, scaler, col, split_type):
    """apply scaling to dataframne's col based on split"""
    if not col:
        return df
    if split_type == "train":  # fit transform for train
        df[col] = scaler.fit_transform(df[col])
    else:  # transform for val & test
        df[col] = scaler.transform(df[col])
    return df

In [46]:
min_max_scaler = preprocessing.MinMaxScaler()

# get the numerica cols in train and test to scale
numeric_cols = get_numeric_cols(train_df)

if graph_numerical_only:  # add numeric features for df if needed
    core_cols += numeric_cols

train_df = apply_col_scale(train_df, min_max_scaler, numeric_cols, "train")
val_df = apply_col_scale(val_df, min_max_scaler, numeric_cols, "val")
test_df = apply_col_scale(test_df, min_max_scaler, numeric_cols, "test")

In [47]:
def get_embedding_features(with_text, seq_type):
    """depends on the configuration above, return additonal embedding features
    to add to the dataframe. Embedding features could include Ques+Ans and/or
    g2t embeddings (both through MPNET embeddings)"""
    added_core_cols, emb_feats = [], []

    if with_text:
        added_core_cols.append("question_answer_embedding")
        emb_feats.append("question_answer_embedding")

    # add G2T sequence if needed
    if seq_type:
        added_core_cols.append(f"{sequence_type}_sequence_embedding")
        emb_feats.append(f"{sequence_type}_sequence_embedding")

    # added_core_cols will be added to core_cols list, emb_feats will be proccessed later
    return added_core_cols, emb_feats

In [48]:
added_cols, embedding_features = get_embedding_features(text_only, sequence_type)

# filter dataframe based on column only keep the specified columns
core_cols += added_cols
train_df = train_df[core_cols]
val_df = val_df[core_cols]
test_df = test_df[core_cols]

In [49]:
def process_emb(df, em_type):
    """split embeddings into individual columns for linreg/logreg"""
    embeddings = df[em_type].tolist()
    emb_dict = {}
    for emb in embeddings:
        for i, val in enumerate(emb):
            curr_key = f"{em_type}_{i}"
            if curr_key not in emb_dict:
                emb_dict[f"{em_type}_{i}"] = [val]
            else:
                emb_dict[f"{em_type}_{i}"].append(val)

    # returning a dataframe of just embeddings split into indivudal column format
    return pd.DataFrame(emb_dict)

In [50]:
def add_processed_emb(df):
    """add all processed embeddings in row format to df"""
    em_df_list = []
    for em in embedding_features:
        em_df_list.append(process_emb(df, em))

    df = pd.concat([df] + em_df_list, axis=1)
    return df

In [51]:
# process the embedding features, turn arr -> individual col. Original arr format will be deleted to train
if len(embedding_features) > 0:
    train_df = add_processed_emb(train_df)
    val_df = add_processed_emb(val_df)
    test_df = add_processed_emb(test_df)

#### Training Log/Linear Regression

In [52]:
# dropping all text, embedding features (after processing), and id cols (used for reranking)
X_train = train_df.drop(
    text_features + embedding_features + id_cols + ["correct"], axis=1
)
y_train = train_df["correct"].astype(float).tolist()
X_test = val_df.drop(text_features + embedding_features + id_cols + ["correct"], axis=1)
y_test = val_df["correct"].astype(float).tolist()

In [62]:
classification = False

In [63]:
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

# regr = linear_model.LinearRegression()
if classification:
    model = linear_model.LogisticRegression(class_weight="balanced", max_iter=10000)
else:
    model = linear_model.LinearRegression()
model.fit(X_train, y_train)

#### Reranking

In [54]:
from datasets import load_dataset

seq2seq_outputs = load_dataset(
    res_csv_path, verification_mode="no_checks", cache_dir=cache_path
)
mintaka_ds = load_dataset("AmazonScience/mintaka", cache_dir=cache_path)

Found cached dataset parquet (/workspace/storage/misc/huggingface/hle2000___parquet/hle2000--Mintaka_T5_xl_ssm_outputs-059583fb07b6dc35/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)


  0%|          | 0/3 [00:00<?, ?it/s]

No config specified, defaulting to: mintaka/en
Found cached dataset mintaka (/workspace/storage/misc/huggingface/AmazonScience___mintaka/en/1.0.0/bb35d95f07aed78fa590601245009c5f585efe909dbd4a8f2a4025ccf65bb11d)


  0%|          | 0/3 [00:00<?, ?it/s]

In [55]:
train_seq2seq = seq2seq_outputs["train"].to_pandas()
test_seq2seq = seq2seq_outputs["test"].to_pandas()
val_seq2seq = seq2seq_outputs["validation"].to_pandas()

mintaka_val = mintaka_ds["validation"].to_pandas()
mintaka_test = mintaka_ds["test"].to_pandas()

In [56]:
def find_label(graph, wd_id):
    """find label of the wikidata id using graph"""
    for node_id in graph.nodes:
        node = graph.nodes[node_id]
        if node["name_"] == wd_id:
            return node["label"]
    return f"cannot find label for {wd_id}"

In [57]:
from tqdm import tqdm


def get_reranked_answers(df, seq2seq_res, mintaka_df):
    """getting the reranked answers. Each question will have the following format
    {
        "QuestionID": "ID1",
        "RankedAnswers": [
            {
                "AnswerEntityID": "ID",
                "AnswerString": "String of prediction",
                "Score": null
            },
            {
                "AnswerEntityID": "ID",
                "AnswerString": "String of prediction",
                "Score": null
            }
        ]
        }
    """
    if "tfidf_vector" in df:
        df = df.drop(["tfidf_vector"], axis=1)
    dict_list = []

    for idx, group in tqdm(seq2seq_res.iterrows()):
        # get the corresponding subgraphs/graph features for this question
        curr_question_df = df[df["question"] == group["question"]]
        group_dict = group.to_dict()  # dict of the current entry from seq2seq outputs
        # building the dict entry for this question, with the above format from function description:
        curr_ques_dict = {}

        # we don't have subgraph/graph features for this question, take answer from seq2seq
        if len(curr_question_df) == 0:
            # find the ques id from mintaka
            curr_id = mintaka_df[mintaka_df["question"] == group_dict["question"]][
                "id"
            ].values[0]
            curr_ques_dict["QuestionID"] = curr_id
            ans_cands = []  # hold list of dict for RankedAnswers
            for k, v in group_dict.items():
                if "answer" in k:
                    ans_cand_dict = {}  # current dict for this ranked answer
                    ans_cand_dict["AnswerEntityID"] = None
                    ans_cand_dict["AnswerString"] = v
                    ans_cand_dict["Score"] = None
                    ans_cands.append(ans_cand_dict)
            curr_ques_dict["RankedAnswers"] = ans_cands
        else:  # we have subgraph for this question
            ans_ents = curr_question_df["answerEntity"].tolist()
            ans_strs = [
                q_a.split(";")[-1].strip()
                for q_a in curr_question_df["question_answer"].tolist()
            ]
            ans_ent_str_dict = dict(
                zip(ans_ents, ans_strs)
            )  # look up table for ent and its label

            curr_ques_dict["QuestionID"] = curr_question_df["id"].tolist()[0]
            # dropping features for prediction (similar to X_train and X_test above)
            curr_question_df = curr_question_df.drop(
                embedding_features + text_features + id_cols + ["correct"], axis=1
            )

            all_beams = set(group.tolist()[2:-1])  # all 200 beams
            if group["target"] not in all_beams:  # no correct answer in beam
                continue
            preds = model.predict(curr_question_df)

            if classification:  # get the model's confidence for classification
                idxs = np.where(preds == 1.0)[0]  # idxs of ans predicted as 1
                if len(idxs) == 0:  # model predict all ans to be 0
                    continue

                probs = []
                for idx in idxs:  # getting the probablity for each ans predicted as 1
                    curr_row = X_test[idx : idx + 1]
                    curr_prob = model.predict_proba(curr_row)[0][1]  # [1]
                    probs.append(curr_prob)
                # mapping each ent to its respective probablity/model's confidence
                ans_preds = dict(zip(ans_ents, probs))
            else:
                ans_preds = dict(zip(ans_ents, preds))

            # sorting the dict from highest model's confidence to lowest
            sorted_ans_preds = dict(
                sorted(ans_preds.items(), key=lambda item: item[1], reverse=True)
            )
            ans_cands = []  # hold list of dict for RankedAnswers
            for k, v in sorted_ans_preds.items():
                ans_cand_dict = {}  # current dict for this ranked answer
                ans_cand_dict[f"AnswerEntityID"] = k
                ans_cand_dict[f"AnswerString"] = ans_ent_str_dict[k]
                ans_cand_dict[f"Score"] = v
                ans_cands.append(ans_cand_dict)

            curr_ques_dict["RankedAnswers"] = ans_cands
        dict_list.append(curr_ques_dict)
    return dict_list

In [58]:
import json


def write_jsonl(file_path, data):
    with open(file_path, "w+") as f:
        for d in data:
            json.dump(d, f)
            f.write("\n")

In [64]:
model_type = "logreg" if classification else "linreg"
val_result = get_reranked_answers(val_df, val_seq2seq, mintaka_val)
path = f"/workspace/storage/misc/features_reranking/lin_log/{model_type}_Gap_T5-{ds_type}_val_predictions.jsonl"
write_jsonl(path, val_result)

2000it [00:11, 168.29it/s]


In [66]:
test_result = get_reranked_answers(test_df, test_seq2seq, mintaka_test)
path = f"/workspace/storage/misc/features_reranking/lin_log/{model_type}_Gap_T5-{ds_type}_test_predictions.jsonl"
write_jsonl(path, test_result)

4000it [00:14, 270.96it/s]
