<a href="https://colab.research.google.com/github/rocabrera/language-uncertainty/blob/master/create_squadshifts_aggregated.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --quiet pandas

In [None]:
import numpy as np
import pandas as pd
from typing import List

In [None]:
df = (pd.read_csv('/content/drive/MyDrive/UNICAMP/scored_squadshifts_paraphrased.csv')
        .dropna()
)

In [None]:
def compute_f1(predict_text: str, label_text:str):
    pred_tokens = predict_text.split()
    truth_tokens = label_text.split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec)


def custom_f1s(x):
  predicted_answer = x["predicted_answer"]
  answers: dict = eval(x["answers"])
  f1s = [compute_f1(predicted_answer, answer) for answer in answers["text"]]
  return f1s, max(f1s)

df["f1s"], df["max_f1"] = zip(*df.apply(custom_f1s, axis=1))

In [None]:
id_mean = df.groupby("id", as_index=False).agg({"max_f1":["mean"]}).droplevel(level=1, axis=1).rename(columns={"max_f1":"mean_f1"})

In [None]:
original_df = (pd.read_csv('/content/drive/MyDrive/UNICAMP/squadshifts_original.csv')
                 .dropna()
                 .merge(id_mean, on="id")
)

In [None]:
original_df["uncertainty"] = round(1 - original_df["mean_f1"], 2)

In [None]:
def create_not_bucket_uncertainty_label(x):

  uncertainty = x["uncertainty"]
  answers: dict = eval(x["answers"])
  true_labels = [f"{answer} Uncertainty: {uncertainty}" for answer in answers["text"]]
  return {"text": true_labels}


original_df["answers_not_bucket_uncertainty"] = original_df.apply(create_not_bucket_uncertainty_label, axis=1)

In [None]:
original_df["answers_not_bucket_uncertainty"].iloc[0], original_df["answers_not_bucket_uncertainty"].iloc[194] 


({'text': ['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly Uncertainty: 0.61',
   'two delegates Uncertainty: 0.61',
   'two delegates Uncertainty: 0.61',
   'two delegates Uncertainty: 0.61']},
 {'text': ['initial letters Uncertainty: 0.0',
   'an abbreviation Uncertainty: 0.0',
   'any abbreviation formed from initial letters Uncertainty: 0.0']})

---

In [None]:
bins = pd.IntervalIndex.from_tuples([(0.0, 0.32), (0.33, 0.65), (0.66, 1.)], closed="both")
bucket_uncertainty = pd.cut(original_df["uncertainty"], bins=bins)
print(bucket_uncertainty.cat.categories)
bucket_uncertainty.cat.categories = ["low", "medium", "high"]
print(bucket_uncertainty.cat.categories)
original_df["bucket_uncertainty"] = bucket_uncertainty

IntervalIndex([[0.0, 0.32], [0.33, 0.65], [0.66, 1.0]], dtype='interval[float64, both]')
Index(['low', 'medium', 'high'], dtype='object')


In [None]:
def create_bucket_uncertainty_label(x):

  uncertainty = x["bucket_uncertainty"]
  answers: dict = eval(x["answers"])
  true_labels = [f"{answer} Uncertainty: {uncertainty}" for answer in answers["text"]]
  return {"text": true_labels}

original_df["answers_bucket_uncertainty"] = original_df.apply(create_bucket_uncertainty_label, axis=1)

In [None]:

original_df["answers_bucket_uncertainty"].iloc[0], original_df["answers_bucket_uncertainty"].iloc[194] 

({'text': ['Each brotherhood elects two delegates who take part in the National Ecclesiastical Assembly Uncertainty: medium',
   'two delegates Uncertainty: medium',
   'two delegates Uncertainty: medium',
   'two delegates Uncertainty: medium']},
 {'text': ['initial letters Uncertainty: low',
   'an abbreviation Uncertainty: low',
   'any abbreviation formed from initial letters Uncertainty: low']})

---

In [None]:
def split_dataset(df:pd.DataFrame, approximated_train_pct:float, approximated_eval_pct:float) -> List[pd.DataFrame]:

  df = df.sample(frac=1)
  df["context_codes"] = df["context"].astype("category").cat.codes
  
  dataset_max = df["context_codes"].max()
  max_train_idx = int(np.ceil(dataset_max*approximated_train_pct))
  train_df = df.query(f"context_codes<={max_train_idx}").copy()
  aux =  df.query(f"context_codes > {max_train_idx}").copy()
  max_eval_index = int(np.ceil(max_train_idx + (dataset_max - max_train_idx)*approximated_eval_pct))
  eval_df = aux.query(f"context_codes<={max_eval_index}").copy()
  test_df =  aux.query(f"context_codes > {max_eval_index}").copy()

  return train_df, eval_df, test_df

approximated_eval_pct = 0.5
approximated_train_pct = 0.85
train_df, eval_df, test_df = split_dataset(original_df, approximated_train_pct, approximated_eval_pct)

In [None]:
print(f"Temos aproximadamente {round(approximated_train_pct,3)} do dataset para treino")
print(f"Train Percentage: {round(len(train_df)/len(original_df),3)}")
print(f"Temos aproximadamente {round(1-approximated_train_pct,3)} do dataset para separar entre test e validação com porcentagem {approximated_eval_pct} para validacao.")
print(f"Eval Percentage: {round(len(eval_df)/len(original_df),3)}")
print(f"Test Percentage: {round(len(test_df)/len(original_df),3)}")

Temos aproximadamente 0.85 do dataset para treino
Train Percentage: 0.846
Temos aproximadamente 0.15 do dataset para separar entre test e validação com porcentagem 0.5 para validacao.
Eval Percentage: 0.081
Test Percentage: 0.074


In [None]:
print("Mostrando que não tem intersecção de contexto nos datasets")
print(set(train_df.context_codes.unique()).intersection(test_df.context_codes.unique()))
print(set(train_df.context_codes.unique()).intersection(eval_df.context_codes.unique()))
print(set(eval_df.context_codes.unique()).intersection(test_df.context_codes.unique()))

Mostrando que não tem intersecção de contexto nos datasets
set()
set()
set()


In [None]:
train_df.to_csv("/content/squadshifts_aggregated_train.csv", index=False)
eval_df.to_csv("/content/squadshifts_aggregated_eval.csv", index=False)
test_df.to_csv("/content/squadshifts_aggregated_test.csv", index=False)