The distribution of questions-types from the generated questions using the squad_model and balanced_model

In [14]:
from datasets import load_dataset

squad_balanced = load_dataset("csv", data_files="../../data/compare_baseline_balanced_4b.csv", split='train')
print(squad_balanced)

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['source', 'target', 'dataset', 'category', 'squad_question4b', 'balanced_question4b'],
    num_rows: 1346
})


In [15]:
def categorise_dataset(data):
    target = data["target"].lower()
    if any(word in target for word in ["what"]):
        data["category"] = "description"
    elif any(
        word in target
        for word in [
            "how did",
            "how does",
            "how do",
            "compute",
            "calculate",
            "how can",
            "how should",
            "how would",
            "how will",
            "how to",
        ]
    ):
        data["category"] = "method"
    elif any(
        word in target
        for word in [
            "where",
            "when",
            "who",
            "how",
            "which",
        ]
    ):
        data["category"] = "recall"
    elif any(word in target for word in ["why"]):
        data["category"] = "explanation"
    else:
        data["category"] = "NA"

    return data


def print_distribution(dataset):
    categories = ["method", "description", "explanation", "recall", "NA"]

    distributions = []
    for category in categories:
        category_ds = dataset.filter(lambda data: data["category"] == category)
        distribution_str = f"{category} distribution = {len(category_ds) / len(dataset) * 100}%, count = {len(category_ds)}"
        distributions.append(distribution_str)

    for d in distributions:
        print(d)

In [17]:
squad_dataset = (
  squad_balanced
  .select_columns(['squad_question4b'])
  .rename_column("squad_question4b", "target")
  .add_column("category", ["NA"] * len(squad_balanced))
  .map(categorise_dataset)
)

balanced_dataset = (
  squad_balanced
  .select_columns(['balanced_question4b'])
  .rename_column("balanced_question4b", "target")
  .add_column("category", ["NA"] * len(squad_balanced))
  .map(categorise_dataset)
)

In [18]:
print_distribution(squad_dataset)

method distribution = 0.8915304606240713%, count = 12
description distribution = 50.81723625557206%, count = 684
explanation distribution = 0.9658246656760773%, count = 13
recall distribution = 46.95393759286775%, count = 632
NA distribution = 0.3714710252600297%, count = 5


In [19]:
print_distribution(balanced_dataset)

method distribution = 30.163447251114412%, count = 406
description distribution = 26.300148588410106%, count = 354
explanation distribution = 27.414561664190195%, count = 369
recall distribution = 9.806835066864785%, count = 132
NA distribution = 6.315007429420505%, count = 85


The distribution of question types between the balanced and balanced-resolved dataset

In [10]:
balanced_ds = load_dataset("alinet/balanced_qg", "default", split='train')
balanced_resolved_ds = load_dataset("alinet/balanced_qg", "resolved", split='train')

In [12]:
balanced_ds = balanced_ds.map(categorise_dataset)
print_distribution(balanced_ds)

Map:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

method distribution = 25.0%, count = 4061
description distribution = 25.0%, count = 4061
explanation distribution = 25.0%, count = 4061
recall distribution = 25.0%, count = 4061
NA distribution = 0.0%, count = 0


In [13]:
balanced_resolved_ds = balanced_resolved_ds.map(categorise_dataset)
print_distribution(balanced_resolved_ds)

Map:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

Filter:   0%|          | 0/16244 [00:00<?, ? examples/s]

method distribution = 24.495198227037676%, count = 3979
description distribution = 24.328983009111056%, count = 3952
explanation distribution = 24.655257325781825%, count = 4005
recall distribution = 26.415907411967492%, count = 4291
NA distribution = 0.10465402610194534%, count = 17
