In [5]:
from types import SimpleNamespace

args = SimpleNamespace(
    max_subcats=3,  # Maximum number of subcategories
    num_dialogs=100,  # Number of dialogs
    test_datasets=["CMUDoG", "ToC"],  # Datasets to include
    subcats=["MODALITY->would", "NEGATION->negation", "ADJECTIVES->superlatives"]  # Subcategories to consider
)

In [11]:
# script
import pandas as pd
import random
import os
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.append(f'../source')
import data
import helpers

random.seed(os.getenv("RANDOM_SEED"))


In [24]:
list_of_sets

[{'combining', 'comparatives', 'modifying', 'position', 'superlatives'},
 {'adverb phrases - form',
  'adverbs and adverb phrases: types and meanings',
  'adverbs as modifiers',
  'position'},
 {'comparatives',
  'conditional',
  'coordinated',
  'declarative',
  'imperatives',
  'interrogatives',
  'phrases/exclamations',
  'relative',
  'subordinated'},
 {'coordinating', 'subordinating'},
 {'articles', 'demonstratives', 'possessives', 'quantity'},
 {'discourse markers in writing'},
 {'focus'},
 {'future continuous',
  'future expressions with be',
  'future in the past',
  'future perfect continuous',
  'future perfect simple',
  'future with be going to',
  'future with will and shall',
  'present continuous for future use',
  'present simple for future use'},
 {'adjectives',
  'adverbs',
  'can',
  'could',
  'dare',
  'expressions with be',
  'have (got) to',
  'may',
  'might',
  'must',
  'need',
  'ought',
  'shall',
  'should',
  'used to',
  'will',
  'would'},
 {'negation'},

In [21]:
list_of_sets[1]

{'adverb phrases - form',
 'adverbs and adverb phrases: types and meanings',
 'adverbs as modifiers',
 'position'}

In [23]:
def check_overlap(sets):
    overlaps = 0
    for i in range(len(sets)):
        for j in range(i + 1, len(sets)):
            if sets[i].intersection(sets[j]):
                print(i, j)
                overlaps+=1  # Overlap found
    return overlaps  # No overlap

list_of_sets = list(egp.groupby("SuperCategory")["SubCategory"].agg(lambda x: set(x)))
check_overlap(list_of_sets)

0 1
0 2
4 15
10 18


4

In [13]:

# load data
egp = data.get_egp()
dialog_data = data.get_dialog_data(args.test_datasets)

# prepare iterations
num_subcats_list = list(range(1,1+args.max_subcats))
levels = list(egp['Level'].unique())

# helpers
def sample_subcat_constraints(n_subcats):
    subcats = random.sample(args.subcats, n_subcats) # without replacement
    subcat_levels = random.choices(levels, k=len(subcats)) # with replacement
    return zip(subcats, subcat_levels)

# sample and save dataframe
testdata = []
for _ in range(args.num_dialogs):
    context, response, source, id = helpers.sample_dialog_snippet(dialog_data)
    for num_subcats in num_subcats_list:
        constraints = sample_subcat_constraints(num_subcats)
        print(list(constraints))
        testdata.append({
            'context': context,
            'response': response,
            'source': source,
            'id': id,
            'constraints': ";".join([f"{subcat}->{level}" for subcat, level in constraints]),
            'n_subcats': num_subcats
        })
testset = pd.DataFrame(testdata)
testset.to_json(f'../data/task2_test.json')

[('would', 'C2')]
[('would', 'A1'), ('superlatives', 'C1')]
[('negation', 'C2'), ('would', 'C1'), ('superlatives', 'A1')]
[('superlatives', 'A2')]
[('would', 'B2'), ('superlatives', 'B1')]
[('negation', 'C1'), ('would', 'C2'), ('superlatives', 'B2')]
[('negation', 'C2')]
[('would', 'C1'), ('negation', 'A1')]
[('would', 'C1'), ('negation', 'C2'), ('superlatives', 'C1')]
[('would', 'C2')]
[('superlatives', 'B1'), ('negation', 'C1')]
[('negation', 'C2'), ('would', 'A1'), ('superlatives', 'B1')]
[('negation', 'B2')]
[('superlatives', 'A2'), ('would', 'B1')]
[('superlatives', 'B1'), ('negation', 'A1'), ('would', 'C1')]
[('superlatives', 'C1')]
[('negation', 'B2'), ('superlatives', 'B1')]
[('negation', 'C2'), ('superlatives', 'A2'), ('would', 'A2')]
[('superlatives', 'A1')]
[('superlatives', 'C2'), ('negation', 'A1')]
[('superlatives', 'B1'), ('would', 'C1'), ('negation', 'A1')]
[('would', 'A2')]
[('negation', 'C2'), ('would', 'B1')]
[('negation', 'A1'), ('superlatives', 'C1'), ('would', 'B1