# Exp 015: Generate test set
This experiment tries to create good evaluation sets for the different subtasks. Therefore it first checks the uniqueness of subcategory names.

In [3]:
from types import SimpleNamespace

args = SimpleNamespace(
    max_subcats=3,  # Maximum number of subcategories
    num_dialogs=400,  # Number of dialogs
    test_datasets=["CMUDoG", "ToC"],  # Datasets to include
    subcats=["MODALITY->would", "NEGATION->negation", "ADJECTIVES->superlatives"]  # Subcategories to consider
)

In [2]:
# script
import pandas as pd
import random
import os
from dotenv import load_dotenv
load_dotenv()
import sys
sys.path.append(f'../source')
import data
import helpers

random.seed(os.getenv("RANDOM_SEED"))


[nltk_data] Downloading package punkt to
[nltk_data]     /cluster/scratch/dglandorf/cache...
[nltk_data]   Package punkt is already up-to-date!


In [24]:
list_of_sets

[{'combining', 'comparatives', 'modifying', 'position', 'superlatives'},
 {'adverb phrases - form',
  'adverbs and adverb phrases: types and meanings',
  'adverbs as modifiers',
  'position'},
 {'comparatives',
  'conditional',
  'coordinated',
  'declarative',
  'imperatives',
  'interrogatives',
  'phrases/exclamations',
  'relative',
  'subordinated'},
 {'coordinating', 'subordinating'},
 {'articles', 'demonstratives', 'possessives', 'quantity'},
 {'discourse markers in writing'},
 {'focus'},
 {'future continuous',
  'future expressions with be',
  'future in the past',
  'future perfect continuous',
  'future perfect simple',
  'future with be going to',
  'future with will and shall',
  'present continuous for future use',
  'present simple for future use'},
 {'adjectives',
  'adverbs',
  'can',
  'could',
  'dare',
  'expressions with be',
  'have (got) to',
  'may',
  'might',
  'must',
  'need',
  'ought',
  'shall',
  'should',
  'used to',
  'will',
  'would'},
 {'negation'},

In [21]:
list_of_sets[1]

{'adverb phrases - form',
 'adverbs and adverb phrases: types and meanings',
 'adverbs as modifiers',
 'position'}

In [23]:
def check_overlap(sets):
    overlaps = 0
    for i in range(len(sets)):
        for j in range(i + 1, len(sets)):
            if sets[i].intersection(sets[j]):
                print(i, j)
                overlaps+=1  # Overlap found
    return overlaps  # No overlap

list_of_sets = list(egp.groupby("SuperCategory")["SubCategory"].agg(lambda x: set(x)))
check_overlap(list_of_sets)

0 1
0 2
4 15
10 18


4

In [6]:
nrs =helpers.get_existing_classifiers('corpus_training')

In [8]:

# load data
egp = data.get_egp()
dialog_data = data.get_dialog_data(args.test_datasets)

In [21]:
egp[egp['#'].isin(nrs)].groupby(['SubCategory','Level'])['#'].agg("count")

SubCategory   Level
conditional   A2        4
              B1        3
negation      A1        3
              A2        3
              B1        3
              B2        3
              C1        3
              C2        3
superlatives  A1        1
              A2        3
              B1        3
              B2        2
              C1        1
              C2        1
would         A1        3
              A2        6
              B1       11
              B2        1
              C1        1
              C2        1
Name: #, dtype: int64

In [13]:
# prepare iterations
num_subcats_list = list(range(1,1+args.max_subcats))
levels = list(egp['Level'].unique())

# helpers
def sample_subcat_constraints(n_subcats):
    subcats = random.sample(args.subcats, n_subcats) # without replacement
    subcat_levels = random.choices(levels, k=len(subcats)) # with replacement
    return zip(subcats, subcat_levels)

# sample and save dataframe
testdata = []
for _ in range(args.num_dialogs):
    context, response, source, id = helpers.sample_dialog_snippet(dialog_data)
    for num_subcats in num_subcats_list:
        constraints = sample_subcat_constraints(num_subcats)
        print(list(constraints))
        testdata.append({
            'context': context,
            'response': response,
            'source': source,
            'id': id,
            'constraints': ";".join([f"{subcat}->{level}" for subcat, level in constraints]),
            'n_subcats': num_subcats
        })
testset = pd.DataFrame(testdata)
testset.to_json(f'../data/task2_test.json')

[('would', 'C2')]
[('would', 'A1'), ('superlatives', 'C1')]
[('negation', 'C2'), ('would', 'C1'), ('superlatives', 'A1')]
[('superlatives', 'A2')]
[('would', 'B2'), ('superlatives', 'B1')]
[('negation', 'C1'), ('would', 'C2'), ('superlatives', 'B2')]
[('negation', 'C2')]
[('would', 'C1'), ('negation', 'A1')]
[('would', 'C1'), ('negation', 'C2'), ('superlatives', 'C1')]
[('would', 'C2')]
[('superlatives', 'B1'), ('negation', 'C1')]
[('negation', 'C2'), ('would', 'A1'), ('superlatives', 'B1')]
[('negation', 'B2')]
[('superlatives', 'A2'), ('would', 'B1')]
[('superlatives', 'B1'), ('negation', 'A1'), ('would', 'C1')]
[('superlatives', 'C1')]
[('negation', 'B2'), ('superlatives', 'B1')]
[('negation', 'C2'), ('superlatives', 'A2'), ('would', 'A2')]
[('superlatives', 'A1')]
[('superlatives', 'C2'), ('negation', 'A1')]
[('superlatives', 'B1'), ('would', 'C1'), ('negation', 'A1')]
[('would', 'A2')]
[('negation', 'C2'), ('would', 'B1')]
[('negation', 'A1'), ('superlatives', 'C1'), ('would', 'B1

In [3]:
# for each single constraint: 32 cases where true answer contained response
# for each single constraint: 32 cases where true answer did not contain response


In [39]:
import pickle
input_file = '../data/corpus_classification_all.pkl'
with open(input_file, 'rb') as f:
    all_hit_indices = pickle.load(f)
    all_hit_sentences = pickle.load(f)
    extracts = pickle.load(f)

n = 20
classifiers_nrs = helpers.get_existing_classifiers('corpus_training')
all_indices = set(range(len(extracts)))
data=[]
def append(cases, hits):
    for case in cases:
        data.append({
            'context': case[0],
            'response': case[1],
            'source': case[2],
            'id': None,
            'constraints': [nr],
            'n_subcats': 1,
            'response_hit': hits
        })
for nr in classifiers_nrs:
    hit_indices = set(all_hit_indices[nr])
    pos_cases = [extracts[idx] for idx in random.sample(list(hit_indices), min(n, len(hit_indices)))]
    other_indices = all_indices.difference(hit_indices)
    neg_cases = [extracts[idx] for idx in random.sample(list(other_indices), min(n, len(hit_indices)))]
    append(pos_cases, True)
    append(neg_cases, False)

In [42]:
testset = pd.read_json(f'../data/task1_test.json')

In [43]:
testset

Unnamed: 0,context,response,source,id,constraints,n_subcats,response_hit
0,"[Taste buds are on the tongue, buddy, not the ...","Will do, my scaly friend. Bon Voyage and best ...",ToC,,[616],1,1.0
1,[Yes it has. A person has to search google an...,Yes they should. They need to report facts on...,ToC,,[616],1,1.0
2,"[( At counter 1. ) Do you take parcels here?, ...",I'd like to send it by regular mail.,DailyDialog,,[616],1,1.0
3,[I am taking a trip to Europe in September for...,Alright well I would like to thank you for cha...,ToC,,[616],1,1.0
4,[I would like to talk to you about this year '...,"I know history is about to be made, and I woul...",DialogSum,,[616],1,1.0
...,...,...,...,...,...,...,...
3433,"[hello, how are you? Ernie and bert have a uni...",i guess for LGBTQIA it is representation that ...,ToC,2001.0,"[1175, 623]",2,
3434,"[hello, how are you? Ernie and bert have a uni...",i guess for LGBTQIA it is representation that ...,ToC,2001.0,"[1197, 59, 630]",3,
3435,"[hello, how are you? Ernie and bert have a uni...",i guess for LGBTQIA it is representation that ...,ToC,2001.0,"[1198, 1186]",1,
3436,"[hello, how are you? Ernie and bert have a uni...",i guess for LGBTQIA it is representation that ...,ToC,2001.0,"[1175, 1197, 58, 60]",2,
