In [2]:
import os
import json

from datasets import load_from_disk


PROJECT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
SPLITS_DIR = os.path.join(PROJECT_DIR, "classification/split_datasets")

base_dataset = load_from_disk(os.path.join(SPLITS_DIR, "coqa"))
force_aug_dataset = load_from_disk(os.path.join(SPLITS_DIR, "coqa_force_aug"))
force_aug_noerrors_dataset = force_aug_dataset.copy()

In [3]:
# Load maps with errors ids
MAPS_DIR = os.path.join(PROJECT_DIR, "maps")

errors_map = json.load(open(os.path.join(MAPS_DIR, 'errors_idx_uuid_map.json')))
errors_idxs = [int(idx) for idx in errors_map.keys()]

force_aug_noerrors_dataset['train'] = force_aug_noerrors_dataset['train'].filter(
    lambda x: x['pandas_idx'] not in errors_idxs
)
force_aug_noerrors_dataset['validation'] = force_aug_noerrors_dataset['validation'].filter(
    lambda x: x['pandas_idx'] not in errors_idxs
)
force_aug_noerrors_dataset['test'] = force_aug_noerrors_dataset['test'].filter(
    lambda x: x['pandas_idx'] not in errors_idxs
)

In [10]:
# check relation between instance length and class

pos_lengths = []
neg_lengths = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            pos_lengths.append(len(i['text']))
        else:
            neg_lengths.append(len(i['text']))

print("Positive instances mean length: ", sum(pos_lengths)/len(pos_lengths))
print("Negative instances mean length: ", sum(neg_lengths)/len(neg_lengths))

Positive instances mean length:  539.1500855188142
Negative instances mean length:  637.0677719604421


In [9]:
# same for force_aug_noerrors_dataset
pos_lengths = []
neg_lengths = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            pos_lengths.append(len(i['text']))
        else:
            neg_lengths.append(len(i['text']))

print("Positive instances mean length: ", sum(pos_lengths)/len(pos_lengths))
print("Negative instances mean length: ", sum(neg_lengths)/len(neg_lengths))

Positive instances mean length:  539.1500855188142
Negative instances mean length:  598.9387667097888


In [11]:
# check for pos and neg instances containing none
pos_none = []
neg_none = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "none" in i['text']:
                pos_none.append(i['text'])
        else:
            if "none" in i['text']:
                neg_none.append(i['text'])

print("Positive instances containing none: ", len(pos_none))
print("Negative instances containing none: ", len(neg_none))

Positive instances containing none:  15
Negative instances containing none:  111


In [12]:
# same for force_aug_noerrors_dataset
pos_none = []
neg_none = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "none" in i['text']:
                pos_none.append(i['text'])
        else:
            if "none" in i['text']:
                neg_none.append(i['text'])
                
print("Positive instances containing none: ", len(pos_none))
print("Negative instances containing none: ", len(neg_none))

Positive instances containing none:  15
Negative instances containing none:  16


In [17]:
# check for pos and neg instances starting with if or when

pos_if = []
neg_if = []
pos_when = []
neg_when = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if"):
                pos_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                pos_when.append(i['text'])
        else:
            if i['text'].lower().startswith("if"):
                neg_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                neg_when.append(i['text'])

print("Positive instances starting with if: ", len(pos_if))
print("Negative instances starting with if: ", len(neg_if))
print("Positive instances starting with when: ", len(pos_when))
print("Negative instances starting with when: ", len(neg_when))

Positive instances starting with if:  355
Negative instances starting with if:  194
Positive instances starting with when:  206
Negative instances starting with when:  108


In [16]:
# same for force_aug_noerrors_dataset
pos_if = []
neg_if = []
pos_when = []
neg_when = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if"):
                pos_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                pos_when.append(i['text'])
        else:
            if i['text'].lower().startswith("if"):
                neg_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                neg_when.append(i['text'])

print("Positive instances starting with if: ", len(pos_if))
print("Negative instances starting with if: ", len(neg_if))
print("Positive instances starting with when: ", len(pos_when))
print("Negative instances starting with when: ", len(neg_when))


Positive instances starting with if:  355
Negative instances starting with if:  139
Positive instances starting with when:  206
Negative instances starting with when:  83


In [25]:
# check for pos and neg instances starting with 'if you' in the force dataset

pos_ifyou = []
neg_ifyou = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you"):
                pos_ifyou.append(i['text'])
        else:
            if i['text'].lower().startswith("if you"):
                neg_ifyou.append(i['text'])

print("Positive instances starting with if you: ", len(pos_ifyou))
print("Negative instances starting with if you: ", len(neg_ifyou))



Positive instances starting with if you:  143
Negative instances starting with if you:  83


In [31]:
# check for pos and neg instances starting with 'if you' in the noerrors dataset

pos_ifyou = []
neg_ifyou = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you"):
                pos_ifyou.append(i['text'])
        else:
            if i['text'].lower().startswith("if you"):
                neg_ifyou.append(i['text'])
                print(i['text'])

print("Positive instances containing if you: ", len(pos_ifyou))
print("Negative instances containing if you: ", len(neg_ifyou))

If you catch your girlfriend lying about seeing another guy, the first thing you'll most likely experience is mistrust. You may also feel hurt and betrayed, which could lead to a broken heart. Depending on the severity of the situation, you may also end up getting dumped. However, being fired or getting caught are not relevant in this scenario. Therefore, the best answer is B. mistrust.
If you are fishing but do not catch any fish, you will not be able to get food or eat fish. You could choose to go home or continue fishing in hopes of catching something. Getting drunk is not a likely outcome of not catching fish. It is possible that you may feel frustrated or disappointed, but anger is not the only possible emotion. Therefore, the most appropriate answer is C. go home.
If you read a book about something you have little knowledge in, you can definitely learn things. Therefore, option C is correct.
If you are loving another and he or she loves you back, it would lead to a mutual feeling

In [19]:
# check for word 'elminate' in pos and neg instances 

pos_eliminate = []
neg_eliminate = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "eliminate" in i['text'].lower():
                pos_eliminate.append(i['text'])
        else:
            if "eliminate" in i['text'].lower():
                neg_eliminate.append(i['text'])

print("Positive instances containing eliminate: ", len(pos_eliminate))
print("Negative instances containing eliminate: ", len(neg_eliminate))

Positive instances containing eliminate:  1235
Negative instances containing eliminate:  662


In [20]:
# check for word 'possib' in pos and neg instances

pos_possib = []
neg_possib = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "possib" in i['text'].lower():
                pos_possib.append(i['text'])
        else:
            if "possib" in i['text'].lower():
                neg_possib.append(i['text'])
            
print("Positive instances containing possib: ", len(pos_possib))
print("Negative instances containing possib: ", len(neg_possib))


Positive instances containing possib:  1668
Negative instances containing possib:  1322


In [21]:
# same for force_aug_noerrors_dataset
pos_possib = []
neg_possib = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "possib" in i['text'].lower():
                pos_possib.append(i['text'])
        else:
            if "possib" in i['text'].lower():
                neg_possib.append(i['text'])

print("Positive instances containing possib: ", len(pos_possib))
print("Negative instances containing possib: ", len(neg_possib))


Positive instances containing possib:  1668
Negative instances containing possib:  811


In [22]:
# check for word 'based on' in pos and neg instances

pos_basedon = []
neg_basedon = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "based on" in i['text'].lower():
                pos_basedon.append(i['text'])
        else:
            if "based on" in i['text'].lower():
                neg_basedon.append(i['text'])

print("Positive instances containing based on: ", len(pos_basedon))
print("Negative instances containing based on: ", len(neg_basedon))

Positive instances containing based on:  408
Negative instances containing based on:  474


In [23]:
# same for force_aug_noerrors_dataset
pos_basedon = []
neg_basedon = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "based on" in i['text'].lower():
                pos_basedon.append(i['text'])
        else:
            if "based on" in i['text'].lower():
                neg_basedon.append(i['text'])

print("Positive instances containing based on: ", len(pos_basedon))
print("Negative instances containing based on: ", len(neg_basedon))

Positive instances containing based on:  408
Negative instances containing based on:  289


In [34]:
# check for 'as they' in pos and neg instances

pos_asthey = []
neg_asthey = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "as they" in i['text'].lower():
                pos_asthey.append(i['text'])
        else:
            if "as they" in i['text'].lower():
                neg_asthey.append(i['text'])

print("Positive instances containing as they: ", len(pos_asthey))
print("Negative instances containing as they: ", len(neg_asthey))


Positive instances containing as they:  834
Negative instances containing as they:  444


In [35]:
# same for force_aug_noerrors_dataset
pos_asthey = []
neg_asthey = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "as they" in i['text'].lower():
                pos_asthey.append(i['text'])
        else:
            if "as they" in i['text'].lower():
                neg_asthey.append(i['text'])

print("Positive instances containing as they: ", len(pos_asthey))
print("Negative instances containing as they: ", len(neg_asthey))

Positive instances containing as they:  834
Negative instances containing as they:  328


In [32]:
# check for instances starting with 'if you' and containing 'as they'

pos_ifyou_asthey = []
neg_ifyou_asthey = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                pos_ifyou_asthey.append(i['text'])
        else:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                neg_ifyou_asthey.append(i['text'])

print("Positive instances starting with if you and containing as they: ", len(pos_ifyou_asthey))
print("Negative instances starting with if you and containing as they: ", len(neg_ifyou_asthey))


Positive instances starting with if you and containing as they:  4
Negative instances starting with if you and containing as they:  2


In [33]:
# same for force_aug_noerrors_dataset

pos_ifyou_asthey = []
neg_ifyou_asthey = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                pos_ifyou_asthey.append(i['text'])
        else:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                neg_ifyou_asthey.append(i['text'])


print("Positive instances starting with if you and containing as they: ", len(pos_ifyou_asthey))
print("Negative instances starting with if you and containing as they: ", len(neg_ifyou_asthey))

Positive instances starting with if you and containing as they:  4
Negative instances starting with if you and containing as they:  1


In [39]:
# check for instances containing 'as they', 'if you' and 'possib'

pos_ifyou_asthey_possib = []
neg_ifyou_asthey_possib = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                pos_ifyou_asthey_possib.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                neg_ifyou_asthey_possib.append(i['text'])

print("Positive instances starting with if you and containing as they and possib: ", len(pos_ifyou_asthey_possib))
print("Negative instances starting with if you and containing as they and possib: ", len(neg_ifyou_asthey_possib))


Positive instances starting with if you and containing as they and possib:  9
Negative instances starting with if you and containing as they and possib:  1


In [41]:
# same for force_aug_noerrors_dataset

pos_ifyou_asthey_possib = []
neg_ifyou_asthey_possib = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                pos_ifyou_asthey_possib.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                neg_ifyou_asthey_possib.append(i['text'])

print("Positive instances with if you, as they and possib: ", len(pos_ifyou_asthey_possib))
print("Negative instances with if you, as they and possib: ", len(neg_ifyou_asthey_possib))

Positive instances with if you, as they and possib:  9
Negative instances with if you, as they and possib:  0


In [43]:
# check for instances containing 'as they', 'if you' and 'based on' in both datasets

pos_ifyou_asthey_basedon = []
neg_ifyou_asthey_basedon = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "based on" in i['text'].lower():
                pos_ifyou_asthey_basedon.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "based on" in i['text'].lower():
                neg_ifyou_asthey_basedon.append(i['text'])

print("Positive instances with if you, as they and based on: ", len(pos_ifyou_asthey_basedon))
print("Negative instances with if you, as they and based on: ", len(neg_ifyou_asthey_basedon))

Positive instances with if you, as they and based on:  1
Negative instances with if you, as they and based on:  0


In [49]:
# check for instances containing 'if you' and 'based on' in both datasets

pos_ifyou_basedon = []
neg_ifyou_basedon = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                pos_ifyou_basedon.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                neg_ifyou_basedon.append(i['text'])

print("Positive instances with if you and based on: ", len(pos_ifyou_basedon))
print("Negative instances with if you and based on: ", len(neg_ifyou_basedon))

pos_ifyou_basedon = []
neg_ifyou_basedon = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            # if you at the start
            #if i['text'].lower().startswith("if you") and "based on" in i['text'].lower():
            # if you or when you at the start
            #if i['text'].lower().startswith("if you") or i['text'].lower().startswith("when you") and "based on" in i['text'].lower():
            # if you anywhere
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                pos_ifyou_basedon.append(i['text'])
        else:
            # if you at the start
            #if i['text'].lower().startswith("if you") and "based on" in i['text'].lower():
            # if you or when you at the start
            #if i['text'].lower().startswith("if you") or i['text'].lower().startswith("when you") and "based on" in i['text'].lower():
            # if you anywhere
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                neg_ifyou_basedon.append(i['text'])

print("Positive instances with if you and based on: ", len(pos_ifyou_basedon))
print("Negative instances with if you and based on: ", len(neg_ifyou_basedon))

Positive instances with if you and based on:  10
Negative instances with if you and based on:  19
Positive instances with if you and based on:  10
Negative instances with if you and based on:  11


In [50]:
# check for instances starting with 'if you' and containing 'based on'

ifyou_basedon_base = () # tuple of (neg, pos)
ifyou_basedon_force = ()
ifyou_basedon_force_noerrors = ()

def get_ifyou_basedon_nums(dataset):
    pos, neg = 0, 0
    for split in [dataset['train'], dataset['validation'], dataset['test']]:
        for i in split:
            #if you at the start
            #if i['text'].lower().startswith("if you") and "based on" in i['text'].lower():
            # if you anywhere
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                if i['label'] == True:
                    pos += 1
                else:
                    neg += 1
    return (neg, pos)

print("Force_aug_noerrors: ", get_ifyou_basedon_nums(force_aug_noerrors_dataset))
print("Force_aug: ", get_ifyou_basedon_nums(force_aug_dataset))
print("Base: ", get_ifyou_basedon_nums(base_dataset))

Force_aug_noerrors:  (11, 10)
Force_aug:  (19, 10)
Base:  (11, 8)
