In [2]:
import os
import json

from datasets import load_from_disk


PROJECT_DIR = os.path.dirname(os.path.dirname(os.getcwd()))
SPLITS_DIR = os.path.join(PROJECT_DIR, "classification/split_datasets")

base_dataset = load_from_disk(os.path.join(SPLITS_DIR, "coqa"))
force_aug_dataset = load_from_disk(os.path.join(SPLITS_DIR, "coqa_force_aug"))
force_aug_noerrors_dataset = force_aug_dataset.copy()

In [3]:
# Load maps with errors ids
MAPS_DIR = os.path.join(PROJECT_DIR, "maps")

errors_map = json.load(open(os.path.join(MAPS_DIR, 'errors_idx_uuid_map.json')))
errors_idxs = [int(idx) for idx in errors_map.keys()]

force_aug_noerrors_dataset['train'] = force_aug_noerrors_dataset['train'].filter(
    lambda x: x['pandas_idx'] not in errors_idxs
)
force_aug_noerrors_dataset['validation'] = force_aug_noerrors_dataset['validation'].filter(
    lambda x: x['pandas_idx'] not in errors_idxs
)
force_aug_noerrors_dataset['test'] = force_aug_noerrors_dataset['test'].filter(
    lambda x: x['pandas_idx'] not in errors_idxs
)

In [4]:
# for each dataset, get counts of pos and neg instances

def get_pos_neg_counts(dataset):
    pos, neg = 0, 0
    for split in [dataset['train'], dataset['validation'], dataset['test']]:
        for i in split:
            if i['label'] == True:
                pos += 1
            else:
                neg += 1
    return (neg, pos)

base_counts = get_pos_neg_counts(base_dataset)
force_aug_counts = get_pos_neg_counts(force_aug_dataset)
force_aug_noerrors_counts = get_pos_neg_counts(force_aug_noerrors_dataset)

In [5]:
# check relation between instance length and class

pos_lengths = []
neg_lengths = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            pos_lengths.append(len(i['text']))
        else:
            neg_lengths.append(len(i['text']))

print("Positive instances mean length: ", sum(pos_lengths)/len(pos_lengths))
print("Negative instances mean length: ", sum(neg_lengths)/len(neg_lengths))

Positive instances mean length:  539.1500855188142
Negative instances mean length:  637.0677719604421


In [6]:
# same for force_aug_noerrors_dataset
pos_lengths = []
neg_lengths = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            pos_lengths.append(len(i['text']))
        else:
            neg_lengths.append(len(i['text']))

print("Positive instances mean length: ", sum(pos_lengths)/len(pos_lengths))
print("Negative instances mean length: ", sum(neg_lengths)/len(neg_lengths))

Positive instances mean length:  539.1500855188142
Negative instances mean length:  598.9387667097888


In [7]:
# check for pos and neg instances containing none
pos_none = []
neg_none = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "none" in i['text']:
                pos_none.append(i['text'])
        else:
            if "none" in i['text']:
                neg_none.append(i['text'])

print("Positive instances containing none: ", len(pos_none))
print("Negative instances containing none: ", len(neg_none))

Positive instances containing none:  15
Negative instances containing none:  111


In [8]:
# same for force_aug_noerrors_dataset
pos_none = []
neg_none = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "none" in i['text']:
                pos_none.append(i['text'])
        else:
            if "none" in i['text']:
                neg_none.append(i['text'])
                
print("Positive instances containing none: ", len(pos_none))
print("Negative instances containing none: ", len(neg_none))

Positive instances containing none:  15
Negative instances containing none:  16


In [9]:
# PERCENTAGES of pos and neg instances containing none (over total pos/neg instances*100) (round to 2 dec)
print("Force:")
print("% pos instances containing none: ", round(len(pos_none)/force_aug_counts[1]*100, 2))
print("% neg instances containing none: ", round(len(neg_none)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances containing none: ", round(len(pos_none)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances containing none: ", round(len(neg_none)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances containing none:  0.21
% neg instances containing none:  0.47
Force no errors:
% pos instances containing none:  0.21
% neg instances containing none:  0.69


In [10]:
# check for pos and neg instances starting with if or when
pos_if = []
neg_if = []
pos_when = []
neg_when = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if"):
                pos_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                pos_when.append(i['text'])
        else:
            if i['text'].lower().startswith("if"):
                neg_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                neg_when.append(i['text'])

print("Positive instances starting with if: ", len(pos_if))
print("Negative instances starting with if: ", len(neg_if))
print("Positive instances starting with when: ", len(pos_when))
print("Negative instances starting with when: ", len(neg_when))

Positive instances starting with if:  355
Negative instances starting with if:  194
Positive instances starting with when:  206
Negative instances starting with when:  108


In [11]:
# same for force_aug_noerrors_dataset
pos_if = []
neg_if = []
pos_when = []
neg_when = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if"):
                pos_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                pos_when.append(i['text'])
        else:
            if i['text'].lower().startswith("if"):
                neg_if.append(i['text'])
            elif i['text'].lower().startswith("when"):
                neg_when.append(i['text'])

print("Positive instances starting with if: ", len(pos_if))
print("Negative instances starting with if: ", len(neg_if))
print("Positive instances starting with when: ", len(pos_when))
print("Negative instances starting with when: ", len(neg_when))


Positive instances starting with if:  355
Negative instances starting with if:  139
Positive instances starting with when:  206
Negative instances starting with when:  83


In [12]:
# PERCENTAGES of pos and neg instances starting with if or when (over total pos/neg instances*100) (round to 2 dec)
print("Force:")
print("% pos instances starting with if: ", round(len(pos_if)/force_aug_counts[1]*100, 2))
print("% neg instances starting with if: ", round(len(neg_if)/force_aug_counts[0]*100, 2))

print("% pos instances starting with when: ", round(len(pos_when)/force_aug_counts[1]*100, 2))
print("% neg instances starting with when: ", round(len(neg_when)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances starting with if: ", round(len(pos_if)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances starting with if: ", round(len(neg_if)/force_aug_noerrors_counts[0]*100, 2))

print("% pos instances starting with when: ", round(len(pos_when)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances starting with when: ", round(len(neg_when)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances starting with if:  5.06
% neg instances starting with if:  4.04
% pos instances starting with when:  2.94
% neg instances starting with when:  2.41
Force no errors:
% pos instances starting with if:  5.06
% neg instances starting with if:  5.99
% pos instances starting with when:  2.94
% neg instances starting with when:  3.58


In [13]:
# check for pos and neg instances starting with 'if you' in the force dataset
pos_ifyou = []
neg_ifyou = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you"):
                pos_ifyou.append(i['text'])
        else:
            if i['text'].lower().startswith("if you"):
                neg_ifyou.append(i['text'])

print("Positive instances starting with if you: ", len(pos_ifyou))
print("Negative instances starting with if you: ", len(neg_ifyou))

Positive instances starting with if you:  143
Negative instances starting with if you:  83


In [14]:
# check for pos and neg instances starting with 'if you' in the noerrors dataset

pos_ifyou = []
neg_ifyou = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you"):
                pos_ifyou.append(i['text'])
        else:
            if i['text'].lower().startswith("if you"):
                neg_ifyou.append(i['text'])

print("Positive instances containing if you: ", len(pos_ifyou))
print("Negative instances containing if you: ", len(neg_ifyou))

Positive instances containing if you:  143
Negative instances containing if you:  60


In [15]:
# PERCENTAGES of pos and neg instances starting with if you (over total pos/neg instances*100) (round to 2 dec)
print("Force:")
print("% pos instances starting with if you: ", round(len(pos_ifyou)/force_aug_counts[1]*100, 2))
print("% neg instances starting with if you: ", round(len(neg_ifyou)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances starting with if you: ", round(len(pos_ifyou)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances starting with if you: ", round(len(neg_ifyou)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances starting with if you:  2.04
% neg instances starting with if you:  1.75
Force no errors:
% pos instances starting with if you:  2.04
% neg instances starting with if you:  2.59


In [16]:
# check for word 'elminate' in pos and neg instances 

pos_eliminate = []
neg_eliminate = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "eliminate" in i['text'].lower():
                pos_eliminate.append(i['text'])
        else:
            if "eliminate" in i['text'].lower():
                neg_eliminate.append(i['text'])

print("Positive instances containing eliminate: ", len(pos_eliminate))
print("Negative instances containing eliminate: ", len(neg_eliminate))

Positive instances containing eliminate:  1235
Negative instances containing eliminate:  662


In [37]:
# check for word 'elminate' in pos and neg instances in noerrors dataset

pos_eliminate = []
neg_eliminate = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "eliminate" in i['text'].lower():
                pos_eliminate.append(i['text'])
        else:
            if "eliminate" in i['text'].lower():
                neg_eliminate.append(i['text'])

print("Positive instances containing eliminate: ", len(pos_eliminate))
print("Negative instances containing eliminate: ", len(neg_eliminate))

Positive instances containing eliminate:  1235
Negative instances containing eliminate:  499


In [38]:
# PERCENTAGES of pos and neg instances containing eliminate (over total pos/neg instances*100) (round to 2 dec)

print("Force:")
print("% pos instances containing eliminate: ", round(len(pos_eliminate)/force_aug_counts[1]*100, 2))
print("% neg instances containing eliminate: ", round(len(neg_eliminate)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances containing eliminate: ", round(len(pos_eliminate)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances containing eliminate: ", round(len(neg_eliminate)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances containing eliminate:  17.6
% neg instances containing eliminate:  14.51
Force no errors:
% pos instances containing eliminate:  17.6
% neg instances containing eliminate:  21.52


In [18]:
# check for word 'possib' in pos and neg instances

pos_possib = []
neg_possib = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "possib" in i['text'].lower():
                pos_possib.append(i['text'])
        else:
            if "possib" in i['text'].lower():
                neg_possib.append(i['text'])
            
print("Positive instances containing possib: ", len(pos_possib))
print("Negative instances containing possib: ", len(neg_possib))


Positive instances containing possib:  1668
Negative instances containing possib:  1322


In [19]:
# same for force_aug_noerrors_dataset
pos_possib = []
neg_possib = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "possib" in i['text'].lower():
                pos_possib.append(i['text'])
        else:
            if "possib" in i['text'].lower():
                neg_possib.append(i['text'])

print("Positive instances containing possib: ", len(pos_possib))
print("Negative instances containing possib: ", len(neg_possib))


Positive instances containing possib:  1668
Negative instances containing possib:  811


In [20]:
# PERCENTAGES of pos and neg instances containing possib (over total pos/neg instances*100) (round to 2 dec)   
print("Force:")
print("% pos instances containing possib: ", round(len(pos_possib)/force_aug_counts[1]*100, 2))
print("% neg instances containing possib: ", round(len(neg_possib)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances containing possib: ", round(len(pos_possib)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances containing possib: ", round(len(neg_possib)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances containing possib:  23.77
% neg instances containing possib:  23.59
Force no errors:
% pos instances containing possib:  23.77
% neg instances containing possib:  34.97


In [21]:
# check for word 'based on' in pos and neg instances

pos_basedon = []
neg_basedon = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "based on" in i['text'].lower():
                pos_basedon.append(i['text'])
        else:
            if "based on" in i['text'].lower():
                neg_basedon.append(i['text'])

print("Positive instances containing based on: ", len(pos_basedon))
print("Negative instances containing based on: ", len(neg_basedon))

Positive instances containing based on:  408
Negative instances containing based on:  474


In [22]:
# same for force_aug_noerrors_dataset
pos_basedon = []
neg_basedon = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "based on" in i['text'].lower():
                pos_basedon.append(i['text'])
        else:
            if "based on" in i['text'].lower():
                neg_basedon.append(i['text'])

print("Positive instances containing based on: ", len(pos_basedon))
print("Negative instances containing based on: ", len(neg_basedon))

Positive instances containing based on:  408
Negative instances containing based on:  289


In [23]:
# PERCENTAGES of pos and neg instances containing based on (over total pos/neg instances*100) (round to 2 dec)
print("Force:")
print("% pos instances containing based on: ", round(len(pos_basedon)/force_aug_counts[1]*100, 2))
print("% neg instances containing based on: ", round(len(neg_basedon)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances containing based on: ", round(len(pos_basedon)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances containing based on: ", round(len(neg_basedon)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances containing based on:  5.82
% neg instances containing based on:  8.41
Force no errors:
% pos instances containing based on:  5.82
% neg instances containing based on:  12.46


In [24]:
# check for 'as they' in pos and neg instances

pos_asthey = []
neg_asthey = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "as they" in i['text'].lower():
                pos_asthey.append(i['text'])
        else:
            if "as they" in i['text'].lower():
                neg_asthey.append(i['text'])

print("Positive instances containing as they: ", len(pos_asthey))
print("Negative instances containing as they: ", len(neg_asthey))


Positive instances containing as they:  834
Negative instances containing as they:  444


In [25]:
# same for force_aug_noerrors_dataset
pos_asthey = []
neg_asthey = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "as they" in i['text'].lower():
                pos_asthey.append(i['text'])
        else:
            if "as they" in i['text'].lower():
                neg_asthey.append(i['text'])

print("Positive instances containing as they: ", len(pos_asthey))
print("Negative instances containing as they: ", len(neg_asthey))

Positive instances containing as they:  834
Negative instances containing as they:  328


In [26]:
# PERCENTAGES of pos and neg instances containing as they (over total pos/neg instances*100) (round to 2 dec)
print("Force:")
print("% pos instances containing as they: ", round(len(pos_asthey)/force_aug_counts[1]*100, 2))
print("% neg instances containing as they: ", round(len(neg_asthey)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances containing as they: ", round(len(pos_asthey)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances containing as they: ", round(len(neg_asthey)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances containing as they:  11.89
% neg instances containing as they:  9.54
Force no errors:
% pos instances containing as they:  11.89
% neg instances containing as they:  14.14


In [27]:
# check for instances starting with 'if you' and containing 'as they'

pos_ifyou_asthey = []
neg_ifyou_asthey = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                pos_ifyou_asthey.append(i['text'])
        else:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                neg_ifyou_asthey.append(i['text'])

print("Positive instances starting with if you and containing as they: ", len(pos_ifyou_asthey))
print("Negative instances starting with if you and containing as they: ", len(neg_ifyou_asthey))


Positive instances starting with if you and containing as they:  4
Negative instances starting with if you and containing as they:  2


In [28]:
# same for force_aug_noerrors_dataset

pos_ifyou_asthey = []
neg_ifyou_asthey = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                pos_ifyou_asthey.append(i['text'])
        else:
            if i['text'].lower().startswith("if you") and "as they" in i['text'].lower():
                neg_ifyou_asthey.append(i['text'])


print("Positive instances starting with if you and containing as they: ", len(pos_ifyou_asthey))
print("Negative instances starting with if you and containing as they: ", len(neg_ifyou_asthey))

Positive instances starting with if you and containing as they:  4
Negative instances starting with if you and containing as they:  1


In [29]:
# PERCENTAGES of pos and neg instances starting with if you and containing as they (over total pos/neg instances*100) (round to 2 dec)
print("Force:")
print("% pos instances starting with if you and containing as they: ", round(len(pos_ifyou_asthey)/force_aug_counts[1]*100, 2))
print("% neg instances starting with if you and containing as they: ", round(len(neg_ifyou_asthey)/force_aug_counts[0]*100, 2))

print("Force no errors:")
print("% pos instances starting with if you and containing as they: ", round(len(pos_ifyou_asthey)/force_aug_noerrors_counts[1]*100, 2))
print("% neg instances starting with if you and containing as they: ", round(len(neg_ifyou_asthey)/force_aug_noerrors_counts[0]*100, 2))

Force:
% pos instances starting with if you and containing as they:  0.06
% neg instances starting with if you and containing as they:  0.03
Force no errors:
% pos instances starting with if you and containing as they:  0.06
% neg instances starting with if you and containing as they:  0.04


In [30]:
# check for instances containing 'as they', 'if you' and 'possib'

pos_ifyou_asthey_possib = []
neg_ifyou_asthey_possib = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                pos_ifyou_asthey_possib.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                neg_ifyou_asthey_possib.append(i['text'])

print("Positive instances starting with if you and containing as they and possib: ", len(pos_ifyou_asthey_possib))
print("Negative instances starting with if you and containing as they and possib: ", len(neg_ifyou_asthey_possib))


Positive instances starting with if you and containing as they and possib:  9
Negative instances starting with if you and containing as they and possib:  1


In [31]:
# same for force_aug_noerrors_dataset

pos_ifyou_asthey_possib = []
neg_ifyou_asthey_possib = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                pos_ifyou_asthey_possib.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "possib" in i['text'].lower():
                neg_ifyou_asthey_possib.append(i['text'])

print("Positive instances with if you, as they and possib: ", len(pos_ifyou_asthey_possib))
print("Negative instances with if you, as they and possib: ", len(neg_ifyou_asthey_possib))

Positive instances with if you, as they and possib:  9
Negative instances with if you, as they and possib:  0


In [32]:
# check for instances containing 'as they', 'if you' and 'based on' in both datasets

pos_ifyou_asthey_basedon = []
neg_ifyou_asthey_basedon = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "based on" in i['text'].lower():
                pos_ifyou_asthey_basedon.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "as they" in i['text'].lower() and "based on" in i['text'].lower():
                neg_ifyou_asthey_basedon.append(i['text'])

print("Positive instances with if you, as they and based on: ", len(pos_ifyou_asthey_basedon))
print("Negative instances with if you, as they and based on: ", len(neg_ifyou_asthey_basedon))

Positive instances with if you, as they and based on:  1
Negative instances with if you, as they and based on:  0


In [33]:
# check for instances containing 'if you' and 'based on' in both datasets

pos_ifyou_basedon = []
neg_ifyou_basedon = []

for split in [force_aug_dataset['train'], force_aug_dataset['validation'], force_aug_dataset['test']]:
    for i in split:
        if i['label'] == True:
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                pos_ifyou_basedon.append(i['text'])
        else:
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                neg_ifyou_basedon.append(i['text'])

print("Positive instances with if you and based on: ", len(pos_ifyou_basedon))
print("Negative instances with if you and based on: ", len(neg_ifyou_basedon))

pos_ifyou_basedon = []
neg_ifyou_basedon = []

for split in [force_aug_noerrors_dataset['train'], force_aug_noerrors_dataset['validation'], force_aug_noerrors_dataset['test']]:
    for i in split:
        if i['label'] == True:
            # if you at the start
            #if i['text'].lower().startswith("if you") and "based on" in i['text'].lower():
            # if you or when you at the start
            #if i['text'].lower().startswith("if you") or i['text'].lower().startswith("when you") and "based on" in i['text'].lower():
            # if you anywhere
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                pos_ifyou_basedon.append(i['text'])
        else:
            # if you at the start
            #if i['text'].lower().startswith("if you") and "based on" in i['text'].lower():
            # if you or when you at the start
            #if i['text'].lower().startswith("if you") or i['text'].lower().startswith("when you") and "based on" in i['text'].lower():
            # if you anywhere
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                neg_ifyou_basedon.append(i['text'])

print("Positive instances with if you and based on: ", len(pos_ifyou_basedon))
print("Negative instances with if you and based on: ", len(neg_ifyou_basedon))

Positive instances with if you and based on:  10
Negative instances with if you and based on:  19
Positive instances with if you and based on:  10
Negative instances with if you and based on:  11


In [34]:
# check for instances starting with 'if you' and containing 'based on'

#ifyou_basedon_base = () # tuple of (neg, pos)
#ifyou_basedon_force = ()
#ifyou_basedon_force_noerrors = ()

def get_ifyou_basedon_nums(dataset):
    pos, neg = 0, 0
    for split in [dataset['train'], dataset['validation'], dataset['test']]:
        for i in split:
            #if you at the start
            #if i['text'].lower().startswith("if you") and "based on" in i['text'].lower():
            # if you anywhere
            if "if you" in i['text'].lower() and "based on" in i['text'].lower():
                if i['label'] == True:
                    pos += 1
                else:
                    neg += 1
    return (neg, pos)

print("Force_aug_noerrors: ", get_ifyou_basedon_nums(force_aug_noerrors_dataset))
print("Force_aug: ", get_ifyou_basedon_nums(force_aug_dataset))
print("Base: ", get_ifyou_basedon_nums(base_dataset))

Force_aug_noerrors:  (11, 10)
Force_aug:  (19, 10)
Base:  (11, 8)


In [35]:
# for each dataset, get PERCENTAGES of neg and pos instances containing 'if you' and 'based on'

ifyou_basedon_base = get_ifyou_basedon_nums(base_dataset)
ifyou_basedon_base_percentages = (
    round(
        ifyou_basedon_base[0]/base_counts[0]*100, 2
    ), 
    round(
        ifyou_basedon_base[1]/base_counts[1]*100, 2
    )
)

ifyou_basedon_force = get_ifyou_basedon_nums(force_aug_dataset)
ifyou_basedon_force_percentages = (
    round(
        ifyou_basedon_force[0]/force_aug_counts[0]*100, 2
    ), 
    round(
        ifyou_basedon_force[1]/force_aug_counts[1]*100, 2
    )
)

ifyou_basedon_force_noerrors = get_ifyou_basedon_nums(force_aug_noerrors_dataset)
ifyou_basedon_force_noerrors_percentages = (
    round(
        ifyou_basedon_force_noerrors[0]/force_aug_noerrors_counts[0]*100, 2
    ), 
    round(
        ifyou_basedon_force_noerrors[1]/force_aug_noerrors_counts[1]*100, 2
    )
)

print("Base: ", ifyou_basedon_base_percentages)
print("Force: ", ifyou_basedon_force_percentages)
print("Force_noerrors: ", ifyou_basedon_force_noerrors_percentages)

Base:  (0.29, 0.13)
Force:  (0.55, 0.14)
Force_noerrors:  (0.47, 0.14)
