In [400]:
from pathlib import Path
import json
from enum import Enum
from faker import Faker
import sys
import random

In [410]:
working_directory = str(Path.cwd().parent)
if working_directory not in sys.path:
    sys.path.append(str(working_directory))
with open('../data/raw/train.json') as f:
    data = json.load(f)
num_of_tokens = sum([len(doc['tokens']) for doc in data])
possible_labels = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM',
                   'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM',
                   'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM']
class Label(Enum):
    O = 'O'
    B_NAME_STUDENT = 'B-NAME_STUDENT'
    I_NAME_STUDENT = 'I-NAME_STUDENT'
    B_URL_PERSONAL = 'B-URL_PERSONAL'
    B_ID_NUM = 'B-ID_NUM'
    B_EMAIL = 'B-EMAIL'
    I_STREET_ADDRESS = 'I-STREET_ADDRESS'
    I_PHONE_NUM = 'I-PHONE_NUM'
    B_USERNAME = 'B-USERNAME'
    B_PHONE_NUM = 'B-PHONE_NUM'
    B_STREET_ADDRESS = 'B-STREET_ADDRESS'
    I_URL_PERSONAL = 'I-URL_PERSONAL'
    I_ID_NUM = 'I-ID_NUM'

def generate_fake_label(label, faker):
    if 'NAME_STUDENT' in label:
        return faker.name().split(' ')
    elif 'EMAIL' in label:
        return [faker.email()]
    elif 'USERNAME' in label:
        return [faker.user_name()]
    elif 'ID_NUM' in label:
        return [f"{faker.random_int(min=100000000000, max=999999999999):012d}"]
    elif 'URL_PERSONAL' in label:
        full_name = faker.name().split(' ')
        return [random.choice([f"{faker.url()}{full_name[0]}_{full_name[1]}", f"{faker.url()}{faker.user_name()}", f"http://{faker.user_name()}.com"])]
    elif 'PHONE_NUM' in label:
        return [faker.phone_number()]
    elif 'ADDRESS' in label:
        return [faker.address().split(' ')]

In [357]:
len(data)

6807

In [358]:
data[1].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])

In [359]:
essays_with_labels_dict = {}
for essay in data:
    if not all(list(map(lambda label: label=='O', essay['labels']))):
        essays_with_labels_dict[essay['document']] = {'labels': essay['labels'], 'tokens': essay['tokens']}

In [360]:
essays_with_labels_dict[7].keys()

dict_keys(['labels', 'tokens'])

In [361]:
indices_of_labels_per_essay = {}
tokens_per_doc_id = {}
labels_per_doc_id = {}
for doc_id in essays_with_labels_dict.keys():
    tokens_per_doc_id[doc_id] = essays_with_labels_dict[doc_id]['tokens']
    labels_per_doc_id[doc_id] = essays_with_labels_dict[doc_id]['labels']
    indices_of_labels = []
    essay = essays_with_labels_dict[doc_id]
    for i in range(len(essay['labels'])):
        if essay['labels'][i] != 'O':
            indices_of_labels.append(i)
    indices_of_labels_per_essay[doc_id] = indices_of_labels

In [362]:
for doc_id in indices_of_labels_per_essay.keys():
    print(f'{doc_id}: {indices_of_labels_per_essay[doc_id]}')

7: [9, 10, 482, 483, 741, 742]
10: [0, 1, 464, 465]
16: [4, 5]
20: [5, 6]
56: [12, 13]
86: [6, 7]
93: [0, 1]
104: [8, 9]
112: [5, 6]
123: [32, 33]
136: [30]
166: [0, 1]
204: [4]
214: [4, 5]
269: [783, 784]
288: [0, 1]
308: [0, 1, 591, 592, 1206, 1207, 1502, 1503]
317: [611]
324: [13]
330: [18, 19]
333: [20, 21]
344: [7, 8]
356: [701, 702]
375: [5, 6]
379: [15, 16, 28, 31, 760, 761]
470: [0]
472: [4, 5, 922]
591: [3, 4]
607: [478, 479]
609: [19, 20, 26, 32, 33, 39, 50, 53, 54, 56, 62, 65, 66, 68, 71, 72, 74, 77, 78, 80, 83, 84, 89, 92, 93, 125]
616: [0, 1]
651: [0, 1, 611, 612]
659: [0, 1]
671: [13]
714: [3, 4]
730: [3, 4, 6, 7, 12, 13]
736: [17, 18]
760: [8, 9]
828: [9, 10]
1105: [5, 6, 56, 93, 155, 194, 226, 239, 580]
1134: [11, 12]
1175: [0, 1]
1185: [6, 7]
1210: [12, 13]
1221: [0, 1]
1239: [9, 10]
1277: [0, 1, 503, 504]
1290: [556, 557, 601, 602]
1295: [5, 6]
1309: [497]
1325: [14, 15, 16, 17]
1353: [9, 10, 438, 439]
1437: [6, 7]
1444: [12, 13]
1447: [746, 747]
1472: [826, 827]
1477

In [363]:
def group_close_numbers(original_list, threshold=100):
    result = []
    current_group = []

    for num in original_list:
        if not current_group or num - current_group[-1] <= threshold:
            current_group.append(num)
        else:
            result.append(current_group)
            current_group = [num]

    if current_group:
        result.append(current_group)

    return result

In [364]:
indices_of_labels_per_essay_groups = {}
for doc_id in indices_of_labels_per_essay:
    indices_of_labels_per_essay_groups[doc_id] = group_close_numbers(indices_of_labels_per_essay[doc_id])

In [365]:
indices_of_labels_per_essay_groups

{7: [[9, 10], [482, 483], [741, 742]],
 10: [[0, 1], [464, 465]],
 16: [[4, 5]],
 20: [[5, 6]],
 56: [[12, 13]],
 86: [[6, 7]],
 93: [[0, 1]],
 104: [[8, 9]],
 112: [[5, 6]],
 123: [[32, 33]],
 136: [[30]],
 166: [[0, 1]],
 204: [[4]],
 214: [[4, 5]],
 269: [[783, 784]],
 288: [[0, 1]],
 308: [[0, 1], [591, 592], [1206, 1207], [1502, 1503]],
 317: [[611]],
 324: [[13]],
 330: [[18, 19]],
 333: [[20, 21]],
 344: [[7, 8]],
 356: [[701, 702]],
 375: [[5, 6]],
 379: [[15, 16, 28, 31], [760, 761]],
 470: [[0]],
 472: [[4, 5], [922]],
 591: [[3, 4]],
 607: [[478, 479]],
 609: [[19,
   20,
   26,
   32,
   33,
   39,
   50,
   53,
   54,
   56,
   62,
   65,
   66,
   68,
   71,
   72,
   74,
   77,
   78,
   80,
   83,
   84,
   89,
   92,
   93,
   125]],
 616: [[0, 1]],
 651: [[0, 1], [611, 612]],
 659: [[0, 1]],
 671: [[13]],
 714: [[3, 4]],
 730: [[3, 4, 6, 7, 12, 13]],
 736: [[17, 18]],
 760: [[8, 9]],
 828: [[9, 10]],
 1105: [[5, 6, 56, 93, 155, 194, 226, 239], [580]],
 1134: [[11, 12]

In [366]:
def get_window(group_of_indices, tokens):
    stopping_tokens = ['\n\n', '\n']
    left = max(0, group_of_indices[0] - 100)
    right = min(len(tokens) - 1, group_of_indices[-1])
    previous_progres = 0
    for progress, token in enumerate(tokens[left:group_of_indices[0]]):
        if token in stopping_tokens:
            left += (progress - previous_progres)
            previous_progres = progress
    
    for _ in range(len(tokens) - 1 - right):
        if tokens[right] in ['.']:
            break
        else:
            right += 1
    
    return [left, right]

In [367]:
windows_per_doc_id = {}
for doc_id in indices_of_labels_per_essay_groups.keys():
    indices_of_labels_per_essay_windows = []
    for group in indices_of_labels_per_essay_groups[doc_id]:
        indices_of_labels_per_essay_windows.append(get_window(group, tokens_per_doc_id[doc_id]))
    windows_per_doc_id[doc_id] = indices_of_labels_per_essay_windows

In [368]:
windows_per_doc_id

{7: [[0, 38], [472, 515], [731, 752]],
 10: [[0, 30], [463, 475]],
 16: [[2, 28]],
 20: [[4, 61]],
 56: [[0, 106]],
 86: [[0, 35]],
 93: [[0, 21]],
 104: [[6, 44]],
 112: [[4, 21]],
 123: [[31, 64]],
 136: [[29, 63]],
 166: [[0, 31]],
 204: [[0, 23]],
 214: [[3, 48]],
 269: [[782, 785]],
 288: [[0, 47]],
 308: [[0, 42], [590, 604], [1205, 1254], [1501, 1698]],
 317: [[582, 623]],
 324: [[9, 21]],
 330: [[17, 79]],
 333: [[16, 46]],
 344: [[0, 69]],
 356: [[700, 711]],
 375: [[0, 53]],
 379: [[0, 64], [660, 765]],
 470: [[0, 49]],
 472: [[3, 16], [917, 943]],
 591: [[0, 40]],
 607: [[477, 480]],
 609: [[8, 134]],
 616: [[0, 4]],
 651: [[0, 42], [610, 626]],
 659: [[0, 9]],
 671: [[10, 75]],
 714: [[0, 94]],
 730: [[0, 39]],
 736: [[16, 24]],
 760: [[7, 45]],
 828: [[8, 60]],
 1105: [[0, 267], [505, 604]],
 1134: [[0, 23]],
 1175: [[0, 24]],
 1185: [[3, 44]],
 1210: [[0, 25]],
 1221: [[0, 47]],
 1239: [[8, 11]],
 1277: [[0, 38], [502, 526]],
 1290: [[555, 677]],
 1295: [[0, 7]],
 1309: [

In [411]:
new_doc_id = 0
augmented_data = {}
for doc_id in windows_per_doc_id.keys():
    for window in windows_per_doc_id[doc_id]:
        augmented_data[new_doc_id] = {'tokens': tokens_per_doc_id[doc_id][window[0]:window[1]], 'labels': labels_per_doc_id[doc_id][window[0]:window[1]]}
        new_doc_id += 1

In [412]:
faker = Faker()
for doc_id in augmented_data:
    essay = augmented_data[doc_id]
    index = 0
    while index < len(essay['labels']):
        # print(index)
        if essay['labels'][index] != 'O':
            new_token = generate_fake_label(essay['labels'][index], faker)
            print(new_token)
            for i, new_token_part in enumerate(new_token):
                index += i
                if index >= len(essay['labels']):
                    break
                if essay['labels'][index] == 'O':
                    essay['tokens'].insert(index, new_token_part)
                    essay['labels'].insert(index, essay['labels'][index])
                else:
                    essay['tokens'][index] = new_token_part
        index += 1

['Donald', 'Anderson']
['Ryan', 'King']
['Anthony', 'Barnes']
['Ricardo', 'Edwards']
['Justin', 'Hernandez']
['Joshua', 'Dawson']
['Amber', 'Franklin']
['Cynthia', 'Walters']
['Ian', 'White']
['Stephanie', 'Baker']
['Stephanie', 'Nichols']
['Joseph', 'Fritz']
['Jamie', 'Neal']
['Kristina', 'Hickman']
['Adrian', 'Gordon']
['Stephen', 'Wu']
['Erik', 'Davis']
['Kathy', 'King']
['Aaron', 'Conner']
['Cathy', 'Haynes']
['Shelia', 'Cole']
['Alexandria', 'Ortiz']
['Michael', 'Martin']
['https://strickland.biz/Troy_Woodward']
['Joseph', 'Washington']
['Steve', 'Wilson']
['Monica', 'Hudson']
['Timothy', 'Solis']
['Mark', 'Myers']
['Robert', 'Bowman']
['Jeffrey', 'Price']
['xsmith@example.com']
['http://www.stephenson-jensen.com/Kathleen_Gibson']
['Robert', 'Allen']
['John', 'Meadows']
['Jasmine', 'Taylor']
['http://stephen75.com']
['Elizabeth', 'Young']
['Jimmy', 'Jackson']
['Lisa', 'Hopkins']
['499415510555']
['Rebecca', 'Robertson']
['242802134753']
['291383014817']
['Erin', 'Garza']
['7183335

In [418]:
augmented_data[160]['tokens']

['\n\n',
 'William',
 'Price',
 'Newtown',
 ',',
 'CT',
 'USA',
 '\n\n',
 'Coursera',
 ':',
 'Design',
 'Thinking',
 'for',
 'Innovation',
 'Jan.',
 '2',
 ',',
 '2017',
 '\n\n',
 'Challenge',
 'and',
 'Selection',
 '\n\n',
 'Coming',
 'from',
 'a',
 'family',
 'who',
 'were',
 'Joseph',
 'Campbell',
 'gurus',
 ',',
 'members',
 'of',
 'AA',
 'and',
 ' ',
 'writers',
 ',',
 'Storytelling',
 'is',
 'a',
 'revealing',
 'tool',
 'of',
 'connection',
 ',',
 'information',
 ',',
 'and',
 'shared',
 ' ',
 'insight']

In [419]:
len(augmented_data)

1203

In [330]:
for doc_id in augmented_data.keys():
    for i, label in enumerate(augmented_data[doc_id]['labels']):
        if 'ID_NUM' in label:
            print(augmented_data[doc_id]['tokens'][i])

860632713425
530670102508
530670102508
875673967537
860632713425
557349702179
784372734211
054176622314
674915248960
932353568953
982645662261
409046248321
163133980712
186941941714
159531167997
159531167997
046922558887
943063077874
792389774673
167695383458
Iz.:999893751750
Kl.:838901042770
06EYD876
143860010348
Ei:556799175487
143860010348
Un:705491035775
143860010348
Kh:360595695159
143860010348
Kh:217952887271
35615904922
696135165639
Vw.:403489591437
141774671173
747051878431
779875708882
800306846075
955487471144
276795361801
276795361801
034626995785
VZ:775Y6A5764
DM:705244534902
132305666219
789323889085
762035863358
188408534931
nMFtUVxSUI|33529258
nMFtUVxSUI|33529258
51,00,23,0
342998677810
522233062166
843756944804
493950392533
183169291463
172801513686
172801513686
208798413907
347376430553
943995368223
027693
ras21
723847538279
534516353860
871483046449
836172426340
369615882777
014674070485
264945858442
320622779078
Weyhacy_7000693584
047378465
047378465
IV-8322
IV-8322
