In [2]:
import sys
import scrubadub
from pathlib import Path
from pii_detection import LLM
import json
from statistics import mean
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from sklearn.metrics import fbeta_score

In [3]:
llm = LLM.LlmModel()

Downloading pytorch_model.bin:   0%|          | 0.00/433M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


In [23]:
email_pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
def is_email(input_token):
    if re.fullmatch(email_pattern, input_token):
        return True
    return False

In [24]:
# This is currently detecting all the urls. receiving recall of 0.5. We need to think how to say that a url is personal!
url_pattern = r"\b(?:(?:https?:?//(?:www\.)?|www\.)[A-Za-z\d][a-zA-Z\d\-\.]{0,255}(?:\.[a-zA-Z-]{2,6})?(?:/[\w\-\.?&%=]{1,500}){0,10})\b"
def is_url(input_token):
    if re.fullmatch(url_pattern, input_token):
        return True
    return False

In [45]:
def is_name(input_token, was_first_name=False):
    if input_token[0].isupper() and input_token not in ['It', 'This', 'I']:
        if llm.query_ner(input_token):
            if was_first_name:
                return 'I-NAME_STUDENT'
            return 'B-NAME_STUDENT'

In [26]:
working_directory = str(Path.cwd().parent)
if working_directory not in sys.path:
    sys.path.append(str(working_directory))

In [27]:
with open('../data/raw/train.json') as f:
    data = json.load(f)

In [44]:
def predict(input_token, was_first_name=False):
    # if is_email(input_token):
    #     return 'B-EMAIL'
    # if is_url(input_token):
    #     return 'B-URL_PERSONAL'
    naming = is_name(input_token, was_first_name) 
    if naming is not None:
        return naming
    return 'O'

In [27]:
def print_labels_examples(input_label, start=0, finish=10):
    count = 0
    for doc in data:
        for token, label in zip(doc['tokens'], doc['labels']):
            if label == input_label:
                if start <= count <= finish:
                    print(doc['full_text'])
                    print(token)
                elif count > finish:
                    return
                count += 1
print_labels_examples('I-STREET_ADDRESS', 0, 1000)
# print_labels_examples('I-URL_PERSONAL')

Waseem Mabunda  591 Smith Centers Apt. 656
Joshuamouth, RI 95963 ( The Netherlands)  410.526.1667  vpi@mn.nl

Mind Mapping,      Challenge:     For several years I have been working for an Asset manager in the Netherlands. During this period I have been involved in many  projects. Certainly in the world of asset management, much has changed in recent years in the area of Law and Regulations.  What I mainly experience in these projects is that all departments have a different interest in starting a new project. This  certainly does not benefit the project. How do you get everyone to complete a project in the common interest and how do you  motivate everyone who participate in the project?    Selection:    An improvement project can be approached in different ways. The most common way is the scrum approach. We work in  multidisciplinary teams that work in short sprints, with a fixed length of 1 to 4 weeks. Cooperation is very important and  everyone must be able to respond quickly to cha

In [29]:
num_of_tokens = sum([len(doc['tokens']) for doc in data])

In [30]:
possible_labels = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM',
                        'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM',
                        'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM']

In [46]:
TP = {}
FP = {}
FN = {}
y_true = []
y_pred = []
index = 1
detected_urls_enumerate = []
detected_urls = []
for classification_type in possible_labels:
    TP[classification_type] = 0
    FP[classification_type] = 0
    FN[classification_type] = 0
prediction = {}
was_first_name = False
for doc in tqdm(data):
    prediction[doc['document']] = []
    for label, token in zip(doc['labels'], doc['tokens']):
        curr_prediction = predict(token, was_first_name)
        if curr_prediction == 'B-NAME_STUDENT':
            was_first_name = True
        else:
            was_first_name = False
        y_true.append(label)
        y_pred.append(curr_prediction)
        prediction[doc['document']].append(curr_prediction)
        if curr_prediction == label:
            TP[curr_prediction] += 1
        else:
            FP[curr_prediction] += 1
            FN[label] += 1

  2%|▏         | 148/6807 [03:37<2:43:13,  1.47s/it]


KeyboardInterrupt: 

In [14]:
# urls_prompt = LLM.LlmModel.get_prompt_to_personal_url(detected_urls)
# response = llm.query_llm(urls_prompt)
# response.split('\n')

In [32]:
recalls = []
precisions = []
for label in possible_labels:
    precision = 0 if TP[label] == 0 else TP[label] / (TP[label] + FP[label])
    recall = TP[label] / (TP[label] + FN[label])
    print("For label {}: Recall: {}, Precision: {}".format(label, recall, precision))
    recalls.append(recall)
    precisions.append(precision)

For label O: Recall: 0.9999785562289746, Precision: 0.9994727901001519
For label B-NAME_STUDENT: Recall: 0.0, Precision: 0
For label I-NAME_STUDENT: Recall: 0.0, Precision: 0
For label B-URL_PERSONAL: Recall: 0.9727272727272728, Precision: 0.5
For label B-ID_NUM: Recall: 0.0, Precision: 0
For label B-EMAIL: Recall: 0.0, Precision: 0
For label I-STREET_ADDRESS: Recall: 0.0, Precision: 0
For label I-PHONE_NUM: Recall: 0.0, Precision: 0
For label B-USERNAME: Recall: 0.0, Precision: 0
For label B-PHONE_NUM: Recall: 0.0, Precision: 0
For label B-STREET_ADDRESS: Recall: 0.0, Precision: 0
For label I-URL_PERSONAL: Recall: 0.0, Precision: 0
For label I-ID_NUM: Recall: 0.0, Precision: 0
final score:  0.9994513806919252


In [ ]:
print("final score: ", fbeta_score(y_true, y_pred, average='micro', beta=5))

In [18]:
print(data[6]['full_text'])

Silvia Villalobos

Challenge:

There is a company which provides financial advisory to customers either in person or virtual.  Lately organisation climate has been seen decayed as result of arguments, hassles and the  lack of fraternity and cooperation among the campaign workers. The aim is improving  climate organisation to transmit unity and trust to our customer.

Selection:

Storytelling is the first tool selected, because this tool allows to connect with the audience,  to make understandable the message, and to transmit emotions. Campaign workers would  feel identified and understand the importance of tolerance and empathy.

Application and insight:

Many stories were told in various sections. Stories from previous experiences, fictional  stories, and stories told by participants created a different atmosphere due to participants  started to produce deeper relationships among them and recognize the importance of being  empathic. An important result was that meeting with clients we

In [15]:
for token in data[5]['tokens']:
    print(llm.query_ner(token))    

[]
[{'entity': 'B-ORG', 'score': 0.8830654, 'index': 1, 'word': 'Start', 'start': 0, 'end': 5}]
[]
[]
[]
[]
[{'entity': 'B-ORG', 'score': 0.88072574, 'index': 1, 'word': 'El', 'start': 0, 'end': 2}]
[{'entity': 'B-PER', 'score': 0.46941844, 'index': 1, 'word': 'Am', 'start': 0, 'end': 2}]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[{'entity': 'B-ORG', 'score': 0.9995167, 'index': 1, 'word': 'Barcelona', 'start': 0, 'end': 9}]
[]
[{'entity': 'B-LOC', 'score': 0.99982965, 'index': 1, 'word': 'Spain', 'start': 0, 'end': 5}]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[{'entity': 'B-LOC', 'score': 0.929586, 'index': 1, 'word': 'P', 'start': 0, 'end': 1}, {'entity': 'B-LOC', 'score': 0.84071195, 'index': 2, 'word': '##yre', 'start': 1, 'end': 4}, {'entity': 'I-LOC', 'score': 0.7833575, 'index': 3, 'word': '##nees', 'start': 4, 'end': 8}]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]
[]

In [19]:
llm.query_ner(data[6]['full_text'])

[{'entity': 'B-PER',
  'score': 0.48807332,
  'index': 1,
  'word': 'Si',
  'start': 0,
  'end': 2},
 {'entity': 'I-PER',
  'score': 0.94796073,
  'index': 4,
  'word': 'Villa',
  'start': 7,
  'end': 12},
 {'entity': 'I-ORG',
  'score': 0.70209837,
  'index': 5,
  'word': '##lo',
  'start': 12,
  'end': 14},
 {'entity': 'I-ORG',
  'score': 0.7002141,
  'index': 6,
  'word': '##bos',
  'start': 14,
  'end': 17}]

In [22]:
for doc in data:
    for token in doc['tokens']:
        if token[0].isupper():
            if llm.query_ner(token):
                print(token)

Avril
Nathalie
Sylla
Buzan
T.
Buzan
B.
Dessine
Paris
Les
Éditions
Avril
Nathalie
Sylla
Avril
Nathalie
Sylla
Mind
Diego
Estrada


KeyboardInterrupt: 

In [5]:
from pii_detection.data_split_utils import shuffle_and_split
train, val, test = shuffle_and_split(data, save_dir="../data")

# read data/val.json
for name in ["train", "val", "test"]:
    with open(f"../data/{name}_shard.json", "r") as f:
        dataset = json.load(f)
    assert dataset == locals()[name]

In [6]:
print(type(train))
print(len(train))
print(train[1].keys())

<class 'list'>
4764
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])


In [7]:
print(train[1]['full_text'])

Reflection – Learning Launch

Paola Garcia

Challenge

In the financial services industry, the strategies developed to manage lending portfolios are developed  through data analysis of past performance.  In the collections environment, analytics and traditional  methods to contact customers (phone, letters and email) are the norm.  As the number of missed  payments on an account increases, the ability to reach customers decreases.   The result is higher  delinquency rates, increase expense and customer complaints.

A project was launched to determine how these measures could be improved.  A small team was formed  which included people from various departments including Analytics, Operations, Finance, Marketing to  name a few.   A recommendation was made to utilize design thinking.  Working with our Marketing  Research team several tools were utilized to understand our customer needs and develop some ideas to  test.  To address the needs of a specific persona, Customers who felt uncomfo

In [8]:
for i in range(len(train)):
    if 'B-NAME_STUDENT' in train[i]['labels']:
        print(i)

1
6
8
13
15
18
24
30
32
41
50
51
53
54
69
83
95
110
125
129
142
143
145
154
155
158
166
206
220
221
229
236
240
241
251
253
286
298
315
321
330
333
336
354
356
362
368
374
377
390
404
405
409
411
421
429
431
454
460
465
473
480
488
495
501
517
519
526
528
529
530
533
541
555
562
566
570
571
579
582
583
599
601
606
610
628
632
635
636
643
647
658
659
666
667
669
680
681
688
691
707
709
712
713
717
724
726
735
747
752
760
764
767
775
800
805
807
814
817
821
846
857
861
863
864
870
888
893
896
901
919
921
926
931
933
939
958
962
965
966
970
978
992
1010
1011
1016
1019
1029
1040
1043
1045
1047
1056
1057
1058
1061
1070
1082
1092
1093
1104
1108
1118
1121
1122
1123
1124
1125
1132
1134
1136
1144
1155
1170
1172
1173
1183
1184
1193
1213
1215
1248
1251
1252
1256
1258
1272
1274
1278
1292
1298
1299
1315
1316
1326
1334
1342
1346
1358
1359
1364
1374
1381
1384
1388
1405
1410
1411
1414
1418
1419
1429
1438
1452
1460
1465
1466
1468
1470
1473
1497
1515
1516
1535
1538
1558
1560
1561
1573
1593
1600
1609
161

In [9]:
for i in range(len(train)):
    if 'B-NAME_STUDENT' in train[i]['labels'] and 'I-NAME_STUDENT' not in train[i]['labels']:
        print(i)

143
236
241
501
632
658
681
817
1056
1252
1256
1274
1315
1411
1452
1516
1560
1699
1743
1744
1750
1761
1778
2049
2089
2146
2152
2190
2672
2700
2703
2704
2739
2770
2788
2950
2958
3050
3293
3332
3566
3586
3606
3620
3715
3749
3762
3783
4054
4069
4322
4448
4503
4626


In [13]:
doc_index = 3783
print(train[doc_index]['full_text'])

Amritpal’s Reflection – Mind Mapping

Challenge  Being a member of my company’s innovation department, I was called along with my team for an urgent  meeting. Based on our latest monthly report, a big number of our newly employed engineers have  complained about an unpleasant and boring working environment; an unfortunate fact for a company  that relies on young and fresh minds to keep aiming forward. My challenge is to create an environment  in which our employees can perform and endure their daily challenges agreeably and more pleasantly.   The project scope includes our team’s department and the new employees of other departments, with a  limited period of thirty business days. We will focus on trying to uncover and pin point the effective  solutions with regard to a pleasant environment.

Selection  “When I want to do something analytical, I make a list. When I’m trying to do something creative, I make  a mind-map.” David Kelley-Founder, IDEO.  With the problem already in hand, I s

In [14]:
print(train[doc_index]['labels'])

['B-NAME_STUDENT', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', '