In [None]:
import sys
import scrubadub
from pathlib import Path
from pii_detection import LLM
import json
from statistics import mean
import pandas as pd
import numpy as np
from tqdm import tqdm
import re
from sklearn.metrics import fbeta_score
from enum import Enum
import copy

In [None]:
working_directory = str(Path.cwd().parent)
if working_directory not in sys.path:
    sys.path.append(str(working_directory))
with open('../data/raw/train.json') as f:
    data = json.load(f)
num_of_tokens = sum([len(doc['tokens']) for doc in data])
possible_labels = ['O', 'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-URL_PERSONAL', 'B-ID_NUM',
                   'B-EMAIL', 'I-STREET_ADDRESS', 'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM',
                   'B-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-ID_NUM']
class Label(Enum):
    O = 'O'
    B_NAME_STUDENT = 'B-NAME_STUDENT'
    I_NAME_STUDENT = 'I-NAME_STUDENT'
    B_URL_PERSONAL = 'B-URL_PERSONAL'
    B_ID_NUM = 'B-ID_NUM'
    B_EMAIL = 'B-EMAIL'
    I_STREET_ADDRESS = 'I-STREET_ADDRESS'
    I_PHONE_NUM = 'I-PHONE_NUM'
    B_USERNAME = 'B-USERNAME'
    B_PHONE_NUM = 'B-PHONE_NUM'
    B_STREET_ADDRESS = 'B-STREET_ADDRESS'
    I_URL_PERSONAL = 'I-URL_PERSONAL'
    I_ID_NUM = 'I-ID_NUM'

In [None]:
dict_with_prediction = copy.deepcopy(data)

In [76]:
def print_predicted_labels(all_docs, given_label):
    for doc in all_docs:
        for index1, pred_label in enumerate(doc['pred_labels']):
            if pred_label == given_label:
                print(doc['tokens'][index1])

In [82]:
def print_fn_labels(all_docs, given_label):
    for doc in all_docs:
        for index1, pred_label in enumerate(doc['pred_labels']):
            if doc['labels'][index1] == given_label and pred_label != given_label:
                print("The prediction was: ", pred_label)
                print("Prev token was: ", doc['tokens'][index1-1])
                print(doc['tokens'][index1])

In [None]:
email_pattern = r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
def is_email(input_token):
    if re.fullmatch(email_pattern, input_token):
        return True
    return False

In [None]:
# This is currently detecting all the urls. receiving recall of 0.5. We need to think how to say that a url is personal!
url_pattern = r"\b(?:(?:https?:?//(?:www\.)?|www\.)[A-Za-z\d][a-zA-Z\d\-\.]{0,255}(?:\.[a-zA-Z-]{2,6})?(?:/[\w\-\.?&%=]{1,500}){0,10})\b"
def is_url(input_token):
    if re.fullmatch(url_pattern, input_token):
        return True
    return False

In [None]:
not_names_list = ['It', 'This', 'I']
def is_name(input_token, first_name_existed_before=False):
    if input_token[0].isupper() and input_token not in not_names_list:
        if first_name_existed_before:
            return 'I-NAME_STUDENT'
        return 'B-NAME_STUDENT'

In [None]:
def predict(input_token, first_name_existed_before=False):
    # if is_email(input_token):
    #     return 'B-EMAIL'
    # if is_url(input_token):
    #     return 'B-URL_PERSONAL'
    naming = is_name(input_token, first_name_existed_before) 
    if naming is not None:
        return naming
    return 'O'

In [92]:
def print_labels_examples(input_label, start=0, finish=10):
    count = 0
    for doc in data:
        for token, label in zip(doc['tokens'], doc['labels']):
            if label == input_label:
                if start <= count <= finish:
                    # print(doc['full_text'])
                    print(token)
                elif count > finish:
                    return
                count += 1
print_labels_examples(Label.B_PHONE_NUM.value, 0, 1000)
# print_labels_examples('I-URL_PERSONAL')

(
(
(
(
(
410.526.1667


In [None]:
data[1].keys()

In [None]:
TP = {}
FP = {}
FN = {}
y_true = []
y_pred = []
index = 1
detected_urls_enumerate = []
detected_urls = []
for classification_type in possible_labels:
    TP[classification_type] = 0
    FP[classification_type] = 0
    FN[classification_type] = 0
prediction = {}
was_first_name = False
for i, doc in tqdm(enumerate(data), desc="Processing items", total=len(data)):
    dict_with_prediction[i]["pred_labels"] = []
    prediction[doc['document']] = []
    for label, token in zip(doc['labels'], doc['tokens']):
        curr_prediction = predict(token, was_first_name)
        dict_with_prediction[i]["pred_labels"].append(curr_prediction)
        if curr_prediction == 'B-NAME_STUDENT':
            was_first_name = True
        else:
            was_first_name = False
        y_true.append(label)
        y_pred.append(curr_prediction)
        if curr_prediction == label:
            TP[curr_prediction] += 1
        else:
            FP[curr_prediction] += 1
            FN[label] += 1

In [84]:
print_fn_labels(dict_with_prediction, Label.I_NAME_STUDENT.value)

The prediction was:  B-NAME_STUDENT
Prev token was:  Sakir
Ahmad
The prediction was:  B-NAME_STUDENT
Prev token was:  Van
Der
The prediction was:  B-NAME_STUDENT
Prev token was:  Loredana
Abidin
The prediction was:  B-NAME_STUDENT
Prev token was:  Santosh
Kumar
The prediction was:  B-NAME_STUDENT
Prev token was:  Melvin
Lu
The prediction was:  B-NAME_STUDENT
Prev token was:  Joe
Ferrara
The prediction was:  B-NAME_STUDENT
Prev token was:  Shivam
Giri
The prediction was:  B-NAME_STUDENT
Prev token was:  Garcia
Lopez
The prediction was:  B-NAME_STUDENT
Prev token was:  Khan
Pandey
The prediction was:  B-NAME_STUDENT
Prev token was:  Uwe
Wegener
The prediction was:  B-NAME_STUDENT
Prev token was:  Tara
Limbu
The prediction was:  B-NAME_STUDENT
Prev token was:  Tara
Limbu
The prediction was:  B-NAME_STUDENT
Prev token was:  Tara
Limbu
The prediction was:  B-NAME_STUDENT
Prev token was:  Vero
Reyes
The prediction was:  B-NAME_STUDENT
Prev token was:  Kumar
Aakash
The prediction was:  B-NAME

In [None]:
recalls = []
precisions = []
for label in possible_labels:
    precision = 0 if TP[label] == 0 else TP[label] / (TP[label] + FP[label])
    recall = TP[label] / (TP[label] + FN[label])
    print("For label {}: Recall: {}, Precision: {}".format(label, recall, precision))
    recalls.append(recall)
    precisions.append(precision)

In [ ]:
print("final score: ", fbeta_score(y_true, y_pred, average='micro', beta=5))

In [None]:
print(data[6]['full_text'])

In [None]:
for token in data[5]['tokens']:
    print(llm.query_ner(token))    

In [None]:
llm.query_ner(data[6]['full_text'])

In [None]:
for doc in data:
    for token in doc['tokens']:
        if token[0].isupper():
            if llm.query_ner(token):
                print(token)

In [None]:
from pii_detection.data_split_utils import shuffle_and_split
train, val, test = shuffle_and_split(data, save_dir="../data")

# read data/val.json
for name in ["train", "val", "test"]:
    with open(f"../data/{name}_shard.json", "r") as f:
        dataset = json.load(f)
    assert dataset == locals()[name]

In [None]:
print(type(train))
print(len(train))
print(train[1].keys())

In [None]:
print(train[1]['full_text'])

In [None]:
for i in range(len(train)):
    if 'B-NAME_STUDENT' in train[i]['labels']:
        print(i)

In [None]:
for i in range(len(train)):
    if 'B-NAME_STUDENT' in train[i]['labels'] and 'I-NAME_STUDENT' not in train[i]['labels']:
        print(i)

In [None]:
doc_index = 3783
print(train[doc_index]['full_text'])

In [None]:
print(train[doc_index]['labels'])