Authors - Shubhankar and Achintya

In [None]:
import pandas as pd
import numpy as np
import transformers
import torch
import os
from tqdm import tqdm
from torch.utils.data import DataLoader
from NERDA.models import NERDA
import helper
import warnings
warnings.filterwarnings("ignore")

In [None]:
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 10)

In [None]:
path_ner = './ner'
path_ner_model = './ner/ner_model_saved'

os.makedirs(path_ner, exist_ok=True)
os.makedirs(path_ner_model, exist_ok=True)

In [None]:
os.getcwd()

'c:\\Users\\yashb\\OneDrive\\Desktop\\UChicago Courses\\Quarter-2\\Data Mining - Utku\\PII-DETECTION\\code'

#### If using df_labels.csv (skip below in the final version)

In [None]:
# Define a function to convert string representation of lists to actual lists
def convert_to_list(string):
    import ast
    try:
        return ast.literal_eval(string)
    except (SyntaxError, ValueError):
        # If there's an error, return the original value
        return string

In [None]:
cols_to_convert = ['full_text', 'tokens', 'trailing_whitespace','labels', 'unique_labels']
df = pd.read_csv('../archive/df_labels.csv', converters={key: convert_to_list for key in cols_to_convert})

#### continue from here

In [None]:
df, all_labels, label2id, id2label = helper.load_data()

kaggle train data = 6807
gemma data =  1390
df_mpware data = 2355
all_labels = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-EMAIL', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'I-USERNAME', 'O']


In [None]:
df.shape

(12552, 5)

In [None]:
df.head()

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,0,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,1,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,2,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,3,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,4,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [None]:
from collections import Counter
c = Counter(df['labels'].explode().to_list())
c

Counter({'O': 8980724,
         'I-STREET_ADDRESS': 26344,
         'I-NAME_STUDENT': 21765,
         'I-PHONE_NUM': 15365,
         'B-NAME_STUDENT': 10462,
         'B-USERNAME': 4256,
         'B-URL_PERSONAL': 4148,
         'B-PHONE_NUM': 4020,
         'B-EMAIL': 3983,
         'B-STREET_ADDRESS': 3570,
         'B-ID_NUM': 3334,
         'I-ID_NUM': 1117,
         'I-USERNAME': 847,
         'I-EMAIL': 69,
         'I-URL_PERSONAL': 31})

In [None]:
df.columns

Index(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'], dtype='object')

In [None]:
## training on gpu
print(torch.cuda.is_available())
print(torch.cuda.get_device_properties("cuda"))

True
_CudaDeviceProperties(name='NVIDIA GeForce RTX 4050 Laptop GPU', major=8, minor=9, total_memory=6140MB, multi_processor_count=20)


In [None]:
import gc
def clear_memory():
    torch.cuda.empty_cache()
    gc.collect()

clear_memory()

In [None]:
len(df['tokens'].iloc[0]) == len(df['labels'].iloc[0]) ## checking if the \n\n token also has the corresponding tags in the labels column, it does!

True

In [None]:
train_df, valid_df = helper.downsample_df(df.copy())
train_df.reset_index(drop=True, inplace=True)
valid_df.reset_index(drop=True, inplace=True)
print(f"Number of train_df = {len(train_df)}")
print(f"Number of valid_df = {len(valid_df)}")
clear_memory()

Number of true_labels = 4690
Number of false_labels = 5862
true_samples = 4540 true_others = 150
false_samples = 5712 false_others = 150
Number of train_df = 10252
Number of valid_df = 300


In [None]:
train_df.shape, valid_df.shape

((12252, 6), (300, 6))

In [None]:
lengths = df['tokens'].apply(lambda x: len(x))
max(lengths)

3298

In [None]:
train_dict = {}
train_dict['sentences'] = train_df['tokens'].to_list()
train_dict['tags'] = train_df['labels'].to_list()

validate_dict = {}
validate_dict['sentences'] = valid_df['tokens'].to_list()
validate_dict['tags'] = valid_df['labels'].to_list()

In [None]:
tag_scheme = train_df['labels'].explode().unique().tolist()
tag_scheme.remove('O')

In [None]:
# transformer = 'studio-ousia/luke-base'
#transformer = 'studio-ousia/luke-large'
# transformer = 'bert-base-uncased'
#transformer = 'google/bigbird-roberta-base'
transformer = 'distilbert-base-uncased'
#transformer = 'distilroberta-base'
#transformer = 'roberta-base'


In [None]:
# hyperparameters for training
training_hyperparameters = {
'epochs' : 8,
'warmup_steps' : 400,
'train_batch_size': 8, #very sensitive to batch size; running 8 by default
 'learning_rate': 0.0001
}
tokenizer_params = {
    "padding": True
    }

In [None]:
ner_model = NERDA(dataset_training = train_dict,
              dataset_validation = validate_dict,
              tag_scheme = tag_scheme,
              tag_outside = 'O',
              max_len = 512,
              transformer = transformer,
              dropout = 0.1,
              hyperparameters = training_hyperparameters,
              device='cuda',
              num_workers=8,
              tokenizer_parameters = tokenizer_params)

Device set to: cuda


In [None]:
ner_model.transformer_tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [None]:
%%time
ner_model.train()




 Epoch 1 / 8


  0%|          | 0/1532 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.
100%|██████████| 1532/1532 [11:39<00:00,  2.19it/s]
100%|██████████| 38/38 [00:32<00:00,  1.17it/s]


Train Loss = 0.08099929709809106 Valid Loss = 0.1458051706519511

 Epoch 2 / 8


100%|██████████| 1532/1532 [13:14<00:00,  1.93it/s]  
100%|██████████| 38/38 [00:21<00:00,  1.80it/s]


Train Loss = 0.008920432473420012 Valid Loss = 0.07075342964938165

 Epoch 3 / 8


100%|██████████| 1532/1532 [08:22<00:00,  3.05it/s]
100%|██████████| 38/38 [00:21<00:00,  1.79it/s]


Train Loss = 0.006136141479006213 Valid Loss = 0.03425687145026084

 Epoch 4 / 8


100%|██████████| 1532/1532 [08:27<00:00,  3.02it/s]
100%|██████████| 38/38 [00:21<00:00,  1.79it/s]


Train Loss = 0.004807169571538826 Valid Loss = 0.016978710569769885

 Epoch 5 / 8


100%|██████████| 1532/1532 [08:29<00:00,  3.01it/s]
100%|██████████| 38/38 [00:21<00:00,  1.79it/s]


Train Loss = 0.003389501349394953 Valid Loss = 0.015185661573588348

 Epoch 6 / 8


100%|██████████| 1532/1532 [08:27<00:00,  3.02it/s]
100%|██████████| 38/38 [00:21<00:00,  1.76it/s]


Train Loss = 0.0025931950973748395 Valid Loss = 0.012943141032002657

 Epoch 7 / 8


100%|██████████| 1532/1532 [08:27<00:00,  3.02it/s]
100%|██████████| 38/38 [00:21<00:00,  1.78it/s]


Train Loss = 0.0019269366479018064 Valid Loss = 0.008050331133242588

 Epoch 8 / 8


100%|██████████| 1532/1532 [08:25<00:00,  3.03it/s]
100%|██████████| 38/38 [00:21<00:00,  1.77it/s]

Train Loss = 0.0014030131743304234 Valid Loss = 0.005494640699979706
CPU times: total: 8min 43s
Wall time: 1h 18min 36s





'Model trained successfully'

In [None]:
torch.save(ner_model, path_ner_model + '/' + transformer + '_ner_model_data_augmented_v2.pt')

In [None]:
ner_model = torch.load("./ner/ner_model_saved/distilbert-base-uncased_ner_model_data_augmented_v1.pt")

In [None]:
ner_model.evaluate_performance(validate_dict)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


Unnamed: 0,Level,F1-Score,Precision,Recall
0,B-NAME_STUDENT,0.801303,0.848276,0.759259
1,I-NAME_STUDENT,0.40226,0.410138,0.394678
2,B-ID_NUM,0.489796,0.489796,0.489796
3,B-USERNAME,0.331492,0.333333,0.32967
4,B-EMAIL,0.313953,0.303371,0.325301
5,B-URL_PERSONAL,0.247619,0.245283,0.25
6,B-PHONE_NUM,0.258824,0.258824,0.258824
7,I-PHONE_NUM,0.624454,0.621739,0.627193
8,I-USERNAME,0.0,0.0,0.0
9,B-STREET_ADDRESS,0.4,0.402299,0.397727


In [None]:
clear_memory()

In [None]:
text = '''what movies star cameron diaz'''
ner_model.predict_text(text)

([['what', 'movies', 'star', 'cameron', 'diaz']],
 [['O', 'O', 'O', 'B-NAME_STUDENT', 'I-NAME_STUDENT']])

In [None]:
def single_sent_test(text):

    single_sent_test = ner_model.predict_text(text)

    df_w = pd.DataFrame({'words': single_sent_test[0]})
    df_t = pd.DataFrame({'tags': single_sent_test[1]})

    test_explode_words = df_w.explode('words', ignore_index=True)
    test_explode_tags = df_t.explode('tags', ignore_index=True)

    test_sent_df = test_explode_words.join(test_explode_tags, how='left')

    return test_sent_df

In [None]:
text = '''My name is Yash Bhardwaj, my email id is yashb@uchicago.edu. My phone number is (224) 706-4831'''
single_sent_test(text)

Unnamed: 0,words,tags
0,My,O
1,name,O
2,is,O
3,Yash,B-NAME_STUDENT
4,Bhardwaj,I-NAME_STUDENT
5,",",O
6,my,O
7,email,O
8,id,O
9,is,O


In [None]:
tag_scheme

['B-NAME_STUDENT',
 'I-NAME_STUDENT',
 'B-URL_PERSONAL',
 'B-EMAIL',
 'B-ID_NUM',
 'I-URL_PERSONAL',
 'B-USERNAME',
 'B-PHONE_NUM',
 'I-PHONE_NUM',
 'B-STREET_ADDRESS',
 'I-STREET_ADDRESS']

In [None]:
clear_memory()