In [1]:
import os
import gc
from tqdm.auto import tqdm
import json

import numpy as np 
import pandas as pd 
from itertools import chain

from text_unidecode import unidecode
from typing import Dict, List, Tuple
import codecs

from datasets import concatenate_datasets,load_dataset,load_from_disk
from sklearn.metrics import log_loss
from transformers import AutoModel, AutoTokenizer, AdamW, DataCollatorWithPadding

import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import pytorch_lightning as pl
from pytorch_lightning import Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

ModuleNotFoundError: No module named 'text_unidecode'

In [3]:
from datasets import Dataset, load_from_disk
import pickle
import re
from transformers import TrainingArguments, AutoConfig, AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import get_polynomial_decay_schedule_with_warmup,get_cosine_schedule_with_warmup,get_linear_schedule_with_warmup
from transformers.tokenization_utils_base import BatchEncoding, PreTrainedTokenizerBase
from transformers import DataCollatorWithPadding,DataCollatorForTokenClassification

from copy import deepcopy

from transformers.models.llama.modeling_llama import *
from transformers.modeling_outputs import TokenClassifierOutput
import sys
!cp -r /kaggle/input/peft-main /kaggle/working



In [4]:
sys.path.append('/kaggle/working/peft-main/src')
from peft import get_peft_model, LoraConfig, TaskType

# ⚙️ Config

Notebook was ran on my local Instance, you will have to change the paths for Kaggle accordingly. 

In [5]:
class config:
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    seed = 69
    # dataset path 
    train_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    test_dataset_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
    sample_submission_path = "/home/nischay/PID/Data/sample_submission.csv"
       
    save_dir="/kaggle/working/"
    #tokenizer params
    downsample = 0.5
    truncation = True 
    padding = False #'max_length'
    max_length = 2536
    doc_stride = 512
    freeze_layers = 6
    # model params
    model_name = "/kaggle/input/h2o-danube-1-8b-base"
    
    target_cols = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL','O']

    load_from_disk = None
    #training params
    learning_rate = 1e-5
    batch_size = 1
    epochs = 4
    NFOLDS = 4
    trn_fold = 0
    ckpt_path = '/kaggle/input/h2o-danube-18b-extv1-1400-cv955/last.ckpt'
    exp_path = '/kaggle/input/h2o-danube-18b-extv1-1400-cv955'
    
seed_everything(config.seed)

69

In [6]:
if not os.path.exists(config.save_dir):
  os.makedirs(config.save_dir)

# 📊 Preprocessing

In [7]:
data = json.load(open(config.train_dataset_path))
test_data = json.load(open(config.test_dataset_path))

print('num_samples:', len(data))
print(data[0].keys())


num_samples: 6807
dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels'])


In [8]:
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

print(id2label)

{0: 'B-EMAIL', 1: 'B-ID_NUM', 2: 'B-NAME_STUDENT', 3: 'B-PHONE_NUM', 4: 'B-STREET_ADDRESS', 5: 'B-URL_PERSONAL', 6: 'B-USERNAME', 7: 'I-ID_NUM', 8: 'I-NAME_STUDENT', 9: 'I-PHONE_NUM', 10: 'I-STREET_ADDRESS', 11: 'I-URL_PERSONAL', 12: 'O'}


In [9]:
tokenizer = AutoTokenizer.from_pretrained(config.model_name)
tokenizer.pad_token = tokenizer.eos_token

In [10]:
df_train = pd.DataFrame(data)
df_train.head(5)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O..."
3,20,Design Thinking for Innovation\n\nSindy Samaca...,"[Design, Thinking, for, Innovation, \n\n, Sind...","[True, True, True, False, False, True, False, ...","[O, O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT..."
4,56,Assignment: Visualization Reflection Submitt...,"[Assignment, :, , Visualization, , Reflecti...","[False, False, False, False, False, False, Fal...","[O, O, O, O, O, O, O, O, O, O, O, O, B-NAME_ST..."


In [11]:
df_train['fold'] = df_train['document'] % 4
df_train.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace,labels,fold
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F...","[O, O, O, O, O, O, O, O, O, B-NAME_STUDENT, I-...",3
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,...","[B-NAME_STUDENT, I-NAME_STUDENT, O, O, O, O, O...",2
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,...","[O, O, O, O, B-NAME_STUDENT, I-NAME_STUDENT, O...",0


In [12]:
df_test = pd.DataFrame(test_data)
# df_test = df_train[df_train['fold']==config.trn_fold].reset_index(drop=True)
df_test.head(3)

Unnamed: 0,document,full_text,tokens,trailing_whitespace
0,7,Design Thinking for innovation reflexion-Avril...,"[Design, Thinking, for, innovation, reflexion,...","[True, True, True, True, False, False, True, F..."
1,10,Diego Estrada\n\nDesign Thinking Assignment\n\...,"[Diego, Estrada, \n\n, Design, Thinking, Assig...","[True, False, False, True, True, False, False,..."
2,16,Reporting process\n\nby Gilberto Gamboa\n\nCha...,"[Reporting, process, \n\n, by, Gilberto, Gambo...","[True, False, False, True, True, False, False,..."


In [13]:
def tokenize_row(example):
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
            
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=config.truncation, max_length=config.max_length)
    
    return {
        "input_ids": tokenized.input_ids,
        "attention_mask": tokenized.attention_mask,
        "offset_mapping": tokenized.offset_mapping,
        "token_map": token_map,}

In [14]:
df_test.describe()

Unnamed: 0,document
count,10.0
mean,62.7
std,46.142412
min,7.0
25%,17.0
50%,71.0
75%,101.25
max,123.0


In [15]:
%%time
if config.load_from_disk is None:

    config.valid_stride = True
    print(len(df_test))
    
    ds = Dataset.from_pandas(df_test)
    ds = ds.map(
      tokenize_row,
      batched=False,
      num_proc=2,
      desc="Tokenizing",
  )

    ds.save_to_disk(f"{config.save_dir}test.dataset")
    print("Saving dataset to disk:", config.save_dir)

      
        

10
    

Tokenizing #0:   0%|          | 0/5 [00:00<?, ?ex/s]

Tokenizing #1:   0%|          | 0/5 [00:00<?, ?ex/s]

Saving dataset to disk: /kaggle/working/
CPU times: user 85.6 ms, sys: 59.4 ms, total: 145 ms
Wall time: 465 ms


In [16]:
ds[0].keys()

dict_keys(['document', 'full_text', 'tokens', 'trailing_whitespace', 'input_ids', 'attention_mask', 'offset_mapping', 'token_map'])

# 🔝 Competition Metrics

In [17]:
import pandas as pd

def predictions_to_df(preds, ds, id2label=id2label):
    triplets = []
    pairs = set()
    document, token, label, token_str = [], [], [], []
    for p, token_map, offsets, tokens, doc in zip(preds, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
        # p = p.argmax(-1).cpu().detach().numpy()
        p = p.cpu().detach().numpy()
        
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[(token_pred)]

            if start_idx + end_idx == 0: continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): break

            
            token_id = token_map[start_idx]

            if label_pred == "O" or token_id == -1:
                continue
            
            pair = (doc, token_id)
    
            if pair in pairs:
                continue

            
            
            document.append(doc)
            token.append(token_id)
            label.append(label_pred)
            token_str.append(tokens[token_id])
            pairs.add(pair)
                
            # ignore "O" predictions and whitespace preds
            # if label_pred != "O" and token_id != -1:
                # triplet = (label_pred, token_id, tokens[token_id])

                # if triplet not in triplets:
                #     document.append(doc)
                #     token.append(token_id)
                #     label.append(label_pred)
                #     token_str.append(tokens[token_id])
                #     triplets.append(triplet)

    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))
    
    return df


# 🧠 Model

In [18]:

def process_predictions(flattened_preds, threshold=0.875):
    
    preds_final = []
    for predictions in flattened_preds:
        
        predictions_softmax = torch.softmax(predictions, dim=-1)        
        predictions_argmax = predictions.argmax(-1)
        predictions_without_O = predictions_softmax[ :, :12].argmax(-1)
        
        O_predictions = predictions_softmax[ :, 12]
        pred_final = torch.where(O_predictions < threshold, predictions_without_O, predictions_argmax)        
        preds_final.append(pred_final)
    
    return preds_final



In [19]:
import random

from peft import get_peft_model, LoraConfig, TaskType
from transformers.models.llama.modeling_llama import *
from transformers.modeling_outputs import TokenClassifierOutput

class LlamaForTokenClassification(LlamaPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.model = LlamaModel(config)

        self.post_init()

    def get_input_embeddings(self):
        return self.model.embed_tokens

    def set_input_embeddings(self, value):
        self.model.embed_tokens = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        output_attentions: Optional[bool] = None,
        output_hidden_states: Optional[bool] = None,
        return_dict: Optional[bool] = None,
    ) -> Union[Tuple, SequenceClassifierOutputWithPast]:
        r"""
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        """
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        sequence_output = outputs[0]


        return sequence_output


class LSTMHead(nn.Module):
    def __init__(self, in_features, hidden_dim, n_layers):
        super().__init__()
        self.lstm = nn.LSTM(in_features,
                            hidden_dim,
                            n_layers,
                            batch_first=True,
                            bidirectional=True,
                            dropout=0.)
        self.out_features = hidden_dim

    def forward(self, x):
        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm(x)
        out = hidden
        return out


    
    
class PIIModel(pl.LightningModule):
    def __init__(self,config):
        super().__init__()
        self.cfg = config
        self.model_config = AutoConfig.from_pretrained(
            config.model_name,
        )

        hidden_dropout_prob: float = 0.1
        layer_norm_eps: float = 1e-7
        self.model_config.update(
            {
                "output_hidden_states": True,
                "hidden_dropout_prob": hidden_dropout_prob,
                "layer_norm_eps": layer_norm_eps,
                "add_pooling_layer": False,
            }
        )

        self.transformers_model = LlamaForTokenClassification.from_pretrained(
        config.model_name, num_labels=len(self.cfg.target_cols), id2label=id2label, label2id=label2id, 
        )
        peft_config = LoraConfig(task_type=TaskType.TOKEN_CLS, inference_mode=False, r=8, lora_alpha=32, lora_dropout=0.0)
        self.transformers_model = get_peft_model(self.transformers_model, peft_config)
        self.transformers_model.print_trainable_parameters()
        self.head = LSTMHead(in_features=self.model_config.hidden_size, hidden_dim=self.model_config.hidden_size//2, n_layers=2)
        self.output = nn.Linear(self.model_config.hidden_size, len(self.cfg.target_cols))

        self.loss_function = nn.CrossEntropyLoss(reduction='mean',ignore_index=-100) 
        self.validation_step_outputs = []

    def forward(self, input_ids, attention_mask,train):
        transformer_out = self.transformers_model(input_ids,attention_mask = attention_mask)
        sequence_output = self.head(transformer_out)
        logits = self.output(sequence_output)
        return (logits, _)

    def training_step(self,batch,batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        target = batch['labels'] 

        outputs = self(input_ids,attention_mask,train=True)
        output = outputs[0]

        # loss = self.loss_function(output.view(-1, len(self.cfg.target_cols)),target.view(-1))
        loss = self.loss_function(output.view(-1,len(self.cfg.target_cols)), target.view(-1))

        self.log('train_loss', loss , prog_bar=True)
        return {'loss': loss}
    
    def train_epoch_end(self,outputs):
        avg_loss = torch.stack([x['loss'] for x in outputs]).mean()
        print(f'epoch {trainer.current_epoch} training loss {avg_loss}')
        return {'train_loss': avg_loss} 
    
    def train_dataloader(self):
        return self._train_dataloader 
    
    def validation_dataloader(self):
        return self._validation_dataloader

    def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in self.transformers_model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in self.transformers_model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in self.named_parameters() if "transformers_model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr = config.learning_rate)

        epoch_steps = self.cfg.data_length
        batch_size = self.cfg.batch_size

        warmup_steps = 0.1 * epoch_steps // batch_size
        training_steps = self.cfg.epochs * epoch_steps // batch_size
        # scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,training_steps,-1)
        scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, warmup_steps, training_steps, lr_end=1e-6, power=3.0)

        lr_scheduler_config = {
                'scheduler': scheduler,
                'interval': 'step',
                'frequency': 1,
            }

        return {'optimizer': optimizer, 'lr_scheduler': lr_scheduler_config}
    

In [20]:
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

In [21]:
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm

def predict(data_loader, model, config, ds_test):
    model.eval()  
    model.to(config.device)
    
    predictions = []
    
    prediction_outputs = []

    for batch in tqdm(data_loader):
        with torch.no_grad(): 
            
            inputs = {key:val.reshape(val.shape[0], -1).to(config.device) for key,val in batch.items() if key in ['input_ids', 'attention_mask']}
            outputs = model(input_ids = inputs['input_ids'], attention_mask = inputs['attention_mask'],train=False)[0]
            
            prediction_outputs.append(outputs)  # Collect raw predictions
        
    prediction_outputs = [logit for batch in prediction_outputs for logit in batch]
    processed_predictions = process_predictions(prediction_outputs, )


    pred_df = predictions_to_df(processed_predictions, ds_test)

    return pred_df

In [22]:
for fold in range(config.NFOLDS):
    if fold != config.trn_fold:
        continue
    train_ds_list = []


    print(f"====== FOLD RUNNING {fold}======")

    keep_cols = {"input_ids", "attention_mask"}

    test_ds = load_from_disk(f'{config.save_dir}test.dataset')
    test_ds = test_ds.remove_columns([c for c in test_ds.column_names if c not in keep_cols])
    config.data_length = len(test_ds)
    config.len_token = len(tokenizer)
    test_ds2 = load_from_disk(f'{config.save_dir}test.dataset')
    
    # swa_callback = pl.callbacks.StochasticWeightAveraging(swa_epoch_start=0.8, swa_lrs=None, 
                                                              # annealing_epochs=1, annealing_strategy='cos', 
                                                              # avg_fn=None, device="cuda")
    print('Dataset Loaded....')
    print((test_ds[0].keys()))
    print("Generating Test DataLoader")
    test_dataloader = DataLoader(test_ds, batch_size = config.batch_size, shuffle = False, num_workers= 4, pin_memory=False,collate_fn = collator)


    model = PIIModel(config)
    if config.ckpt_path is not None:
        model.load_state_dict(torch.load(config.ckpt_path,'cpu')['state_dict'])

    print("prediction on test data")

    sub_df = predict(test_dataloader, model, config, test_ds2)
    
    del model,test_dataloader,test_ds
    gc.collect()
    torch.cuda.empty_cache()


You are using a model of type mistral to instantiate a model of type llama. This is not supported for all configurations of models and can yield errors.


Dataset Loaded....
dict_keys(['input_ids', 'attention_mask'])
Generating Test DataLoader
trainable params: 1,597,440 || all params: 1,750,878,720 || trainable%: 0.09123647353484313
prediction on test data


100%|██████████| 10/10 [00:10<00:00,  1.03s/it]


In [23]:
# sub_df2 = pd.read_csv('suba_949_public.csv')
# sub_df2.head(2)

In [24]:
!rm -rf /kaggle/working/


rm: cannot remove '/kaggle/working/': Device or resource busy


In [25]:
sample_sub = pd.read_csv('/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv')
sub_df = sub_df[sample_sub.columns]

# sub_fin = pd.concat([sub_df, sub_df2],axis=0).reset_index(drop=True)
# sub_fin['row_id'] = [i for i in range(len(sub_fin))]

sub_df[sample_sub.columns].to_csv('submission.csv',index=False)

In [26]:
# sub_fin.head(2)