## Import packages

In [None]:
import numpy as np
import pandas as pd
import json
from tqdm import tqdm
from transformers import pipeline
import matplotlib.pyplot as plt
import plotly.express as px
from flashtext import KeywordProcessor
from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()
from nltk import tokenize
from transformers import AdamW
from collections import Counter
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import AutoModelForTokenClassification
from transformers import AutoTokenizer, AutoConfig, TrainingArguments, Trainer, EvalPrediction
import sys
import time
import gc
import pickle
import re

In [None]:
# !pip install GPUtil
# import torch
# from GPUtil import showUtilization as gpu_usage
# from numba import cuda

# def free_gpu_cache():
#     print("Initial GPU Usage")
#     gpu_usage()                             

#     torch.cuda.empty_cache()

#     cuda.select_device(0)
#     cuda.close()
#     cuda.select_device(0)

#     print("GPU Usage after emptying the cache")
#     gpu_usage()

In [None]:
!nvidia-smi 

## Load input files

In [None]:
train_data = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/train.csv')
submission_file = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')

## EDA

### Train data samples

In [None]:
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

In [None]:
print('Train data shape:', train_data.shape)
train_data.head()

In [None]:
file_path = '../input/coleridgeinitiative-show-us-the-data/train/0008656f-0ba2-4632-8602-3017b44c2e90.json'
file_id = file_path.split('/')[-1].split('.json')[0]
with open(file_path) as json_file:
    data = json.load(json_file)

In [None]:
print('Number of elements:', len(data))
combined_text = ' '.join([x['text'] for x in data])

In [None]:
len(combined_text), combined_text[0:1000]

In [None]:
print('Dataset label:', train_data[train_data['Id']==file_id]['dataset_label'].iloc[0],'\n')
start_index = combined_text.find(train_data[train_data['Id']==file_id]['dataset_label'].iloc[0])
print('Text for the dataset label\n',combined_text[start_index-100:start_index+100])

In [None]:
train_data[train_data['Id']==file_id]['pub_title'].iloc[0],train_data[train_data['Id']==file_id]['dataset_title'].iloc[0],train_data[train_data['Id']==file_id]['dataset_label'].iloc[0],train_data[train_data['Id']==file_id]['cleaned_label'].iloc[0]

In [None]:
print('Train data shape:', train_data.shape)
print('Number of unique IDs in train data:', train_data['Id'].nunique())
print('Number of unique pub_titles:', train_data['pub_title'].nunique())
print('Number of unique dataset titles:', train_data['dataset_title'].nunique())
print('Number of unique dataset labels:', train_data['dataset_label'].nunique())
print('Number of unique cleaned labels:', train_data['cleaned_label'].nunique())

In [None]:
print(submission_file.shape,submission_file['Id'].nunique())
submission_file

### Identifying problem as NER

In [None]:
# Check if problem is NER (named entity recognition) - dataset label should be somewhere in text for each row

number_of_elements_in_text = pd.DataFrame(data = None, columns = ['Train Id','Number of elements','Total document length (words)','Total document length (char)','start_index in text for dataset label','start_index in section_title for dataset label'])
all_train_ids = train_data['Id'].unique().tolist()
count = 0
for i in tqdm(all_train_ids):    
    file_path = '../input/coleridgeinitiative-show-us-the-data/train//' + i + '.json'
    file_id = file_path.split('/')[-1].split('.json')[0]
    with open(file_path) as json_file:
        data = json.load(json_file)
        
    combined_section_title = ' '.join([x['section_title'].strip().lower() for x in data])
    combined_text = ' '.join([x['text'].strip().lower() for x in data])
    
    try:
        start_index_text = combined_text.find((train_data[train_data['Id']==file_id]['dataset_label'].iloc[0]).lower().strip())
    except:
        start_index_text = np.NaN
        
    try:
        start_index_section_title = combined_section_title.find((train_data[train_data['Id']==file_id]['dataset_label'].iloc[0]).lower().strip())
    except:
        start_index_section_title = np.NaN
    
    
    number_of_elements_in_text.loc[count] = i, len(data), len(combined_text.strip().split(' ')), len(combined_text.strip()), start_index_text, start_index_section_title
    count = count + 1    

In [None]:
len(all_train_ids)

In [None]:
print(number_of_elements_in_text.shape)
number_of_elements_in_text.head()

In [None]:
print('Percentage of documents with dataset label in text:', number_of_elements_in_text[(number_of_elements_in_text['start_index in text for dataset label']!=-1)].shape[0]/number_of_elements_in_text.shape[0] * 100,'%')

In [None]:
number_of_elements_in_text[(number_of_elements_in_text['start_index in text for dataset label']==-1)].shape

In [None]:
# All dataset labels occur in the text
number_of_elements_in_text[(number_of_elements_in_text['start_index in text for dataset label']==-1) & (number_of_elements_in_text['start_index in section_title for dataset label']!=-1)].shape

In [None]:
# Very few dataset labels occur in section_title
number_of_elements_in_text[(number_of_elements_in_text['start_index in section_title for dataset label']!=-1)].shape

In [None]:
number_of_elements_in_text[number_of_elements_in_text['start_index in text for dataset label']==-1].head()

In [None]:
file_path = '../input/coleridgeinitiative-show-us-the-data/train/c9050bc3-2551-4f41-9f40-2851fc705c3c.json'
file_id = file_path.split('/')[-1].split('.json')[0]
with open(file_path) as json_file:
    data = json.load(json_file)

In [None]:
# Full forms and rare dataset names do not occur in text nor in section_title
train_data[train_data['Id']==file_id]

In [None]:
combined_section_title = ' '.join([x['section_title'].strip().lower() for x in data])
combined_text = ' '.join([x['text'].strip().lower() for x in data])

In [None]:
(train_data[train_data['Id']==file_id]['dataset_label'].iloc[0]).strip().lower() in combined_section_title,(train_data[train_data['Id']==file_id]['dataset_label'].iloc[0]).strip().lower() in combined_text

In [None]:
train_data[train_data['Id'].isin(number_of_elements_in_text[number_of_elements_in_text['start_index in text for dataset label']==-1]['Train Id'].unique().tolist())]['dataset_label'].unique()

### Number of elements in each document

In [None]:
df = number_of_elements_in_text[['Train Id','Number of elements']].drop_duplicates()
print(df.shape)
fig = px.histogram(df, x='Number of elements',nbins = 200)
fig.show()

### Length of documents

In [None]:
df = number_of_elements_in_text[['Train Id','Total document length (words)']].drop_duplicates()
print(df.shape)
print('Maximum length:',df['Total document length (words)'].max())
fig = px.histogram(df, x='Total document length (words)',nbins = 1000)
fig.show()

### Number of documents of cleaned labels

In [None]:
train_data_cl_label_summary = train_data.groupby(['cleaned_label']).agg({'Id':'nunique'}).reset_index().rename(columns = {'Id':'Number of documents'}).sort_values(by = 'Number of documents', ascending = False)
train_data_cl_label_summary.head(10)

In [None]:
data_for_plot = train_data_cl_label_summary.head(20).sort_values(by = 'Number of documents')
plt.barh(data_for_plot.head(20)['cleaned_label'],data_for_plot.head(20)['Number of documents'])

### Length of cleaned labels

In [None]:
train_data['cleaned_label_length'] = train_data['cleaned_label'].apply(lambda x: len(x.strip().split(' ')))

In [None]:
data = train_data[['cleaned_label']].drop_duplicates().reset_index(drop = True)
data['cleaned_label_length'] = data['cleaned_label'].apply(lambda x: len(x.strip().split(' ')))
train_data_cl_label_len_summary = data.groupby(['cleaned_label_length']).agg({'cleaned_label':'nunique'}).reset_index().rename(columns = {'cleaned_label':'Number of cleaned labels'}).sort_values(by = 'Number of cleaned labels', ascending = False)
train_data_cl_label_len_summary.head(10)

In [None]:
data_for_plot = train_data_cl_label_len_summary.sort_values(by = 'Number of cleaned labels')
plt.barh(data_for_plot['cleaned_label_length'],data_for_plot['Number of cleaned labels'])

In [None]:
print('Minimum length:', train_data_cl_label_len_summary['cleaned_label_length'].min())
print('Maximum length:', train_data_cl_label_len_summary['cleaned_label_length'].max())

## NER Model

In [None]:
train_data.head()

In [None]:
labels_for_model = train_data['dataset_label'].str.strip().str.lower().unique().tolist()

In [None]:
keywordprocessor = KeywordProcessor()
keywordprocessor.add_keywords_from_list(keyword_list=labels_for_model)

In [None]:
def tokenize_sentence(x):
    "takes in a string and returns tokenized list after special character padded"

    return [x for x in x.strip().lower().split(" ") if len(x) > 0]

In [None]:
def get_tags(sent, ep):
    '''
    Input: sent as a sentence tokenized as list of tokens, ep is list of eparker strings (not tokenized)
    output: tags 
    '''
    sent = [x.lower() for x in sent]
    ep_non_nan = False
    if isinstance(ep, list):
        ep_non_nan = True
        ep = [tokenize_sentence(x.lower()) for x in ep]
    i = 0
    tag = []
    if(ep_non_nan):
        while (i < len(sent)) and (len(ep) > 0):
            if (len(ep[0]) == 1) and (ep[0][0] == sent[i]):
                tag.append("B")
                i = i + 1
                ep = ep[1:]

            elif (len(ep[0]) > 1) and (ep[0] == sent[i:i + len(ep[0])]):
                tag = tag + ['B'] + ['I'] * (len(ep[0]) - 1)
                i = i + len(ep[0])
                ep = ep[1:]

            else:
                tag.append("O")
                i = i + 1

    tag = tag + ['O'] * (len(sent) - len(tag))

    return tag

In [None]:
def vocab_sent_tokenize_label(sent_tokenized, token_tag):
    try:
        vocab_sent_token = []
        sent_input_ids = []
        vocab_token_tag = []
        token_tag_ids = []
        for sent_token_, tag_ in zip(sent_tokenized, token_tag):
            _vocab_sent_token = tokenizer.tokenize(sent_token_)
            _sent_input_ids = [
                tokenizer.convert_tokens_to_ids(x) for x in _vocab_sent_token
            ]
            _vocab_token_tag = [tag_] * len(_vocab_sent_token)
            _token_tag_ids = [tag2idx[x] for x in _vocab_token_tag]

            vocab_sent_token.extend(_vocab_sent_token)
            sent_input_ids.extend(_sent_input_ids)
            vocab_token_tag.extend(_vocab_token_tag)
            token_tag_ids.extend(_token_tag_ids)
        return vocab_sent_token, sent_input_ids, vocab_token_tag, token_tag_ids
    except Exception as e:
        print(f"Error in line no: {sys.exc_info()[2].tb_lineno}")
        print(e)

In [None]:
def sent_tag_tokenization(data):
    try:
        model_data_preprocessing = data.copy(deep=True)
        model_data_preprocessing['sent_tokenized'] = model_data_preprocessing[
            'sentence'].progress_apply(tokenize_sentence)

        model_data_preprocessing[
            'token_tag'] = model_data_preprocessing.progress_apply(
                lambda x: get_tags(sent=x['sent_tokenized'], ep=x['dataset_label'])
                if (isinstance(x['sent_tokenized'], list)) else np.nan, axis=1)

        model_data_preprocessing['vocab_sent_tokenized'], model_data_preprocessing[
            'sent_input_ids'], model_data_preprocessing[
                'vocab_token_tag'], model_data_preprocessing['token_tag_ids'] = zip(
                    *model_data_preprocessing.
                    progress_apply(lambda x: vocab_sent_tokenize_label(
                        sent_tokenized=x['sent_tokenized'], token_tag=x['token_tag'])
                                   if isinstance(x['token_tag'], list) else np.nan,
                                   axis=1))
        return model_data_preprocessing
    except Exception as e:
        print(f"Error in line no: {sys.exc_info()[2].tb_lineno}")
        print(e)

In [None]:
def pad_data(input_ids, token_ids):
    try:
        max_token_length = 512
        attention_mask = []
        for input_ in tqdm(input_ids):
            attention_mask.append(torch.ones(len(input_[:max_token_length])))

        padded_attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask,
                                                                batch_first=True,
                                                                padding_value=0.0)

        padded_input_ids = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(input_[:max_token_length]) for input_ in input_ids],
            batch_first=True,
            padding_value=0.0)

        padded_tags = torch.nn.utils.rnn.pad_sequence(
            [torch.tensor(tag_[:max_token_length]) for tag_ in token_ids],
            batch_first=True,
            padding_value=0.0)
        return padded_input_ids, padded_attention_mask, padded_tags
    except Exception as e:
        print(f"Error in line no: {sys.exc_info()[2].tb_lineno}")
        print(e)

In [None]:
def create_dataloader(token_ids, masks, tags, batch_size=16, val=False):
    try:
        # wrap tensors
        data = TensorDataset(token_ids, masks, tags)

        if val:
            # sampler for sampling the data during training
            sampler = SequentialSampler(data)
            
            # dataLoader for validation set
            dataloader = DataLoader(data,
                                    sampler=sampler,
                                    batch_size=batch_size)
        else:    
            # sampler for sampling the data during training
            sampler = RandomSampler(data)
            # dataLoader for train set
            dataloader = DataLoader(data,
                                    sampler=sampler,
                                    batch_size=batch_size)
        return dataloader
    except Exception as e:
        print(f"Error in line no: {sys.exc_info()[2].tb_lineno}")
        print(e)

In [None]:
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer = AutoTokenizer.from_pretrained('../input/coleridge-model-data')
tag2idx = {
    'O': 0,
    'B': 1,
    'I': 2,
}

idx2tag = {
    "0": "O",
    "1": "B",
    "2": "I"
}

In [None]:
training_data_v1 = train_data.groupby(['Id']).agg({'dataset_label':'|'.join}).reset_index()
training_data_v1['dataset_label'] = training_data_v1['dataset_label'].apply(lambda x: x.split('|'))

In [None]:
pd.set_option('display.max_colwidth',500)
training_data_v1.head()

In [None]:
training_data_v1['sentence'] = np.NaN
for i in tqdm(range(0, len(training_data_v1))):
    text = training_data_v1['Id'].iloc[i]
    file_path = '../input/coleridgeinitiative-show-us-the-data/train//' + text + '.json'
    file_id = file_path.split('/')[-1].split('.json')[0]
    with open(file_path) as json_file:
        data = json.load(json_file)

    training_data_v1['sentence'].iloc[i] = ' '.join([x['text'].strip().lower() for x in data])

In [None]:
pd.set_option('display.max_colwidth',500)
training_data_v1.head()

In [None]:
# Max token length for BERT is 512. Hence, we need to break down the sentences into smaller groups

create_train_data = False # Make this true to create data

if create_train_data:
    training_data_for_model = pd.DataFrame(data = None, columns = ['Id','sentence'])
    count = 0
    for j in tqdm(range(0, len(training_data_v1))):
        text_from_doc = tokenize_sentence(training_data_v1['sentence'].iloc[j])
        parts = [' '.join(text_from_doc[i:i+512]) for i in range(0, len(text_from_doc), 500)]
        for k in parts:
            training_data_for_model.loc[count] = training_data_v1['Id'].iloc[j], k
            count = count + 1
            
    pd.set_option('display.max_colwidth',500)
    print(training_data_for_model.shape)
    training_data_for_model.head()

    print(training_data_for_model.shape)
    training_data_for_model = training_data_for_model.merge(training_data_v1[['Id','dataset_label']], how = 'left', on = 'Id')
    print(training_data_for_model.shape)

    pd.set_option('display.max_colwidth',500)
    print(training_data_for_model.shape)
    training_data_for_model.head()

    training_data_for_model['flag'] = training_data_for_model[['dataset_label','sentence']].apply(lambda x: ([1 if k.strip().lower() in x['sentence'].lower().strip() else 0 for k in x['dataset_label']]), axis = 1)

    training_data_for_model['flag_sum'] = training_data_for_model['flag'].apply(lambda x: sum(x)) 
    training_data_for_model['flag_sum'] = np.where(training_data_for_model['flag_sum']>0,1,0)

    training_data_for_model[training_data_for_model['Id']=='000efc17-13d8-433d-8f62-a3932fe4f3b8']

    print('Sentences without labels:', training_data_for_model[training_data_for_model['flag_sum']==0].shape[0]/training_data_for_model.shape[0] * 100,'%')
    print('Sentences with labels:', training_data_for_model[training_data_for_model['flag_sum']==1].shape[0]/training_data_for_model.shape[0] * 100,'%')

    training_data_for_model_v1 = training_data_for_model[training_data_for_model['flag_sum']==1].reset_index(drop = True)

    print(training_data_for_model_v1.shape)
    training_data_for_model_v1.head()
    
    with open('training_data_for_model_v1.pkl', 'wb') as file:
        pickle.dump(training_data_for_model_v1, file)

In [None]:
with open('../input/coleridgetrainingdata/training_data_for_model_v1.pkl', 'rb') as file:
    training_data_for_model_v1 = pickle.load(file)

In [None]:
# For train
batch_size=10

create_train_data = False # Put true for training

if create_train_data:
    processed_sentence_tag_full_data = sent_tag_tokenization(data=training_data_for_model_v1)
    processed_sentence_tag = processed_sentence_tag_full_data[['sent_input_ids', 'token_tag_ids']]

    input_ids = processed_sentence_tag['sent_input_ids'].tolist()
    token_ids = processed_sentence_tag['token_tag_ids'].tolist()

    padded_input_ids, padded_attention_mask, padded_tags = pad_data(
        input_ids=input_ids, 
        token_ids=token_ids
    )

    train_dataloader = create_dataloader(
        token_ids=padded_input_ids, 
        masks=padded_attention_mask, 
        tags=padded_tags, 
        batch_size=batch_size, 
        val=False
    )

    with open('train_dataloader.pkl', 'wb') as file:
        pickle.dump(train_dataloader, file)

In [None]:
with open('../input/coleridgetrainingdata/train_dataloader.pkl', 'rb') as file:
    train_dataloader = pickle.load(file)

In [None]:
# del train_data, train_data_cl_label_len_summary, train_data_cl_label_summary, training_data_for_model, training_data_for_model_v1, training_data_v1
gc.collect()
torch.cuda.empty_cache()

In [None]:
# config = AutoConfig.from_pretrained(
#     'bert-base-uncased', 
#     num_labels=len(tag2idx),
#     id2label=idx2tag,
#     label2id=tag2idx
# )
# model = AutoModelForTokenClassification.from_pretrained('bert-base-uncased', config=config)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

# model = model.to(device)

# optimizer = AdamW(model.parameters(),
#                   lr = 1e-5) # learning rate

criterion = nn.CrossEntropyLoss()

In [None]:
def do_train(model, optimizer, loss_criteria, train_dataloader):
    try:
        model.train()

        total_loss = 0
        total_logits = []

        # iterate over batches
        for step, batch in enumerate(train_dataloader):

            # progress update after every 50 batches.
            if step % 50 == 0 and not step == 0:
                print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(train_dataloader)))

            # push the batch to gpu
            batch = [r.to(device) for r in batch]

            sent_id, mask, labels = batch
            
            gc.collect()
            torch.cuda.empty_cache()
            # clear previously calculated gradients
            model.zero_grad()

            # get model predictions for the current batch
            logits = model(sent_id.to(device), mask.to(device))

            # compute the loss between actual and predicted values
            loss = loss_criteria(logits.logits.permute(0, 2, 1), labels)

            # add on to the total loss
            total_loss = total_loss + loss.item()

            # backward pass to calculate the gradients
            loss.backward()

            # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # update parameters
            optimizer.step()

            # model predictions are stored on GPU. So, push it to CPU
            logits = logits.logits.detach().cpu().numpy()

            # append the model predictions
            total_logits.append(logits)

        # compute the training loss of the epoch
        avg_loss = total_loss / len(train_dataloader)

        total_logits = np.concatenate(total_logits, axis=0)


        return avg_loss, total_logits
    except Exception as e:
        print(f"Error during training the model on line: {sys.exc_info()[2].tb_lineno}")
        print(e)


# function for evaluating the model
def do_evaluate(model, val_dataloader, loss_criteria):
    print("\nEvaluating...")
    
    # deactivate dropout layers
    model.eval()

    total_loss, total_accuracy = 0, 0

    # empty list to save the model predictions
    total_logits = []

    # iterate over batches
    for step, batch in enumerate(val_dataloader):

        # Progress update every 50 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            #             elapsed = format_time(time.time() - t0)

            # Report progress.
            print('  Batch {:>5,}  of  {:>5,}.'.format(step, len(val_dataloader)))

        # push the batch to gpu
        batch = [t.to(device) for t in batch]

        sent_id, mask, labels = batch

        # deactivate autograd
        with torch.no_grad():

            # model predictions
            logits = model(sent_id.to(device), mask.to(device))

            # compute the validation loss between actual and predicted values
            loss = loss_criteria(logits.logits.permute(0, 2, 1), labels)

            total_loss = total_loss + loss.item()

            logits = logits.logits.detach().cpu().numpy()

            total_logits.append(logits)

    # compute the validation loss of the epoch
    avg_loss = total_loss / len(val_dataloader)

    # reshape the predictions in form of (number of samples, no. of classes)
    total_logits = np.concatenate(total_logits, axis=0)

    return avg_loss, total_logits

In [None]:
train_flag = False

if train_flag:
    %%time
    epochs=1 ## Need to increase this and see better performance

    # empty lists to store training and validation loss of each epoch
    train_losses=[]

    #for each epoch

    for epoch in range(epochs):
        start = time.time()
        print('\n Epoch {:} / {:}'.format(epoch + 1, epochs))

        #train model
        train_loss, _ = do_train(
            model = model,
            optimizer = optimizer, 
            loss_criteria = criterion, 
            train_dataloader = train_dataloader 

        )

        model.save_pretrained('model_file')
        tokenizer.save_pretrained('tokenizer_file')

        # append training and validation loss
        train_losses.append(train_loss)

        print(f"Time taken: {time.time() - start}")
        print(f'\nTraining Loss: {train_loss:.3f}')

In [None]:
# Load fine-tuned model
model = AutoModelForTokenClassification.from_pretrained('../input/coleridge-model-data')
model = model.to(device)

In [None]:
def get_prediction_from_logits(logits):
    try:
        tag_prob = nn.Softmax(dim=2)(logits)
        tag_prediction = torch.argmax(tag_prob, dim=2).detach().cpu().numpy()
        return tag_prediction
    except Exception as e:
        print(f"Error in line: {sys.exc_info()[2].tb_lineno}")
        print(e)
        
def classification_result(tag2idx, c_tag_id):
    try:
        prediction_result = []
        for sent_ in c_tag_id:
            prediction_result.append(
                list(map(lambda x: list(tag2idx.keys())[list(tag2idx.values()).index(x)], sent_))
            )
            
        tagged_entity = np.concatenate(prediction_result, axis=0)
        return tagged_entity
    except Exception as e:
        print(f"Error in line: {sys.exc_info()[2].tb_lineno}")
        print(e) 

In [None]:
submission_file

In [None]:
# For test data
test_data_v1 = submission_file[['Id']]
test_data_v1['sentence'] = np.NaN
for i in tqdm(range(0, len(submission_file))):
    text = submission_file['Id'].iloc[i]
    file_path = '../input/coleridgeinitiative-show-us-the-data/test//' + text + '.json'
    file_id = file_path.split('/')[-1].split('.json')[0]
    with open(file_path) as json_file:
        data = json.load(json_file)

    test_data_v1['sentence'].iloc[i] = ' '.join([x['text'].strip().lower() for x in data])
    
# Max token length for BERT is 512. Hence, we need to break down the sentences into smaller groups
test_data_for_model = pd.DataFrame(data = None, columns = ['Id','sentence'])
count = 0
for j in tqdm(range(0, len(test_data_v1))):
    text_from_doc = tokenize_sentence(test_data_v1['sentence'].iloc[j])
    parts = [' '.join(text_from_doc[i:i+512]) for i in range(0, len(text_from_doc), 500)]
    for k in parts:
        test_data_for_model.loc[count] = test_data_v1['Id'].iloc[j], k
        count = count + 1

In [None]:
test_data_for_model['dataset_label'] = [[''] for x in range(0,len(test_data_for_model))]
print(test_data_for_model.shape)
test_data_for_model.head()

In [None]:
# For test data - tokenization + attention masks
processed_sentence_tag_full_data = sent_tag_tokenization(data=test_data_for_model)
processed_sentence_tag = processed_sentence_tag_full_data[['sent_input_ids', 'token_tag_ids']]

input_ids = processed_sentence_tag['sent_input_ids'].tolist()
token_ids = processed_sentence_tag['token_tag_ids'].tolist()

test_padded_input_ids, test_padded_attention_mask, test_padded_tags = pad_data(
    input_ids=input_ids, 
    token_ids=token_ids
)

In [None]:
# get predictions for test data
for i in range(0,10):
    with torch.no_grad():
        logits = model(test_padded_input_ids[i:i+1].to(device), test_padded_attention_mask[i:i+1].to(device))
        preds = get_prediction_from_logits(logits=logits['logits'])
    print(sum(preds[0]))

In [None]:
def get_predicted_labels(sent_padded_input_ids, sent_padded_attention_mask):
    try:
        with torch.no_grad():
            logits = model(sent_padded_input_ids.to(device), sent_padded_attention_mask.to(device))
            c_tag_id = get_prediction_from_logits(logits=logits['logits'])        
        test_ids = np.squeeze(sent_padded_input_ids.reshape(1, -1)).detach().cpu().numpy()
        preds = classification_result(
            tag2idx = tag2idx, 
            c_tag_id = c_tag_id
        )        
        test_tokens = [tokenizer.convert_ids_to_tokens(int(x)) for x in test_ids]
        
        final_out_ls = []
        #final_out_ls1 = []
        temp = []
        for _idx, _tag in enumerate(preds):
            if _tag in ['B', 'I']:
                temp.append(test_tokens[_idx])
            else:
                if len(temp)>0:
                    e = ''
                    for t2 in temp:
                        if t2.startswith('##'): e = e+t2.lstrip('##')
                        else: e = e + ' ' +t2
                    # final_out_ls1.append(temp)
                    final_out_ls.append(e.strip())
                temp = []        
        
        return final_out_ls
    except Exception as e:
        print(f"Error in line: {sys.exc_info()[2].tb_lineno}")
        print(e)

In [None]:
predicted_labels = []
for i in tqdm(range(test_data_for_model.shape[0])):
    predicted_labels.append(get_predicted_labels(test_padded_input_ids[i:i+1].to(device), test_padded_attention_mask[i:i+1].to(device)))

In [None]:
predicted_labels

In [None]:
test_data_for_model['predicted_label'] = predicted_labels

In [None]:
test_data_for_model['PredictionString'] = test_data_for_model['predicted_label'].apply(lambda x: ' '.join(x))
test_data_for_model['PredictionString_clean'] = test_data_for_model['PredictionString'].apply(lambda x: clean_text(x))

In [None]:
final_test_results_ner = test_data_for_model[test_data_for_model['PredictionString_clean']!=''].groupby('Id').agg({'PredictionString_clean':' | '.join}).reset_index().rename(columns = {'PredictionString_clean':'PredictionString'})

In [None]:
final_test_results_ner

## Lookup approach

In [None]:
test_data_for_model['Predicted_labels - Lookup approach'] = test_data_for_model['sentence'].apply(lambda x: keywordprocessor.extract_keywords(x))

In [None]:
test_data_for_model[test_data_for_model['Predicted_labels - Lookup approach'].str.len() > 0].head()

In [None]:
test_data_for_model['Predicted_labels - Lookup approach'] = test_data_for_model['Predicted_labels - Lookup approach'].apply(lambda x: ' | '.join(x))
test_data_for_model['Predicted_labels - Lookup approach'] = test_data_for_model['Predicted_labels - Lookup approach'].apply(lambda x: clean_text(x))
final_test_results_lookup = test_data_for_model[test_data_for_model['Predicted_labels - Lookup approach'].str.len() > 0].groupby('Id').agg({'Predicted_labels - Lookup approach':' | '.join}).reset_index().rename(columns = {'Predicted_labels - Lookup approach':'PredictionString'})

In [None]:
final_test_results_lookup

In [None]:
final_test_results = submission_file[['Id']].merge(final_test_results_lookup, how = 'left', on = 'Id')
final_test_results = final_test_results.merge(final_test_results_ner, how = 'left', on = 'Id')
final_test_results['PredictionString'] = final_test_results['PredictionString_x'].fillna('') + ' | ' + final_test_results['PredictionString_y'].fillna('')

In [None]:
final_test_results

In [None]:
manual_noise_list_identified = ['international of']

In [None]:
def remove_noise(text):
    tokens = text.split(' | ')
    tokens = list(set(tokens))
    tokens = [x for x in tokens if (len(x)>=5) & (x not in manual_noise_list_identified)]
    result = ' | '.join(tokens)
    return result

In [None]:
# Remove noise
final_test_results['PredictionString'] = final_test_results['PredictionString'].apply(lambda x: remove_noise(x))

In [None]:
final_test_results = submission_file[['Id']].merge(final_test_results, on = 'Id', how = 'left')[['Id','PredictionString']]

In [None]:
final_test_results

In [None]:
final_test_results.to_csv(f'submission.csv', index=False)