## Install packages

# Import

In [None]:
import os
import re
import json
import time
import datetime
import random
import glob
import importlib

import numpy as np
import pandas as pd

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

random.seed(123)
np.random.seed(456)

# Hyper-parameters

In [None]:
MAX_LENGTH = 64 # max no. words for each sentence.
OVERLAP = 20 # if a sentence exceeds MAX_LENGTH, we split it to multiple sentences with overlapping

MAX_SAMPLE = None # set a small number for experimentation, set None for production.

# MAX_SAMPLE=5000

# Load data

In [None]:
train_path = '../input/coleridgeinitiative-show-us-the-data/train.csv'
paper_train_folder = '../input/coleridgeinitiative-show-us-the-data/train'
train = pd.read_csv(train_path)

In [None]:
train.shape

In [None]:
train = train[:MAX_SAMPLE]
print(f'No. raw training rows: {len(train)}')

In [None]:
all_labels = set()

for label_1, label_2, label_3 in train[['dataset_title', 'dataset_label', 'cleaned_label']].itertuples(index=False):
    all_labels.add(str(label_1).lower())
    all_labels.add(str(label_2).lower())
    all_labels.add(str(label_3).lower())
    
print(f'No. different labels: {len(all_labels)}')

In [None]:
train.head()

Group by publication, training labels should have the same form as expected output.

In [None]:
train = train.groupby('Id').agg({
    'pub_title': 'first',
    'dataset_title': '|'.join,
    'dataset_label': '|'.join,
    'cleaned_label': '|'.join
}).reset_index()

print(f'No. grouped training rows: {len(train)}')

In [None]:
papers = {}
for paper_id in train['Id'].unique():
    with open(f'{paper_train_folder}/{paper_id}.json', 'r') as f:
        paper = json.load(f)
        papers[paper_id] = paper

# Transform data to NER format

In [None]:
def clean_training_text(txt):
    """
    similar to the default clean_text function but without lowercasing.
    """
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt)).strip()

def shorten_sentences(sentences):
    short_sentences = []
    for sentence in sentences:
        words = sentence.split()
        if len(words) > MAX_LENGTH:
            for p in range(0, len(words), MAX_LENGTH - OVERLAP):
                short_sentences.append(' '.join(words[p:p+MAX_LENGTH]))
        else:
            short_sentences.append(sentence)
    return short_sentences

def find_sublist(big_list, small_list):
    all_positions = []
    for i in range(len(big_list) - len(small_list) + 1):
        if small_list == big_list[i:i+len(small_list)]:
            all_positions.append(i)
    
    return all_positions

def tag_sentence(sentence, labels): # requirement: both sentence and labels are already cleaned
    sentence_words = sentence.split()
    
    if labels is not None and any(re.findall(f'\\b{label}\\b', sentence)
                                  for label in labels): # positive sample
        nes = ['O'] * len(sentence_words)
        for label in labels:
            label_words = label.split()
#             print(label_words)
            all_pos = find_sublist(sentence_words, label_words)
#             print(all_pos)
            for pos in all_pos:
                nes[pos] = 'B'
                for i in range(pos+1, pos+len(label_words)):
                    nes[i] = 'I'
        
#         print(nes)
        return True, [sentence_words, nes]
#         return True, list(zip(sentence_words, nes))
        
    else: # negative sample
        nes = ['O'] * len(sentence_words)
        return False, [sentence_words, nes]
#         return False, list(zip(sentence_words, nes))

In [None]:
cnt_pos, cnt_neg = 0, 0 # number of sentences that contain/not contain labels
ner_data = []
s=False
pbar = tqdm(total=len(train))
for i, id, dataset_label in train[['Id', 'dataset_label']].itertuples():
    # paper
    paper = papers[id]
    
    # labels
    labels = dataset_label.split('|')
    labels = [clean_training_text(label) for label in labels]
    
    # sentences
    sentences = set([clean_training_text(sentence) for section in paper 
                 for sentence in section['text'].split('.') 
                ])
    sentences = shorten_sentences(sentences) # make sentences short
    sentences = [sentence for sentence in sentences if len(sentence) > 10] # only accept sentences with length > 10 chars
    
#     print(sentences)
    
    # positive sample
    for sentence in sentences:
        is_positive, tags = tag_sentence(sentence, labels)
        if is_positive:
#             print(tags)

            cnt_pos += 1
            ner_data.append(tags)
#             s=True
#             break
        elif any(word in sentence.lower() for word in ['data', 'study']): 
            ner_data.append(tags)
            cnt_neg += 1
#     if s==True:
#         break
    # process bar
    pbar.update(1)
    pbar.set_description(f"Training data size: {cnt_pos} positives + {cnt_neg} negatives")

# shuffling
random.shuffle(ner_data)

In [None]:
ner_dataset=pd.DataFrame(ner_data,columns=['sentence','tag'])

In [None]:
ner_dataset.head()

In [None]:
ner_dataset.to_csv("ner_dataset.csv",index=False)

In [None]:
# with open('train_ner.json', 'w') as f:
#     for row in ner_data:
#         words, nes = list(zip(*row))
#         row_json = {'tokens' : words, 'tags' : nes}
#         json.dump(row_json, f)
#         f.write('\n')

In [None]:
# import joblib
# import torch
# import torch.nn as nn
# import transformers

# import numpy as np
# import pandas as pd

# from sklearn import preprocessing
# from sklearn import model_selection

# from tqdm import tqdm
# from transformers import AdamW
# from transformers import get_linear_schedule_with_warmup

In [None]:
# class config:
#     MAX_LEN = 128
#     TRAIN_BATCH_SIZE = 32
#     VALID_BATCH_SIZE = 8
#     EPOCHS = 3
#     BASE_MODEL_PATH = "../input/bert-base-uncased/"
#     MODEL_PATH = "model.bin"
#     TRAINING_FILE = "../input/entity-annotated-corpus/ner_dataset.csv"
#     TOKENIZER = transformers.BertTokenizer.from_pretrained(
#         'bert-base-uncased',
#         do_lower_case=True
#     )

In [None]:
# class EntityDataset:
#     def __init__(self, texts, pos, tags):
#         self.texts = texts
#         self.pos = pos
#         self.tags = tags
    
#     def __len__(self):
#         return len(self.texts)
    
#     def __getitem__(self, item):
#         text = self.texts[item]
#         pos = self.pos[item]
#         tags = self.tags[item]

#         ids = []
#         target_pos = []
#         target_tag =[]

#         for i, s in enumerate(text):
#             inputs = config.TOKENIZER.encode(
#                 s,
#                 add_special_tokens=False
#             )
#             # abhishek: ab ##hi ##sh ##ek
#             input_len = len(inputs)
#             ids.extend(inputs)
#             target_pos.extend([pos[i]] * input_len)
#             target_tag.extend([tags[i]] * input_len)

#         ids = ids[:config.MAX_LEN - 2]
#         target_pos = target_pos[:config.MAX_LEN - 2]
#         target_tag = target_tag[:config.MAX_LEN - 2]

#         ids = [101] + ids + [102]
#         target_pos = [0] + target_pos + [0]
#         target_tag = [0] + target_tag + [0]

#         mask = [1] * len(ids)
#         token_type_ids = [0] * len(ids)

#         padding_len = config.MAX_LEN - len(ids)

#         ids = ids + ([0] * padding_len)
#         mask = mask + ([0] * padding_len)
#         token_type_ids = token_type_ids + ([0] * padding_len)
#         target_pos = target_pos + ([0] * padding_len)
#         target_tag = target_tag + ([0] * padding_len)

#         return {
#             "ids": torch.tensor(ids, dtype=torch.long),
#             "mask": torch.tensor(mask, dtype=torch.long),
#             "token_type_ids": torch.tensor(token_type_ids, dtype=torch.long),
#             "target_pos": torch.tensor(target_pos, dtype=torch.long),
#             "target_tag": torch.tensor(target_tag, dtype=torch.long),
#         }