# Where's Waldo - Dataset Edition
This notebook contains our approach to the Coleridge challenge. It uses a SciBERT model to find the datasets in text. We sadly had to limit our predictions to the first few 2048 sentences of each publication, since our full model took too long to predict on the full hidden test set.

# Getting things ready

## Install packages offline
- Install `segtok` for splitting sentences into words.
- Install the right version of `fsspec` (required by `simpletransformers`)
- Install the right version of `seqeval` (required by `simpletransformers`)
- Install `simpletransformers`

In [None]:
!pip install segtok --no-index --find-links=file:///kaggle/input/coleridgepackages/packages/
!pip install fsspec --no-index --find-links=file:///kaggle/input/coleridgepackages/packages/
!pip install seqeval --no-index --find-links=file:///kaggle/input/coleridgepackages/packages/
!pip install simpletransformers --no-index --find-links=file:///kaggle/input/simpletransformers/simpletransformers-0.51.0/

## Import libraries

In [None]:
import os, json, re
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

from simpletransformers.ner import NERModel, NERArgs
import torch

from tqdm import tqdm
from sklearn.model_selection import train_test_split

## Define functions

Define the function that is given to us on the evaluation page to clean text.

In [None]:
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower())

The following functions are mainly used to get some data from a given publication.

In [None]:
# Get number of sentences in a given text
def get_sentence_count(text):
    return len(split_single(text))

# Get datasets used in publication
def extract_datasets(doc_id):
    temp = train.loc[train['Id'] == doc_id]
    return temp.dataset_label.values

# Get text for training publication
def extract_train_text(filename):
    file_loc = f"{data_folder}/train/{filename}.json"
    
    with open(file_loc) as f:
        json_file = json.load(f)
        text_list = [section['text'] for section in json_file]
    
    return " ".join(text_list)

# Get text for test publication
def extract_test_text(filename):
    file_loc = f"{data_folder}/test/{filename}.json"
    
    with open(file_loc) as f:
        json_file = json.load(f)
        text_list = [section['text'] for section in json_file]
    
    return " ".join(text_list)

The following function adds tags for the `datasets` in a `text` and saves it to an `output` file. It only considers sentences that actually have a dataset in them. 

In [None]:
def preprocess_text_part(text, datasets, outfile):
    # Keep track of the sentence we are currently adding to the output file
    global sent_id
    
    # Get the sentences from the given text
    sentences = split_single(text)    
    
    # Get indices for sentences with dataset in them
    indices = []
    for i, sent in enumerate(sentences):
        for dataset in datasets:
            if dataset in sent:
                indices.append(i)
    
    # Remove duplicates
    indices = list(set(indices))

    # Go through all these indices
    for ix in indices:
        # Retrieve the sentence and split it into words
        sent = sentences[ix]
        sent_words = sent.split(' ')

        # Find longest dataset  for current sentence
        max_dataset_len = 0
        max_dataset = ""
        for dataset in datasets:
            if dataset in sent and max_dataset_len < len(dataset):
                    max_dataset_len = len(dataset)
                    max_dataset = dataset

        ds_indices = []

        # Split the dataset into words
        ds_words = max_dataset.split(' ')        

        # Check if the dataset consists of 1 or more words
        if len(ds_words) == 1:
            # If one go through all the words in the sentence, if the cleaned dataset is
            # equal to one of these words we add this words index to a list
            for i, w in enumerate(sent_words):
                    # if clean_text(dataset) == clean_text(w):
                    if max_dataset in w:
                        ds_indices.append([i])
        elif len(ds_words) > 1:
            # Else go through all the words in the sentence and add the indices of the dataset
            # words in the sentence words to the list
            for i in range(len(sent_words)):
                if sent_words[i] == ds_words[0] and sent_words[i:i+len(ds_words)] == ds_words:
                    ds_indices.append([ix for ix in range(i, i+len(ds_words))])
        else: 
            print("Something strange happened...")

        # Tag every word in the sentence with 'O' except at the given indices
        for i, word in enumerate(sent_words):
            tag = 'O'

            # Go trough all the lists of indices that contain dataset words
            for ix_list in ds_indices:
                # If the dataset consists of 1 word and is equal to our word
                # we tag it with 'B-DATASET'
                if len(ix_list) == 1 and i == ix_list[0]:
                    tag = 'B-DATASET'
                # If the dataset consists of more than 1 word and its first word 
                # is equal to our current word , we also tag it with 'B-DATASET'
                elif len(ix_list) > 1 and i == ix_list[0]:
                    tag = 'B-DATASET'
                # If it is one of the other words in the datset, we tag it with
                # 'I-DATASET'
                elif len(ix_list) > 1 and i in ix_list:
                    tag = 'I-DATASET'
            
            # Write result to the given csv file
            with open(outfile, 'a+') as f:
                word_writer = csv.writer(f, delimiter='\t')
                word_writer.writerow([sent_id, word, tag])

        sent_id += 1

# Preparing the data

## Loading the data

 We get a `train.csv` that has information about the publications and the datasets that they contain.

In [None]:
data_folder = "../input/coleridgeinitiative-show-us-the-data"
train_csv = os.path.join(data_folder, "train.csv")
train = pd.read_csv(train_csv)
train.sample(5)

The actual text in the publication can be found in the `train/` folder, and has a `.json` file for each publication. Each of these files has section titles and their corresponding texts.

In [None]:
train_sample_json = os.path.join(data_folder, f"train/{train.Id.values[13891]}.json") 
with open(train_sample_json) as f:
    sample_file = json.load(f)
    title_list = [section['section_title'] for section in sample_file]    
    print(f"Sample section titles of text:")
    for t in title_list[:5]:
        print('    ' + t )
    text_list = [section['text'] for section in sample_file]
    text = " ".join(text_list)
    print(f"Sample of text:\n '{text[0:250]}'\n")

## Preprocessing the data

We showcase an old and a new/current method of preprocessing the input data. The old method was discarded when we discovered that our new method greatly improved precision and recall.

### Old method

In the old method of preprocessing the data, we first preprocess the data and then split it into training and validation data. 

In [None]:
# Process all ids and save it to a csv
sent_id = 0 
outfile = 'train_modified_part.csv'

for i in tqdm(train.Id.values):
    text = extract_train_text(i)
    datasets = extract_datasets(i)
    preprocess_text_part_v2(text, datasets, outfile)

This split simply takes the first 80% of the (processed) sentences and uses that as the training data, and uses the rest as validation data.

In [None]:
# Split data in training and eval data
n = max(data['sentence_id'].values)
split = int(0.8 * n)

train_data =  data[data['sentence_id'] <= split]
eval_data = data[data['sentence_id'] > split]

Show where the split takes place

In [None]:
print(train_data.head())
print(eval_data.head())

### Current/new method

> The output of this step is done beforehand, and loaded in from `coleridgepackages/data/train_modified.part.csv` and `coleridgepackages/data/val_modified.part.csv` 

Go through all ids, extract the text and tag the words in the sentences that contain datasets, then save to `.csv` file.

In [None]:
# Split data into training and validation data
train_ids, val_ids = train_test_split(train.Id.values, test_size=0.2)

# Process all the training ids and save it to a csv
sent_id = 0
outfile = 'train_modified_part.csv'
for i in tqdm(train_ids):
    text = extract_train_text(i)
    datasets = extract_datasets(i)
    preprocess_text_part(text, datasets, outfile)

Do the same thing for our validation data.

In [None]:
# Process all the validation ids and save it to a csv
sent_id = 0 
outfile = 'val_modified_part.csv'
for i in tqdm(val_ids):
    text = extract_train_text(i)
    datasets = extract_datasets(i)
    preprocess_text_part(text, datasets, outfile)

# Creating our model

Load old preprocessed data

In [None]:
data = pd.read_csv('train_modified_part.csv', delimiter='\t', encoding='utf8',
                 names=["sentence_id", "words", "labels"]).dropna()

Load preprocessed data when recently processed

In [None]:
train_data_r = pd.read_csv('train_modified_part.csv', delimiter='\t', encoding='utf8',
                 names=["sentence_id", "words", "labels"]).dropna()
val_data_r = pd.read_csv('val_modified_part.csv', delimiter='\t', encoding='utf8',
                 names=["sentence_id", "words", "labels"]).dropna()

Load preprocessed data from a Kaggle dataset (to reduce running time)

In [None]:
# Load data offline
train_data = pd.read_csv('../input/coleridgepackages/data/train_modified_part.csv', delimiter='\t', encoding='utf8',
                 names=["sentence_id", "words", "labels"]).dropna()
eval_data = pd.read_csv('../input/coleridgepackages/data/val_modified_part.csv', delimiter='\t', encoding='utf8',
                 names=["sentence_id", "words", "labels"]).dropna()

## Setting up the SimpleTransformers model

Configure a logger to reduce clutter in the log messages.

In [None]:
# Logging config
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

Initialize the model using various parameters.

In [None]:
# Configure the arguments for our model
model_args = NERArgs()

# Define our labels
model_args.labels_list = ["O", "B-DATASET", "I-DATASET"]

# Define batch size
model_args.train_batch_size = 32

# Evaluate during training
model_args.evaluate_during_training = True
model_args.evaluate_during_training_verbose = True
model_args.use_cached_eval_features = True

# Enable early stopping
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric='eval_loss'
model_args.early_stopping_metric_minimize=True
model_args.early_stopping_patience=2

# Overwrite output if already exists
model_args.overwrite_output_dir=True

# Check that we are using GPU
cuda_available = torch.cuda.is_available()

# Load the weights from Sci-BERT 
weights_loc =  "../input/coleridgepackages/sci-bert"
model = NERModel(
    "bert", weights_loc, args=model_args, use_cuda=cuda_available
)


Train the model.

In [None]:
# Train the model
model.train_model(train_data, eval_data=eval_data)

Evaluate the model on our validation data

In [None]:
# Evaluate the model
result, model_outputs, preds_list = model.eval_model(eval_data)

# Predictions

Load all the test files.

In [None]:
test_files = os.listdir('../input/coleridgeinitiative-show-us-the-data/test')
test_files

Go through all ids, extract text, turn into sentences, feed these to the model, extract the words that have a dataset tag, and add these to the prediction string. 

In [None]:
submission_df = pd.read_csv('../input/coleridgeinitiative-show-us-the-data/sample_submission.csv')
ids = submission_df['Id']

labels = []

for index in ids:
    # Keep track of all the predictions for this publication
    prediction_list = []
    
    try: 
        # Retrieve the text and split it into sentences
        text = extract_test_text(index)    
        sentences_list = split_single(text)
        
        # Take all the first 1024 sentences 
        inp = sentences_list[:1024] 
        
        # Predict on these sentences
        preds, _ = model.predict(inp)    
        
        # For each prediction on a sentence
        for sent in preds:
            long_pred_list = []
            in_long_dataset = False

            for j, word_label in enumerate(sent):
                current_word = clean_text(list(word_label.keys())[0]).strip()
                current_tag = list(word_label.values())[0]

                # Look at the next tag if it exists, otherwise its an empty string
                next_tag = ""
                next_ix = j+1
                if next_ix < len(sent):
                    next_tag = list(sent[next_ix].values())[0]

                # Single word that has B-DATASET tag, add it to the final list immediately
                # when next tag = 'O' or word is at end of sentence
                is_latest_word = (j == len(sent)-1)
                if (current_tag=='B-DATASET' and next_tag == 'O') or (current_tag=='B-DATASET' and is_latest_word):
                    if current_word not in prediction_list:
                        prediction_list.append(current_word)
                # If we otherwise meet a word with a B-DATASET tag we must be in a long dataset name
                elif current_tag=='B-DATASET':
                    in_long_dataset = True
                # No longer if we meet a word with an O tag
                elif current_tag=='O':
                    in_long_dataset = False

                if in_long_dataset:
                    long_pred_list.append(current_word)

            # Combine long predictions
            long_pred_str = ' '.join(long_pred_list)

            # 
            if len(long_pred_list) > 0:
                # Combine the dataset for the current sentence
                long_pred_str = ' '.join(long_pred_list)

                # Add it to the full predictions for the current publication if its not
                # in there already
                if long_pred_str not in prediction_list:
                    prediction_list.append(long_pred_str)
    except:
        print('an error occurred')
        pass
    
    # Add the predictions for the current publication to the complete predictions
    labels.append('|'.join(prediction_list))                

## Create submission file

Add the ids and the prediction strings to the submission dataframe. Save the dataframe as `.csv` and submit.

In [None]:
submission = pd.DataFrame()
submission['Id'] = ids
submission['PredictionString'] = labels


submission

In [None]:
submission.to_csv('submission.csv', index=False)