In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import json
import glob
import tqdm
import spacy

from transformers import RobertaTokenizer
import tokenizers

# Define stop words
sp = spacy.load('en_core_web_sm')
all_stopwords = sp.Defaults.stop_words

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# Gathering Data From Large Texts

The goal of this competition is to build a model where we can automatically extract the name of the data set associcated with `train.csv`. One of the first things we should look at is that text itself. The size of the text will make this competition challenging since models like BERT only allow for a max of 512 tokens so we will have to get creative or we can also check out models like the Longformer (https://huggingface.co/transformers/model_doc/longformer.html). Here is the official paper on the Longformer (https://arxiv.org/pdf/2004.05150.pdf).

In [None]:
# Load in the training data - the cleaned label names are defined here
train_df = pd.read_csv('/kaggle/input/coleridgeinitiative-show-us-the-data/train.csv')

In [None]:
# Snippet to extract number of words from the different sections in each text &
# extract the first section where the dataset name is mentioned (name comes from train.csv)

text_list = []

for data in tqdm.tqdm(glob.glob('/kaggle/input/coleridgeinitiative-show-us-the-data/train/*')):
    full_filename = data.split('/')[-1]
    filename = full_filename.split('.')[0]
    
    # Extract the dataset name for the text
    # For some files there is more than one cleaned dataset name
    dataset_name = (
        train_df
        .loc[train_df['Id'] == filename]
        .reset_index(drop=True)['cleaned_label'][0]
    )
    
    # Open the text data
    with open(data, 'rb') as f:
        text_data = json.load(f)
        
    # Snippet to count the number of words
    number_of_words = 0
    number_of_words_no_stop = 0
    for idx, i in enumerate(text_data):
        
        # NOTE: We probably want to omit stop words but for
        # a first take this is fine
        text = i['text'].lower().split()
        text_no_stop = [i for i in text if i not in all_stopwords]
        
        number_of_words += len(text)
        number_of_words_no_stop += len(text_no_stop)
    
    # Extract the first section where the dataset name
    # is mentioned
    for idx, i in enumerate(text_data):
        section = i['section_title'].lower()
        text = i['text'].lower()
        text_no_stop = ' '.join([i for i in text.split() if i not in all_stopwords])
    
        text_index = text.find(dataset_name)
    
        if text_index != -1:
            start_index = text_index
            end_index = start_index + len(dataset_name)
            data = text[start_index:end_index]
        
            section_tuple = (idx, start_index, end_index, number_of_words, number_of_words_no_stop, data, text_no_stop)
        
            text_list.append(section_tuple)
        
            continue

In [None]:
extracted_data = pd.DataFrame(
    text_list,
    columns=['section', 'start_index', 'end_index', 'number_of_words', 'number_of_words_no_stop', 'dataset_name', 'text']
)

In [None]:
extracted_data.head()

# Sections

Each text is a list of JSON objects and each has to keys `section_title` & `text`. For a quick visualiztion on which sections the dataset name shows up in we can look at a bar plot of the value counts

In [None]:
section_counts = extracted_data['section'].value_counts()

In [None]:
ax = section_counts[section_counts > 100].plot(kind='bar', figsize=(10, 7))

ax.set_title('Common Sections That Mention Dataset Name')
ax.set_xlabel('Section Number')
ax.set_ylabel('Counts')

plt.grid()

There are some texts with quite the number of sections as identifed by the JSON. We see that there can be a wide range where the dataset name is defined or mentioned for the first time. The idea here was to see if we could truncate the texts to be shorter. It looks like it could be possible but there will have to be some experimentation and creativity. For example, we could possible build a two-stage model. A model that goes through each section and predicts if the data set will be mentioned based on the text (binary classification) and then another model that predictions the start and ending index of the dataset.

From the plot above we see that the dataset name of interest in most commonly found in section 1.

In [None]:
# Simple plot that looks at the log number of words in a text
ax = extracted_data['number_of_words'].apply(np.log).hist(figsize=(10, 7), alpha=0.6, bins=100)

ax.set_title('Histogram For Number of Words in Text (Log)')
ax.set_xlabel('Log Number of Words')
ax.set_ylabel('Counts')

# Simple plot that looks at the log number of words with stop words omitted in a text
extracted_data['number_of_words_no_stop'].apply(np.log).hist(figsize=(10, 7), alpha=0.6, bins=100, color='orange', ax=ax)

In [None]:
# Look at the head of the train.csv
train_df.head()

Since we already have it defined, we can also take a quick look at some of the stats of the publication titles and the cleaned labels from the `train.csv`.

In [None]:
# Get lengths for the publication title and the cleaned label
train_df['pub_title_length_by_word'] = train_df['pub_title'].apply(lambda x: len(x.split()))
train_df['cleaned_label_length_by_word'] = train_df['cleaned_label'].apply(lambda x: len(x.split()))

train_df['pub_title_length'] = train_df['pub_title'].apply(len)
train_df['cleaned_label_length'] = train_df['cleaned_label'].apply(len)

In [None]:
ax = train_df['pub_title_length_by_word'].hist(figsize=(10, 7), bins=10)
ax.set_title('Historgram of Publication Title Lengths by Word')

In [None]:
ax = train_df['pub_title_length'].hist(figsize=(10, 7), bins=10)
ax.set_title('Historgram of Publication Title Lengths')

In [None]:
ax = train_df['cleaned_label_length_by_word'].hist(figsize=(10, 7), bins=10)
ax.set_title('Historgram of Cleaned Label Lengths by Word')

In [None]:
ax = train_df['cleaned_label_length'].hist(figsize=(10, 7), bins=10)
ax.set_title('Historgram of Cleaned Label Lengths')

If we want to train a model the next thing we will have to think about is tokenization of the text. I will be importing `tokenizers` which can be found here (https://pypi.org/project/tokenizers/). We will take a look at a simple example of what the `ByteLevelBPETokenizer` looks like and how we can use it on our text data. For a better understanding of BPE (Byte Pair Encoding) this artical may be useful (https://towardsdatascience.com/byte-pair-encoding-the-dark-horse-of-modern-nlp-eb36c7df4f10).

First let's look at a simple example:

In [None]:
# Load in the data for the tokenizer
PATH = '../input/roberta-base'
vocab_file = os.path.join(PATH, 'vocab.json')
merges_file = os.path.join(PATH, 'merges.txt')

tokenizer = tokenizers.ByteLevelBPETokenizer(
    vocab=vocab_file, 
    merges=merges_file, 
    lowercase=True,
    add_prefix_space=True
)

# One thing we want to do before feeding our sequences into a model like RoBERTa is adding special tokens to the tokenizer
tokenizer.add_special_tokens(['<s>', '</s>', '<pad>', '<mask>'])

In [None]:
encoding = tokenizer.encode('<s> Hello World!</s><pad>')

In [None]:
# Look at the encoded tokens
encoding.tokens

In [None]:
# Look at the offsets provided by the tokenizer
encoding.offsets

In [None]:
# Look at the attention mask provided by the tokenizer
encoding.attention_mask

In [None]:
encoding.ids

In [None]:
encoding = tokenizer.encode(extracted_data['text'][0])

In [None]:
# Look at the tokens
encoding.tokens[:10]

In [None]:
# Look at the ids
encoding.ids[:10]

There is certainly a lot more work to be done here. We should probably remove punctuation & try to clean up the text a bit more before we tokenize and input into a model.