Hello Fellow Kagglers,

This notebook demonstrates the preprocessing of the data for the Feedback Price competition by tokenizing the excerpts and oversampling the minority classes.

This oversampling process reduces the class inbalance and should make the model less biased towards the majority class.

[Training Notebook](https://www.kaggle.com/markwijkhuizen/training-longformer-gradient-accumulation)

Inference Notebook Coming Soon

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from transformers import LongformerTokenizer
from tqdm.notebook import tqdm
from nltk.tokenize import word_tokenize, sent_tokenize
from sklearn.model_selection import train_test_split

import glob
import re

tqdm.pandas()

In [None]:
# In debug mode a subset of the training dataset is used
DEBUG = False

# Read Train DataFrame

In [None]:
# Column Data Types
dtype = {
    'id': 'string',
    'discourse_id': np.uint64,
    'discourse_start': np.uint16,
    'discourse_end': np.uint16,
    'discourse_text': 'string',
    'discourse_type': 'category',
    'discourse_type_num': 'category',
    'predictionstring': 'string',
}

if DEBUG:
    train = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv', dtype=dtype).head(int(10e3))
else:
    train = pd.read_csv('/kaggle/input/feedback-prize-2021/train.csv', dtype=dtype)

display(train.head())

display(train.info())

# Word and Sentence Count

In [None]:
# Number of Annotated Words, Ignoring Double Annotated Words
def sample_ann_word_count(predictionstrings):
    s = set()
    
    for l in predictionstrings.str.split():
        for e in l:
            s.add(int(e))
        
    return [len(s)] * len(predictionstrings)

# Text Word Count
train['word_count'] = train['discourse_text'].apply(word_tokenize).apply(len).astype(np.uint16)

# Text Word Count
train['ann_word_count'] = train['predictionstring'].str.split(' ').apply(len).astype(np.uint16)

# Sample Word Count
train['sample_ann_word_count'] = train.groupby('id')['predictionstring'].transform(sample_ann_word_count).astype(np.uint16)

# Text Sentence Count
train['sentence_count'] = train['discourse_text'].apply(sent_tokenize).apply(len).astype(np.uint16)

# Maximum Word Index
train['max_word_index'] = train['predictionstring'].str.split(' ').apply(lambda l: int(l[-1])).astype(np.uint16)

# Max Word Index of Text ID
train['sample_max_word_index'] = train.groupby('id')['max_word_index'].transform('max')

In [None]:
# Text ID to Word Count
def id2sample_word_count(text_ids):
    text_id = text_ids.values[0]
    # Read Text File
    with open(f'/kaggle/input/feedback-prize-2021/train/{text_id}.txt', 'r') as f:
        text = f.read().split()
        word_count = len(text)
        
    return [word_count] * len(text_ids)

In [None]:
# Sample Word Count
train['sample_word_count'] = train.groupby('id')['id'].transform(id2sample_word_count).astype(np.uint16)

# Ratio of Annotated Words
train['ann_ratio'] = (train['sample_ann_word_count'] / train['sample_word_count']).astype(np.float32)

In [None]:
display(train.head(10))

# Discourse Class Distribution

The discourse types are unbalances, which occurances the range of 2.7% to 33.2%.

In [None]:
# Discourse Type Distribution
plt.figure(figsize=(10,10))
train.groupby('discourse_type')['discourse_type'].count().plot(kind='pie', autopct='%1.1f%%', textprops={'fontsize': 16}, startangle=0)
plt.title('Discourse Type Distribution', size=24)
plt.ylabel('')
pass

When looking at the number of annotated words per discourse type the inbalance becomes even more severe, with occurances ranging from 1.7% to 57.5%.

In [None]:
# Discourse Type Distribution
plt.figure(figsize=(10,10))
train.groupby('discourse_type')['ann_word_count'].sum().plot(kind='pie', autopct='%1.1f%%', textprops={'fontsize': 16}, startangle=0)
plt.title('Discourse Type Annotated Words Distribution', size=24)
plt.ylabel('')
pass

In [None]:
# The annotated word count inbalance can be explained by the difference in discourse type size
display(train.groupby(['discourse_type'])['ann_word_count'].describe())

# Annotation Count

In [None]:
# Word Count Distribution
plt.figure(figsize=(15, 8))
train.groupby('id')['word_count'].sum().plot(kind='hist', bins=32)
plt.title('Word Count Distribution', size=24)
plt.xlabel('Word Count', size=18)
plt.ylabel('Frequency', size=18)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
pass

In [None]:
# Word Count Distribution
plt.figure(figsize=(15, 8))
train.groupby('id')['ann_word_count'].sum().plot(kind='hist', bins=32)
plt.title('Text Annotated Word Count Distribution', size=24)
plt.xlabel('Text Annotated Word Count', size=18)
plt.ylabel('Frequency', size=18)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
pass

In [None]:
# Word Count Distribution
plt.figure(figsize=(15, 8))
train.groupby('id')['sample_word_count'].first().plot(kind='hist', bins=32)
plt.title('Sample Word Count Distribution', size=24)
plt.xlabel('Sample Word Count', size=18)
plt.ylabel('Frequency', size=18)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
pass

In [None]:
# Word Count Distribution
plt.figure(figsize=(15, 8))
train.groupby('id')['sentence_count'].sum().plot(kind='hist', bins=32)
plt.title('Sentence Count Distribution', size=24)
plt.xlabel('Sentence Count', size=18)
plt.ylabel('Frequency', size=18)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
pass

# Annotation Ratio

Most texts have an annotation ratio, which is the ratio of words which are annotated in a text, close to 100%. There are however a handful of texts which have less than 70% of words annotated which will be filtered out.

In [None]:
# Annotation Ratio Distribution
plt.figure(figsize=(20, 8))
train.groupby('id')['ann_ratio'].first().plot(kind='hist', bins=32)
plt.title('Annotation Ratio', size=24)
plt.xlabel('Annotation Ration', size=18)
plt.ylabel('Frequency', size=18)
plt.xticks([i * 0.05 for i in range(21)], size=14)
plt.yticks(size=16)
plt.grid()
pass

# Longformer Tokenizer

In [None]:
# Input Sequence Length
SEQ_LENGTH= 4096

In [None]:
# Load the Tokenizer
tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-large-4096')

# Save Tokenizer
tokenizer.save_pretrained('./tokenizer/')

In [None]:
# This function tokenize the text according to a transformers model tokenizer
def tokenize(excerpt, padding='max_length', max_length=SEQ_LENGTH):
    enc_di = tokenizer.encode_plus(
        excerpt,
        padding = padding,
        truncation = True,
        max_length = max_length,
    )
    
    return np.array(enc_di['input_ids'], dtype=np.int32)

# Token Length

In [None]:
def get_token_lengths(text_ids):
    text_id = text_ids.values[0]
    # Read Text File
    with open(f'/kaggle/input/feedback-prize-2021/train/{text_id}.txt', 'r') as f:
        text = f.read().split(' ')
        
    # Tokenize Text
    text_encoded = tokenize(text, padding='do_not_pad', max_length=np.PINF)
    
    return [len(text_encoded)] * len(text_ids)

# Get Token Length
train['token_len'] = train.groupby('id')['id'].progress_transform(get_token_lengths).astype(np.uint16)

In [None]:
# Token Length Distribution
plt.figure(figsize=(15, 8))
train.groupby('id')['token_len'].first().plot(kind='hist', bins=64)
plt.title('Token Length Distribution', size=24)
plt.xlabel('Token Length', size=18)
plt.ylabel('Frequency', size=18)
plt.xticks(size=16)
plt.yticks(size=16)
plt.grid()
pass

# Filter Samples

In [None]:
# Minimum Annotation Ratio
ANN_RATIO_THRESHOLD = 0.70

# All Text Ids
ALL_TEXT_IDS = train['id'].nunique()

# Drop Sample with Word Count Above Threshold and Annotation Ratio Below Threshold
drop_idxs = train[
    (train['token_len'] > SEQ_LENGTH) | # Token Length Larger than Max Sequence Length
    (train['ann_ratio'] < ANN_RATIO_THRESHOLD) | # Annotation Ratio Below Threshold
    (train['max_word_index'] >= SEQ_LENGTH) # Max Word Index Larger than Max Sequence Length
].index
train = train.drop(drop_idxs, axis=0).reset_index(drop=True)

# Valid Text Ids
VALID_TEXT_IDS = list(train['id'].unique())
N_VALID_TEXT_IDS = len(VALID_TEXT_IDS)
print(f'{N_VALID_TEXT_IDS} valid text ids out of {ALL_TEXT_IDS} text ids')

# Train/Validation Split

In [None]:
# We will be using 1024 Validation Samples
N_VAL = 1024
X_train_ids, X_val_ids = train_test_split(train['id'].unique(), test_size=N_VAL, random_state=42)
print(f'X_train_ids shape: {X_train_ids.shape}, X_val_ids shape: {X_val_ids.shape}')

In [None]:
X_train_ids = list(X_train_ids)
X_val_ids = list(X_val_ids)

DISCOURSE_TYPES = train['discourse_type'].unique().tolist()

id2discourse_types_dict = train.groupby('id')['discourse_type'].unique().to_dict()

# Oversampling

In [None]:
# Dicsourse Type Count
def train_ids2discourse_type_counts():
    discourse_type_counts_train_ids = dict([(dt, 0) for dt  in DISCOURSE_TYPES])
    
    for train_id in X_train_ids:
        for dt in id2discourse_types_dict[train_id]:
            discourse_type_counts_train_ids[dt] += 1
            
    return pd.Series(discourse_type_counts_train_ids).sort_values()

discourse_type_counts0 = train_ids2discourse_type_counts()

# Discour Type Count
display(discourse_type_counts0.sort_index())

In [None]:
# Fill All Classes to the Majority Class
DISCOURSE_TYPES = discourse_type_counts0.index.tolist()
FILL_TO = max(discourse_type_counts0)

print(f'DISCOURSE_TYPES: {DISCOURSE_TYPES}, FILL_TO: {FILL_TO}')

In [None]:
# Oversample to Maximum Sample Count
for dt in tqdm(DISCOURSE_TYPES):
    # Get current Discourse Type Count
    discourse_type_counts = train_ids2discourse_type_counts()
    samples_discourse_type = discourse_type_counts[dt]
    if samples_discourse_type < FILL_TO:
        while samples_discourse_type < FILL_TO:
            # Take Random ID
            random_id = str(np.random.choice(X_train_ids, 1).squeeze())
            if dt in id2discourse_types_dict[random_id]:
                X_train_ids.append(random_id)
                samples_discourse_type += 1

# Oversample Statistics

In [None]:
print('=== BEFORE ===')
display(discourse_type_counts0.to_frame().sort_index())
print('=== AFTER ===')
display(discourse_type_counts.to_frame().sort_index())
print('=== DIFFERENCE PERCENTAGE ===')
percentual_increase = ((discourse_type_counts - discourse_type_counts0) / discourse_type_counts0 * 100)
percentual_increase = percentual_increase.apply(lambda i: f'{int(i)}%')
percentual_increase = percentual_increase.to_frame(name='Percentage Increase')
display(percentual_increase.sort_index())

# Create Tokens

In [None]:
N_TRAIN_SAMPLES = len(X_train_ids)
N_VAL_SAMPLES = len(X_val_ids)

print(f'N_TRAIN_SAMPLES: {N_TRAIN_SAMPLES}, N_VAL_SAMPLES: {N_VAL_SAMPLES}')

In [None]:
# Create Train Directory
!rm -rf train val
!mkdir train val

# Train Tokens

In [None]:
train_tokens = np.empty(shape=(N_TRAIN_SAMPLES, SEQ_LENGTH), dtype=np.uint16)

# === TRAIN ===
for idx, text_id in enumerate(tqdm(X_train_ids)):
    
    # Read Text File
    with open(f'/kaggle/input/feedback-prize-2021/train/{text_id}.txt', 'r') as f:
        text = f.read()
        
    # Tokenize Text
    text_encoded = tokenize(text)
    
    # Add to Train Tokens Array
    train_tokens[idx] = text_encoded
    
# Save Train Tokens as Numpy Array
np.save('train/train_tokens.npy', train_tokens)

# Validation Tokens

In [None]:
val_tokens = np.empty(shape=(N_VAL_SAMPLES, SEQ_LENGTH), dtype=np.uint16)

# === VALIDATION ===
for idx, text_id in enumerate(tqdm(X_val_ids)):
    
    # Read Text File
    with open(f'/kaggle/input/feedback-prize-2021/train/{text_id}.txt', 'r') as f:
        text = f.read()
        
    # Tokenize Text
    text_encoded = tokenize(text)
    
    # Add to Val Tokens Array
    val_tokens[idx] = text_encoded
    
# Save Val Tokens as Numpy Array
np.save('val/val_tokens.npy', val_tokens)

# Labels

In [None]:
# Text Id to Token Length Mapping
ID2TOKEN_LEN = train[['id', 'token_len']].set_index('id').squeeze().to_dict()

In [None]:
# Get all labels sorted for reproducibility
LABELS = train['discourse_type'].unique().sort_values().tolist()
# Add extra non-annotated and padding label
N_LABELS = len(LABELS) + 2
# Not Annotated Class
NA_CLASS = len(LABELS)
# Padding Class
PAD_CLASS = len(LABELS) + 1

print(f'N_LABELS: {N_LABELS}, NA_CLASS: {NA_CLASS}, PAD_CLASS: {PAD_CLASS}')
print(f'LABELS: {LABELS}')

In [None]:
# Text Id to Label
def id2label(text_id):
    group = train[train['id'] == text_id]
    
    labels = np.full(fill_value=NA_CLASS, shape=SEQ_LENGTH, dtype=np.int8)
    # Set with set indices
    idxs_set = set()
    
    # Set Labels
    for _, row in group.iterrows():
        # Discourse Type
        discourse_type = row['discourse_type']
        # Discourse Label
        discourse_type_int = LABELS.index(discourse_type)
        idxs = np.array(row['predictionstring'].split(' '), dtype=np.int16)
        # filter on indices that are already set
        idxs = idxs[[e not in idxs_set for e in idxs]]
        # Set Discourse Labels to 1
        labels[idxs] = discourse_type_int
        # Update Indices Seen
        idxs_set.update(idxs)
        
    # Set Padding Class
    token_len = ID2TOKEN_LEN[text_id]
    labels[token_len:] = PAD_CLASS
        
    return labels

In [None]:
# === TRAIN ===
train_labels = np.zeros(shape=(N_TRAIN_SAMPLES, SEQ_LENGTH), dtype=np.int8)
print(f'train_labels shape: {train_labels.shape}')

# Generate Labels
for idx, text_id in enumerate(tqdm(X_train_ids)):
    train_labels[idx] = id2label(text_id)
    
# Save Train Labels as Numpy Array
np.save('train/train_labels.npy', train_labels)

In [None]:
# === VALIDATION ===
val_labels = np.zeros(shape=(N_VAL_SAMPLES, SEQ_LENGTH), dtype=np.int8)
print(f'val_labels shape: {val_labels.shape}')

# Generate Labels
for idx, text_id in enumerate(tqdm(X_val_ids)):
    val_labels[idx] = id2label(text_id)
    
# Save Val Labels as Numpy Array
np.save('val/val_labels.npy', val_labels)

# Attention Mask 

In [None]:
# === TRAIN ===
train_attention_masks = (train_labels != PAD_CLASS).astype(np.int8)
print(f'train_attention_masks shape: {train_attention_masks.shape}')
    
# Save as Numpy Array
np.save('train/train_attention_masks.npy', train_attention_masks)

In [None]:
# # === VALIDATION ===
val_attention_masks = (val_labels != PAD_CLASS).astype(np.int8)
print(f'val_attention_masks shape: {val_attention_masks.shape}')

# Save as Numpy Array
np.save('val/val_attention_masks.npy', val_attention_masks)