In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
import csv

In [2]:
train_tokens_df = pd.read_csv("../input/tagged-ner-tokens-feedback-prize/text.txt", sep="\n", header= None, names=["Tokens"], quoting=csv.QUOTE_NONE)
train_labels_df = pd.read_csv("../input/tagged-ner-tokens-feedback-prize/labels.txt", sep="\n", header= None, names=["Labels"], quoting = csv.QUOTE_NONE)

In [3]:
train_df = pd.concat([train_tokens_df, train_labels_df], axis=1)
train_df.Tokens = train_df.Tokens.str.split()
train_df.Labels = train_df.Labels.str.split()
train_df

Unnamed: 0,Tokens,Labels
0,"[I, think, we, should, be, able, to, play, in,...","[B-Position, I-Position, I-Position, I-Positio..."
1,"[Some, schools, require, summer, projects, for...","[B-Position, I-Position, I-Position, I-Positio..."
2,"[Driverless, cars, have, been, argued, and, ta...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
3,"[The, author, of, ""The, Challenge, of, Explori...","[B-Position, I-Position, I-Position, I-Positio..."
4,"[Wow,, from, the, mar, really, look, like, hum...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
...,...,...
15589,"[Electoral, college, is, a, unfar, way, to, vo...","[B-Position, I-Position, I-Position, I-Positio..."
15590,"[Driverless, cars, are, coming., Are, we, read...","[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
15591,"[People, are, surrending, their, cars., these,...","[B-Position, I-Position, I-Position, I-Positio..."
15592,"[The, use, of, the, Facial, Action, Coding, Sy...","[B-Position, I-Position, I-Position, I-Positio..."


In [4]:
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df)
train_dataset = train_dataset.train_test_split(test_size=0.1)

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("allenai/longformer-base-4096", add_prefix_space=True)

Downloading:   0%|          | 0.00/694 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

In [6]:
discourse_types = ["O", 
    "B-Position",
    "I-Position",
    "B-Evidence",
    "I-Evidence",
    "B-Counterclaim", 
    "I-Counterclaim", 
    "B-Rebuttal", 
    "I-Rebuttal", 
    "B-Claim", 
    "I-Claim", 
    "B-ConcludingStatement", 
    "I-ConcludingStatement",
    "B-Lead",
    "I-Lead",   
]

In [7]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["Tokens"], truncation=True, is_split_into_words=True, max_length=1024)

    labels = []
    for i, label in enumerate(examples["Labels"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(discourse_types.index(label[word_idx]))
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [8]:
tokenized_train = train_dataset.map(tokenize_and_align_labels, batched=True)

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

In [9]:
tokenized_train

DatasetDict({
    train: Dataset({
        features: ['Tokens', 'Labels', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14034
    })
    test: Dataset({
        features: ['Tokens', 'Labels', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 1560
    })
})

In [10]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [11]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

model = AutoModelForTokenClassification.from_pretrained("allenai/longformer-base-4096", num_labels=len(discourse_types))

Downloading:   0%|          | 0.00/570M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerForTokenClassification: ['lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias']
- This IS expected if you are initializing LongformerForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of LongformerForTokenClassification were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN

In [12]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [13]:
BS = 4
GRAD_ACC = 8
LR = 5e-5
WD = 0.01
WARMUP = 0.1
N_EPOCHS = 5
OUTPUT_DIR = "./results"

In [14]:
training_args = TrainingArguments(
    evaluation_strategy = "epoch",
    logging_strategy = "epoch",
    save_strategy = "epoch",
    output_dir = OUTPUT_DIR,          
    learning_rate=LR,
    per_device_train_batch_size=BS,
    per_device_eval_batch_size=BS,
    num_train_epochs=N_EPOCHS,
    weight_decay=WD,
    report_to=None, 
    gradient_accumulation_steps=GRAD_ACC,
    warmup_ratio=WARMUP
)

Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [15]:
model = model.to("cuda:0")

In [16]:
!--report_to none

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
/bin/bash: --: invalid option
Usage:	/bin/bash [GNU long option] [option] ...
	/bin/bash [GNU long option] [option] script-file ...
GNU long options:
	--debug
	--debugger
	--dump-po-strings
	--dump-strings
	--help
	--init-file
	--login
	--noediting
	--noprofile
	--norc
	--posix
	--pretty-print
	--rcfile
	--restricted
	--verbose
	--version
Shell options:
	-ilrsD or -c command or -O shopt_option		(invocation only)
	-abefhkmnptuvxBCHP or -o option


In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train["train"],
    eval_dataset = tokenized_train["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [18]:
model.to("cuda")

LongformerForTokenClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0): LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_

In [19]:
import tensorflow as tf

print("training")
with tf.device('/GPU:0'):
    trainer.train()

2022-02-11 21:48:08.081705: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
The following columns in the training set  don't have a corresponding argument in `LongformerForTokenClassification.forward` and have been ignored: Labels, Tokens.
2022-02-11 21:48:08.082869: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-11 21:48:08.083504: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:937] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-11 21:48:08.086062: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the

training


Input ids are automatically padded from 577 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 919 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 799 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 835 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 433 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 581 to 1024 to be a multiple of `config.attention_window`: 512


Epoch,Training Loss,Validation Loss
0,0.9049,0.534584
1,0.4683,0.45769
2,0.3717,0.445279
3,0.3019,0.464764
4,0.2484,0.484189


Input ids are automatically padded from 401 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 858 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 745 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 821 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 598 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 664 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 823 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 498 to 512 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 517 to 1024 to be a multiple of `config.attention_window`: 512
Input ids are automatically padded from 688 to 1024 to be a multiple of `co

In [20]:
trainer.save_model('token_classification_model')

Saving model checkpoint to token_classification_model
Configuration saved in token_classification_model/config.json
Model weights saved in token_classification_model/pytorch_model.bin
tokenizer config file saved in token_classification_model/tokenizer_config.json
Special tokens file saved in token_classification_model/special_tokens_map.json
