In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
from tqdm import tqdm
import nltk
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [2]:
discourse_types = ["O", 
    "B-Position",
    "I-Position",
    "B-Evidence",
    "I-Evidence",
    "B-Counterclaim", 
    "I-Counterclaim", 
    "B-Rebuttal", 
    "I-Rebuttal", 
    "B-Claim", 
    "I-Claim", 
    "B-ConcludingStatement", 
    "I-ConcludingStatement",
    "B-Lead",
    "I-Lead",   
]

In [3]:
# Config
batch_size = 1
min_tokens = 5
tok_checkpoint = '../input/feedback-prize-huggingface-baseline-training/longformer-base-4096-4'
model_checkpoint = '../input/feedback-prize-huggingface-baseline-training/longformer-base-4096-4/pytorch_model.bin'

In [4]:
from transformers import AutoTokenizer
    
tokenizer = AutoTokenizer.from_pretrained(tok_checkpoint, add_prefix_space=True)

In [5]:
# Load model
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
import torch

model = AutoModelForTokenClassification.from_pretrained(tok_checkpoint, num_labels=len(discourse_types))

model.load_state_dict(torch.load(model_checkpoint))
model.eval()

LongformerForTokenClassification(
  (longformer): LongformerModel(
    (embeddings): LongformerEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(4098, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): LongformerEncoder(
      (layer): ModuleList(
        (0): LongformerLayer(
          (attention): LongformerAttention(
            (self): LongformerSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (query_global): Linear(in_features=768, out_features=768, bias=True)
              (key_global): Linear(in_features=768, out_features=768, bias=True)
              (value_

In [6]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer)

In [7]:
trainer = Trainer(
    model,
    data_collator=data_collator,
    tokenizer=tokenizer
)

In [8]:
TEST_DIR = f"../input/feedback-prize-2021/test"

In [9]:
def create_tokens():
    tokenized_text_file = open("text.txt", "w")

    test_dir = f"../input/feedback-prize-2021/test"

    for filename in tqdm(os.listdir(test_dir)): #Loop through all the files in the training directory
        file_path = os.path.join(test_dir, filename) #Get file path

        # checking if it is a txt file
        if os.path.isfile(file_path) and os.path.splitext(file_path)[1] == ".txt":
            
            with open (file_path) as f:
                file_text = f.read()

            token_list = []
            
            text_tokens = file_text.split() #Split into word tokens

            for token in text_tokens:
                token_list.append(token)

            tokenized_text_file.write(" ".join(token_list) + "\n") #Separating each token with a space and each essay with a newline

    tokenized_text_file.close()

In [10]:
create_tokens()

100%|██████████| 5/5 [00:00<00:00, 303.75it/s]


In [11]:
import csv

test_tokens_df = pd.read_csv("text.txt", sep="\n", header= None, names=["Tokens"], quoting= csv.QUOTE_NONE)
test_tokens_df.Tokens = test_tokens_df.Tokens.str.split()

In [12]:
from datasets import Dataset
test_dataset = Dataset.from_pandas(test_tokens_df)

In [13]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["Tokens"], truncation=True, is_split_into_words=True)

    return tokenized_inputs

In [14]:
tokenized_test = test_dataset.map(tokenize_and_align_labels, batched=True)
tokenized_test

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['Tokens', 'input_ids', 'attention_mask'],
    num_rows: 5
})

In [15]:
predictions, _, _ = trainer.predict(tokenized_test)

The following columns in the test set  don't have a corresponding argument in `LongformerForTokenClassification.forward` and have been ignored: Tokens.
***** Running Prediction *****
  Num examples = 5
  Batch size = 8
Input ids are automatically padded from 1292 to 1536 to be a multiple of `config.attention_window`: 512


In [16]:
preds = np.argmax(predictions, axis=-1)

In [17]:
def get_discourse_type(idx):
    discourse_type = discourse_types[int(idx)]
    if discourse_type != "O":
        discourse_type = discourse_type[2:] #Slice to remove IOB TAG
        if discourse_type == "ConcludingStatement":
            discourse_type = "Concluding Statement"
    
    return discourse_type


In [18]:
predictionstring_list = []
discourse_type_list = []
text_id_list = []

previous_discourse_type = ""
file_list = os.listdir(TEST_DIR)

for text_idx, text_preds in enumerate(preds):
    current_discourse_idx_list = []
    for i, discourse_pred in enumerate(text_preds):
        discourse_type = get_discourse_type(discourse_pred)
        
        if discourse_type != previous_discourse_type and i > 0:
            if len(current_discourse_idx_list) > 5:
                text_id_list.append(os.path.splitext(file_list[text_idx])[0])
                predictionstring_list.append(" ".join(current_discourse_idx_list))
                discourse_type_list.append(previous_discourse_type)
            
            current_discourse_idx_list = []
        
        current_discourse_idx_list.append(str(i))
        
        previous_discourse_type = discourse_type

In [19]:
fixed_id_list = []
fixed_discourse_type_list = []
fixed_predictionstring_list = []

for i, x in enumerate(discourse_type_list):
    
    if (text_id_list[i] == text_id_list[i-1]) and (discourse_type_list[i] == discourse_type_list[i-1]):
        fixed_predictionstring_list[-1] = fixed_predictionstring_list[-1] + predictionstring_list[i]
    else:
        fixed_id_list.append(text_id_list[i])
        fixed_discourse_type_list.append(discourse_type_list[i])
        fixed_predictionstring_list.append(predictionstring_list[i])
        

In [20]:
submission_df = pd.DataFrame({
    "id" : fixed_id_list,
    "class" : fixed_discourse_type_list,
    "predictionstring" : fixed_predictionstring_list
})

In [21]:
submission_df.head()

Unnamed: 0,id,class,predictionstring
0,0FB0700DAF44,Rebuttal,2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 ...
1,0FB0700DAF44,Claim,57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 7...
2,0FB0700DAF44,Lead,125 126 127 128 129 130 131 132 133 134 135 13...
3,0FB0700DAF44,Claim,140 141 142 143 144 145 146 147 148 149 150 15...
4,0FB0700DAF44,Concluding Statement,644 645 646 647 648 649 650 651 652 653 654 65...


In [22]:
submission_df.to_csv("submission.csv", index=False)