In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
paths = []
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        path = os.path.join(dirname, filename)
        paths.append(path)

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import os
os.listdir('/kaggle/working')

In [None]:
paths[:10]

In [None]:
TRAIN_PATH = '/kaggle/input/feedback-prize-2021/train/'
TEST_PATH = '/kaggle/input/feedback-prize-2021/test/'

In [None]:
train_paths = [os.path.join(TRAIN_PATH,file_path) for file_path in os.listdir(TRAIN_PATH)]
train_paths[:4]

In [None]:
test_paths = [os.path.join(TEST_PATH,file_path) for file_path in os.listdir(TEST_PATH)]
test_paths

In [None]:
with open('/kaggle/input/feedback-prize-2021/train.csv','r') as p:
    train_csv = pd.read_csv(p)
train_csv.head(5)

Load data from .txt files. Split them into tokens and assign ids using file names


In [None]:
def load_texts(paths):
    ids,tokens = [],[]
    for file_name in paths:
        with open(file_name,'r') as f:
            ids.append(file_name.split('/')[-1].split('.')[0])
            tokens.append(f.read().split())
            
    return dict(
        id = ids,
        tokens = tokens
    )
    

In [None]:
train_texts = load_texts(train_paths)
test_texts = load_texts(test_paths)

In [None]:
all_texts = load_texts(train_paths + test_paths)
id_to_tokens = {id:token for id,token in zip(all_texts['id'],all_texts['tokens'])}

In [None]:
train_text_df = pd.DataFrame.from_dict(train_texts)
test_text_df = pd.DataFrame.from_dict(test_texts)

In [None]:
train_text_df.head()

Create list of Labels with 0 as not label assinged and start, continue class for all others - we have to do this to detect for example several claims in a row

In [None]:
classes = train_csv.discourse_type.unique().tolist()
label_list = ["0"]
for c in classes:
    label_list.append(f'B-{c}')
    label_list.append(f'I-{c}')
tags_to_classes = {val: num for num,val in enumerate(label_list)}
label_list

In [None]:
train_csv['len']= train_csv['id'].apply(lambda x: len(id_to_tokens[x]))

In [None]:
def assign_tokens_to_classes(x):
    length = x.values[0][0]
    values = x.values
    res = [0]*length
    for length,tag, value in values:
        clazz = tags_to_classes[f'B-{tag}']
        splitted_values = value.split()

        min_val = int(splitted_values[0])
        res[min_val]= clazz
        res[min_val+1: min_val + len(splitted_values)] = [clazz+1] * (len(splitted_values)-1)

    return res
train_csv2 = train_csv.groupby('id')[['len','discourse_type','predictionstring']].apply(assign_tokens_to_classes).reset_index(name='result')

In [None]:
train_csv2.head()

We have one dataframe with tokens, another one with classes, so we need to merge them by ids now

In [None]:
train_df = train_text_df.merge(train_csv2,on="id",how="inner")

In [None]:
train_df_values = [
    dict(
    id=v[0],
    tokens=v[1],
    ner_tags=v[2]
    ) for v in train_df.values
    
]
test_df_values = [
    dict(
    id=v[0],
    tokens=v[1],
    ) for v in test_text_df.values
]

In [None]:
from transformers import BigBirdModel
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")


In [None]:
def tokenize_and_align_labels(examples,tokenized_inputs=None):
    if tokenized_inputs is None:
        tokenized_inputs = tokenizer(examples["tokens"], truncation=False, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
all_train_info = {
    "id": [v['id'] for v in train_df_values],
    "tokens": [v['tokens'] for v in train_df_values],
    "ner_tags": [v['ner_tags'] for v in train_df_values]   
}
all_test_info = {
    "id": [v['id'] for v in test_df_values],
    "tokens": [v['tokens'] for v in test_df_values],
}

In [None]:
tokenized_inputs = tokenizer(all_train_info["tokens"], truncation=False, is_split_into_words=True)


In [None]:
all_train_info_tokenized = tokenize_and_align_labels(all_train_info,tokenized_inputs)

In [None]:
from transformers import DataCollatorForTokenClassification

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [None]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer


In [None]:
model = AutoModelForTokenClassification.from_pretrained("google/bigbird-roberta-base",num_labels=len(label_list))


In [None]:
%pip install datasets

create huggingface dataset from previously created dictionary

In [None]:
from datasets import Dataset
dataset = Dataset.from_dict(all_train_info_tokenized)
small_dataset = Dataset.from_dict({k:v[:100] for k,v in all_train_info_tokenized.items()})

In [None]:
splited_dataset= dataset.train_test_split(test_size=0.1)
small_splited_dataset= small_dataset.train_test_split(test_size=0.1)

In [None]:
import datasets


In [None]:
%pip install seqeval

In [None]:
metric = datasets.load_metric("seqeval")
def compute_metrics(p):
    predictions, labels = p
    print(predictions.shape)
    print(labels.shape)
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [p for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [l for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
#         "precision": results["overall_precision"],
#         "recall": results["overall_recall"],
#         "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    prediction_loss_only=False,
    learning_rate=1e-5,
    per_device_train_batch_size=2, # optimise gpu memory usage
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,  # optimise gpu memory usage
    gradient_accumulation_steps=4, # optimise gpu memory usage
    gradient_checkpointing=True, # optimise gpu memory usage
    load_best_model_at_end=True, #look out for memory! all checkpoints are saved
    run_name="bigbird_big_three"  # name of the W&B run (optional)
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=splited_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)



In [None]:
#wandb fails on authentication when run in the background

In [None]:
os.environ["WANDB_DISABLED"] = "true"
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
os.makedirs('kaggle/working/results/bert_base_res')

In [None]:
trainer.save_model('/kaggle/working/results/bert_base_res')