In [1]:
"""
Changing working directory to repository path 
in order to make simpler references to files/folder.

Also, adding src folder in the repository to import
any code that has been moved to py files for reusability
"""

import os
REPOSITORY_PATH = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/mlgpu2/code/Users/santiago.a.diez/evaluating-student-writing-kaggle-challenge'
os.chdir(REPOSITORY_PATH)
import sys  
sys.path.insert(0, 'src')

from eswkg.config import Config

In [2]:
import numpy as np
import gc
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
import json
from sklearn.metrics import accuracy_score
import wandb

from torch.utils.data import Dataset, DataLoader, random_split
from torch import cuda
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig
#from transformers import *

In [3]:
def read_essay(essay_id, folder_path = Config.get_file_path("train_folder")):
    with open(folder_path + f"/{essay_id}.txt") as f:
        essay = f.read()
    return essay


def read_essays(train_txt):
    train_txt_file_id, train_txt_file_text = [],[]
    for train_txt_file in train_txt:
        essay_id = os.path.basename(train_txt_file).rsplit(".",1)[0]
        essay_folder = os.path.dirname(train_txt_file)

        train_txt_file_id.append(essay_id)
        train_txt_file_text.append(read_essay(essay_id,essay_folder))
    return pd.DataFrame({"id":train_txt_file_id, "text":train_txt_file_text})


def get_essay_entities(essay_text, essay_metadata):
    essay_entities = ["O"]*len(essay_text.split())
    for discourse_type, predictionstring in zip(essay_metadata["discourse_type"],essay_metadata["predictionstring"]):
        predictionstring_digits = list(map(int, predictionstring.split()))
        
        essay_entities[predictionstring_digits[0]] = f"B-{discourse_type}"
        for predictionstring_digits_index in predictionstring_digits[1:]:
           essay_entities[predictionstring_digits_index] = f"I-{discourse_type}"
    
    return essay_entities


def tag_essays(essays, essays_metadata):
    tagged_essays = pd.DataFrame()
    tagged_essays_list = []
    for _, essay in essays.iterrows():
        essay_id = essay["id"]
        essay_text = essay["text"]
        essay_metadata = essays_metadata.query("id == @essay_id")
        essay_entities = get_essay_entities(essay_text, essay_metadata)

        tagged_essays_list.append( 
            {
                "id": essay_id,
                "text": essay_text,
                "entities": essay_entities
            }
        )
    return pd.DataFrame.from_dict(tagged_essays_list)


def generate_file(generation_func, file_path, generate_file=False, *args):
    try:
        if generate_file:
            generation_func(*args).to_csv(file_path, index=False)
        return pd.read_csv(file_path)
    except FileNotFoundError as err:
        print(f"{err}, {type(err)}")
    except Exception as err:
        print(f"Unexpected {err}, {type(err)}")
        raise

def generate_labels_file(essays_metadata, file_path=Config.get_file_path("model_input")):
    label_list = []
    label_list.append('O')

    for discourse_type in essays_metadata.discourse_type.unique():
        label_list.append(f'B-{discourse_type}')
        label_list.append(f'I-{discourse_type}')

    labels_to_ids = {v:k for k,v in enumerate(label_list)}
    ids_to_labels = {k:v for k,v in enumerate(label_list)}
    
    with open(file_path+"/label_list.txt", "w") as output:
        output.write(str(label_list))
        
    json.dump(labels_to_ids, open(file_path+"/labels_to_ids.json",'w'))
    json.dump(ids_to_labels, open(file_path+"/ids_to_labels.json",'w'))


Retrieving file paths for different folders and files in the project

In [4]:
file_paths = Config.get_all_file_paths()

## Loading data

In [5]:
essays_metadata = pd.read_csv(file_paths["train"])
essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']] = essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)

sample_submission = pd.read_csv(file_paths["sample_submission"])

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_txt = glob(file_paths["train_folder"] + "/*.txt") 
test_txt = glob(file_paths["test_folder"] + "/*.txt")

In [6]:
create_test_essays_file = False
essays_file_path = file_paths["intermediate"]+"/test_text.csv"

test_essays = generate_file(read_essays, essays_file_path, create_test_essays_file, test_txt)

print(test_essays.shape)
test_essays.head()

(5, 2)


Unnamed: 0,id,text
0,0FB0700DAF44,"During a group project, have you ever asked a ..."
1,18409261F5C2,80% of Americans believe seeking multiple opin...
2,D46BCB48440A,"When people ask for advice,they sometimes talk..."
3,D72CB1C11673,Making choices in life can be very difficult. ...
4,DF920E0A7337,Have you ever asked more than one person for h...


In [7]:
create_essays_file = False
essays_file_path = file_paths["intermediate"]+"/train_text.csv"

essays = generate_file(read_essays, essays_file_path, create_essays_file, train_txt)

print(essays.shape)
essays.head(5)

(15594, 2)


Unnamed: 0,id,text
0,0000D23A521A,"Some people belive that the so called ""face"" o..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...
3,001552828BD0,Would you be able to give your car up? Having ...
4,0016926B079C,I think that students would benefit from learn...


In [8]:
create_essay_entities_file = False
essay_entities_file_path = file_paths["model_input"]+"/essays_NER.csv"

essays_entities = generate_file(tag_essays, essay_entities_file_path, create_essay_entities_file, essays, essays_metadata)
essays_entities.entities = essays_entities.entities.apply(lambda x: literal_eval(x) )

print(essays_entities.shape)
essays_entities.head(5)

(15594, 3)


Unnamed: 0,id,text,entities
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[B-Position, I-Position, I-Position, I-Positio..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[O, O, B-Position, I-Position, I-Position, I-P..."
3,001552828BD0,Would you be able to give your car up? Having ...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,0016926B079C,I think that students would benefit from learn...,"[B-Position, I-Position, I-Position, I-Positio..."


In [9]:
generate_labels_file_ = True
if generate_labels_file_:
    generate_labels_file(essays_metadata)

## Pytorch Dataset definition

In [10]:
class dataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len, get_wids=False, labels_file_path = file_paths["model_input"]):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids
        
        with open(labels_file_path+"/ids_to_labels.json") as f:
            self.ids_to_labels = {int(k):v for k,v in json.load(f).items() }
        with open(labels_file_path+"/labels_to_ids.json") as f:
            self.labels_to_ids = {k:int(v) for k,v in json.load(f).items() }
        
    def __getitem__(self, index):
        text = self.sentences[index]
        word_labels = self.labels[index] if not self.get_wids else None
        
        encoding = self.tokenizer(
            text,
            return_offsets_mapping=True, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_len
        )
        
        encoding['labels'], split_word_ids = self._get_label_ids(text, word_labels, encoding)

        # CONVERT TO TORCH TENSORS
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids: 
            item['wids'] = torch.as_tensor(split_word_ids)
        
        return item 
    
    def __len__(self):
        return self.len

    def _get_label_ids(self, text, word_labels, encoding):
        word_ids = encoding.word_ids()  
        split_word_ids = np.full(len(word_ids),-1)
        offset_to_wordidx = self._split_mapping(text)
        offsets = encoding['offset_mapping']
        
        # CREATE TARGETS AND MAPPING OF TOKENS TO SPLIT() WORDS
        label_ids = []
        # Iterate in reverse to label whitespace tokens until a Begin token is encountered
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):
            if word_idx is None:
                if not self.get_wids: label_ids.append(-100)
            else:
                if offsets[token_idx] != (0,0):
                    #Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx][0]:offsets[token_idx][1]]
                    split_index = stats.mode(split_idxs[split_idxs != -1]).mode[0] if len(np.unique(split_idxs)) > 1 else split_idxs[0]
                    
                    if split_index != -1: 
                        if not self.get_wids: label_ids.append( self.labels_to_ids[word_labels[split_index]] )
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        last_label_id = label_ids[-1]
                        if label_ids and last_label_id != -100 and self.ids_to_labels[last_label_id][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if not self.get_wids: label_ids.append(last_label_id)
                        else:
                            if not self.get_wids: label_ids.append(-100)
                else:
                    if not self.get_wids: label_ids.append(-100)
        
        return list(reversed(label_ids)), split_word_ids

    def _split_mapping(self, unsplit):
        # Return an array that maps character index to index of word in list of split() words
        # Code copied from https://www.kaggle.com/chasembowers/pytorch-bigbird-whitespace-cv-0-6284/notebook
        splt = unsplit.split()
        no_token_value = -1
        offset_to_wordidx = np.full(len(unsplit), no_token_value)
        txt_ptr = 0
        for split_index, full_word in enumerate(splt):
            while unsplit[txt_ptr:txt_ptr + len(full_word)] != full_word:
                txt_ptr += 1
            offset_to_wordidx[txt_ptr:txt_ptr + len(full_word)] = split_index
            txt_ptr += len(full_word)
        return offset_to_wordidx

In [11]:
def get_train_valid_split(sentences, labels, tokenizer, split_size=0.8):
    train_set = dataset(
        tokenizer=tokenizer, 
        sentences=sentences, 
        labels=labels, 
        max_len=config["max_length"] )


    torch.manual_seed(random_seed)
    # Before
    print('Train data set:', len(train_set))
    #print('Test data set:', len(test_set))

    train_set_size = int(len(train_set) * split_size)
    valid_set_size = len(train_set) - train_set_size
    train_set, valid_set = random_split(train_set, [train_set_size, valid_set_size])
    
    # After
    print('='*30)
    print("{:.0%} train split".format(split_size))
    print('Train data set:', len(train_set))
    #print('Test data set:', len(test_set))
    print('Valid data set:', len(valid_set))

    return train_set, valid_set

# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
def train(model, optimizer, train_loader, device_config="cpu", grad_norm=10):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    #tr_preds, tr_labels = [], []
    
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        
        ids = batch['input_ids'].to(device_config, dtype = torch.long)
        mask = batch['attention_mask'].to(device_config, dtype = torch.long)
        labels = batch['labels'].to(device_config, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels,
                               return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 200==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss after {idx:04d} training steps: {loss_step}")
            wandb.log({"step": idx})
            wandb.log({"loss_step": loss_step})

           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        #tr_labels.extend(labels)
        #tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=grad_norm
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

In [13]:
wandb_enabled = True

wand_project = "evaluating-student-writing-kaggle-challenge"
wand_entity = "sdsantiagodiez"
if wandb_enabled:
    wandb.init(project=wand_project, entity=wand_entity)

[34m[1mwandb[0m: Currently logged in as: [33msdsantiagodiez[0m (use `wandb login --relogin` to force relogin)


In [14]:
model_name = 'distilbert-base-uncased'
model_name_alphanumeric = re.sub("[^0-9a-zA-Z]+", "_", model_name)

train_set_size_proportion = 0.8
random_seed = 42

config = {'model_name': model_name,   
         'max_length': 512,
         'train_batch_size':4,
         'valid_batch_size':4,
         'epochs':5,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

tokenizer = AutoTokenizer.from_pretrained(config["model_name"])

#just renaming and reassigning for readability purposes
train_essays_sentences = essays_entities.text
train_essays_labels = essays_entities.entities
test_essays_sentenes = test_essays 

In [16]:
train_set, valid_set = get_train_valid_split(
        sentences = train_essays_sentences, 
        labels = train_essays_labels, 
        tokenizer = tokenizer,
        split_size = train_set_size_proportion)

Train data set: 15594
80% train split
Train data set: 12475
Valid data set: 3119


In [17]:
# TRAIN DATASET AND VALID DATASET
train_params = {'batch_size': config['train_batch_size'],
                'shuffle': True,
                'num_workers': 2,
                'pin_memory':True
                }

test_params = {'batch_size': config['valid_batch_size'],
                'shuffle': False,
                'num_workers': 2,
                'pin_memory':True
                }

train_loader = DataLoader(train_set, **train_params)
valid_loader = DataLoader(valid_set, **test_params)

# TEST DATASET
test_set = dataset(test_essays_sentenes, tokenizer, config['max_length'], True)
test_loader = DataLoader(test_essays, **test_params)

In [18]:
# CREATE MODEL
config_model = AutoConfig.from_pretrained(config["model_name"]) 

config_model.num_labels = 15

model = AutoModelForTokenClassification.from_pretrained(config["model_name"], config=config_model)
model.to(config['device'])
optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [21]:
model_version = 1
load_saved_model = False
train_model = False

model_file_name = f'{model_name_alphanumeric}_v{model_version}.pt'
model_file_path = f'{file_paths["models"] }/{model_name_alphanumeric}_v{model_version}.pt'

if load_saved_model:
    model.load_state_dict(torch.load(model_file_path))
    print(f"Model {model_file_name} loaded.")

if train_model:
    for epoch in range(config['epochs']):
        print(f"### Training epoch: {epoch + 1}")
        for g in optimizer.param_groups: 
            g['lr'] = config['learning_rates'][epoch]
        lr = optimizer.param_groups[0]['lr']
        print(f'### LR = {lr}\n')

        train(model, optimizer, train_loader, device_config=config['device'], grad_norm=config['max_grad_norm'])    
        torch.cuda.empty_cache()
        gc.collect()
        
    torch.save(model.state_dict(), model_file_path)
    print(f"Training complete and model {model_file_name} saved.")
    