In [4]:
"""
Changing working directory to repository path 
in order to make simpler references to files/folder.

Also, adding src folder in the repository to import
any code that has been moved to py files for reusability
"""

import os
REPOSITORY_PATH = '/mnt/batch/tasks/shared/LS_root/mounts/clusters/mlgpu2/code/Users/santiago.a.diez/evaluating-student-writing-kaggle-challenge'
os.chdir(REPOSITORY_PATH)
import sys  
sys.path.insert(0, 'src')

In [73]:
import numpy as np
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

from torch.utils.data import Dataset, DataLoader
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

from eswkg.config import Config

In [25]:
def read_essay(essay_id, train_folder = Config.get_all_file_paths()["train_folder"]):
    with open(train_folder + f"/{essay_id}.txt") as f:
        essay = f.read()
    return essay


def read_essays(train_txt):
    train_txt_file_id, train_txt_file_text = [],[]
    for train_txt_file in train_txt:
        essay_id = os.path.basename(train_txt_file).rsplit(".",1)[0]

        train_txt_file_id.append(essay_id)
        train_txt_file_text.append(read_essay(essay_id))
    return pd.DataFrame({"id":train_txt_file_id, "text":train_txt_file_text})


def get_essay_entities(essay_text, essay_metadata):
    essay_entities = ["O"]*len(essay_text.split())
    for discourse_type, predictionstring in zip(essay_metadata["discourse_type"],essay_metadata["predictionstring"]):
        predictionstring_digits = list(map(int, predictionstring.split()))

        essay_entities[predictionstring_digits[0]] = f"B-{discourse_type}"
        for predictionstring_digits_index in predictionstring_digits[1:]:
           essay_entities[predictionstring_digits_index] = f"I-{discourse_type}"
    
    return essay_entities


def tag_essays(essays, essays_metadata):
    tagged_essays = pd.DataFrame()
    tagged_essays_list = []
    for _, essay in essays.iterrows():
        essay_id = essay["id"]
        essay_text = essay["text"]
        essay_metadata = essays_metadata.query("id == @essay_id")
        essay_entities = get_essay_entities(essay_text, essay_metadata)

        tagged_essays_list.append( 
            {
                "id": essay_id,
                "text": essay_text,
                "entities": essay_entities
            }
        )
    return pd.DataFrame.from_dict(tagged_essays_list)


def generate_file(generation_func, file_path, generate_file=False, *args):
    try:
        if generate_file:
            generation_func(*args).to_csv(file_path, index=False)
        return pd.read_csv(file_path)
    except FileNotFoundError as err:
        print(f"{err}, {type(err)}")
    except Exception as err:
        print(f"Unexpected {err}, {type(err)}")
        raise


Retrieving file paths for different folders and files in the project

In [7]:
file_paths = Config.get_all_file_paths()

## Loading data

In [9]:
essays_metadata = pd.read_csv(file_paths["train"])
essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']] = essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)

sample_submission = pd.read_csv(file_paths["sample_submission"])

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_txt = glob(file_paths["train_folder"] + "/*.txt") 
test_txt = glob(file_paths["test_folder"] + "/*.txt")

In [23]:
create_essays_file = False
essays_file_path = file_paths["intermediate"]+"/train_text.csv"

essays = generate_file(read_essays, essays_file_path, create_essays_file, train_txt)

print(essays.shape)
essays.head(5)

(15594, 2)


Unnamed: 0,id,text
0,0000D23A521A,"Some people belive that the so called ""face"" o..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...
3,001552828BD0,Would you be able to give your car up? Having ...
4,0016926B079C,I think that students would benefit from learn...


In [24]:
create_essay_entities_file = False
essay_entities_file_path = file_paths["model_input"]+"/essays_NER.csv"

essays_entities = generate_file(tag_essays, essay_entities_file_path, create_essay_entities_file, essays, essays_metadata)
essays_entities.entities = essays_entities.entities.apply(lambda x: literal_eval(x) )

print(essays_entities.shape)
essays_entities.head(5)

(15594, 3)


Unnamed: 0,id,text,entities
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[B-Position, I-Position, I-Position, I-Positio..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[O, O, B-Position, I-Position, I-Position, I-P..."
3,001552828BD0,Would you be able to give your car up? Having ...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,0016926B079C,I think that students would benefit from learn...,"[B-Position, I-Position, I-Position, I-Positio..."


In [50]:
label_list = []
label_list.append('O')

for discourse_type in essays_metadata.discourse_type.unique():
    label_list.append(f'B-{discourse_type}')
    label_list.append(f'I-{discourse_type}')

labels_to_ids = {v:k for k,v in enumerate(label_list)}
ids_to_labels = {k:v for k,v in enumerate(label_list)}

labels_to_ids

{'O': 0,
 'B-Lead': 1,
 'I-Lead': 2,
 'B-Position': 3,
 'I-Position': 4,
 'B-Evidence': 5,
 'I-Evidence': 6,
 'B-Claim': 7,
 'I-Claim': 8,
 'B-Concluding Statement': 9,
 'I-Concluding Statement': 10,
 'B-Counterclaim': 11,
 'I-Counterclaim': 12,
 'B-Rebuttal': 13,
 'I-Rebuttal': 14}

## Pytorch Dataset definition

In [54]:
def split_mapping(unsplit):
    # Return an array that maps character index to index of word in list of split() words
    # Code copied from https://www.kaggle.com/chasembowers/pytorch-bigbird-whitespace-cv-0-6284/notebook
    splt = unsplit.split()
    offset_to_wordidx = np.full(len(unsplit),-1)
    txt_ptr = 0
    for split_index, full_word in enumerate(splt):
        while unsplit[txt_ptr:txt_ptr + len(full_word)] != full_word:
            txt_ptr += 1
        offset_to_wordidx[txt_ptr:txt_ptr + len(full_word)] = split_index
        txt_ptr += len(full_word)
    return offset_to_wordidx

In [71]:
class dataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = self.sentences[index]
        #use encode_plus??
        encoding = self.tokenizer.encode(
            text,
            return_offsets_mapping=True, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_len
        )
        
        word_ids = encoding.word_ids()
        split_word_ids = np.full(len(word_ids),-1)
        offset_to_wordidx = split_mapping(text)
        offsets = encoding['offset_mapping']
        
        mask = encoding['attention_mask']
        label = self.labels[index]
        label.extend([4]*200)
        label=label[:200]

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'tags': torch.tensor(label, dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [74]:
essays_entities.head()

Unnamed: 0,id,text,entities
0,0000D23A521A,"Some people belive that the so called ""face"" o...","[B-Position, I-Position, I-Position, I-Positio..."
1,00066EA9880D,Driverless cars are exaclty what you would exp...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
2,000E6DE9E817,Dear: Principal\n\nI am arguing against the po...,"[O, O, B-Position, I-Position, I-Position, I-P..."
3,001552828BD0,Would you be able to give your car up? Having ...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,0016926B079C,I think that students would benefit from learn...,"[B-Position, I-Position, I-Position, I-Positio..."


In [78]:
MODEL_NAME = 'google/bigbird-roberta-base'
sentences = essays_entities.text
labels = essays_entities.entities
max_len = 1024


tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) 
training_set = dataset(
    tokenizer=tokenizer, 
    sentences=sentences, 
    labels=labels, 
    max_len=nax_len)

ImportError: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html

In [77]:
from ipywidgets import IntProgress

In [None]:
class dataset(Dataset):
  def __init__(self, dataframe, tokenizer, max_len, get_wids):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids # for validation????

  def __getitem__(self, index):
        # GET TEXT AND WORD LABELS 
        text = self.data.text[index]        
        word_labels = self.data.entities[index] if not self.get_wids else None

        # TOKENIZE TEXT
        encoding = self.tokenizer(text,
                             return_offsets_mapping=True, 
                             padding='max_length', 
                             truncation=True, 
                             max_length=self.max_len)
        
        word_ids = encoding.word_ids()  
        split_word_ids = np.full(len(word_ids),-1)
        offset_to_wordidx = split_mapping(text)
        offsets = encoding['offset_mapping']
        
        # CREATE TARGETS AND MAPPING OF TOKENS TO SPLIT() WORDS
        label_ids = []
        # Iterate in reverse to label whitespace tokens until a Begin token is encountered
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):
            
            if word_idx is None:
                if not self.get_wids: label_ids.append(-100)
            else:
                if offsets[token_idx] != (0,0):
                    #Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx][0]:offsets[token_idx][1]]
                    split_index = stats.mode(split_idxs[split_idxs != -1]).mode[0] if len(np.unique(split_idxs)) > 1 else split_idxs[0]
                    
                    if split_index != -1: 
                        if not self.get_wids: label_ids.append( labels_to_ids[word_labels[split_index]] )
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        if label_ids and label_ids[-1] != -100 and ids_to_labels[label_ids[-1]][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if not self.get_wids: label_ids.append(label_ids[-1])
                        else:
                            if not self.get_wids: label_ids.append(-100)
                else:
                    if not self.get_wids: label_ids.append(-100)
        
        encoding['labels'] = list(reversed(label_ids))

        # CONVERT TO TORCH TENSORS
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids: 
            item['wids'] = torch.as_tensor(split_word_ids)
        
        return item

  def __len__(self):
        return self.len

In [31]:
for type_ in essays_metadata.discourse_type.unique():
    print(type_)

Lead
Position
Evidence
Claim
Concluding Statement
Counterclaim
Rebuttal


In [49]:
torch.cuda.get_device_name(0)


'Tesla K80'