# PyTorch DistilBERT NER Baseline




## Folder structure initialization
Let's create some folders to organize our data following the [Data Engineering](https://kedro.readthedocs.io/en/stable/12_faq/01_faq.html#what-is-data-engineering-convention) standard mentioned in the [Kedro documentation](https://github.com/quantumblacklabs/kedro). Let's also store them into a dictionary so it's easier to read and use in our code

In [3]:
!mkdir "/kaggle/working/data"
!mkdir "/kaggle/working/data/01_raw"
!mkdir "/kaggle/working/data/02_intermediate"
!mkdir "/kaggle/working/data/03_primary"
!mkdir "/kaggle/working/data/04_feature"
!mkdir "/kaggle/working/data/05_model_input"
!mkdir "/kaggle/working/data/06_models"
!mkdir "/kaggle/working/data/07_model_output"
!mkdir "/kaggle/working/data/08_reporting"

In [4]:
kaggle_dataset_hf_base_model = "pydistilbertner"
kaggle_dataset_model = "pydestilbertnerv1"
file_paths = {
    
    #Kaggle's input data
    "train"             : "/kaggle/input/feedback-prize-2021/train.csv",
    "sample_submission" : "/kaggle/input/feedback-prize-2021/sample_submission.csv",
    "train_folder"      : "/kaggle/input/feedback-prize-2021/train",
    "test_folder"       : "/kaggle/input/feedback-prize-2021/test",
    
    #Model path for offline use during submission
    "hf_base"           : "/kaggle/input/"+kaggle_dataset_hf_base_model,
    "offline_model"     : "/kaggle/input/"+kaggle_dataset_model,
    
    #newly created data folders
    "raw"               : "/kaggle/working/data/01_raw",
    "intermediate"      : "/kaggle/working/data/02_intermediate",
    "primary"           : "/kaggle/working/data/03_primary",
    "feature"           : "/kaggle/working/data/04_feature",
    "model_input"       : "/kaggle/working/data/05_model_input",
    "models"            : "/kaggle/working/data/06_models",
    "model_output"      : "/kaggle/working/data/07_model_output",
    "reporting"         : "/kaggle/working/data/08_reporting"
}

## Internet connection check
Let's create a variable that informs if internet is enabled or not. This will be useful later in the code

In [5]:
import requests

def is_internet_connection_enabled():
    url = "http://www.kaggle.com"
    timeout = 5    
    try:
        request = requests.get(url, timeout=timeout)
        return True
    except (requests.ConnectionError, requests.Timeout) as exception:
        return False
    

## Weights & Biases

In [6]:
wandb_enabled = True
wand_project = "evaluating-student-writing-kaggle-challenge"
wand_entity = "sdsantiagodiez"

if wandb_enabled and is_internet_connection_enabled():
    wandb.init(project=wand_project, entity=wand_entity)

## Load data and libraries

In [7]:
import os
import sys  
import numpy as np
import gc
import pandas as pd
from glob import glob
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval
import json
from sklearn.metrics import accuracy_score
from scipy import stats
import wandb
import re

from torch.utils.data import Dataset, DataLoader, random_split
from torch import cuda
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, AutoConfig

## Loading data

In [8]:
def read_essay(essay_id, folder_path = file_paths["train_folder"]):
    with open(folder_path + f"/{essay_id}.txt") as f:
        essay = f.read()
    return essay


def read_essays(train_txt):
    train_txt_file_id, train_txt_file_text = [],[]
    for train_txt_file in train_txt:
        essay_id = os.path.basename(train_txt_file).rsplit(".",1)[0]
        essay_folder = os.path.dirname(train_txt_file)

        train_txt_file_id.append(essay_id)
        train_txt_file_text.append(read_essay(essay_id,essay_folder))
    return pd.DataFrame({"id":train_txt_file_id, "text":train_txt_file_text})


def get_essay_entities(essay_text, essay_metadata):
    essay_entities = ["O"]*len(essay_text.split())
    for discourse_type, predictionstring in zip(essay_metadata["discourse_type"],essay_metadata["predictionstring"]):
        predictionstring_digits = list(map(int, predictionstring.split()))
        
        essay_entities[predictionstring_digits[0]] = f"B-{discourse_type}"
        for predictionstring_digits_index in predictionstring_digits[1:]:
            essay_entities[predictionstring_digits_index] = f"I-{discourse_type}"
    
    return essay_entities


def tag_essays(essays, essays_metadata):
    tagged_essays = pd.DataFrame()
    tagged_essays_list = []
    for _, essay in essays.iterrows():
        essay_id = essay["id"]
        essay_text = essay["text"]
        essay_metadata = essays_metadata.query("id == @essay_id")
        essay_entities = get_essay_entities(essay_text, essay_metadata)

        tagged_essays_list.append( 
            {
                "id": essay_id,
                "text": essay_text,
                "entities": essay_entities
            }
        )
    return pd.DataFrame.from_dict(tagged_essays_list)


def generate_file(generation_func, file_path, generate_file=False, *args):
    try:
        if generate_file:
            generation_func(*args).to_csv(file_path, index=False)
        return pd.read_csv(file_path)
    except FileNotFoundError as err:
        print(f"{err}, {type(err)}")
    except Exception as err:
        print(f"Unexpected {err}, {type(err)}")
        raise

def generate_labels_file(essays_metadata, file_path=file_paths["model_input"]):
    label_list = []
    label_list.append('O')

    for discourse_type in essays_metadata.discourse_type.unique():
        label_list.append(f'B-{discourse_type}')
        label_list.append(f'I-{discourse_type}')

    labels_to_ids = {v:k for k,v in enumerate(label_list)}
    ids_to_labels = {k:v for k,v in enumerate(label_list)}
    
    with open(file_path+"/label_list.txt", "w") as output:
        output.write(str(label_list))
        
    json.dump(labels_to_ids, open(file_path+"/labels_to_ids.json",'w'))
    json.dump(ids_to_labels, open(file_path+"/ids_to_labels.json",'w'))


In [9]:
essays_metadata = pd.read_csv(file_paths["train"])
essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']] = essays_metadata[['discourse_id', 'discourse_start', 'discourse_end']].astype(int)

sample_submission = pd.read_csv(file_paths["sample_submission"])

#The glob module finds all the pathnames matching a specified pattern according to the rules used by the Unix shell
train_txt = glob(file_paths["train_folder"] + "/*.txt") 
test_txt = glob(file_paths["test_folder"] + "/*.txt")

In [10]:
create_test_essays_file = True
essays_file_path = file_paths["intermediate"]+"/test_text.csv"

test_essays = generate_file(read_essays, essays_file_path, create_test_essays_file, test_txt)

print(test_essays.shape)
test_essays.head()

In [11]:
create_essays_file = True
essays_file_path = file_paths["intermediate"]+"/train_text.csv"

essays = generate_file(read_essays, essays_file_path, create_essays_file, train_txt)

print(essays.shape)
essays.head(5)

In [12]:
create_essay_entities_file = True
essay_entities_file_path = file_paths["model_input"]+"/essays_NER.csv"

essays_entities = generate_file(tag_essays, essay_entities_file_path, create_essay_entities_file, essays, essays_metadata)
essays_entities.entities = essays_entities.entities.apply(lambda x: literal_eval(x) )

print(essays_entities.shape)
essays_entities.head(5)

In [13]:
generate_labels_file_ = True
if generate_labels_file_:
    generate_labels_file(essays_metadata)

## Pytorch & Hugging Face

### Classes and functions

#### Dataset & Dataloader definition

In [14]:
class dataset(Dataset):
    def __init__(self, tokenizer, sentences, labels, max_len, get_wids=False, labels_file_path = file_paths["model_input"]):
        self.len = len(sentences)
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids
        
        with open(labels_file_path+"/ids_to_labels.json") as f:
            self.ids_to_labels = {int(k):v for k,v in json.load(f).items() }
        with open(labels_file_path+"/labels_to_ids.json") as f:
            self.labels_to_ids = {k:int(v) for k,v in json.load(f).items() }
        
    def __getitem__(self, index):
        text = self.sentences[index]
        word_labels = self.labels[index] if not self.get_wids else None
        
        encoding = self.tokenizer(
            text,
            return_offsets_mapping=True, 
            padding='max_length', 
            truncation=True, 
            max_length=self.max_len
        )
        
        encoding['labels'], split_word_ids = self._get_label_ids(text, word_labels, encoding)

        # CONVERT TO TORCH TENSORS
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids: 
            item['wids'] = torch.as_tensor(split_word_ids)
        
        return item 
    
    def __len__(self):
        return self.len

    def _get_label_ids(self, text, word_labels, encoding):
        word_ids = encoding.word_ids()  
        split_word_ids = np.full(len(word_ids),-1)
        offset_to_wordidx = self._split_mapping(text)
        offsets = encoding['offset_mapping']
        
        # CREATE TARGETS AND MAPPING OF TOKENS TO SPLIT() WORDS
        label_ids = []
        # Iterate in reverse to label whitespace tokens until a Begin token is encountered
        for token_idx, word_idx in reversed(list(enumerate(word_ids))):
            if word_idx is None:
                if not self.get_wids: label_ids.append(-100)
            else:
                if offsets[token_idx] != (0,0):
                    #Choose the split word that shares the most characters with the token if any
                    split_idxs = offset_to_wordidx[offsets[token_idx][0]:offsets[token_idx][1]]
                    split_index = stats.mode(split_idxs[split_idxs != -1]).mode[0] if len(np.unique(split_idxs)) > 1 else split_idxs[0]
                    
                    if split_index != -1: 
                        if not self.get_wids: label_ids.append( self.labels_to_ids[word_labels[split_index]] )
                        split_word_ids[token_idx] = split_index
                    else:
                        # Even if we don't find a word, continue labeling 'I' tokens until a 'B' token is found
                        last_label_id = label_ids[-1]
                        if label_ids and last_label_id != -100 and self.ids_to_labels[last_label_id][0] == 'I':
                            split_word_ids[token_idx] = split_word_ids[token_idx + 1]
                            if not self.get_wids: label_ids.append(last_label_id)
                        else:
                            if not self.get_wids: label_ids.append(-100)
                else:
                    if not self.get_wids: label_ids.append(-100)
        
        return list(reversed(label_ids)), split_word_ids

    def _split_mapping(self, unsplit):
        # Return an array that maps character index to index of word in list of split() words
        # Code copied from https://www.kaggle.com/chasembowers/pytorch-bigbird-whitespace-cv-0-6284/notebook
        splt = unsplit.split()
        no_token_value = -1
        offset_to_wordidx = np.full(len(unsplit), no_token_value)
        txt_ptr = 0
        for split_index, full_word in enumerate(splt):
            while unsplit[txt_ptr:txt_ptr + len(full_word)] != full_word:
                txt_ptr += 1
            offset_to_wordidx[txt_ptr:txt_ptr + len(full_word)] = split_index
            txt_ptr += len(full_word)
        return offset_to_wordidx

#### Get and split data

In [15]:
def get_train_valid_split(data=essays_entities, split_size=0.8):
    ids = data.id.unique()
    print(f'There are {len(ids)} train texts.')
    print("The splits will be {:.0%} train and {:.0%} validation.".format(split_size, 1-split_size))

    np.random.seed(random_seed)
    train_idx = np.random.choice(np.arange(len(ids)),int(split_size*len(ids)),replace=False)
    valid_idx = np.setdiff1d(np.arange(len(ids)),train_idx)

    # CREATE TRAIN SUBSET AND VALID SUBSET
    data = essays_entities
    train_data = data.loc[data['id'].isin(ids[train_idx]),['text', 'entities']].reset_index(drop=True)
    valid_data = data.loc[data['id'].isin(ids[valid_idx])].reset_index(drop=True)

    print(f"Full data: {data.shape}")
    print(f"Tran data: {train_data.shape}")
    print(f"Valid data: {valid_data.shape}")

    return train_data, valid_data
    
def get_data_loader(tokenizer, sentences, labels, max_len, get_wids, params):
        if sentences is None:
                return None

        train_set = dataset(
                tokenizer=tokenizer, 
                sentences=sentences, 
                labels=labels, 
                max_len=max_len, 
                get_wids=get_wids)
        return DataLoader(train_set, **params)

def get_data_loaders(train_data, valid_data, test_data, config):
        train_params = {
                'batch_size': config['train_batch_size'],
                'shuffle': True,
                'num_workers': 2,
                'pin_memory':True
        }

        valid_params = {
                'batch_size': config['valid_batch_size'],
                'shuffle': False,
                'num_workers': 2,
                'pin_memory':True
        }

        tokenizer = get_tokenizer(config)
        
        train_loader = get_data_loader(
                tokenizer=tokenizer,
                sentences=train_data.text,
                labels=train_data.entities,
                max_len=config['max_length'],
                get_wids=False,
                params=train_params )
        
        valid_loader = get_data_loader(
                tokenizer=tokenizer,
                sentences=valid_data.text,
                labels=valid_data.entities,
                max_len=config['max_length'],
                get_wids=True,
                params=valid_params)

        test_loader = get_data_loader(
                tokenizer=tokenizer,
                sentences=test_data.text,
                labels=None,
                max_len=config['max_length'],
                get_wids=True,
                params=valid_params)

        return train_loader, valid_loader, test_loader

#### Initializing model and tokenizer

In [16]:
def get_tokenizer(config):
    if is_internet_connection_enabled():
        tokenizer = AutoTokenizer.from_pretrained(config["model_name"])
        tokenizer.save_pretrained(file_paths["models"])
    else:
        tokenizer = AutoTokenizer.from_pretrained(file_paths["hf_base"])
    return tokenizer

def get_model_config(config):
    if is_internet_connection_enabled():
        config_model = AutoConfig.from_pretrained(config["model_name"]) 
        config_model.num_labels = 15
        config_model.save_pretrained(file_paths["models"])
    else:
        config_model = AutoConfig.from_pretrained(file_paths["hf_base"])
    
    return config_model

def get_model(config):    
    config_model = get_model_config(config)
    
    if is_internet_connection_enabled():        
        model = AutoModelForTokenClassification.from_pretrained(config["model_name"], config=config_model)
        model.save_pretrained(file_paths["models"])
    else:
        model = AutoModelForTokenClassification.from_pretrained(file_paths["hf_base"], config=config_model)
    
    model.to(config['device'])
    
    return model

#### Training

In [17]:
# https://www.kaggle.com/raghavendrakotala/fine-tunned-on-roberta-base-as-ner-problem-0-533
def train(model, optimizer, train_loader, device_config="cpu", grad_norm=10):
    tr_loss, tr_accuracy = 0, 0
    nb_tr_examples, nb_tr_steps = 0, 0
    #tr_preds, tr_labels = [], []
    
    # put model in training mode
    model.train()
    
    for idx, batch in enumerate(train_loader):
        
        ids = batch['input_ids'].to(device_config, dtype = torch.long)
        mask = batch['attention_mask'].to(device_config, dtype = torch.long)
        labels = batch['labels'].to(device_config, dtype = torch.long)

        loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels,
                               return_dict=False)
        tr_loss += loss.item()

        nb_tr_steps += 1
        nb_tr_examples += labels.size(0)
        
        if idx % 200==0:
            loss_step = tr_loss/nb_tr_steps
            print(f"Training loss after {idx:04d} training steps: {loss_step}")
            wandb.log({"step": idx})
            wandb.log({"loss_step": loss_step})

           
        # compute training accuracy
        flattened_targets = labels.view(-1) # shape (batch_size * seq_len,)
        active_logits = tr_logits.view(-1, model.num_labels) # shape (batch_size * seq_len, num_labels)
        flattened_predictions = torch.argmax(active_logits, axis=1) # shape (batch_size * seq_len,)
        
        # only compute accuracy at active labels
        active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)
        #active_labels = torch.where(active_accuracy, labels.view(-1), torch.tensor(-100).type_as(labels))
        
        labels = torch.masked_select(flattened_targets, active_accuracy)
        predictions = torch.masked_select(flattened_predictions, active_accuracy)
        
        #tr_labels.extend(labels)
        #tr_preds.extend(predictions)

        tmp_tr_accuracy = accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
        tr_accuracy += tmp_tr_accuracy
    
        # gradient clipping
        torch.nn.utils.clip_grad_norm_(
            parameters=model.parameters(), max_norm=grad_norm
        )
        
        # backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    epoch_loss = tr_loss / nb_tr_steps
    tr_accuracy = tr_accuracy / nb_tr_steps
    print(f"Training loss epoch: {epoch_loss}")
    print(f"Training accuracy epoch: {tr_accuracy}")

#### Inference and Validation Code

During inference the model will make predictions for each subword token. Since a single words consists of one or more subword tokens, an approach needs to be taken in order to decide which classification to take when the model produces multiple lables for a single word

In the code below, a word's first subword token prediction as the label for the entire word is the approach followed. We can try other approaches, like averaging all subword predictions or taking B labels before I labels etc.

In [18]:
def inference(batch, device_config= "cpu",labels_file_path = file_paths["model_input"]):
    # MOVE BATCH TO GPU AND INFER
    ids = batch["input_ids"].to(config['device'])
    mask = batch["attention_mask"].to(config['device'])
    outputs = model(ids, attention_mask=mask, return_dict=False)
    all_preds = torch.argmax(outputs[0], axis=-1).cpu().numpy() 
    
    with open(labels_file_path+"/ids_to_labels.json") as f:
            ids_to_labels = {int(k):v for k,v in json.load(f).items() }
    # INTERATE THROUGH EACH TEXT AND GET PRED
    predictions = []
    for text_pred_idx, text_preds in enumerate(all_preds):
        token_preds = [ids_to_labels[i] for i in text_preds]

        prediction = []
        word_ids = batch['wids'][text_pred_idx].numpy()
        previous_word_idx = -1
        for idx, word_idx in enumerate(word_ids):                            
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:              
                prediction.append(token_preds[idx])
                previous_word_idx = word_idx
        predictions.append(prediction)
    
    return predictions

# https://www.kaggle.com/zzy990106/pytorch-ner-infer
# code has been modified from original
def get_predictions(df, loader, device_config="cpu"):
    # put model in eval mode
    model.eval()
    
    # GET WORD LABEL PREDICTIONS
    y_pred = []
    for batch in loader:
        labels = inference(batch, device_config)
        y_pred.extend(labels)

    final_preds2 = []
    for i in range(len(df)):

        idx = df.id.values[i]
        #pred = [x.replace('B-','').replace('I-','') for x in y_pred[i]]
        pred = y_pred[i] # Leave "B" and "I"
        preds = []
        j = 0
        while j < len(pred):
            cls = pred[j]
            # The commented out line below appears to be a bug.
#             if cls == 'O': j += 1
            if cls != 'O':
                cls = cls.replace('B','I') # spans start with B
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            
            if cls != 'O' and cls != '' and end - j > 7:
                final_preds2.append((idx, cls.replace('I-',''),
                                     ' '.join(map(str, list(range(j, end))))))
        
            j = end
        
    oof = pd.DataFrame(final_preds2)
    oof.columns = ['id','class','predictionstring']

    return oof

#### Score feedback competition calc

In [19]:
# from Rob Mulla @robikscube
# https://www.kaggle.com/robikscube/student-writing-competition-twitch
def calc_overlap(row):
    """
    Calculates the overlap between prediction and
    ground truth and overlap percentages used for determining
    true positives.
    """
    set_pred = set(row.predictionstring_pred.split(' '))
    set_gt = set(row.predictionstring_gt.split(' '))
    # Length of each and intersection
    len_gt = len(set_gt)
    len_pred = len(set_pred)
    inter = len(set_gt.intersection(set_pred))
    overlap_1 = inter / len_gt
    overlap_2 = inter/ len_pred
    return [overlap_1, overlap_2]


def score_feedback_comp(pred_df, gt_df):
    """
    A function that scores for the kaggle
        Student Writing Competition
        
    Uses the steps in the evaluation page here:
        https://www.kaggle.com/c/feedback-prize-2021/overview/evaluation
    """
    gt_df = gt_df[['id','discourse_type','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df = pred_df[['id','class','predictionstring']] \
        .reset_index(drop=True).copy()
    pred_df['pred_id'] = pred_df.index
    gt_df['gt_id'] = gt_df.index
    # Step 1. all ground truths and predictions for a given class are compared.
    joined = pred_df.merge(gt_df,
                           left_on=['id','class'],
                           right_on=['id','discourse_type'],
                           how='outer',
                           suffixes=('_pred','_gt')
                          )
    joined['predictionstring_gt'] = joined['predictionstring_gt'].fillna(' ')
    joined['predictionstring_pred'] = joined['predictionstring_pred'].fillna(' ')

    joined['overlaps'] = joined.apply(calc_overlap, axis=1)

    # 2. If the overlap between the ground truth and prediction is >= 0.5, 
    # and the overlap between the prediction and the ground truth >= 0.5,
    # the prediction is a match and considered a true positive.
    # If multiple matches exist, the match with the highest pair of overlaps is taken.
    joined['overlap1'] = joined['overlaps'].apply(lambda x: eval(str(x))[0])
    joined['overlap2'] = joined['overlaps'].apply(lambda x: eval(str(x))[1])


    joined['potential_TP'] = (joined['overlap1'] >= 0.5) & (joined['overlap2'] >= 0.5)
    joined['max_overlap'] = joined[['overlap1','overlap2']].max(axis=1)
    tp_pred_ids = joined.query('potential_TP') \
        .sort_values('max_overlap', ascending=False) \
        .groupby(['id','predictionstring_gt']).first()['pred_id'].values

    # 3. Any unmatched ground truths are false negatives
    # and any unmatched predictions are false positives.
    fp_pred_ids = [p for p in joined['pred_id'].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query('potential_TP')['gt_id'].unique()
    unmatched_gt_ids = [c for c in joined['gt_id'].unique() if c not in matched_gt_ids]

    # Get numbers of each type
    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    #calc microf1
    my_f1_score = TP / (TP + 0.5*(FP+FN))
    return my_f1_score

### Model training and prediction

In [20]:
train_set_size_proportion = 0.90
random_seed = 42

model_name = 'distilbert-base-uncased'
model_name_alphanumeric = re.sub("[^0-9a-zA-Z]+", "_", model_name)

config = {'model_name': model_name,   
         'max_length': 512,
         'train_batch_size':4,
         'valid_batch_size':4,
         'epochs':10,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10,
         'device': 'cuda' if cuda.is_available() else 'cpu'}

In [21]:
train_data, valid_data = get_train_valid_split(data=essays_entities, split_size=train_set_size_proportion)
test_data = test_essays

In [22]:
train_loader, valid_loader, test_loader = get_data_loaders(train_data, valid_data, test_data, config)

In [23]:
model = get_model(config)
optimizer = torch.optim.Adam(params=model.parameters(), lr=config['learning_rates'][0])

In [24]:
model_version = 1
load_saved_model = True
train_model = False

model_file_name = f'{model_name_alphanumeric}_v{model_version}.pt'
model_file_path = f'{file_paths["offline_model"] }/{model_name_alphanumeric}_v{model_version}.pt'

if load_saved_model:
    model.load_state_dict(torch.load(model_file_path))
    print(f"Model {model_file_name} loaded.")

if train_model:
    for epoch in range(config['epochs']):
        print(f"### Training epoch: {epoch + 1}")
        wandb.log({"epoch": epoch + 1})
        
        lr = learning_rates[epoch] if epoch < len(learning_rates) else learning_rates[-1]
        for g in optimizer.param_groups: 
            g['lr'] = lr
        print(f'### LR = {lr}\n')

        train(model, optimizer, train_loader, device_config=config['device'], grad_norm=config['max_grad_norm'])    
        torch.cuda.empty_cache()
        gc.collect()
        
        torch.save(model.state_dict(), model_file_path)
    print(f"Training complete and model {model_file_name} saved.")
    

In [25]:
compute_val_score = True
if compute_val_score: 
    #valid data targets
    valid_data_id_list = valid_data.id.tolist()
    valid_data_metadata = essays_metadata.query("id == @valid_data_id_list")
    
    # OOF PREDICTIONS
    oof = get_predictions(valid_data, valid_loader, device_config=config["device"])

    # COMPUTE F1 SCORE
    f1s = []
    classes = oof['class'].unique()
    print()
    for class_ in classes:
        pred_df = oof.loc[oof['class']==class_].copy()
        gt_df = valid_data_metadata.query("discourse_type == @class_").copy()
        f1 = score_feedback_comp(pred_df, gt_df)
        print(class_,f1)
        f1s.append(f1)
    print()
    print('Overall',np.mean(f1s))
    print()

In [26]:
sub = get_predictions(test_data, test_loader)
sub.head()

In [None]:
sub.to_csv(file_paths["model_output"]+"/submission.csv", index=False)
sub.to_csv("submission.csv", index=False)