<a href="https://colab.research.google.com/github/sayanbanerjee32/feedback-prize-effectiveness/blob/main/all_text_concat_DBERTA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

!pip install -Uq datasets
!pip install -Uq transformers
!pip install -Uq sentencepiece 
!pip install -Uq pynvml
!pip install -Uq evaluate

In [2]:
from fastkaggle import *
import os
from pathlib import Path
import pandas as pd

In [3]:
# config depending on whether this is running on kaggle or collab
# is_colab = True
is_colab = not os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
comp = 'feedback-prize-effectiveness'
if is_colab:
    model_save_path = Path('/content/'+comp+'_out/models')
else:
    model_save_path = Path('/kaggle/working/'+comp) #+'/models')

In [4]:
if is_colab:
    from google.colab import output
    output.enable_custom_widget_manager()

In [5]:
# import colab libraries
if is_colab:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so lets move it there.
if is_colab:
    !mkdir ~/.kaggle
    !cp /content/drive/MyDrive/Kaggle_api_auth/kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [7]:
# This permissions change avoids a warning on Kaggle tool startup.
if is_colab:
    !chmod 600 ~/.kaggle/kaggle.json

In [8]:
path = setup_comp(comp)
path

Path('feedback-prize-effectiveness')

## Text classification

In [9]:
def file_read(file_path):
    with open(file_path, 'r') as _f: 
        all_content = _f.read()
    return all_content

In [10]:
# pre-process text - add all columns 
df = pd.read_csv(path/'train.csv')
df['essay_text'] = df['essay_id'].apply(lambda x: file_read(path / 'train' / f'{x}.txt'))
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay_text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."


In [11]:
# See sequence length
df['seq_length_essay'] = [len(txt.split()) for txt in df['essay_text'].tolist()]
df['seq_length_dis'] = [len(txt.split()) for txt in df['discourse_text'].tolist()]
df['seq_length_essay'].describe(), df['seq_length_dis'].describe()

(count    36765.000000
 mean       458.588522
 std        220.423420
 min        144.000000
 25%        288.000000
 50%        408.000000
 75%        579.000000
 max       1367.000000
 Name: seq_length_essay, dtype: float64, count    36765.000000
 mean        44.654073
 std         46.669682
 min          1.000000
 25%         16.000000
 50%         28.000000
 75%         57.000000
 max        836.000000
 Name: seq_length_dis, dtype: float64)

In [12]:
# replace discorse text within the context text as __MASKED__
df['masked_ess_txt'] = df[['essay_text','discourse_text']].apply(lambda row: row.essay_text.strip().replace(row.discourse_text.strip(),
                                                                                                         '__MASKED__'),
                                                              axis = 1)
df['seq_length_mask_ess'] = [len(txt.split()) for txt in df['masked_ess_txt'].tolist()]
df['masked_ess_txt'].head(), df['seq_length_mask_ess'].describe()

(0    __MASKED__ On my perspective, I think that the...
 1    Hi, i'm Isaac, i'm going to be writing about h...
 2    Hi, i'm Isaac, i'm going to be writing about h...
 3    Hi, i'm Isaac, i'm going to be writing about h...
 4    Hi, i'm Isaac, i'm going to be writing about h...
 Name: masked_ess_txt, dtype: object, count    36765.000000
 mean       414.791024
 std        213.788307
 min          1.000000
 25%        254.000000
 50%        367.000000
 75%        531.000000
 max       1345.000000
 Name: seq_length_mask_ess, dtype: float64)

In [13]:
# function to truncate discourse text and context text
# this is still in progress
# def trunc_text(text, num_words, unique_centre_tok = None):
#     w_l = text.split()
#     if unique_centre_tok is None:
#         if len(w_l) > num_words: w_l = w_l[:num_words]
#     else:
#         if len(w_l) > num_words:
#             try: pos_tok = w_l.index(unique_centre_tok) + 1
#             except: # in case there is an issue with the replacement
#                 print(text)
#                 pos_tok = round(num_words / 2)
#             if pos_tok > round(len(w_l) / 2):
#                 if len(w_l) > pos_tok + round((num_words - 1) / 2):
#                     start_pos = pos_tok - round((num_words - 1) / 2) - 1
#                 else:
#                     start_pos = len(w_l) - num_words
#                 w_l = w_l[start_pos:(start_pos + num_words)]
#             else:
#                 if pos_tok > round((num_words -1) / 2):
#                     start_pos = pos_tok - round((num_words -1) / 2) - 1
#                     w_l = w_l[start_pos:(start_pos + num_words)]
#                 else:
#                     w_l = w_l[:num_words]
            
#     return ' '.join(w_l)

# [trunc_text("let's see where __MASKED__ we are going.", i, unique_centre_tok = "__MASKED__") for i in range(2,8)]

In [14]:
# combine all text columns for classification 
# concat all
# df['all_text'] = 'CONTEXT: ' + df.essay_text + '; DISCOURSE: ' + df.discourse_text + '; TYPE: ' + df.discourse_type

# concat after truncation
# df['essay_text_trunc'] = df.masked_ess_txt.apply(lambda t: trunc_text(t,512, "__MASKED__"))
# df['discourse_text_trunc'] = df.discourse_text.apply(lambda t: trunc_text(t,64))
# df['all_text'] = 'CONTEXT: ' + df.essay_text_trunc + '; TYPE: ' + df.discourse_type + '; DISCOURSE: ' + df.discourse_text_trunc 
df['input'] = 'CONTEXT: ' + df.masked_ess_txt + '; TYPE: ' + df.discourse_type + '; DISCOURSE: ' + df.discourse_text
df['seq_length_input'] = [len(txt.split()) for txt in df['input'].tolist()]
df['seq_length_input'].describe()

count    36765.000000
mean       463.536244
std        220.382475
min        149.000000
25%        293.000000
50%        413.000000
75%        584.000000
max       1373.000000
Name: seq_length_input, dtype: float64

In [15]:
# random sampling to test in collab
if is_colab: df = df.sample(frac=0.10)
df.shape[0]

3676

In [16]:
# create data datasets
from datasets import Dataset,DatasetDict
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'essay_text', 'seq_length_essay', 'seq_length_dis', 'masked_ess_txt', 'seq_length_mask_ess', 'input', 'seq_length_input', '__index_level_0__'],
    num_rows: 3676
})

In [17]:
# label encoding
from datasets import ClassLabel
labels = ClassLabel(names=df.discourse_effectiveness.unique().tolist())

In [18]:
model_nm = 'microsoft/deberta-v3-small'

In [19]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer, DataCollatorWithPadding
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tokz.model_max_length, tokz.is_fast

(1000000000000000019884624838656, True)

In [33]:
def tok_func(batch, is_test = False): 
    tokens = tokz(batch["input"], padding="longest") #, truncation=True)
    if not is_test:
        tokens['labels'] = labels.str2int(batch['discourse_effectiveness'])
        # [float(l) for l in labels.str2int(batch['discourse_effectiveness'])]
    return tokens
tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/4 [00:00<?, ?ba/s]

In [35]:
labels.num_classes, labels.names

(3, ['Effective', 'Adequate', 'Ineffective'])

In [36]:
# tok_ds = tok_ds.rename_columns({'discourse_effectiveness':'labels'})

In [37]:
# row = tok_ds[0]
# row['input'], row['input_ids'], row['labels']

In [38]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'essay_text', 'seq_length_essay', 'seq_length_dis', 'masked_ess_txt', 'seq_length_mask_ess', 'input', 'seq_length_input', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 2757
    })
    test: Dataset({
        features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'essay_text', 'seq_length_essay', 'seq_length_dis', 'masked_ess_txt', 'seq_length_mask_ess', 'input', 'seq_length_input', '__index_level_0__', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 919
    })
})

In [56]:
from transformers import TrainingArguments,Trainer, IntervalStrategy
import evaluate
import numpy as np

In [57]:
data_collator = DataCollatorWithPadding(tokenizer=tokz)

In [82]:
bs = 4
grad_acc = 16
epochs = 4
lr = 5e-5
metric_name = "f1"

args = TrainingArguments('outputs',
                         learning_rate=lr,
                         warmup_ratio=0.1,
                         lr_scheduler_type='cosine',
                         fp16=True,
                        evaluation_strategy="epoch",
                         logging_strategy = "epoch",
                        #  logging_steps = 10,
                        #  eval_steps = 10,
                         per_device_train_batch_size=bs,
                          per_device_eval_batch_size=bs*2,
                        num_train_epochs=epochs,
                          weight_decay=0.01,
                         gradient_accumulation_steps=grad_acc,
                        #   load_best_model_at_end=True,
                            metric_for_best_model=metric_name,
                        #  label_names = labels.names, # possibly this is messing up with metrics
                          report_to='none')

PyTorch: setting up devices


In [83]:
import gc
import torch
def report_gpu():
    print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()

In [84]:
report_gpu()

GPU:0
process      61761 uses    15413.000 MB GPU memory


In [85]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, 
                                                           num_labels=labels.num_classes)
def acc_metrics(eval_preds):
    metric = evaluate.load("f1")
    # metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions,
                          references=labels,
                          average = 'weighted')
    # return {"accuracy": (predictions == labels).astype(np.float32).mean().item()}

trainer = Trainer(model, args, 
                  train_dataset=dds['train'], 
                  eval_dataset=dds['test'],
                  data_collator=data_collator,
                  tokenizer=tokz,
                  compute_metrics=acc_metrics)

loading configuration file https://huggingface.co/microsoft/deberta-v3-small/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8e0c12a7672d1d36f647c86e5fc3a911f189d8704e2bc94dde4a1ffe38f648fa.9df96bac06c2c492bc77ad040068f903c93beec14607428f25bf9081644ad0da
Model config DebertaV2Config {
  "_name_or_path": "microsoft/deberta-v3-small",
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 

In [86]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: masked_ess_txt, discourse_type, discourse_text, seq_length_input, essay_text, discourse_effectiveness, __index_level_0__, seq_length_essay, seq_length_mask_ess, input, essay_id, discourse_id, seq_length_dis. If masked_ess_txt, discourse_type, discourse_text, seq_length_input, essay_text, discourse_effectiveness, __index_level_0__, seq_length_essay, seq_length_mask_ess, input, essay_id, discourse_id, seq_length_dis are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2757
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 86


Epoch,Training Loss,Validation Loss,F1
0,0.9482,0.779412,0.593373
1,0.7566,0.736639,0.603832


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: masked_ess_txt, discourse_type, discourse_text, seq_length_input, essay_text, discourse_effectiveness, __index_level_0__, seq_length_essay, seq_length_mask_ess, input, essay_id, discourse_id, seq_length_dis. If masked_ess_txt, discourse_type, discourse_text, seq_length_input, essay_text, discourse_effectiveness, __index_level_0__, seq_length_essay, seq_length_mask_ess, input, essay_id, discourse_id, seq_length_dis are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 919
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: masked_ess_txt, discourse_type, discourse_text, seq_length_input, essay_text, discourse_effectiveness, __ind

In [87]:
trainer.save_model(model_save_path)

Saving model checkpoint to /content/feedback-prize-effectiveness_out/models
Configuration saved in /content/feedback-prize-effectiveness_out/models/config.json
Model weights saved in /content/feedback-prize-effectiveness_out/models/pytorch_model.bin
tokenizer config file saved in /content/feedback-prize-effectiveness_out/models/tokenizer_config.json
Special tokens file saved in /content/feedback-prize-effectiveness_out/models/special_tokens_map.json


### Test prediction

In [88]:
eval_df = pd.read_csv(path/'test.csv')
# pre-process test df texts
eval_df['essay_text'] = eval_df['essay_id'].apply(lambda x: file_read(path / 'test' / f'{x}.txt'))
eval_df['masked_ess_txt'] = eval_df[['essay_text','discourse_text']].apply(lambda row: row.essay_text.strip().replace(row.discourse_text.strip(),
                                                                                                         '__MASKED__'),
                                                              axis = 1)
eval_df['input'] = 'CONTEXT: ' + eval_df.masked_ess_txt + '; TYPE: ' + eval_df.discourse_type + '; DISCOURSE: ' + eval_df.discourse_text

In [89]:
from functools import partial
tst_tok_func = partial(tok_func, is_test = True)
eval_ds = Dataset.from_pandas(eval_df).map(tst_tok_func, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [90]:
preds = trainer.predict(eval_ds).predictions
preds.astype(float)

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: masked_ess_txt, discourse_type, discourse_text, essay_text, input, essay_id, discourse_id. If masked_ess_txt, discourse_type, discourse_text, essay_text, input, essay_id, discourse_id are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 8


array([[ 0.48876953,  0.78320312, -1.29199219],
       [ 0.49682617,  0.77978516, -1.296875  ],
       [ 0.50097656,  0.77929688, -1.30078125],
       [ 0.47509766,  0.78466797, -1.28320312],
       [ 0.46020508,  0.78613281, -1.27246094],
       [ 0.48852539,  0.78125   , -1.29296875],
       [ 0.50488281,  0.77832031, -1.3046875 ],
       [ 0.49511719,  0.77978516, -1.296875  ],
       [ 0.4921875 ,  0.78027344, -1.29492188],
       [ 0.49121094,  0.78125   , -1.29394531]])

In [91]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

loading configuration file /content/feedback-prize-effectiveness_out/models/config.json
Model config DebertaV2Config {
  "_name_or_path": "/content/feedback-prize-effectiveness_out/models",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 

In [93]:
raw_inputs = eval_df['input'].tolist()
inputs = tokz(raw_inputs, padding="longest",return_tensors="pt")
inputs

{'input_ids': tensor([[     1,  20967, 104917,  ...,    260,      2,      0],
        [     1,  20967, 104917,  ...,   1141,      2,      0],
        [     1,  20967, 104917,  ...,   1262,      2,      0],
        ...,
        [     1,  20967, 104917,  ...,    260,      2,      0],
        [     1,  20967, 104917,  ...,    260,      2,      0],
        [     1,  20967, 104917,  ...,    955,    260,      2]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [94]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.4895,  0.7833, -1.2925],
        [ 0.4966,  0.7800, -1.2969],
        [ 0.5023,  0.7791, -1.3011],
        [ 0.4747,  0.7846, -1.2829],
        [ 0.4605,  0.7860, -1.2724],
        [ 0.4894,  0.7813, -1.2937],
        [ 0.5049,  0.7781, -1.3043],
        [ 0.4948,  0.7800, -1.2965],
        [ 0.4925,  0.7803, -1.2950],
        [ 0.4908,  0.7811, -1.2939]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [95]:
sm_preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
sm_preds

tensor([[0.3985, 0.5345, 0.0671],
        [0.4010, 0.5323, 0.0667],
        [0.4026, 0.5310, 0.0663],
        [0.3944, 0.5376, 0.0680],
        [0.3904, 0.5406, 0.0690],
        [0.3989, 0.5341, 0.0671],
        [0.4036, 0.5303, 0.0661],
        [0.4005, 0.5327, 0.0668],
        [0.3999, 0.5332, 0.0669],
        [0.3993, 0.5337, 0.0670]], grad_fn=<SoftmaxBackward0>)

In [97]:
if not iskaggle:
    push_notebook('saan', comp,
                  title='Feedback effeciveness: debertav3 - metrics fixed',
                  file='/content/drive/MyDrive/Colab Notebooks/all_text_concat_DBERTA.ipynb',
                  competition=comp, private=False, gpu=True)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.
Kernel version 1 successfully pushed.  Please check progress at https://www.kaggle.com/code/saansd2003/feedback-effeciveness-debertav3-metrics-fixed
