<a href="https://colab.research.google.com/github/sayanbanerjee32/feedback-prize-effectiveness/blob/main/all_text_concat_DBERTA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture
try: import fastkaggle
except ModuleNotFoundError:
    !pip install -Uq fastkaggle

!pip install -q datasets
!pip install transformers
!pip install sentencepiece 
!pip install pynvml
!pip install evaluate
from fastkaggle import *

In [2]:
import os
from pathlib import Path
import pandas as pd

In [3]:
# config depending on whether this is running on kaggle or collab
# is_colab = True
is_colab = not os.environ.get('KAGGLE_KERNEL_RUN_TYPE', '')
comp = 'feedback-prize-effectiveness'
if is_colab:
    model_save_path = Path('/content/'+comp+'_out/models')
else:
    model_save_path = Path('/kaggle/working/'+comp) #+'/models')

In [4]:
if is_colab:
    from google.colab import output
    output.enable_custom_widget_manager()

In [5]:
# import colab libraries
if is_colab:
    from google.colab import drive
    drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
# The Kaggle API client expects this file to be in ~/.kaggle,
# so lets move it there.
if is_colab:
    !mkdir ~/.kaggle
    !cp /content/drive/MyDrive/Kaggle_api_auth/kaggle.json ~/.kaggle/

mkdir: cannot create directory ‘/root/.kaggle’: File exists


In [7]:
# This permissions change avoids a warning on Kaggle tool startup.
if is_colab:
    !chmod 600 ~/.kaggle/kaggle.json

In [8]:
path = setup_comp(comp)
path

Path('feedback-prize-effectiveness')

## Text classification

In [9]:
def file_read(file_path):
    with open(file_path, 'r') as _f: 
        all_content = _f.read()
    return all_content

In [10]:
# pre-process text - add all columns 
df = pd.read_csv(path/'train.csv')
df['essay_text'] = df['essay_id'].apply(lambda x: file_read(path / 'train' / f'{x}.txt'))
df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,essay_text
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,"Hi, i'm Isaac, i'm going to be writing about h..."


In [11]:
# See sequence length
df['seq_length_essay'] = [len(txt.split()) for txt in df['essay_text'].tolist()]
df['seq_length_dis'] = [len(txt.split()) for txt in df['discourse_text'].tolist()]
df['seq_length_essay'].describe(), df['seq_length_dis'].describe()

(count    36765.000000
 mean       458.588522
 std        220.423420
 min        144.000000
 25%        288.000000
 50%        408.000000
 75%        579.000000
 max       1367.000000
 Name: seq_length_essay, dtype: float64, count    36765.000000
 mean        44.654073
 std         46.669682
 min          1.000000
 25%         16.000000
 50%         28.000000
 75%         57.000000
 max        836.000000
 Name: seq_length_dis, dtype: float64)

In [12]:
# replace discorse text within the context text as __MASKED__
df['masked_ess_txt'] = df[['essay_text','discourse_text']].apply(lambda row: row.essay_text.strip().replace(row.discourse_text.strip(),
                                                                                                         '__MASKED__'),
                                                              axis = 1)
df['seq_length_mask_ess'] = [len(txt.split()) for txt in df['masked_ess_txt'].tolist()]
df['masked_ess_txt'].head(), df['seq_length_mask_ess'].describe()

(0    __MASKED__ On my perspective, I think that the...
 1    Hi, i'm Isaac, i'm going to be writing about h...
 2    Hi, i'm Isaac, i'm going to be writing about h...
 3    Hi, i'm Isaac, i'm going to be writing about h...
 4    Hi, i'm Isaac, i'm going to be writing about h...
 Name: masked_ess_txt, dtype: object, count    36765.000000
 mean       414.791024
 std        213.788307
 min          1.000000
 25%        254.000000
 50%        367.000000
 75%        531.000000
 max       1345.000000
 Name: seq_length_mask_ess, dtype: float64)

In [13]:
# function to truncate discourse text and context text
# this is still in progress
# def trunc_text(text, num_words, unique_centre_tok = None):
#     w_l = text.split()
#     if unique_centre_tok is None:
#         if len(w_l) > num_words: w_l = w_l[:num_words]
#     else:
#         if len(w_l) > num_words:
#             try: pos_tok = w_l.index(unique_centre_tok) + 1
#             except: # in case there is an issue with the replacement
#                 print(text)
#                 pos_tok = round(num_words / 2)
#             if pos_tok > round(len(w_l) / 2):
#                 if len(w_l) > pos_tok + round((num_words - 1) / 2):
#                     start_pos = pos_tok - round((num_words - 1) / 2) - 1
#                 else:
#                     start_pos = len(w_l) - num_words
#                 w_l = w_l[start_pos:(start_pos + num_words)]
#             else:
#                 if pos_tok > round((num_words -1) / 2):
#                     start_pos = pos_tok - round((num_words -1) / 2) - 1
#                     w_l = w_l[start_pos:(start_pos + num_words)]
#                 else:
#                     w_l = w_l[:num_words]
            
#     return ' '.join(w_l)

# [trunc_text("let's see where __MASKED__ we are going.", i, unique_centre_tok = "__MASKED__") for i in range(2,8)]

In [14]:
# combine all text columns for classification 
# concat all
# df['all_text'] = 'CONTEXT: ' + df.essay_text + '; DISCOURSE: ' + df.discourse_text + '; TYPE: ' + df.discourse_type

# concat after truncation
# df['essay_text_trunc'] = df.masked_ess_txt.apply(lambda t: trunc_text(t,512, "__MASKED__"))
# df['discourse_text_trunc'] = df.discourse_text.apply(lambda t: trunc_text(t,64))
# df['all_text'] = 'CONTEXT: ' + df.essay_text_trunc + '; TYPE: ' + df.discourse_type + '; DISCOURSE: ' + df.discourse_text_trunc 
df['input'] = 'CONTEXT: ' + df.masked_ess_txt + '; TYPE: ' + df.discourse_type + '; DISCOURSE: ' + df.discourse_text
df['seq_length_input'] = [len(txt.split()) for txt in df['input'].tolist()]
df['seq_length_input'].describe()

count    36765.000000
mean       463.536244
std        220.382475
min        149.000000
25%        293.000000
50%        413.000000
75%        584.000000
max       1373.000000
Name: seq_length_input, dtype: float64

In [15]:
# random sampling to test in collab
# if is_colab: df = df.sample(frac=0.10)
df.shape[0]

36765

In [16]:
# create data datasets
from datasets import Dataset,DatasetDict
ds = Dataset.from_pandas(df)
ds

Dataset({
    features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'essay_text', 'seq_length_essay', 'seq_length_dis', 'masked_ess_txt', 'seq_length_mask_ess', 'input', 'seq_length_input'],
    num_rows: 36765
})

In [17]:
# label encoding
from datasets import ClassLabel
labels = ClassLabel(names=df.discourse_effectiveness.unique().tolist())

In [18]:
model_nm = 'microsoft/deberta-v3-small'

In [19]:
from transformers import AutoModelForSequenceClassification,AutoTokenizer, DataCollatorWithPadding
tokz = AutoTokenizer.from_pretrained(model_nm)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
tokz.model_max_length, tokz.is_fast

(1000000000000000019884624838656, True)

In [21]:
def tok_func(batch, is_test = False): 
    tokens = tokz(batch["input"], padding="longest") #, truncation=True)
    if not is_test:
        tokens['labels'] = [float(l) for l in labels.str2int(batch['discourse_effectiveness'])]
    return tokens
tok_ds = ds.map(tok_func, batched=True)

  0%|          | 0/37 [00:00<?, ?ba/s]

In [22]:
labels.num_classes, labels.names

(3, ['Adequate', 'Ineffective', 'Effective'])

In [23]:
# tok_ds = tok_ds.rename_columns({'discourse_effectiveness':'labels'})

In [24]:
# row = tok_ds[0]
# row['input'], row['input_ids'], row['labels']

In [25]:
dds = tok_ds.train_test_split(0.25, seed=42)
dds

DatasetDict({
    train: Dataset({
        features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'essay_text', 'seq_length_essay', 'seq_length_dis', 'masked_ess_txt', 'seq_length_mask_ess', 'input', 'seq_length_input', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 27573
    })
    test: Dataset({
        features: ['discourse_id', 'essay_id', 'discourse_text', 'discourse_type', 'discourse_effectiveness', 'essay_text', 'seq_length_essay', 'seq_length_dis', 'masked_ess_txt', 'seq_length_mask_ess', 'input', 'seq_length_input', 'input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 9192
    })
})

In [26]:
from transformers import TrainingArguments,Trainer
import evaluate
import numpy as np

In [27]:
data_collator = DataCollatorWithPadding(tokenizer=tokz)

In [28]:
bs = 4
grad_acc = 16
epochs = 2
lr = 2e-5
metric_name = "f1"

args = TrainingArguments('outputs', learning_rate=lr,
                         warmup_ratio=0.1,
                         lr_scheduler_type='cosine',
                         fp16=True,
                        evaluation_strategy="epoch",
                         logging_strategy = "epoch",
                         per_device_train_batch_size=bs,
                          per_device_eval_batch_size=bs*2,
                        num_train_epochs=epochs,
                          weight_decay=0.01,
                         gradient_accumulation_steps=grad_acc,
                        #   load_best_model_at_end=True,
                            metric_for_best_model=metric_name,
                         label_names = labels.names,
                          report_to='none')

In [29]:
import gc
import torch
def report_gpu():
    print(torch.cuda.list_gpu_processes())
    gc.collect()
    torch.cuda.empty_cache()

In [30]:
report_gpu()

GPU:0
no processes are running


In [31]:
model = AutoModelForSequenceClassification.from_pretrained(model_nm, 
                                                           num_labels=labels.num_classes)
def compute_metrics(eval_preds):
    metric = evaluate.load("f1")
    metric = evaluate.load("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(model, args, 
                  train_dataset=dds['train'], 
                  eval_dataset=dds['test'],
                  data_collator=data_collator,
                  tokenizer=tokz,
                  compute_metrics=compute_metrics)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.bias', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

In [32]:
trainer.train();

The following columns in the training set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: discourse_type, essay_text, discourse_id, essay_id, discourse_text, seq_length_essay, masked_ess_txt, seq_length_input, discourse_effectiveness, seq_length_mask_ess, seq_length_dis, input. If discourse_type, essay_text, discourse_id, essay_id, discourse_text, seq_length_essay, masked_ess_txt, seq_length_input, discourse_effectiveness, seq_length_mask_ess, seq_length_dis, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 27573
  Num Epochs = 2
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 16
  Total optimization steps = 860


Epoch,Training Loss,Validation Loss
0,No log,No log


The following columns in the evaluation set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: discourse_type, essay_text, discourse_id, essay_id, discourse_text, seq_length_essay, masked_ess_txt, seq_length_input, discourse_effectiveness, seq_length_mask_ess, seq_length_dis, input. If discourse_type, essay_text, discourse_id, essay_id, discourse_text, seq_length_essay, masked_ess_txt, seq_length_input, discourse_effectiveness, seq_length_mask_ess, seq_length_dis, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9192
  Batch size = 8
Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-

Epoch,Training Loss,Validation Loss
0,No log,No log
1,0.758300,No log




Training completed. Do not forget to share your model on huggingface.co/models =)




In [33]:
trainer.save_model(model_save_path)

Saving model checkpoint to /content/feedback-prize-effectiveness_out/models
Configuration saved in /content/feedback-prize-effectiveness_out/models/config.json
Model weights saved in /content/feedback-prize-effectiveness_out/models/pytorch_model.bin
tokenizer config file saved in /content/feedback-prize-effectiveness_out/models/tokenizer_config.json
Special tokens file saved in /content/feedback-prize-effectiveness_out/models/special_tokens_map.json


### Test prediction

In [34]:
eval_df = pd.read_csv(path/'test.csv')
# pre-process test df texts
eval_df['essay_text'] = eval_df['essay_id'].apply(lambda x: file_read(path / 'test' / f'{x}.txt'))
eval_df['masked_ess_txt'] = eval_df[['essay_text','discourse_text']].apply(lambda row: row.essay_text.strip().replace(row.discourse_text.strip(),
                                                                                                         '__MASKED__'),
                                                              axis = 1)
eval_df['input'] = 'CONTEXT: ' + eval_df.masked_ess_txt + '; TYPE: ' + eval_df.discourse_type + '; DISCOURSE: ' + eval_df.discourse_text

In [35]:
from functools import partial
tst_tok_func = partial(tok_func, is_test = True)
eval_ds = Dataset.from_pandas(eval_df).map(tst_tok_func, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [36]:
preds = trainer.predict(eval_ds).predictions
preds.astype(float)

The following columns in the test set don't have a corresponding argument in `DebertaV2ForSequenceClassification.forward` and have been ignored: discourse_type, essay_text, discourse_id, essay_id, discourse_text, masked_ess_txt, input. If discourse_type, essay_text, discourse_id, essay_id, discourse_text, masked_ess_txt, input are not expected by `DebertaV2ForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Prediction *****
  Num examples = 10
  Batch size = 8


array([[ 1.50683594, -0.71386719, -0.49707031],
       [ 1.41796875, -0.82470703, -0.36181641],
       [ 1.33886719, -0.97070312, -0.11932373],
       [ 1.37109375, -0.90087891, -0.23364258],
       [ 1.36035156, -0.94628906, -0.17431641],
       [ 1.75097656,  0.16772461, -1.60742188],
       [ 1.76953125,  0.16845703, -1.61816406],
       [ 1.37011719, -0.90332031, -0.22814941],
       [ 1.74023438,  0.1394043 , -1.57421875],
       [ 1.60351562, -0.24938965, -1.08300781]])

In [37]:
from transformers import AutoModelForSequenceClassification
model = AutoModelForSequenceClassification.from_pretrained(model_save_path)

loading configuration file /content/feedback-prize-effectiveness_out/models/config.json
Model config DebertaV2Config {
  "_name_or_path": "/content/feedback-prize-effectiveness_out/models",
  "architectures": [
    "DebertaV2ForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-07,
  "max_position_embeddings": 512,
  "max_relative_positions": -1,
  "model_type": "deberta-v2",
  "norm_rel_ebd": "layer_norm",
  "num_attention_heads": 12,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_dropout": 0,
  "pooler_hidden_act": "gelu",
  "pooler_hidden_size": 768,
  "pos_att_type": [
    "p2c",
    "c2p"
  ],
  "position_biased_input": false,
  "position_buckets": 

In [38]:
raw_inputs = eval_df['input'].tolist()
inputs = tokz(raw_inputs, padding="longest",return_tensors="pt")
inputs

{'input_ids': tensor([[     1,  20967, 104917,  ...,    260,      2,      0],
        [     1,  20967, 104917,  ...,   1141,      2,      0],
        [     1,  20967, 104917,  ...,   1262,      2,      0],
        ...,
        [     1,  20967, 104917,  ...,    260,      2,      0],
        [     1,  20967, 104917,  ...,    260,      2,      0],
        [     1,  20967, 104917,  ...,    955,    260,      2]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        ...,
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 1]])}

In [39]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 1.5059, -0.7145, -0.4957],
        [ 1.4183, -0.8250, -0.3606],
        [ 1.3389, -0.9720, -0.1175],
        [ 1.3704, -0.9007, -0.2334],
        [ 1.3602, -0.9467, -0.1738],
        [ 1.7512,  0.1669, -1.6067],
        [ 1.7691,  0.1683, -1.6178],
        [ 1.3703, -0.9031, -0.2276],
        [ 1.7407,  0.1385, -1.5733],
        [ 1.6038, -0.2496, -1.0825]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [42]:
sm_preds = torch.nn.functional.softmax(outputs.logits, dim=-1)
sm_preds

tensor([[0.8041, 0.0873, 0.1086],
        [0.7844, 0.0832, 0.1324],
        [0.7506, 0.0744, 0.1749],
        [0.7667, 0.0791, 0.1542],
        [0.7603, 0.0757, 0.1640],
        [0.8065, 0.1654, 0.0281],
        [0.8094, 0.1633, 0.0274],
        [0.7661, 0.0789, 0.1550],
        [0.8079, 0.1627, 0.0294],
        [0.8164, 0.1279, 0.0556]], grad_fn=<SoftmaxBackward0>)

In [41]:
if not iskaggle:
    push_notebook('saan', comp,
                  title='Feedback effeciveness: debertav3 small',
                  file='/content/drive/MyDrive/Colab Notebooks/all_text_concat_DBERTA.ipynb',
                  competition=comp, private=False, gpu=True)

Your kernel title does not resolve to the specified id. This may result in surprising behavior. We suggest making your title something that resolves to the specified id. See https://en.wikipedia.org/wiki/Clean_URL#Slug for more information on how slugs are determined.


ApiException: ignored