# 1. Kaggle Dataset configuretion

## 1.1 Install Kaggle & Download `Feedback Prize` dataset

In [1]:
# Kaggle Configuration Start
! pip install kaggle > /dev/null

In [2]:
# Integrate with Kaggle API
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/.
! chmod 600 ~/.kaggle/kaggle.json

In [3]:
! mkdir dataset
%cd dataset

/content/dataset


In [4]:
! kaggle competitions download -c feedback-prize-effectiveness

Downloading feedback-prize-effectiveness.zip to /content/dataset
  0% 0.00/8.13M [00:00<?, ?B/s]
100% 8.13M/8.13M [00:00<00:00, 133MB/s]


In [5]:
! unzip feedback-prize-effectiveness.zip > /dev/null

In [6]:
%cd ..

/content


## 1.2 Install Transformers related lib

In [7]:
# inatall transformers
!pip install transformers > /dev/null

In [8]:
# for fixing transformer issue '#9750'
# https://github.com/huggingface/transformers/issues/9750
!pip install sentencepiece >> /dev/null

In [9]:
!pip install datasets >> /dev/null

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m


# 2. Imports Lib and Training Config

## 2.1 Imports Lib

In [10]:
%matplotlib notebook

In [11]:
import os

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from torch.utils.data import DataLoader
import warnings,transformers,logging,torch
from transformers import TrainingArguments,Trainer
from transformers import AutoModelForSequenceClassification, AutoTokenizer

import datasets
from datasets import load_dataset, Dataset, DatasetDict

## 2.2 Training Config

In [12]:
class CFG:
    train_file_dir = "/content/dataset/train"
    test_file_dir = "/content/dataset/test"
    train_csv_path = "/content/dataset/train.csv"
    test_csv_path = "/content/dataset/test.csv"
    sample_csv_path = "/content/dataset/sample_submission.csv"

    model_nm = 'microsoft/deberta-v3-base'

# 3. Training Baseline

## 3.1 Load the dataset and model

In [13]:
train_df = pd.read_csv(CFG.train_csv_path)
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate


In [14]:
# test_df = pd.read_csv(CFG.test_csv_path)
# test_df

In [15]:
CFG.model_nm = "microsoft/deberta-v3-small"

In [16]:
print(f"The baseline model is: {CFG.model_nm}")

The baseline model is: microsoft/deberta-v3-small


In [17]:
tokz = AutoTokenizer.from_pretrained(CFG.model_nm)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/578 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [18]:
sep = tokz.sep_token
sep

'[SEP]'

## 3.2 Feature Engineering(for demo baseline only)

Here, we'll put together a simple baseline that completely ignores the full essay and only uses the discourse text to make a classification.

Let's import the stuff necessary for training a model. We will be using HuggingFace Transformers to train our model (specifically it's Trainer API).


In [19]:
train_df['inputs'] = train_df.discourse_type + sep + train_df.discourse_text

In [20]:
train_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_effectiveness,inputs
0,0013cc385424,007ACE74B050,"Hi, i'm Isaac, i'm going to be writing about h...",Lead,Adequate,"Lead[SEP]Hi, i'm Isaac, i'm going to be writin..."
1,9704a709b505,007ACE74B050,"On my perspective, I think that the face is a ...",Position,Adequate,"Position[SEP]On my perspective, I think that t..."
2,c22adee811b6,007ACE74B050,I think that the face is a natural landform be...,Claim,Adequate,Claim[SEP]I think that the face is a natural l...
3,a10d361e54e4,007ACE74B050,"If life was on Mars, we would know by now. The...",Evidence,Adequate,"Evidence[SEP]If life was on Mars, we would kno..."
4,db3e453ec4e2,007ACE74B050,People thought that the face was formed by ali...,Counterclaim,Adequate,Counterclaim[SEP]People thought that the face ...


HuggingFace expects that the **target** is in a column to be called **`label`**, and also that the targets are numerical. We will categorize it and create a new column:

In [21]:
new_label = {"discourse_effectiveness": {"Ineffective": 0, "Adequate": 1, "Effective": 2}}
train_df = train_df.replace(new_label)
train_df = train_df.rename(columns = {"discourse_effectiveness": "label"})

Now let's create our Dataset object:

In [22]:
ds = Dataset.from_pandas(train_df)

To tokenize the data, let's create a function, since that's what Dataset.map will need:

In [23]:
def tok_func(x): return tokz(x["inputs"], truncation=True)

Let's see what one example looks like when tokenized:

In [24]:
 ds[0]

{'discourse_id': '0013cc385424',
 'discourse_text': "Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. ",
 'discourse_type': 'Lead',
 'essay_id': '007ACE74B050',
 'inputs': "Lead[SEP]Hi, i'm Isaac, i'm going to be writing about how this face on Mars is a natural landform or if there is life on Mars that made it. The story is about how NASA took a picture of Mars and a face was seen on the planet. NASA doesn't know if the landform was created by life on Mars, or if it is just a natural landform. ",
 'label': 1}

In [25]:
tok_func(ds[0])

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [1, 8380, 2, 2684, 261, 584, 280, 358, 11759, 261, 584, 280, 358, 446, 264, 282, 898, 314, 361, 291, 812, 277, 7583, 269, 266, 1008, 1311, 9930, 289, 337, 343, 269, 432, 277, 7583, 272, 412, 278, 260, 279, 697, 269, 314, 361, 7658, 681, 266, 1355, 265, 7583, 263, 266, 812, 284, 757, 277, 262, 3568, 260, 7658, 702, 280, 297, 391, 337, 262, 1311, 9930, 284, 994, 293, 432, 277, 7583, 261, 289, 337, 278, 269, 348, 266, 1008, 1311, 9930, 260, 2], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

We can now tokenize the the input. We'll use **`Dataset.map`** to speed it up, and remove the columns we no longer need:

In [26]:
inps = "discourse_text","discourse_type"
tok_ds = ds.map(tok_func, batched=True, remove_columns=inps+('inputs','discourse_id','essay_id'))



  0%|          | 0/37 [00:00<?, ?ba/s]

In [27]:
# Let's see all the columns:
tok_ds[0].keys()

dict_keys(['label', 'input_ids', 'token_type_ids', 'attention_mask'])

## 3.3 Split training set into training and validation set

Next we need to split the dataset into a training set and a validation set. We will split based on essays:

In [28]:
essay_ids = train_df.essay_id.unique()
np.random.seed(42)
np.random.shuffle(essay_ids)
essay_ids[:5]

array(['B5C606F0A883', 'FA4FE7706A1A', '37A77BEAD718', '0ED28D8A5EC4',
       'F25BA634ADDD'], dtype=object)

We'll do a random 80%-20% split:

In [29]:
val_prop = 0.2
val_sz = int(len(essay_ids)*val_prop)
val_essay_ids = essay_ids[:val_sz]

In [30]:
is_val = np.isin(train_df.essay_id, val_essay_ids)
idxs = np.arange(len(train_df))
val_idxs = idxs[ is_val]
trn_idxs = idxs[~is_val]
len(val_idxs),len(trn_idxs)

(7181, 29584)

In [31]:
# We can use the select method of the Dataset object to create our splits:
dds = DatasetDict({"train":tok_ds.select(trn_idxs),
             "test": tok_ds.select(val_idxs)})

**Here I put all of this into a single function, along with some extra code to deal with the test set (no split necessary):**

In [32]:
def get_dds(df, train=True):
    ds = Dataset.from_pandas(df)
    to_remove = ['discourse_text','discourse_type','inputs','discourse_id','essay_id']
    tok_ds = ds.map(tok_func, batched=True, remove_columns=to_remove)
    if train:
        return DatasetDict({"train":tok_ds.select(trn_idxs), "test": tok_ds.select(val_idxs)})
    else: 
        return tok_ds

## 3.4 Start Training

We are now ready to train! Let's set some hyperparameters. We select a reasonable LR and a batch size that fits in the GPU RAM. I only train for 1 epoch here.

In [33]:
lr,bs = 8e-5,16
wd,epochs = 0.01,1

In [34]:
from sklearn.metrics import log_loss
import torch.nn.functional as F
def score(preds): return {'log loss': log_loss(preds.label_ids, F.softmax(torch.Tensor(preds.predictions)))}

Now we can create our model and trainer. HuggingFace uses the `TrainingArguments` class to set up arguments. We'll use a **cosine scheduler** with warmup. We'll use **fp16** since it's much faster on modern GPUs, and saves some memory. We evaluate using **double-sized batches**, since no gradients are stored so we can do twice as many rows at a time.

In [35]:
def get_trainer(dds):
    args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine', fp16=True,
        evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
        num_train_epochs=epochs, weight_decay=wd, report_to='none')
    model = AutoModelForSequenceClassification.from_pretrained(CFG.model_nm, num_labels=3)
    return Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
                   tokenizer=tokz, compute_metrics=score)

**Let's train!**

In [36]:
trainer = get_trainer(dds)
trainer.train()

Downloading:   0%|          | 0.00/273M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForSequenceClassification: ['mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from 

Epoch,Training Loss,Validation Loss,Log loss
1,0.7284,0.735931,0.735931


Saving model checkpoint to outputs/checkpoint-500
Configuration saved in outputs/checkpoint-500/config.json
Model weights saved in outputs/checkpoint-500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-500/special_tokens_map.json
Saving model checkpoint to outputs/checkpoint-1000
Configuration saved in outputs/checkpoint-1000/config.json
Model weights saved in outputs/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1000/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1000/special_tokens_map.json
Saving model checkpoint to outputs/checkpoint-1500
Configuration saved in outputs/checkpoint-1500/config.json
Model weights saved in outputs/checkpoint-1500/pytorch_model.bin
tokenizer config file saved in outputs/checkpoint-1500/tokenizer_config.json
Special tokens file saved in outputs/checkpoint-1500/special_tokens_map.json
***** Running Eval

TrainOutput(global_step=1849, training_loss=0.7682757710301599, metrics={'train_runtime': 495.7053, 'train_samples_per_second': 59.681, 'train_steps_per_second': 3.73, 'total_flos': 1432355471883072.0, 'train_loss': 0.7682757710301599, 'epoch': 1.0})

# Inference and Submission

Now we get our test CSV again:

In [37]:
test_df = pd.read_csv(CFG.test_csv_path)

...and process the same way we did with the training set:

In [38]:
test_df['inputs'] = test_df.discourse_type + sep + test_df.discourse_text

In [39]:
test_ds = get_dds(test_df,train=False)

  0%|          | 0/1 [00:00<?, ?ba/s]

Now we have our `Dataset` object with our test dataset. Then we can simply perform `Trainer.predict` on our dataset to get the predictions.

In [40]:
preds = F.softmax(torch.Tensor(trainer.predict(test_ds).predictions)).numpy().astype(float)
preds

***** Running Prediction *****
  Num examples = 10
  Batch size = 32


  """Entry point for launching an IPython kernel.


array([[0.01286736, 0.36411402, 0.62301862],
       [0.06979636, 0.7659291 , 0.1642745 ],
       [0.0303631 , 0.48907942, 0.48055744],
       [0.07275592, 0.73194396, 0.19530003],
       [0.03357669, 0.5314191 , 0.43500417],
       [0.00981039, 0.28198114, 0.7082085 ],
       [0.00185673, 0.08751815, 0.91062516],
       [0.05331918, 0.68006921, 0.26661167],
       [0.05208258, 0.57546258, 0.37245476],
       [0.00840186, 0.2914415 , 0.70015669]])

We put it in a CSV:

In [42]:
submission_df = pd.read_csv(CFG.sample_csv_path)
submission_df['Ineffective'] = preds[:,0]
submission_df['Adequate'] = preds[:,1]
submission_df['Effective'] = preds[:,2]
submission_df

Unnamed: 0,discourse_id,Ineffective,Adequate,Effective
0,a261b6e14276,0.012867,0.364114,0.623019
1,5a88900e7dc1,0.069796,0.765929,0.164274
2,9790d835736b,0.030363,0.489079,0.480557
3,75ce6d68b67b,0.072756,0.731944,0.1953
4,93578d946723,0.033577,0.531419,0.435004
5,2e214524dbe3,0.00981,0.281981,0.708209
6,84812fc2ab9f,0.001857,0.087518,0.910625
7,c668ff840720,0.053319,0.680069,0.266612
8,739a6d00f44a,0.052083,0.575463,0.372455
9,bcfae2c9a244,0.008402,0.291442,0.700157


In [43]:
submission_df.to_csv('submission.csv',index=False)

In [51]:
# submit the file to kaggle
!kaggle competitions submit -c feedback-prize-effectiveness -f submission.csv -m "Yeah! I submit my file through the Google Colab!"

100% 765/765 [00:02<00:00, 263B/s]
400 - Bad Request
