In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!nvidia-smi topo -m

### Install dependencies and check versions

as of now support for multilabel classification provided by latest dev commit so we have to install directly from github


In [None]:
!pip install git+https://github.com/huggingface/transformers.git datasets -q

In [None]:
import torch 
import datasets
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments,
Trainer)
import transformers

In [None]:
transformers.__version__, datasets.__version__

### Load data and instantiate transformers Datasets object

In [None]:
train_df = pd.read_csv('/kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip')

In [None]:
data = datasets.Dataset.from_pandas(train_df).train_test_split(test_size=0.1)

In [None]:
data

### preprocess data

1. remove 'id' column
2. infer lables column 

In [None]:
data = data.remove_columns('id')

In [None]:
data.column_names

In [None]:
column_names = data['train'].column_names

data = data.map(lambda x: {'labels': [x[c] for c in column_names if c!='comment_text']})

In [None]:
data['train'][0]

### tokenizer and encode data

In [None]:
model_name = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
names = data['train'].column_names
names.remove('labels')
print(names)

In [None]:
data_enc = data.map(lambda x:tokenizer(x['comment_text'], truncation=True),
                    remove_columns=names)

In [None]:
data_enc.set_format('torch')
data_enc = data_enc.map(lambda x: {'float_labels': x['labels'].to(torch.float)},
                       remove_columns=['labels']).rename_column('float_labels', "labels")

In [None]:
data_enc['train'][4]['input_ids'].shape

### Training the mode

* Here we have to override the Trainer class to accommodate for multi-label classification

In [None]:
model = AutoModelForSequenceClassification.from_pretrained(model_name,num_labels=6).to('cuda')

In [None]:
class MultiLabelTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop('labels')
#         import pdb
#         pdb.set_trace()
        outputs = model(**inputs)
        logits = outputs.logits
        loss_fct = torch.nn.BCEWithLogitsLoss()
        
        loss = loss_fct(logits.view(-1, self.model.config.num_labels),
                       labels.float().view(-1, self.model.config.num_labels))
        return (loss, outputs) if return_outputs else loss

In [None]:
args = TrainingArguments('.', num_train_epochs=1,
                         per_device_train_batch_size=32,
                         per_device_eval_batch_size=32,
                        report_to="none")

trainer = MultiLabelTrainer(model=model,
                 args=args,
                 train_dataset=data_enc['train'],
                 eval_dataset=data_enc['test'],
                 tokenizer=tokenizer)

In [None]:
args

In [None]:
trainer.train()

In [None]:
trainer.evaluate()