In [1]:
import os
import sys
from pathlib import Path
from tqdm import tqdm

import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score, hamming_loss, jaccard_score

import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

from transformers import AutoTokenizer, pipeline, TrainingArguments, Trainer, AutoModelForSequenceClassification, DataCollatorWithPadding, EvalPrediction

In [2]:
# config variables and constants
dataset_path = Path("../datasets")
dataset_path.mkdir(exist_ok=True)

model_name = "distilbert-base-uncased"

MAX_LEN = 512
train_batch_size = 8
eval_batch_size = 4
learning_rate = 1e-05
epoch = 1
metric_name = "f1"

In [3]:
class StackOverflowDS(Dataset):
    def __init__(self, text, labels, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = text
        self.labels = labels
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
        
    def __getitem__(self, index):
        
        cleaned_text = str(self.text[index])
        
        inputs = self.tokenizer.encode_plus(
            cleaned_text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            #padding='max_length',
            truncation=True,
            return_token_type_ids=True      
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        return {
            'input_ids': torch.tensor(ids, dtype=torch.long),
            'attention_mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'labels': torch.tensor(self.labels[index], dtype=torch.float)            
        }

    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    # return as dictionary
    metrics = {'F1': f1_score(y_true, y_pred, average="weighted"),
               'ROC_AUC': roc_auc_score(y_true, y_pred, average = 'weighted'),
               'Hamming': hamming_loss(y_true, y_pred)*100,
               'Jaccard': jaccard_score(y_true, y_pred, average="weighted"),
               'Accuracy': accuracy_score(y_true, y_pred)}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [4]:
df_full = pd.read_parquet(dataset_path/"cleaned_df.parquet")

In [5]:
tags = df_full["Tag"].apply(lambda x: (x.split(',')))
binarizer =  MultiLabelBinarizer()
labels = binarizer.fit_transform(tags)
df_full["labels"] = list(labels)
df_full

Unnamed: 0,Tag,BodyCleaned,TitleCleaned,labels
0,"sql,asp.net",Has anyone got experience creating SQL-based A...,ASP.NET Site Maps,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"c#,.net",I have a little game written in C#. It uses a ...,Adding scripting functionality to .NET applica...,"[1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ..."
2,c++,I am working on a collection of classes used f...,Should I use nested classes in this case?,"[0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ..."
3,.net,I've been writing a few web services for a .ne...,Homegrown consumption of web services,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,sql-server,I wonder how you guys manage deployment of a d...,Deploying SQL Server Databases from Test to Live,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
830491,javascript,"I'm trying to detect the ""flash out of date"" e...","YouTube iFrame API: no ready call, no error call","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ..."
830492,python,I need to extend a shell script (bash). As I a...,How to execute multiline python code from a ba...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
830493,php,I am building a custom MVC project and I have ...,URL routing in PHP (MVC),"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
830494,android,Under minifyEnabled I changed from false to tr...,Obfuscating code in android studio,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [6]:
x_train_title, x_test_title, y_train, y_test = train_test_split(df_full["TitleCleaned"], df_full["labels"], test_size=0.1, random_state = 0)
x_train_body, x_test_body, y_train, y_test = train_test_split(df_full["BodyCleaned"], df_full["labels"], test_size=0.1, random_state = 0)
samples = x_test_body.sample(1000)
samples_y = y_test[samples.index]

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name, problem_type="multi_label_classification")

## Using zero shot classification

In [15]:
# classifier = pipeline(model=model_name_small, task="zero-shot-classification", device=0)
# predictions = classifier(samples.to_list(), binarizer.classes_, multi_label=True)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_projector.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

## Finetune a LM for downstream task

In [8]:
num_labels = len(binarizer.classes_)
id2label = {idx:label for idx, label in enumerate(binarizer.classes_)}
label2id = {label:idx for idx, label in enumerate(binarizer.classes_)}
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels, 
                                                           problem_type="multi_label_classification",
                                                           id2label=id2label,
                                                           label2id=label2id).to('cuda')

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classi

In [9]:
dataset_train = StackOverflowDS(x_train_body.reset_index(drop=True), y_train.reset_index(drop=True), tokenizer, MAX_LEN)
dataset_test = StackOverflowDS(x_test_body.reset_index(drop=True), y_test.reset_index(drop=True), tokenizer, MAX_LEN)
dataset_sample = StackOverflowDS(samples.reset_index(drop=True), samples_y.reset_index(drop=True), tokenizer, MAX_LEN)

In [10]:
args = TrainingArguments(f"{model_name}", 
                         evaluation_strategy = "epoch",
                         save_strategy = "epoch",
                         learning_rate=learning_rate,
                         per_device_train_batch_size=train_batch_size,
                         per_device_eval_batch_size=eval_batch_size,
                         num_train_epochs=epoch,
                         weight_decay=0.01,
                         load_best_model_at_end=True,
                         metric_for_best_model=metric_name,
                         output_dir=model_path
                        )
trainer = Trainer(model=model, args=args, train_dataset=dataset_train, 
                  #eval_dataset=dataset_sample,
                  eval_dataset=dataset_test, 
                  tokenizer=tokenizer, data_collator=data_collator, compute_metrics=compute_metrics)

Get zero shot evaluation on our sample of test set

In [26]:
trainer.evaluate()

{'eval_loss': 0.7095041871070862,
 'eval_F1': 0.11609777163263978,
 'eval_ROC_AUC': 0.49808693274758437,
 'eval_Hamming': 60.211111111111116,
 'eval_Jaccard': 0.06330936731748303,
 'eval_Accuracy': 0.0,
 'eval_runtime': 7.9594,
 'eval_samples_per_second': 125.637,
 'eval_steps_per_second': 31.409}

zero shot performance is really bad

In [None]:
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
