# Generate a classification model and use the model to filter in good explanations to form a dataset

The script is inspired by [Fine-tuning with custom datasets](https://huggingface.co/transformers/v4.10.1/custom_datasets.html), and the [colab notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/master/transformers_doc/pytorch/custom_datasets.ipynb).

In [1]:
import os
import pandas as pd
import numpy as np

## Load labeled dataset

In [2]:
# load dataset in csv
cwd = os.getcwd()
parent_dir = os.path.split(cwd)[0]
df = pd.read_csv(os.path.join(parent_dir, 'ExplanationsQualityMarked.valid.csv'))
df.head()

Unnamed: 0.1,Unnamed: 0,description,reference_code,complexity,rouge-1-r,rouge-1-f,explanation quality (high-1/low-0)
0,199,"Multifactorial of n of order k, n(!!...!). \n ...","def factorialk(n, k, exact=True): \n if exac...",5,0.4,0.242424,1
1,49,Issues an HTTP redirect to the given relative ...,"def redirect(uri, permanent=False, abort=False...",10,0.309091,0.22973,1
2,201,Return a list of installed packages either glo...,"def freeze(bin_env=None, user=None, cwd=None, ...",5,0.303797,0.292683,1
3,126,Returns a RNG object. \n Parameters \n rng_or_...,"def make_rng(rng_or_seed=None, default_seed=No...",7,0.294118,0.175439,1
4,92,Turns a sequence iterator or list into a dicti...,"def to_dict(sequences, key_function=None): \n ...",4,0.266667,0.163265,1


In [3]:
df.columns

Index(['Unnamed: 0', 'description', 'reference_code', 'complexity',
       'rouge-1-r', 'rouge-1-f', 'explanation quality (high-1/low-0)'],
      dtype='object')

## Train BERT classifier with the trainer api

In [4]:
train_texts, train_labels = df["description"].tolist(), df["explanation quality (high-1/low-0)"].tolist()
print(f"num of all samples: {len(train_texts)}")

num of all samples: 269


In [5]:
from sklearn.model_selection import train_test_split
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)
# use validation set as the test set
test_texts, test_labels = val_texts, val_labels
assert len(train_texts) > 200
print(f"num of training samples: {len(train_texts)}")
print(f"num of validation samples: {len(val_texts)}")

num of training samples: 215
num of validation samples: 54


In [6]:
from transformers import DistilBertTokenizerFast
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

Now we can simply pass our texts to the tokenizer. We’ll pass truncation=True and padding=True, which will ensure that all of our sequences are padded to the same length and are truncated to be no longer model’s maximum input length. This will allow us to feed batches of sequences into the model at the same time.

In [7]:
# Padding is set to false. We add tokenizer option to the trainer later and the 
# input will be padded to the max length there.
train_encodings = tokenizer(train_texts, truncation=True, padding=False)
val_encodings = tokenizer(val_texts, truncation=True, padding=False)
test_encodings = tokenizer(test_texts, truncation=True, padding=False)

In [8]:
import torch

class ExplanationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ExplanationDataset(train_encodings, train_labels)
val_dataset = ExplanationDataset(val_encodings, val_labels)
test_dataset = ExplanationDataset(test_encodings, test_labels)

### Fine-tuning with Trainer
The steps above prepared the datasets in the way that the trainer is expected. Now all we need to do is create a model
to fine-tune, define the `TrainingArguments`/`TFTrainingArguments` and
instantiate a `Trainer`/`TFTrainer`.

In [9]:
# Define Trainer parameters
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
def compute_metrics(p):
    pred, labels = p
    pred = np.argmax(pred, axis=1)

    accuracy = accuracy_score(y_true=labels, y_pred=pred)
    recall = recall_score(y_true=labels, y_pred=pred)
    precision = precision_score(y_true=labels, y_pred=pred)
    f1 = f1_score(y_true=labels, y_pred=pred)

    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}

In [10]:
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
from transformers import EarlyStoppingCallback

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    evaluation_strategy="steps",
    eval_steps=10,
    num_train_epochs=10,              # total number of training epochs
    per_device_train_batch_size=16,  # batch size per device during training
    per_device_eval_batch_size=64,   # batch size for evaluation
    # warmup_ratio=0.1,                 # ratio of learning rate for warmup
    # do not use warmup_steps if warmup_ratio is set
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    learning_rate=1e-5,               # initial learning rate
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=val_dataset,             # evaluation dataset
    tokenizer=tokenizer,                 # tokenizer
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=4)],
)

trainer.train()

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_transform.weight', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

Step,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
10,0.7027,0.687836,0.5,0.5,1.0,0.666667
20,0.6861,0.676051,0.648148,0.590909,0.962963,0.732394
30,0.6693,0.665515,0.666667,0.6,1.0,0.75
40,0.6323,0.649178,0.685185,0.613636,1.0,0.760563
50,0.6089,0.627274,0.666667,0.609756,0.925926,0.735294
60,0.5666,0.589476,0.62963,0.6,0.777778,0.677419
70,0.5257,0.574779,0.685185,0.65625,0.777778,0.711864
80,0.4514,0.578691,0.703704,0.666667,0.814815,0.733333


***** Running Evaluation *****
  Num examples = 54
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-10
Configuration saved in ./results/checkpoint-10/config.json
Model weights saved in ./results/checkpoint-10/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-10/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-10/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 54
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-20
Configuration saved in ./results/checkpoint-20/config.json
Model weights saved in ./results/checkpoint-20/pytorch_model.bin
tokenizer config file saved in ./results/checkpoint-20/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-20/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 54
  Batch size = 64
Saving model checkpoint to ./results/checkpoint-30
Configuration saved in ./results/checkpoint-30/config.json
Model weights saved in ./

TrainOutput(global_step=80, training_loss=0.6054039359092712, metrics={'train_runtime': 88.8492, 'train_samples_per_second': 24.198, 'train_steps_per_second': 1.576, 'total_flos': 137585763006804.0, 'train_loss': 0.6054039359092712, 'epoch': 5.71})

In [11]:
# try predict on test set
predictions, label_ids, metrics = trainer.predict(test_dataset)

***** Running Prediction *****
  Num examples = 54
  Batch size = 64


## apply model on the full corpus

In [13]:
# load the train corpus
df_all = pd.read_csv(os.path.join(parent_dir, 'long_code_desc.train.csv'), index_col=0)
df_all.head()

Unnamed: 0,description,reference_code,complexity,rouge-1-r,rouge-1-f
0,Pulls all flashed messages from the session an...,def get_flashed_messages(with_categories=False...,6,0.16129,0.07874
1,Yield images of the laplacian pyramid formed b...,"def pyramid_laplacian(image, max_layer=(-1), d...",5,0.161765,0.101852
2,Import module by name \n :param name: \n Modul...,"def import_module(name, required=True): \n t...",4,0.15,0.133333
3,"Given a DataSource, generates a dictionary tha...","def mapping(data_source, geom_name='geom', lay...",6,0.16,0.153846
4,Store a temporary file. \n @param filedata: co...,"def store_temp_file(filedata, filename, path=N...",5,0.041667,0.058824


In [14]:
print(f"num of examples in full corpus: {len(df_all)}")
corpus_texts, corpus_labels = df_all["description"].tolist(), [1]*len(df_all)

num of examples in full corpus: 13437


In [15]:
corpus_encodings = tokenizer(corpus_texts, truncation=True, padding=True)
corpus_dataset = ExplanationDataset(corpus_encodings, corpus_labels)

In [17]:
predictions, label_ids, metrics = trainer.predict(corpus_dataset)

***** Running Prediction *****
  Num examples = 13437
  Batch size = 64


In [18]:
pred_quality = np.argmax(predictions, axis=1)
assert len(pred_quality) == len(corpus_texts)
print(f"num of qualified examples: {sum(pred_quality)}")
print(f"ratio of qualified examples: {sum(pred_quality)/len(pred_quality)}")

num of qualified examples: 10369
ratio of qualified examples: 0.7716752251246558


## save qualified data

In [19]:
df_all["explanation quality (high-1/low-0)"] = pred_quality
# filter in qualified examples
df_qualified = df_all[df_all["explanation quality (high-1/low-0)"] == 1]


In [20]:
if not os.path.exists(os.path.join(parent_dir, 'QualifiedExplanations.train.csv')):
    df_qualified.to_csv(os.path.join(parent_dir, 'QualifiedExplanations.train.csv'))