In [None]:
!pip install datasets
!pip install evaluate



### Imports

In [None]:
import torch
import torch.nn as nn
import numpy as np
import evaluate
import pandas as pd

import os
import matplotlib.pyplot as plt

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments, DataCollatorWithPadding

from huggingface_hub import notebook_login

### Load Dataset and Preprocess

There were issues in creating the dataset. So we load it from HuggingFace as it is made available by some other users and split it 80%-20% into train and test splits.

In [None]:
issues_dataset = load_dataset('lewtun/github-issues')
issues_dataset = issues_dataset['train'].train_test_split(train_size=0.8, seed=42)
issues_dataset

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Repo card metadata block was not found. Setting CardData to empty.


DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 2415
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 604
    })
})

Pull requests are also considered issues, so we need to filter them out

In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(3))
samples[:]['is_pull_request']

[True, False, True]

In [None]:
issues_dataset = issues_dataset.filter(lambda x: not x['is_pull_request'])
issues_dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 795
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 189
    })
})

In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(3))
samples[:]['is_pull_request']

[False, False, False]

### Average Time it takes to Close an Issue

Some timestamps give negative time to close issue. I don't know how to filter them out.

In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(10))
print('Creation time stamps: ', samples[:]['created_at'])
print('Closure time stamps: ', samples[:]['closed_at'])

Creation time stamps:  [1610041695000, 1614588091000, 1621000708000, 1630614972000, 1617789496000, 1590709652000, 1623677124000, 1608076941000, 1589547689000, 1613500798000]
Closure time stamps:  [None, None, None, None, 1618913043000, 1590969875000, None, 1623944445000, 1589548240000, 1614793322000]


Some issues are still open, so we filter them out. These issues do not have a `closed_at` time stamp.

In [None]:
issues_dataset = issues_dataset.filter(lambda x: x['closed_at'] is not None)
issues_dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 532
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 127
    })
})

In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(10))
print('Creation time stamps: ', samples[:]['created_at'])
print('Closure time stamps: ', samples[:]['closed_at'])

Creation time stamps:  [1623437242000, 1600481711000, 1631714710000, 1601488203000, 1614149253000, 1609144534000, 1616146063000, 1596498396000, 1621581755000, 1621881533000]
Closure time stamps:  [1625481957000, 1600533991000, 1631726303000, 1601560874000, 1614337806000, 1609144687000, 1618563044000, 1599490393000, 1622828385000, 1623229645000]


In [None]:
issues_dataset.set_format('pandas')

Average time to close in training set

In [None]:
issues_train_df = issues_dataset['train'][:]
issues_train_df['created_at'] = issues_train_df['created_at'].map(lambda x: pd.Timestamp(x, unit='s', tz='UTC'))
issues_train_df['closed_at'] = issues_train_df['closed_at'].map(lambda x: pd.Timestamp(x, unit='s', tz='UTC'))
issues_train_df['time_to_close'] = issues_train_df.closed_at - issues_train_df.created_at
issues_train_df[['created_at', 'closed_at', 'time_to_close']][:5]
print(issues_train_df['time_to_close'].mean())

18 days 18:22:34.511278195


Average time to close in testing set

In [None]:
issues_test_df = issues_dataset['test'][:]
issues_test_df['created_at'] = issues_test_df['created_at'].map(lambda x: pd.Timestamp(x, unit='s', tz='UTC'))
issues_test_df['closed_at'] = issues_test_df['closed_at'].map(lambda x: pd.Timestamp(x, unit='s', tz='UTC'))
issues_test_df['time_to_close'] = issues_test_df.closed_at - issues_test_df.created_at
issues_test_df[['created_at', 'closed_at', 'time_to_close']][:5]
print(issues_test_df['time_to_close'].mean())

20 days 02:29:17.480314960


In [None]:
issues_dataset.reset_format()

For the next set of tasks, we are not concerned whether an issue is open or closed, so we will use them all.

### Multi Label Classification

In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(5))
samples[:]['labels']

[[{'id': 1935892871,
   'node_id': 'MDU6TGFiZWwxOTM1ODkyODcx',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/enhancement',
   'name': 'enhancement',
   'color': 'a2eeef',
   'default': True,
   'description': 'New feature or request'}],
 [{'id': 1935892857,
   'node_id': 'MDU6TGFiZWwxOTM1ODkyODU3',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/bug',
   'name': 'bug',
   'color': 'd73a4a',
   'default': True,
   'description': "Something isn't working"}],
 [],
 [],
 [{'id': 2107841032,
   'node_id': 'MDU6TGFiZWwyMTA3ODQxMDMy',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/nlp-viewer',
   'name': 'nlp-viewer',
   'color': '94203D',
   'default': False,
   'description': ''}]]

Some issues do not have any associated labels. So we filter them out. Likewise, some issues may have multiple labels, so we need a multi label classification.

We could, after finetuning the classifier, use it to predict the labels for those samples whose labels are missing.

In [None]:
issues_dataset = issues_dataset.filter(lambda x: len(x['labels'])>0)
issues_dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 228
    })
    test: Dataset({
        features: ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request'],
        num_rows: 49
    })
})

In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(5))
samples[:]['labels']

[[{'id': 2067393914,
   'node_id': 'MDU6TGFiZWwyMDY3MzkzOTE0',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/metric%20bug',
   'name': 'metric bug',
   'color': '25b21e',
   'default': False,
   'description': 'A bug in a metric script'}],
 [{'id': 1935892857,
   'node_id': 'MDU6TGFiZWwxOTM1ODkyODU3',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/bug',
   'name': 'bug',
   'color': 'd73a4a',
   'default': True,
   'description': "Something isn't working"}],
 [{'id': 1935892857,
   'node_id': 'MDU6TGFiZWwxOTM1ODkyODU3',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/bug',
   'name': 'bug',
   'color': 'd73a4a',
   'default': True,
   'description': "Something isn't working"}],
 [{'id': 1935892857,
   'node_id': 'MDU6TGFiZWwxOTM1ODkyODU3',
   'url': 'https://api.github.com/repos/huggingface/datasets/labels/bug',
   'name': 'bug',
   'color': 'd73a4a',
   'default': True,
   'description': "Something isn't working"}],
 [{'id

accumulating all unique labels across the train and test splits.

In [None]:
all_labels = []

for x in issues_dataset['train']['labels']:
  for p in x:
    if p['name'] not in all_labels:
      all_labels.append(p['name'])

for x in issues_dataset['test']['labels']:
  for p in x:
    if p['name'] not in all_labels:
      all_labels.append(p['name'])

print('Total number of labels: ', len(all_labels))
print('Examples: ', all_labels[:5])

Total number of labels:  17
Examples:  ['bug', 'documentation', 'enhancement', 'good first issue', 'generic discussion']


creating the label to id and id to label mapping

In [None]:
label2id = {k: v for v, k in enumerate(all_labels)}
id2label = {k: v for v, k in label2id.items()}

print(label2id)
print(id2label)

{'bug': 0, 'documentation': 1, 'enhancement': 2, 'good first issue': 3, 'generic discussion': 4, 'dataset request': 5, 'dataset bug': 6, 'question': 7, 'nlp-viewer': 8, 'duplicate': 9, 'speech': 10, 'Metric discussion': 11, 'metric bug': 12, 'wontfix': 13, 'Dataset discussion': 14, 'help wanted': 15, 'metric request': 16}
{0: 'bug', 1: 'documentation', 2: 'enhancement', 3: 'good first issue', 4: 'generic discussion', 5: 'dataset request', 6: 'dataset bug', 7: 'question', 8: 'nlp-viewer', 9: 'duplicate', 10: 'speech', 11: 'Metric discussion', 12: 'metric bug', 13: 'wontfix', 14: 'Dataset discussion', 15: 'help wanted', 16: 'metric request'}


renaming `labels` column in the datasets to `text_labels` so that it won't interfere in training. we will create another `labels` column where we will have 1s for the labels present and 0s for those absent.

In [None]:
issues_dataset = issues_dataset.rename_column('labels', 'text_labels')
print('Train columns: ', issues_dataset['train'].column_names)
print('Test columns: ', issues_dataset['test'].column_names)

Train columns:  ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'text_labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request']
Test columns:  ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'text_labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request']


In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(3))
samples[:]['body']

['When I load more than one metric and try to print doc string of a particular metric,. It shows the doc strings of all imported metric one after the other which looks quite confusing and clumsy.\r\nAttached [Colab](https://colab.research.google.com/drive/13H0ZgyQ2se0mqJ2yyew0bNEgJuHaJ8H3?usp=sharing) Notebook for problem clarification..',
 '[This colab notebook](https://colab.research.google.com/drive/151gKyo0YIwnlznrOHst23oYH_a3mAe3Z?usp=sharing) implements a token classification input pipeline extending the logic from [this hugging example](https://huggingface.co/transformers/custom_datasets.html#tok-ner).\r\n\r\nThe pipeline works fine with most instance in different languages, but unfortunately, [the Japanese Kana ligature (a form of abbreviation? I don\'t know Japanese well)](https://en.wikipedia.org/wiki/Kana_ligature) break the alignment of `return_offsets_mapping`:\r\n![image](https://user-images.githubusercontent.com/50871412/122904371-db192700-d382-11eb-8917-1775db76db69.png

In [None]:
samples[:]['title']

['Problem while printing doc string when instantiating multiple metrics.',
 "Tokenizer's normalization preprocessor cause misalignment in return_offsets_mapping for tokenizer classification task",
 'timit_asr dataset only includes one text phrase']

mapping the dataset splits to create a `labels` column that will be used during training. we will use the label2id mapping here.

we can include the `body` field to make things better but it will need additional processing to remove special characters probably, or a specialized tokenizer. for now, we will include it and the titles. but first, we load the model and the tokenizer.

In [None]:
model_path = 'distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=len(all_labels),
                                                          id2label=id2label, label2id=label2id,
                                                          problem_type = "multi_label_classification")

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Add numeric labels and label names (textual labels).

In [None]:
def add_label(example):
  label_vec = [0. for _ in range(len(all_labels))]
  labels_present = [p['name'] for p in example['text_labels']]
  for x in labels_present:
    label_vec[label2id[x]] = 1.
  example['labels'] = label_vec #torch.tensor(label_vec).float()
  example['label names'] = labels_present
  return example

issues_dataset = issues_dataset.map(add_label)
print('New Columns: ', issues_dataset['train'].column_names)

New Columns:  ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'text_labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request', 'labels', 'label names']


In [None]:
samples = issues_dataset['train'].shuffle(42).select(range(5))
samples[:]['label names']

[['metric bug'], ['bug'], ['bug'], ['bug'], ['bug']]

Tokenize the text (`title` concatenated with `body`).

In [None]:
def tokenize_text(example):
  text = f"{example['title']} \n {example['body']}"
  return tokenizer(text, truncation=True, padding=True)

issues_dataset = issues_dataset.map(tokenize_text)
print('New Columns: ', issues_dataset['train'].column_names)

Map:   0%|          | 0/228 [00:00<?, ? examples/s]

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

New Columns:  ['url', 'repository_url', 'labels_url', 'comments_url', 'events_url', 'html_url', 'id', 'node_id', 'number', 'title', 'user', 'text_labels', 'state', 'locked', 'assignee', 'assignees', 'milestone', 'comments', 'created_at', 'updated_at', 'closed_at', 'author_association', 'active_lock_reason', 'pull_request', 'body', 'timeline_url', 'performed_via_github_app', 'is_pull_request', 'labels', 'label names', 'input_ids', 'attention_mask']


setting the data return format to pytorch tensors for relevant columns.

In [None]:
issues_dataset.set_format('torch', columns=tokenizer.model_input_names+['labels'])

define the metrics functions that will be used for monitoring the training performance.

In [None]:
clf_metrics = evaluate.combine(["accuracy", "f1", "precision", "recall"])

def sigmoid(x):
   return 1/(1 + np.exp(-x))

def compute_metrics(eval_pred):
   predictions, labels = eval_pred
   predictions = sigmoid(predictions)
   predictions = (predictions > 0.5).astype(float).reshape(-1)
   return clf_metrics.compute(predictions=predictions, references=labels.astype(float).reshape(-1))

load model, define training arguments and trainer. finally, train the model and push it to HuggingFace.

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [None]:
batch_size = 2
logging_steps = len(issues_dataset['train'])//batch_size
model_name = f'srvmishra832/text-classification/github_issues-dataset-{model_path}'
training_args = TrainingArguments(output_dir=model_name, learning_rate=2e-5,
                                  per_device_train_batch_size=batch_size, per_device_eval_batch_size=batch_size,
                                  num_train_epochs=5, weight_decay=0.01, evaluation_strategy="epoch",
                                  log_level='error', disable_tqdm=False, logging_steps=logging_steps,
                                  save_strategy="epoch", load_best_model_at_end=True, push_to_hub=True,
                                  remove_unused_columns=False)



In [None]:
trainer = Trainer(model=model, args=training_args,
                  train_dataset=issues_dataset["train"],
                  eval_dataset=issues_dataset["test"],
                  tokenizer=tokenizer, compute_metrics=compute_metrics)
trainer.train()
trainer.push_to_hub()

  trainer = Trainer(model=model, args=training_args,
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33msrvmishra832[0m ([33msrvmishra832-indian-institute-of-science-bangalore[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.3962,0.251287,0.920768,0.34,0.354167,0.326923
2,0.2008,0.184716,0.943577,0.419753,0.586207,0.326923
3,0.1633,0.160755,0.954382,0.55814,0.705882,0.461538
4,0.1468,0.151905,0.957983,0.606742,0.72973,0.519231
5,0.1385,0.149537,0.957983,0.606742,0.72973,0.519231


events.out.tfevents.1742644264.15358c91f674.5034.0:   0%|          | 0.00/9.81k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/srvmishra832/github_issues-dataset-distilbert-base-uncased/commit/c8ba45a0438c713ab81a704b04340450aba2cdab', commit_message='End of training', commit_description='', oid='c8ba45a0438c713ab81a704b04340450aba2cdab', pr_url=None, repo_url=RepoUrl('https://huggingface.co/srvmishra832/github_issues-dataset-distilbert-base-uncased', endpoint='https://huggingface.co', repo_type='model', repo_id='srvmishra832/github_issues-dataset-distilbert-base-uncased'), pr_revision=None, pr_num=None)