<a href="https://colab.research.google.com/github/sreejithvn/zero-shot-classification-for-long-text/blob/main/1_1_Zero_shot_DistlBERT_MNLI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers

In [None]:
import pandas as pd
import numpy as np
import time

from sklearn.metrics import accuracy_score, f1_score

In [None]:
# mount Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
df = pd.read_json('/content/gdrive/MyDrive/Colab Notebooks/MSC_Project/Jan2020Frontiers_20_labels.jsonl',lines=True)

In [None]:
df.head()

Unnamed: 0,id,text,journal,label,clean_text
0,465950,\n Sleep Characteristics and Influencing Facto...,Frontiers in Medicine,Medicine,sleep characteristics and influencing factors ...
1,483526,A Hybrid Approach for Modeling Type 2 Diabetes...,Frontiers in Genetics,Genetics,a hybrid approach for modeling type diabetes m...
2,437333,Environmental Health Research in Africa: Impor...,Frontiers in Genetics,Genetics,environmental health research in africa: impor...
3,486515,"\n 3,5-T2—A Janus-Faced Thyroid Hormone Metabo...",Frontiers in Endocrinology,Endocrinology,",-—a janus-faced thyroid hormone metabolite ex..."
4,488364,\n Differential Regulation of LPS-Mediated VE-...,Frontiers in Cell and Developmental Biology,Cell and Developmental Biology,differential regulation of lps-mediated ve-cad...


In [None]:
candidate_labels = list(df.label.unique())

In [None]:
len(df), len(candidate_labels)

(1101, 20)

# SPLITTING DATA into TRAIN, VALIDATION and TEST sets

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
train_texts, temp_texts, train_labels, temp_labels = train_test_split(df.clean_text, df.label, test_size=0.2, 
                                                                      random_state=42, stratify=df.label, shuffle=True)

In [None]:
val_texts, test_texts, val_labels, test_labels = train_test_split(temp_texts, temp_labels, test_size=0.5, 
                                                                  random_state=42, stratify=temp_labels, shuffle=True)

In [None]:
# reset_indices
train_texts.reset_index(drop=True, inplace=True), train_labels.reset_index(drop=True, inplace=True)
val_texts.reset_index(drop=True, inplace=True), val_labels.reset_index(drop=True, inplace=True)
test_texts.reset_index(drop=True, inplace=True), test_labels.reset_index(drop=True, inplace=True)

(None, None)

In [None]:
test_texts

0      respiratory morbidity and lung function analys...
1      flavor techniques for lfv processes: higgs dec...
2      corrigendum: human milk oligosaccharide compos...
3      obsessive–compulsive personality symptoms pred...
4      blood-brain barrier and delivery of protein an...
                             ...                        
106    synergies between division of labor and gut mi...
107    efficient and stable photocatalytic hydrogen e...
108    the δ-opioid receptor differentially regulates...
109    thalidomide in the treatment of sweet's syndro...
110    investigating gray and white matter structural...
Name: clean_text, Length: 111, dtype: object

In [None]:
train_texts.shape, val_texts.shape, test_texts.shape

((880,), (110,), (111,))

In [None]:
# Each set contains samples from all classes
len(train_labels.unique()), len(val_labels.unique()), len(test_labels.unique())

(20, 20, 20)

### Each set is a representative sample with equal distribution for all classes

In [None]:
pd.DataFrame([train_labels.value_counts(), val_labels.value_counts(), test_labels.value_counts()], 
             index=['Train', 'Val', 'Test']).T

Unnamed: 0,Train,Val,Test
Physiology,84,10,11
Genetics,79,10,10
Neuroscience,71,9,9
Psychiatry,69,8,9
Neurology,61,7,8
Chemistry,55,7,7
Marine Science,51,6,7
Bioengineering and Biotechnology,45,5,6
Endocrinology,42,5,6
Cell and Developmental Biology,38,5,4


In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('typeform/distilbert-base-uncased-mnli')

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
from transformers import pipeline

classifier_gpu = pipeline("zero-shot-classification", device=0, model='typeform/distilbert-base-uncased-mnli') # to utilize GPU

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


Downloading:   0%|          | 0.00/268M [00:00<?, ?B/s]

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


# Split long text into chunks of size less than 512 (chosen 256)

In [None]:
def split_sequence(sequence):
  tokens = tokenizer.tokenize(sequence)
  chunks = [tokens[x:x+256] for x in range(0, len(tokens), 256)]
  return [' '.join(x).replace('##', '') for x in chunks]

In [None]:
test_data_split = test_texts.apply(split_sequence)

In [None]:
test_data_split.head()

0    [respiratory mor bid ity and lung function ana...
1    [flavor techniques for l f v processes : hi gg...
2    [co rri gen du m : human milk ol igo sa cc har...
3    [ob ses sive – com pu ls ive personality sympt...
4    [blood - brain barrier and delivery of protein...
Name: clean_text, dtype: object

# Zero-shot classification with Hugging Face Pipeline

## Results when Classifier only considers FIRST 512 tokens (truncates sentence) of each TEST sample

In [None]:
test_sequences = list(test_texts)    # NOT SPLIT TEST DATA

candidate_labels = list(df.label.unique())

results = classifier_gpu(test_sequences, candidate_labels, batch_size=1) # Sequence is a list of list of sentences, but classifier only takes first sentence from inner list

scores_df = pd.DataFrame(results, columns=['labels', 'scores'])

In [None]:
pred_labels_512 = scores_df['labels'].apply(lambda x: x[0]) # The label with the highest score for each sample is the predicted label

### Accuracy score for Test Data, with default truncation to 512

In [None]:
accuracy_test_data_512 = accuracy_score(test_labels, pred_labels_512)
f1_score_test_data_512 = f1_score(test_labels, pred_labels_512, average='macro')
print(f'Test data with only first 512 tokens -> Accuracy: {accuracy_test_data_512*100:.2f}, F1_score: {f1_score_test_data_512*100:.2f}')

Test data with only first 512 tokens -> Accuracy: 7.21, F1_score: 4.24


## For Entire long_text samples with default batch_size=8

In [None]:
# Create lists to store predicted labels
pred_labels_count_list = []
pred_labels_prob_list = []

start = time.perf_counter()

for ix, long_text in enumerate(test_data_split):

  start_time = time.perf_counter()

  print(f'Sample {ix}: True Label: {test_labels[ix]}')

  prob_score_dict = dict.fromkeys(candidate_labels, 0)

  result = classifier_gpu(long_text, candidate_labels, batch_size=8)
  
  for index in range(len(long_text)):
    # Create a temporary dict for storing probability scores corresponding to each label, for each chunk
    prob_score_chunk = dict(zip(result[index]['labels'], result[index]['scores']))
    for label in prob_score_dict:
      # Add and update probability score received for each chunk, to get overall score for the entire text sample
      prob_score_dict[label] += prob_score_chunk[label]

  max_prob_label = max(prob_score_dict, key=prob_score_dict.get)

  # print('Predicted label based on highest probabilty score:', max_prob_label)
  
  score_df = pd.DataFrame(result, columns=['labels', 'scores'])

  # Get the labels with highest score(one at index '0') (for each sub-sentence)
  # Then get the label occuring first, the most time (chosen as the label for the main sentence)
  most_count_label = score_df['labels'].apply(lambda x: x[0]).value_counts().index[0] 
  
  # print('Predicted label based on most first occurence count:', most_count_label)
  
  pred_labels_count_list.append(most_count_label)
  pred_labels_prob_list.append(max_prob_label)

  stop_time = time.perf_counter() 
  # print(f'Run time for sample {ix}: {(stop_time - start_time):.2f}')

stop = time.perf_counter()
runtime = stop-start
# print(f'Total run time: {runtime:.2f}')

# Accuracy and F1 Score results for TEST Data

In [None]:
# Results for TEST Dataset with only first 512 tokens
accuracy_test_data_512 = accuracy_score(test_labels, pred_labels_512)
f1_score_test_data_512 = f1_score(test_labels, pred_labels_512, average='macro')
print(f'Test data only first 512 -> Accuracy: {accuracy_test_data_512*100:.2f}, F1_score: {f1_score_test_data_512*100:.2f}')


# pred_labels_count_list  # For ENTIRE long_text DATA
accuracy_count = accuracy_score(test_labels, pred_labels_count_list)
f1_score_count = f1_score(test_labels, pred_labels_count_list, average='macro')
print(f'Test data long text (most first position count) -> Accuracy: {accuracy_count*100:.2f}, F1_score: {f1_score_count*100:.2f}')


# pred_labels_count_list  # For ENTIRE long_text DATA

accuracy_prob = accuracy_score(test_labels, pred_labels_prob_list)
f1_score_prob = f1_score(test_labels, pred_labels_prob_list, average='macro')
print(f'Test data long text (highest probability sum) -> Accuracy: {accuracy_prob*100:.2f}, F1_score: {f1_score_prob*100:.2f}')


Test data only first 512 -> Accuracy: 7.21, F1_score: 4.24
Test data long text (most first position count) -> Accuracy: 20.72, F1_score: 17.18
Test data long text (highest probability sum) -> Accuracy: 19.82, F1_score: 15.00


In [None]:
zero_shot_metrics_table_df = pd.DataFrame(data=([np.round(accuracy_test_data_512*100,2),
                    np.round(f1_score_test_data_512*100,2)],
                   [np.round(accuracy_count*100,2),
                    np.round(f1_score_count*100,2)],
                   [np.round(accuracy_prob*100,2),
                    np.round(f1_score_prob*100,2)]), 
                    columns=['Accuracy', 'F1_score'], 
                    index=['Only first 512', 'Long text (most first position count)', 'Long text (highest probability sum)'])

zero_shot_metrics_table_df

Unnamed: 0,Accuracy,F1_score
Only first 512,7.21,4.24
Long text (most first position count),20.72,17.18
Long text (highest probability sum),19.82,15.0


In [None]:
test_data_results_df = pd.DataFrame([test_texts, test_data_split, test_labels, 
                                     np.array(pred_labels_count_list), np.array(pred_labels_prob_list), pred_labels_512],
                                     index=['text', 'clean_split_text', 'true_label', 'pred_label_count', 'pred_label_prob', 'pred_label_512']).T
test_data_results_df.head()

Unnamed: 0,text,clean_split_text,true_label,pred_label_count,pred_label_prob,pred_label_512
0,respiratory morbidity and lung function analys...,[respiratory mor bid ity and lung function ana...,Pediatrics,Aging Neuroscience,Cellular and Infection Microbiology,Bioengineering and Biotechnology
1,flavor techniques for lfv processes: higgs dec...,[flavor techniques for l f v processes : hi gg...,Physics,Aging Neuroscience,Aging Neuroscience,Medicine
2,corrigendum: human milk oligosaccharide compos...,[co rri gen du m : human milk ol igo sa cc har...,Pediatrics,Aging Neuroscience,Aging Neuroscience,Bioengineering and Biotechnology
3,obsessive–compulsive personality symptoms pred...,[ob ses sive – com pu ls ive personality sympt...,Psychiatry,Psychiatry,Psychiatry,Ecology and Evolution
4,blood-brain barrier and delivery of protein an...,[blood - brain barrier and delivery of protein...,Aging Neuroscience,Cellular Neuroscience,Cellular Neuroscience,Genetics


# Zero-shot Classification without Hugging Face pipeline

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [None]:
# pose sequence as a NLI premise and label as a hypothesis
from transformers import AutoModelForSequenceClassification, AutoTokenizer
nli_model = AutoModelForSequenceClassification.from_pretrained("typeform/distilbert-base-uncased-mnli").to(device)

tokenizer = AutoTokenizer.from_pretrained("typeform/distilbert-base-uncased-mnli")

The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.


In [None]:
nli_model.eval()

predicted_label_prob_list = []
predicted_label_count_list = []

for index, sample in enumerate(test_data_split):
# for sample in test_data_split:

  # print(sample)
  pred_dict_prob_sample = dict.fromkeys(candidate_labels, 0)
  pred_dict_count_sample = dict.fromkeys(candidate_labels, 0)

  for text_chunk in sample:
    # print(text_chunk)
    prob_score_chunk = dict.fromkeys(candidate_labels, 0)
    for label in candidate_labels:

      premise = text_chunk
      hypothesis = f'This example is {label}.'

      # run through model pre-trained on MNLI
      x = tokenizer.encode(premise, hypothesis, return_tensors='pt',
                     truncation='only_first')

      logits = nli_model(x.to(device))[0]

      # we throw away "neutral" (dim 1) and take the probability of
      # "entailment" (2) as the probability of the label being true 
      entail_contradiction_logits = logits[:,[0,2]]
      probs = entail_contradiction_logits.softmax(dim=1)
      prob_label_is_true = probs[:,1]

      # For each candidate label store its (entailment) probabilty score, for the text chunk
      prob_score_chunk[label] = prob_label_is_true.detach().cpu().numpy()[0]

      # Accumulate the probabilty scores for each label, for entire sample
      pred_dict_prob_sample[label] += prob_score_chunk[label]

    # Get the label with the highest probability score, and increase its occurrence count by 1
    pred_dict_count_sample[max(prob_score_chunk, key=prob_score_chunk.get)] += 1

  # Most predicted label for each sample is stored in a list 
  predicted_label_count_list.append(max(pred_dict_count_sample, key=pred_dict_count_sample.get))
  # print(max(pred_dict_count_sample, key=pred_dict_count_sample.get))

  # Label with highest accumulated probability score for each sample is stored in a list
  predicted_label_prob_list.append(max(pred_dict_prob_sample, key=pred_dict_prob_sample.get))

In [None]:
# pred_labels_count_list  # For ENTIRE long_text DATA
accuracy_count = accuracy_score(test_labels, predicted_label_count_list)
f1_score_count = f1_score(test_labels, predicted_label_count_list, average='macro')
print(f'Test data long text (most first position count) -> Accuracy: {accuracy_count*100:.2f}, F1_score: {f1_score_count*100:.2f}')


# pred_labels_count_list  # For ENTIRE long_text DATA

accuracy_prob = accuracy_score(test_labels, predicted_label_prob_list)
f1_score_prob = f1_score(test_labels, predicted_label_prob_list, average='macro')
print(f'Test data long text (highest probability sum) -> Accuracy: {accuracy_prob*100:.2f}, F1_score: {f1_score_prob*100:.2f}')