<a href="https://colab.research.google.com/github/praseedm/tracin_implemention_on_ledgar_dataset/blob/main/Mislabelled_detection_using_self_influence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# clone git repo for data
! git clone https://github.com/praseedm/tracin_implemention_on_ledgar_dataset.git

Cloning into 'tracin_implemention_on_ledgar_dataset'...
remote: Enumerating objects: 60, done.[K
remote: Counting objects: 100% (60/60), done.[K
remote: Compressing objects: 100% (53/53), done.[K
remote: Total 60 (delta 26), reused 17 (delta 5), pack-reused 0[K
Unpacking objects: 100% (60/60), done.


In [2]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd tracin_implemention_on_ledgar_dataset

/content/tracin_implemention_on_ledgar_dataset


In [15]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 5.2 MB/s 
[K     |████████████████████████████████| 462 kB 37.1 MB/s 
[?25h

In [4]:
# copying checkpoints 
!cp -r /content/drive/MyDrive/ARU/ML\ Project/train_outputs ./

In [16]:
import os

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import tensorflow_text as text
import pandas as pd
import json
from tqdm import tqdm
import numpy as np



tf.get_logger().setLevel('ERROR')

In [6]:
tf.__version__

'2.8.0'

## Data preparation

In [8]:
BATCH_SIZE = 32
SEED = 10
AUTOTUNE = tf.data.AUTOTUNE

In [9]:
def generate_batch_dataset_from_csv(csv_path:str):
  df = pd.read_csv(csv_path)
  dataset = tf.data.Dataset.from_tensor_slices(
      (
          df['text'].values,
          df['label'].values
      )
  )
  print(f"{len(dataset)}\n{dataset.element_spec}")
  return dataset.batch(batch_size=BATCH_SIZE)

In [10]:
train_ds = generate_batch_dataset_from_csv(csv_path='./data/train_data.csv')
test_ds = generate_batch_dataset_from_csv(csv_path='./data/test_data.csv')

32906
(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))
5438
(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [11]:
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [12]:
# Load labels 
with open('./data/label_names.json') as rb:
  label_names = json.load(rb)
print(f"Loaded {len(label_names)} labels")
print(f"Labels : {label_names}")

Loaded 25 labels
Labels : ['Amendments', 'Assignments', 'Compliance With Laws', 'Confidentiality', 'Counterparts', 'Entire Agreements', 'Expenses', 'Financial Statements', 'Further Assurances', 'General', 'Governing Laws', 'Indemnifications', 'Insurances', 'Litigations', 'No Conflicts', 'Notices', 'Payments', 'Severability', 'Survival', 'Taxes', 'Terminations', 'Terms', 'Use Of Proceeds', 'Waiver Of Jury Trials', 'Waivers']


## Load checkpoints

In [17]:
checkpoints_root_dir = './train_outputs'
def get_checkpoint_dir(epoch_number:int):
  checkpoint_path = os.path.join(checkpoints_root_dir, f"mymodel_{epoch_number}")
  if os.path.exists(checkpoint_path):
    return checkpoint_path
  raise ValueError(f"{checkpoint_path} doesn't exists")


def load_model(model_path:str):
  print(f"Loading model from {model_path}")
  return tf.keras.models.load_model(model_path)

In [18]:
#load models
checkpoint_models = []
epochs = [2,4,5,6]
for epoch in tqdm(epochs):
  model_path = get_checkpoint_dir(epoch)
  checkpoint_models.append(load_model(model_path=model_path))

  0%|          | 0/4 [00:00<?, ?it/s]

Loading model from ./train_outputs/mymodel_2


 25%|██▌       | 1/4 [00:11<00:35, 11.85s/it]

Loading model from ./train_outputs/mymodel_4


 50%|█████     | 2/4 [00:23<00:23, 11.60s/it]

Loading model from ./train_outputs/mymodel_5


 75%|███████▌  | 3/4 [00:35<00:12, 12.00s/it]

Loading model from ./train_outputs/mymodel_6


100%|██████████| 4/4 [00:46<00:00, 11.70s/it]


## Run self influence

In [50]:
def compute_self_influence(inputs, models):
  # Refered from the TracIn, https://arxiv.org/pdf/2002.08484.pdf
  texts , labels = inputs
  self_influences = []
  for checkpoint_model in models:
    # picking weights from the last  two fully connected layer.
    with tf.GradientTape(watch_accessed_variables=False) as tape:
      tape.watch(checkpoint_model.trainable_weights[-2:])
      logits = checkpoint_model(texts)
      probs = tf.nn.softmax(logits)
      loss = tf.keras.losses.sparse_categorical_crossentropy(labels, probs)
    
    grads = tape.jacobian(loss, checkpoint_model.trainable_weights[-2:])
    scores = tf.add_n([tf.math.reduce_sum(
        grad * grad, axis=tf.range(1, tf.rank(grad), 1)) 
        for grad in grads])
    self_influences.append(scores)  

    probs, predicted_labels = tf.math.top_k(probs, k=1)
    return texts,  tf.math.reduce_sum(tf.stack(self_influences, axis=-1), axis=-1), labels, probs, predicted_labels
  

def run(dataset):
  texts = []
  scores = []
  labels = []
  predict_probs = []
  predict_labels = []
  
  for batch in tqdm(dataset):
    b_texts , b_scores , b_labels, b_probs , b_predicted_labels = compute_self_influence(inputs=batch, models=checkpoint_models)
    texts.append(b_texts.numpy())
    scores.append(b_scores.numpy())
    labels.append(b_labels.numpy())
    predict_probs.append(b_probs.numpy())
    predict_labels.append(b_predicted_labels.numpy())
  
  return {
      'texts' : np.concatenate(texts),
      'self_influence_scores' : np.concatenate(scores),
      'labels' : np.concatenate(labels),
      'predicted_probs' : np.concatenate(predict_probs),
      'predicted_labels' : np.concatenate(predict_labels)
  }


In [42]:
res = run(train_ds)

100%|██████████| 1/1 [00:05<00:00,  5.12s/it]


In [48]:
def get_label_name_from_index(index):
  return label_names[index]

def save_result(self_incluence_features, topk=10):
  sorted_indices = np.argsort(-self_incluence_features['self_influence_scores'])
  result = []
  for index in sorted_indices[:topk]:
    result.append(
        {
            "text" : self_incluence_features['texts'][index],
            "actaul_label" : get_label_name_from_index(self_incluence_features['labels'][index]),
            "predicted_label" : get_label_name_from_index(self_incluence_features['predicted_labels'][index][0]),
            "confidence" : self_incluence_features['predicted_probs'][index][0],
            "self_influence_score" : self_incluence_features['self_influence_scores'][index],
        }
    )
  
  return result

In [49]:
save_result(res)

[{'actaul_label': 'Litigations',
  'confidence': 0.6382746,
  'predicted_label': 'Notices',
  'self_influence_score': 153.02077,
  'text': b'From the date hereof through the Closing, each party hereto shall promptly notify the representative of the other parties of any known Proceeding which after the date hereof are threatened or commenced against such party or any of its affiliates or any officer, director, employee, consultant, agent or Acquiree Company Record Owners thereof, in their capacities as such, which, if decided adversely, could reasonably be expected to have a Material Adverse Effect upon the condition (financial or otherwise), assets, liabilities, business, operations or prospects of such party or any of its Subsidiaries.'},
 {'actaul_label': 'Confidentiality',
  'confidence': 0.68292475,
  'predicted_label': 'Confidentiality',
  'self_influence_score': 17.870857,
  'text': b'You agree to hold the facts and circumstances surrounding the execution of this Agreement and th