<a href="https://colab.research.google.com/github/praseedm/tracin_implemention_on_ledgar_dataset/blob/main/Find_opponent_%26_proponents.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# clone git repo for data
! git clone https://github.com/praseedm/tracin_implemention_on_ledgar_dataset.git

Cloning into 'tracin_implemention_on_ledgar_dataset'...
remote: Enumerating objects: 54, done.[K
remote: Counting objects: 100% (54/54), done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 54 (delta 22), reused 17 (delta 5), pack-reused 0[K
Unpacking objects: 100% (54/54), done.


In [2]:
# Mount GDrive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd tracin_implemention_on_ledgar_dataset

/content/tracin_implemention_on_ledgar_dataset


In [4]:
# copying checkpoints 
!cp -r /content/drive/MyDrive/ARU/ML\ Project/train_outputs ./

In [5]:
# A dependency of the preprocessing for BERT inputs
!pip install -q -U "tensorflow-text==2.8.*"

[K     |████████████████████████████████| 4.9 MB 5.0 MB/s 
[K     |████████████████████████████████| 462 kB 61.1 MB/s 
[?25h

In [18]:
import os

import tensorflow as tf
import tensorflow_hub as hub
import matplotlib.pyplot as plt
import tensorflow_text as text
import pandas as pd
import json
from tqdm import tqdm
import numpy as np



tf.get_logger().setLevel('ERROR')

In [7]:
!nvidia-smi

NVIDIA-SMI has failed because it couldn't communicate with the NVIDIA driver. Make sure that the latest NVIDIA driver is installed and running.



In [8]:
tf.__version__

'2.8.0'

## Data Preparation

In [9]:
BATCH_SIZE = 32
SEED = 10
AUTOTUNE = tf.data.AUTOTUNE

In [10]:
def generate_batch_dataset_from_csv(csv_path:str):
  df = pd.read_csv(csv_path)
  dataset = tf.data.Dataset.from_tensor_slices(
      (
          df['text'].values,
          df['label'].values
      )
  )
  print(f"{len(dataset)}\n{dataset.element_spec}")
  return dataset.batch(batch_size=BATCH_SIZE)

In [11]:
train_ds = generate_batch_dataset_from_csv(csv_path='./data/train_data.csv')
test_ds = generate_batch_dataset_from_csv(csv_path='./data/test_data.csv')

32906
(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))
5438
(TensorSpec(shape=(), dtype=tf.string, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))


In [12]:
train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
test_ds = test_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [13]:
# Load labels 
with open('./data/label_names.json') as rb:
  label_names = json.load(rb)
print(f"Loaded {len(label_names)} labels")
print(f"Labels : {label_names}")

Loaded 25 labels
Labels : ['Amendments', 'Assignments', 'Compliance With Laws', 'Confidentiality', 'Counterparts', 'Entire Agreements', 'Expenses', 'Financial Statements', 'Further Assurances', 'General', 'Governing Laws', 'Indemnifications', 'Insurances', 'Litigations', 'No Conflicts', 'Notices', 'Payments', 'Severability', 'Survival', 'Taxes', 'Terminations', 'Terms', 'Use Of Proceeds', 'Waiver Of Jury Trials', 'Waivers']


## Load checkpoints

In [14]:
checkpoints_root_dir = './train_outputs'
def get_checkpoint_dir(epoch_number:int):
  checkpoint_path = os.path.join(checkpoints_root_dir, f"mymodel_{epoch_number}")
  if os.path.exists(checkpoint_path):
    return checkpoint_path
  raise ValueError(f"{checkpoint_path} doesn't exists")

In [15]:
def load_model(model_path:str):
  print(f"Loading model from {model_path}")
  return tf.keras.models.load_model(model_path)

In [16]:
#load models
checkpoint_models = []
epochs = [2,4,5,6]
for epoch in tqdm(epochs):
  model_path = get_checkpoint_dir(epoch)
  checkpoint_models.append(load_model(model_path=model_path))

  0%|          | 0/4 [00:00<?, ?it/s]

Loading model from ./train_outputs/mymodel_2


 25%|██▌       | 1/4 [00:22<01:06, 22.20s/it]

Loading model from ./train_outputs/mymodel_4


 50%|█████     | 2/4 [00:36<00:35, 17.76s/it]

Loading model from ./train_outputs/mymodel_5


 75%|███████▌  | 3/4 [00:52<00:16, 16.97s/it]

Loading model from ./train_outputs/mymodel_6


100%|██████████| 4/4 [01:07<00:00, 17.00s/it]


## Find proponents & Opponents

In [28]:
def run_checkpoints(inputs, models):
  texts, labels = inputs
  loss_grads = []
  for checkpoint_model in models:
    logits = checkpoint_model(texts)
    probs = tf.nn.softmax(logits)
    loss_grad =  tf.one_hot(labels, len(label_names)) - probs
    loss_grads.append(loss_grad)
  
  # prediction using last checkpoint
  probs, predicted_labels =  tf.math.top_k(probs, k=1)

  return texts, tf.stack(loss_grads, axis=-1), labels, probs, predicted_labels

In [26]:
def generate_tracin_features(dataset):
  texts = []
  loss_grads = []
  labels = []
  predict_probs = []
  predict_labels = []
  
  for batch in tqdm(dataset):
    b_texts , b_loss_grads , b_labels, b_probs , b_predicted_labels = run_checkpoints(inputs=batch, models=checkpoint_models)
    texts.append(b_texts.numpy())
    loss_grads.append(b_loss_grads.numpy())
    labels.append(b_labels.numpy())
    predict_probs.append(b_probs.numpy())
    predict_labels.append(b_predicted_labels.numpy())
  
  return {
      'texts' : np.concatenate(texts),
      'loss_grads' : np.concatenate(loss_grads),
      'labels' : np.concatenate(labels),
      'predicted_probs' : np.concatenate(predict_probs),
      'predicted_labels' : np.concatenate(predict_labels)
  }


In [31]:
res = generate_tracin_features(train_ds)

100%|██████████| 1/1 [00:10<00:00, 10.24s/it]
