In [1]:
!pip install transformers
!pip install -U sentence-transformers
!pip install https://github.com/sadrasabouri/plda/tarball/master

Collecting https://github.com/sadrasabouri/plda/tarball/master
  Using cached https://github.com/sadrasabouri/plda/tarball/master
  Preparing metadata (setup.py) ... [?25l[?25hdone


In [2]:
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.nn.functional import normalize
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import torch

import plda

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import requests

In [3]:
def get_raw_data():
    train_data_url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/train.json"
    test_data_url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/dev.json"
    train_data_json = requests.get(train_data_url).json()
    test_data_json = requests.get(test_data_url).json()
    return train_data_json, test_data_json

CLASSES = ['PREAMBLE', 'NONE', 'FAC', 'ARG_RESPONDENT', 'RLC', 'ARG_PETITIONER', 'ANALYSIS', 'PRE_RELIED', 'RATIO', 'RPC', 'ISSUE', 'STA', 'PRE_NOT_RELIED']
train_data_json, test_data_json = get_raw_data()

In [4]:
train_labels_all=[]
train_data=[]

for item in train_data_json:
  for annotations in item['annotations']:
    for results in annotations['result']:
      train_data.append(results['value']['text'].strip())
      train_labels_all.append(results['value']['labels'][0])

test_labels_all=[]
test_data =[]

for item in test_data_json:
  for annotations in item['annotations']:
    for results in annotations['result']:
      test_data.append(results['value']['text'].strip())
      test_labels_all.append(results['value']['labels'][0])

TRAINING_SIZE = len(train_labels_all)

In [None]:
model = SentenceTransformer('sentence-transformers/sentence-t5-xxl')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
  return self.fget.__get__(instance, owner)()


In [1]:
def get_embeddings(sentence):
  return model.encode(sentence)

In [None]:
X = []
y = []
train_labels_unique = list(set(train_labels_all))
progress = 0
for sentence, label in zip(train_data[:TRAINING_SIZE], train_labels_all[:TRAINING_SIZE]):
    X.append(get_embeddings(sentence))
    y.append(train_labels_unique.index(label))
    progress += 1
    if progress % 500 == 0:
        print(f'Progress: {100 * progress / TRAINING_SIZE}%')

In [None]:
X = np.array(X)
y = np.array(y)

In [None]:
PLDA_classifier = plda.Classifier()
PLDA_classifier.fit_model(np.array(X), np.array(y))

In [None]:
def predict_labels(query):
    query_embedding = get_embeddings(query)
    predictions, log_p_predictions = PLDA_classifier.predict(query_embedding)
    predictions = train_labels_unique[predictions]
    return predictions

In [None]:
preds=[]
for query in test_data:
    preds.append(predict_labels(query))

In [None]:
match_count=0
all=0
for i,j in zip(test_labels_all, preds):
  if i==j:
    match_count+=1
  all+=1
f1score = f1_score(test_labels_all, preds, average="macro")
print("F1 Score:", f1score)
print("Precision:", match_count/all)

In [None]:
cm = confusion_matrix(test_labels_all, preds)
cm_df = pd.DataFrame(cm, index = CLASSES, columns = CLASSES)

plt.figure(figsize=(15,10))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()