In [None]:
!pip install transformers
!pip install -U sentence-transformers
!pip install faiss-gpu

Collecting sentence-transformers
  Downloading sentence_transformers-2.5.1-py3-none-any.whl (156 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m156.5/156.5 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: sentence-transformers
Successfully installed sentence-transformers-2.5.1
Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m6.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
from sentence_transformers import SentenceTransformer

from transformers import AutoTokenizer, AutoModel, AutoConfig
from torch.nn.functional import normalize
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import numpy as np
import torch

import faiss

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
import requests

In [None]:
def get_raw_data():
    train_data_url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/train.json"
    test_data_url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/dev.json"
    train_data_json = requests.get(train_data_url).json()
    test_data_json = requests.get(test_data_url).json()
    return train_data_json, test_data_json

CLASSES = ['PREAMBLE', 'NONE', 'FAC', 'ARG_RESPONDENT', 'RLC', 'ARG_PETITIONER', 'ANALYSIS', 'PRE_RELIED', 'RATIO', 'RPC', 'ISSUE', 'STA', 'PRE_NOT_RELIED']
train_data_json, test_data_json = get_raw_data()

In [None]:
train_labels_all=[]
train_data=[]

for item in train_data_json:
  for annotations in item['annotations']:
    for results in annotations['result']:
      train_data.append(results['value']['text'].strip())
      train_labels_all.append(results['value']['labels'][0])

test_labels_all=[]
test_data =[]

for item in test_data_json:
  for annotations in item['annotations']:
    for results in annotations['result']:
      test_data.append(results['value']['text'].strip())
      test_labels_all.append(results['value']['labels'][0])

TRAINING_SIZE = len(train_labels_all)

In [None]:
model = SentenceTransformer('sentence-transformers/sentence-t5-xxl')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/1.98k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/9.73G [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()


In [None]:
def get_embeddings(sentence):
  return model.encode(sentence)

In [None]:
X = []
y = []
train_labels_unique = list(set(train_labels_all))
progress = 0
for sentence, label in zip(train_data[:TRAINING_SIZE], train_labels_all[:TRAINING_SIZE]):
    X.append(get_embeddings(sentence))
    y.append(train_labels_unique.index(label))
    progress += 1
    if progress % 500 == 0:
        print(f'Progress: {100 * progress / TRAINING_SIZE}%')


In [None]:
X = np.array(X)
y = np.array(y)

In [None]:
index = faiss.IndexIVFFlat(faiss.IndexFlatL2(X.shape[1]), X.shape[1], len(train_labels_unique), faiss.METRIC_L2)
index.train(X)
index.add(X)

In [None]:
test_embeddings = []
for i in test_data:
  test_embeddings.append(get_embeddings(i))

test_embeddings = np.array(test_embeddings)

In [None]:
_, indices = index.search(test_embeddings, 1)
preds=[train_labels_unique[y[i[0]]] for i in indices]

In [None]:
match_count=0
all=0
for i,j in zip(test_labels_all, preds):
  if i==j:
    match_count+=1
  all+=1

f1score = f1_score(test_labels_all, preds, average="macro")

print("F1 Score:", f1score)
print("Precision:", match_count/all)

In [None]:
cm = confusion_matrix(test_labels_all, preds)
cm_df = pd.DataFrame(cm, index = CLASSES, columns = CLASSES)

plt.figure(figsize=(15,10))
sns.heatmap(cm_df, annot=True)
plt.title('Confusion Matrix')
plt.ylabel('Actual Values')
plt.xlabel('Predicted Values')
plt.show()