In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

--2024-02-25 02:27:57--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-02-25 02:27:57--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-02-25 02:27:57--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [2]:
!unzip glove.6B.zip

Archive:  glove.6B.zip
  inflating: glove.6B.50d.txt        
  inflating: glove.6B.100d.txt       
  inflating: glove.6B.200d.txt       
  inflating: glove.6B.300d.txt       


In [3]:
import pandas as pd
import json
from tqdm import tqdm
import tensorflow as tf
import numpy as np
import requests

In [4]:
def get_raw_data():
    train_data_url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/train.json"
    test_data_url = "https://storage.googleapis.com/indianlegalbert/OPEN_SOURCED_FILES/Rhetorical_Role_Benchmark/Data/dev.json"
    train_data_json = requests.get(train_data_url).json()
    test_data_json = requests.get(test_data_url).json()
    return train_data_json, test_data_json

CLASSES = ['PREAMBLE', 'NONE', 'FAC', 'ARG_RESPONDENT', 'RLC', 'ARG_PETITIONER', 'ANALYSIS', 'PRE_RELIED', 'RATIO', 'RPC', 'ISSUE', 'STA', 'PRE_NOT_RELIED']
train_data_json, test_data_json = get_raw_data()

In [5]:
df_train = pd.DataFrame(columns = ["text", "output"])
p = 0
for i in tqdm(range(len(train_data_json))):
  for j in range(len(train_data_json[i]["annotations"])):
     for k in range(len(train_data_json[i]["annotations"][j]["result"])):
       text = train_data_json[i]["annotations"][j]["result"][k]["value"]["text"]
       output = train_data_json[i]["annotations"][j]["result"][k]["value"]["labels"][0]
       df_train.loc[p] = [text, output]
       p+=1

100%|██████████| 247/247 [01:43<00:00,  2.38it/s]


In [6]:
df_train.head()

Unnamed: 0,text,output
0,"IN THE HIGH COURT OF KARNATAKA,\n ...",PREAMBLE
1,\n\n BEFORE\n\nTHE HON'BLE MR.JUSTICE ANA...,PREAMBLE
2,This Criminal Appeal is filed under Section 37...,PREAMBLE
3,\n\n This appeal coming on for hearing t...,PREAMBLE
4,\n Heard the learned Counsel for the app...,NONE


In [7]:
tokenizer = tf.keras.preprocessing.text.Tokenizer()
tokenizer.fit_on_texts(df_train["text"].tolist())
vocab_size = len(tokenizer.word_index) + 1
encoded_docs = tokenizer.texts_to_sequences(df_train["text"].tolist())

In [9]:
padded_docs = tf.keras.utils.pad_sequences(encoded_docs, maxlen=380, padding='post')

In [10]:
label = list(set(df_train["output"].tolist()))

In [11]:
index_map = {}
for i in range(len(label)):
  index_map[label[i]] = i

df_train["label"] = df_train["output"].map(index_map)

In [14]:
df_train.head()

Unnamed: 0,text,output,label
0,"IN THE HIGH COURT OF KARNATAKA,\n ...",PREAMBLE,4
1,\n\n BEFORE\n\nTHE HON'BLE MR.JUSTICE ANA...,PREAMBLE,4
2,This Criminal Appeal is filed under Section 37...,PREAMBLE,4
3,\n\n This appeal coming on for hearing t...,PREAMBLE,4
4,\n Heard the learned Counsel for the app...,NONE,12


In [15]:
final_label = df_train["label"].tolist()

In [16]:
embeddings_index = dict()
f = open('glove.6B.100d.txt')
for line in f:
	all_values = line.split()
	curr_word = all_values[0]
	coefficients = np.asarray(all_values[1:], dtype='float32')
	embeddings_index[curr_word] = coefficients
f.close()

In [17]:
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
	embedding_vector = embeddings_index.get(word)
	if embedding_vector is not None:
		embedding_matrix[i] = embedding_vector

In [18]:
final_labels = np.array(final_label)

In [19]:
num_classes = len(index_map)
final_labels = tf.keras.utils.to_categorical(final_labels, num_classes=num_classes)

x_test, x_train = padded_docs[:3000], padded_docs[3000:]
y_test, y_train = final_labels[:3000], final_labels[3000:]

In [20]:
model = tf.keras.Sequential()
e = tf.keras.layers.Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=380, trainable=False)
model.add(e)
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 380, 100)          2192000   
                                                                 
 flatten (Flatten)           (None, 38000)             0         
                                                                 
 dense (Dense)               (None, 13)                494013    
                                                                 
Total params: 2686013 (10.25 MB)
Trainable params: 494013 (1.88 MB)
Non-trainable params: 2192000 (8.36 MB)
_________________________________________________________________
None


In [21]:
model.fit(x_train, y_train, epochs=10, verbose=2)

Epoch 1/10
813/813 - 11s - loss: 1.7142 - accuracy: 0.4866 - 11s/epoch - 13ms/step
Epoch 2/10
813/813 - 9s - loss: 1.2551 - accuracy: 0.5958 - 9s/epoch - 11ms/step
Epoch 3/10
813/813 - 13s - loss: 1.0964 - accuracy: 0.6396 - 13s/epoch - 16ms/step
Epoch 4/10
813/813 - 9s - loss: 1.0085 - accuracy: 0.6599 - 9s/epoch - 11ms/step
Epoch 5/10
813/813 - 8s - loss: 0.9471 - accuracy: 0.6841 - 8s/epoch - 10ms/step
Epoch 6/10
813/813 - 12s - loss: 0.8856 - accuracy: 0.6997 - 12s/epoch - 15ms/step
Epoch 7/10
813/813 - 9s - loss: 0.8492 - accuracy: 0.7111 - 9s/epoch - 12ms/step
Epoch 8/10
813/813 - 9s - loss: 0.8137 - accuracy: 0.7240 - 9s/epoch - 11ms/step
Epoch 9/10
813/813 - 13s - loss: 0.7837 - accuracy: 0.7335 - 13s/epoch - 15ms/step
Epoch 10/10
813/813 - 9s - loss: 0.7590 - accuracy: 0.7386 - 9s/epoch - 11ms/step


<keras.src.callbacks.History at 0x7933c7703f10>

In [22]:
y_pred = model.predict(x_test, verbose=2)

94/94 - 0s - 472ms/epoch - 5ms/step


In [23]:
Y_pred = np.argmax(y_pred,axis=1)
Y_test = np.argmax(y_test,axis=1)

In [32]:
from sklearn.metrics import f1_score
score = f1_score(Y_test, Y_pred, average='macro')
print("F1 Score:", score)

F1 Score: 0.29853078242894576
