## **Reading the Dataset as a DataFrame**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import io
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Final Year Paper Work/Complete Summarized Dataset.xlsx', header=0,index_col=0)

df = df.dropna() # To remove any None values
df.head()

##**Splitting the data for complete, extractive and abstractive summarized texts**

In [None]:
# Complete Data
com_sentences = df[['Judgement','Judgement Status']]

# Abstractive Data
abs_sentences = df[['Abstractive Summarized Judgements','Judgement Status']]

# Extractive Data
ext_sentences = df[['Extractive Summarized Judgements','Judgement Status']]

from sklearn.model_selection import train_test_split

# Train and Test Split for Complete Data
df_train_com, df_test_com = train_test_split(com_sentences, test_size=0.25, random_state=42)

# Train and Test Split for Abstractive Data
df_train_abs, df_test_abs = train_test_split(abs_sentences, test_size=0.25, random_state=42)

# Train and Test Split for Extractive Data
df_train_ext, df_test_ext = train_test_split(ext_sentences, test_size=0.25, random_state=42)

## **Converting the Judgement Status to Categorical Values**

In [None]:
from tensorflow.keras.utils import to_categorical

# Complete Data
y_train_com = to_categorical(df_train_com['Judgement Status'])
y_test_com = to_categorical(df_test_com['Judgement Status'])

# Abstractive Data
y_train_abs = to_categorical(df_train_abs['Judgement Status'])
y_test_abs = to_categorical(df_test_abs['Judgement Status'])

# Extractive Data
y_train_ext = to_categorical(df_train_ext['Judgement Status'])
y_test_ext = to_categorical(df_test_ext['Judgement Status'])

## **Import Statements**

In [None]:
!pip install transformers

In [None]:
# Import Statements
import tensorflow as tf
from tensorflow import keras
import numpy as np

from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.initializers import TruncatedNormal
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import CategoricalAccuracy
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Dense

from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization, Dropout, Layer
from tensorflow.keras.layers import Embedding, Input, GlobalAveragePooling1D, Dense
from tensorflow.keras.models import Sequential, Model

In [None]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

# Load the tokenizer and model
lbert_tokenizer = AutoTokenizer.from_pretrained("nlpaueb/legal-bert-base-uncased")
lbert_model = TFAutoModelForSequenceClassification.from_pretrained("nlpaueb/legal-bert-base-uncased", num_labels=4)

##**1) Complete Data**

In [None]:
# Tokenize the input (takes some time)
# here tokenizer using from bert-base-cased
x_train_com_lbert = lbert_tokenizer(
    text=df_train_com['Judgement'].tolist(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test_com_lbert = lbert_tokenizer(
    text=df_test_com['Judgement'].tolist(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
input_ids_com_lbert = x_train_com_lbert['input_ids']
attention_mask_com_lbert = x_train_com_lbert['attention_mask']

In [None]:
max_len = 100

input_ids_com_lbert = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask_com_lbert = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = lbert_model(input_ids_com_lbert, attention_mask = input_mask_com_lbert)[0]
out = Dense(128, activation='relu')(embeddings)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(4,activation = 'sigmoid')(out)

model_com_lbert = tf.keras.Model(inputs=[input_ids_com_lbert, input_mask_com_lbert], outputs=y)
model_com_lbert.layers[2].trainable = True

In [None]:
optimizer_com_lbert = tf.keras.optimizers.legacy.Adam(learning_rate=5e-05, # this learning rate is for legal bert model , taken from huggingface website
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss_com_lbert = CategoricalCrossentropy(from_logits = True)
metric_com_lbert = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model_com_lbert.compile(
    optimizer = optimizer_com_lbert,
    loss=loss_com_lbert,
    metrics = metric_com_lbert)

In [None]:
history_com_lbert = model_com_lbert.fit(
    x ={'input_ids':x_train_com_lbert['input_ids'],'attention_mask':x_train_com_lbert['attention_mask']} ,
    y = y_train_com,
    validation_data = (
    {'input_ids':x_test_com_lbert['input_ids'],'attention_mask':x_test_com_lbert['attention_mask']}, y_test_com
    ),
  epochs=10,
    batch_size=64
)

In [None]:
predicted_raw_com_lbert = model_com_lbert.predict({'input_ids':x_test_com_lbert['input_ids'],'attention_mask':x_test_com_lbert['attention_mask']})
predicted_raw_com_lbert[0]

In [None]:
import numpy as np

y_predicted_com_lbert = np.argmax(predicted_raw_com_lbert, axis = 1)
y_true_com_lbert = df_test_com['Judgement Status']

In [None]:
from sklearn.metrics import classification_report, f1_score
print(classification_report(y_true_com_lbert, y_predicted_com_lbert))

print("F1 Score: ",f1_score(y_true_com_lbert,y_predicted_com_lbert, average='weighted'))

In [None]:
x_train_com_lbert ={'input_ids':x_train_com_lbert['input_ids'],'attention_mask':x_train_com_lbert['attention_mask']}

train_loss_com_lbert, train_accuracy_com_lbert = model_com_lbert.evaluate(x_train_com_lbert, y_train_com, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy_com_lbert))

In [None]:
x_test_com_lbert = {'input_ids':x_test_com_lbert['input_ids'],'attention_mask':x_test_com_lbert['attention_mask']}

test_loss_com_lbert, test_accuracy_com_lbert = model_com_lbert.evaluate(x_test_com_lbert, y_test_com, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy_com_lbert))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_com_lbert, "balanced_accuracy")
plot_graphs(history_com_lbert, "loss")

In [None]:
model_com_lbert.save("/content/drive/MyDrive/Final Year Paper Work/H5 Files/COM-LEGALBERT.h5")

##**2) Abstractive Summarized Data**

In [None]:
# Tokenize the input (takes some time)
# here tokenizer using from bert-base-cased
x_train_abs_lbert = lbert_tokenizer(
    text=df_train_abs['Abstractive Summarized Judgements'].tolist(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test_abs_lbert = lbert_tokenizer(
    text=df_test_abs['Abstractive Summarized Judgements'].tolist(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
input_ids_abs_lbert = x_train_abs_lbert['input_ids']
attention_mask_abs_lbert = x_train_abs_lbert['attention_mask']

In [None]:
max_len = 100

input_ids_abs_lbert = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask_abs_lbert = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = lbert_model(input_ids_abs_lbert, attention_mask = input_mask_abs_lbert)[0]
out = Dense(128, activation='relu')(embeddings)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(4,activation = 'sigmoid')(out)

model_abs_lbert = tf.keras.Model(inputs=[input_ids_abs_lbert, input_mask_abs_lbert], outputs=y)
model_abs_lbert.layers[2].trainable = True

In [None]:
optimizer_abs_lbert = tf.keras.optimizers.legacy.Adam(learning_rate=5e-05, # this learning rate is for legal bert model , taken from huggingface website
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss_abs_lbert = CategoricalCrossentropy(from_logits = True)
metric_abs_lbert = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model_abs_lbert.compile(
    optimizer = optimizer_abs_lbert,
    loss=loss_abs_lbert,
    metrics = metric_abs_lbert)

In [None]:
history_abs_lbert = model_abs_lbert.fit(
    x ={'input_ids':x_train_abs_lbert['input_ids'],'attention_mask':x_train_abs_lbert['attention_mask']} ,
    y = y_train_abs,
    validation_data = (
    {'input_ids':x_test_abs_lbert['input_ids'],'attention_mask':x_test_abs_lbert['attention_mask']}, y_test_abs
    ),
  epochs=10,
    batch_size=64
)

In [None]:
predicted_raw_abs_lbert = model_abs_lbert.predict({'input_ids':x_test_abs_lbert['input_ids'],'attention_mask':x_test_abs_lbert['attention_mask']})
predicted_raw_abs_lbert[0]

In [None]:
import numpy as np

y_predicted_abs_lbert = np.argmax(predicted_raw_abs_lbert, axis = 1)
y_true_abs_lbert = df_test_abs['Judgement Status']

In [None]:
from sklearn.metrics import classification_report, f1_score
print(classification_report(y_true_abs_lbert, y_predicted_abs_lbert))

print("F1 Score: ",f1_score(y_true_abs_lbert,y_predicted_abs_lbert, average='weighted'))

In [None]:
x_train_abs_lbert ={'input_ids':x_train_abs_lbert['input_ids'],'attention_mask':x_train_abs_lbert['attention_mask']}

train_loss_abs_lbert, train_accuracy_abs_lbert = model_abs_lbert.evaluate(x_train_abs_lbert, y_train_abs, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy_abs_lbert))

In [None]:
x_test_abs_lbert = {'input_ids':x_test_abs_lbert['input_ids'],'attention_mask':x_test_abs_lbert['attention_mask']}

test_loss_abs_lbert, test_accuracy_abs_lbert = model_abs_lbert.evaluate(x_test_abs_lbert, y_test_abs, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy_abs_lbert))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_abs_lbert, "balanced_accuracy")
plot_graphs(history_abs_lbert, "loss")

In [None]:
model_abs_lbert.save("/content/drive/MyDrive/Final Year Paper Work/H5 Files/ABS-LEGALBERT.h5")

##**3) Extractive Summarized Data**

In [None]:
# Tokenize the input (takes some time)
# here tokenizer using from bert-base-cased
x_train_ext_lbert = lbert_tokenizer(
    text=df_train_ext['Extractive Summarized Judgements'].tolist(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)
x_test_ext_lbert = lbert_tokenizer(
    text=df_test_ext['Extractive Summarized Judgements'].tolist(),
    add_special_tokens=True,
    max_length=100,
    truncation=True,
    padding=True,
    return_tensors='tf',
    return_token_type_ids = False,
    return_attention_mask = True,
    verbose = True)

In [None]:
input_ids_ext_lbert = x_train_ext_lbert['input_ids']
attention_mask_ext_lbert = x_train_ext_lbert['attention_mask']

In [None]:
max_len = 100

input_ids_ext_lbert = Input(shape=(max_len,), dtype=tf.int32, name="input_ids")
input_mask_ext_lbert = Input(shape=(max_len,), dtype=tf.int32, name="attention_mask")
embeddings = lbert_model(input_ids_ext_lbert, attention_mask = input_mask_ext_lbert)[0]
out = Dense(128, activation='relu')(embeddings)
out = tf.keras.layers.Dropout(0.1)(out)
out = Dense(32,activation = 'relu')(out)
y = Dense(4,activation = 'sigmoid')(out)

model_ext_lbert = tf.keras.Model(inputs=[input_ids_ext_lbert, input_mask_ext_lbert], outputs=y)
model_ext_lbert.layers[2].trainable = True

In [None]:
optimizer_ext_lbert = tf.keras.optimizers.legacy.Adam(learning_rate=5e-05, # this learning rate is for legal bert model , taken from huggingface website
    epsilon=1e-08,
    decay=0.01,
    clipnorm=1.0)

# Set loss and metrics
loss_ext_lbert = CategoricalCrossentropy(from_logits = True)
metric_ext_lbert = CategoricalAccuracy('balanced_accuracy')

# Compile the model
model_ext_lbert.compile(
    optimizer = optimizer_ext_lbert,
    loss=loss_ext_lbert,
    metrics = metric_ext_lbert)

In [None]:
history_ext_lbert = model_ext_lbert.fit(
    x ={'input_ids':x_train_ext_lbert['input_ids'],'attention_mask':x_train_ext_lbert['attention_mask']} ,
    y = y_train_ext,
    validation_data = (
    {'input_ids':x_test_ext_lbert['input_ids'],'attention_mask':x_test_ext_lbert['attention_mask']}, y_test_ext
    ),
  epochs=10,
    batch_size=64
)

In [None]:
predicted_raw_ext_lbert = model_ext_lbert.predict({'input_ids':x_test_ext_lbert['input_ids'],'attention_mask':x_test_ext_lbert['attention_mask']})
predicted_raw_ext_lbert[0]

In [None]:
import numpy as np

y_predicted_ext_lbert = np.argmax(predicted_raw_ext_lbert, axis = 1)
y_true_ext_lbert = df_test_ext['Judgement Status']

In [None]:
from sklearn.metrics import classification_report, f1_score
print(classification_report(y_true_ext_lbert, y_predicted_ext_lbert))

print("F1 Score: ",f1_score(y_true_ext_lbert,y_predicted_ext_lbert, average='weighted'))

In [None]:
x_train_ext_lbert ={'input_ids':x_train_ext_lbert['input_ids'],'attention_mask':x_train_ext_lbert['attention_mask']}

train_loss_ext_lbert, train_accuracy_ext_lbert = model_ext_lbert.evaluate(x_train_ext_lbert, y_train_ext, verbose=False)
print("Training Accuracy: {:.4f}".format(train_accuracy_ext_lbert))

In [None]:
x_test_ext_lbert = {'input_ids':x_test_ext_lbert['input_ids'],'attention_mask':x_test_ext_lbert['attention_mask']}

test_loss_ext_lbert, test_accuracy_ext_lbert = model_ext_lbert.evaluate(x_test_ext_lbert, y_test_ext, verbose=False)
print("Testing Accuracy:  {:.4f}".format(test_accuracy_ext_lbert))

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_ext_lbert, "balanced_accuracy")
plot_graphs(history_ext_lbert, "loss")

In [None]:
model_ext_lbert.save("/content/drive/MyDrive/Final Year Paper Work/H5 Files/EXT-LEGALBERT.h5")