## **Read the Data and Store it in a DataFrame**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import io
import pandas as pd

df_base = pd.read_excel('/content/drive/MyDrive/Final Year Paper Work/Combined Dataset.xlsx', header=0,index_col=0)

df_base = df_base.dropna() # To remove any None values
df_base.head()

In [None]:
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

# Complete Data
sentences = df_base[['Judgement','Judgement Status']]

# Train and Test Split for Abstractive Data
df_train, df_test = train_test_split(sentences, test_size=0.25, random_state=42)

# Complete Data
y_train = to_categorical(df_train['Judgement Status'])
y_test = to_categorical(df_test['Judgement Status'])

In [None]:
df_base = df_base.rename(columns = {'Judgement Status': 'label', 'Judgement': 'text'}, inplace = False)
print('Available labels: ',df_base.label.unique())

num_classes_base = len(df_base.label.unique())
df_base.head()

## **Import Statements**

In [None]:
!pip install transformers lime

In [None]:
# Import Statements
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers

In [None]:
from transformers import BertConfig, RobertaTokenizer, TFRobertaModel

# Load the tokenizer and model
roberta_tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
roberta_model = TFRobertaModel.from_pretrained('roberta-base')

In [None]:
max_len = 100
sentences_base = df_base['text']
labels_base = df_base['label']
len(sentences_base), len(labels_base)

## **Checking if the Tokenizer is working well with out data by giving one sentence as input**

In [None]:
roberta_tokenizer.tokenize(sentences_base[1])

base_inp_com = roberta_tokenizer.encode_plus(sentences_base[1],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
base_inp_com

In [None]:
id_inp_base = np.asarray(base_inp_com['input_ids'])
mask_inp_base = np.asarray(base_inp_com['attention_mask'])
out_base = roberta_model([id_inp_base.reshape(1,-1),mask_inp_base.reshape(1,-1)])
type(out_base),out_base

In [None]:
roberta_tokenizer.decode(base_inp_com['input_ids'])

## **Since tokenizer is working well, run the tokenizer with all sentences**

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    roberta_layer = roberta_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(128,activation='relu')(roberta_layer)
    dropout= Dropout(0.1)(dense)
    dense = Dense(32,activation = 'relu')(dropout)
    pred = Dense(4, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dense)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model

model_base = create_model()

In [None]:
input_ids_base = []
attention_masks_base = []

for sent in sentences_base:
    base_inps_com = roberta_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids_base.append(base_inps_com['input_ids'])
    attention_masks_base.append(base_inps_com['attention_mask'])

input_ids_base = np.asarray(input_ids_base)
attention_masks_base = np.array(attention_masks_base)
labels_base = np.array(labels_base)
len(input_ids_base),len(attention_masks_base),len(labels_base)

In [None]:
train_inp_base, val_inp_base, train_label_base, val_label_base, train_mask_base, val_mask_base = train_test_split(input_ids_base,labels_base,attention_masks_base,test_size=0.25)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp_base.shape,val_inp_base.shape,train_label_base.shape,val_label_base.shape,train_mask_base.shape,val_mask_base.shape))

loss_base = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric_base = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer_base = tf.keras.optimizers.Adam(learning_rate=3e-5)

model_base.compile(loss=loss_base, optimizer=optimizer_base, metrics=[metric_base])

In [None]:
history_base = model_base.fit([train_inp_base,train_mask_base],train_label_base,batch_size = 32,epochs = 10,validation_data=([val_inp_base,val_mask_base],val_label_base))

In [None]:
preds_base = model_base.predict([val_inp_base,val_mask_base],batch_size=16)

In [None]:
pred_labels_base = preds_base.argmax(axis=1)
f1_base = f1_score(val_label_base,pred_labels_base,average='weighted')
print("F1 Score: ",f1_base)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_base, "accuracy")
plot_graphs(history_base, "loss")

In [None]:
model_base.save("/content/drive/MyDrive/Final Year Paper Work/RoBERTa H5 Files/ASSORTED.h5")

In [None]:
# Import necessary libraries
from lime import lime_text

# Define the function to predict using the RoBERT model with truncation and padding
def predict_fn(x):
    inputs = roberta_tokenizer(x, return_tensors='tf', truncation=True, padding=True, max_length=100)
    return model_base.predict([inputs['input_ids'], inputs['attention_mask']])

# Create a LIME explainer
explainer = lime_text.LimeTextExplainer()

# Choose a sample from the test set for explanation
sample_index = 0  # You can change this index as needed
text_to_explain = df_test['Judgement'].iloc[sample_index]
explanation_class = np.argmax(y_test[sample_index])  # Get the index of the maximum value as the class

# Generate LIME explanation
exp = explainer.explain_instance(
    text_to_explain,
    predict_fn,
    num_features=10,
    num_samples=5000,
    labels=list(range(y_test.shape[1])) if len(y_test.shape) > 1 else None
)

# Print and visualize the explanation
print('Explanation for class', explanation_class)
print(exp.as_list())

# Visualize the explanation
exp.show_in_notebook(text=text_to_explain)

In [None]:
print(df_test['Judgement Status'].iloc[sample_index])