## **Reading the Dataset as a DataFrame**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import io
import pandas as pd

df = pd.read_excel('/content/drive/MyDrive/Final Year Paper Work/Complete Summarized Dataset.xlsx', header=0,index_col=0)

df = df.dropna() # To remove any None values
df.head()

##**Splitting the data for complete, extractive and abstractive summarized texts**

In [None]:
# Complete Data
com_sentences = df[['Judgement','Judgement Status']]

# Abstractive Data
abs_sentences = df[['Abstractive Summarized Judgements','Judgement Status']]

# Extractive Data
ext_sentences = df[['Extractive Summarized Judgements','Judgement Status']]

from sklearn.model_selection import train_test_split

# Train and Test Split for Complete Data
df_train_com, df_test_com = train_test_split(com_sentences, test_size=0.25, random_state=42)

# Train and Test Split for Abstractive Data
df_train_abs, df_test_abs = train_test_split(abs_sentences, test_size=0.25, random_state=42)

# Train and Test Split for Extractive Data
df_train_ext, df_test_ext = train_test_split(ext_sentences, test_size=0.25, random_state=42)

## **Converting the Judgement Status to Categorical Values**

In [None]:
from tensorflow.keras.utils import to_categorical

# Complete Data
y_train_com = to_categorical(df_train_com['Judgement Status'])
y_test_com = to_categorical(df_test_com['Judgement Status'])

# Abstractive Data
y_train_abs = to_categorical(df_train_abs['Judgement Status'])
y_test_abs = to_categorical(df_test_abs['Judgement Status'])

# Extractive Data
y_train_ext = to_categorical(df_train_ext['Judgement Status'])
y_test_ext = to_categorical(df_test_ext['Judgement Status'])

## **Import Statements**

In [None]:
!pip install transformers

In [None]:
# Import Statements
import tensorflow as tf
import tensorflow_hub as hub
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import re
import unicodedata
import nltk
from nltk.corpus import stopwords
from tensorflow import keras
from tensorflow.keras.layers import Dense,Dropout, Input
from tqdm import tqdm
import pickle
from sklearn.metrics import confusion_matrix,f1_score,classification_report
import matplotlib.pyplot as plt
import itertools
from sklearn.utils import shuffle
from tensorflow.keras import regularizers
from transformers import BertConfig

In [None]:
import transformers
from transformers import DistilBertTokenizer, TFDistilBertModel

dbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert_model = TFDistilBertModel.from_pretrained('distilbert-base-uncased')

##**1) Complete Data**

In [None]:
df_com_dbert = df
df_com_dbert = df_com_dbert.rename(columns = {'Judgement Status': 'label', 'Judgement': 'text'}, inplace = False)
print('Available labels: ',df_com_dbert.label.unique())

num_classes_com_dbert = len(df_com_dbert.label.unique())
df_com_dbert.head()

In [None]:
max_len = 100
sentences_com_dbert = df_com_dbert['text']
labels_com_dbert = df_com_dbert['label']
len(sentences_com_dbert), len(labels_com_dbert)

### **Checking if the Tokenizer is working well with out data by giving one sentence as input**

In [None]:
dbert_tokenizer.tokenize(sentences_com_dbert[1])

dbert_inp_com = dbert_tokenizer.encode_plus(sentences_com_dbert[1],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
dbert_inp_com

In [None]:
id_inp_com_dbert = np.asarray(dbert_inp_com['input_ids'])
mask_inp_com_dbert = np.asarray(dbert_inp_com['attention_mask'])
out_com_dbert = dbert_model([id_inp_com_dbert.reshape(1,-1),mask_inp_com_dbert.reshape(1,-1)])
type(out_com_dbert),out_com_dbert

In [None]:
dbert_tokenizer.decode(dbert_inp_com['input_ids'])

### **Since tokenizer is working well, run the tokenizer with all sentences**

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(4, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model

model_com_dbert = create_model()

In [None]:
input_ids_com_dbert = []
attention_masks_com_dbert = []

for sent in sentences_com_dbert:
    dbert_inps_com = dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids_com_dbert.append(dbert_inps_com['input_ids'])
    attention_masks_com_dbert.append(dbert_inps_com['attention_mask'])

input_ids_com_dbert = np.asarray(input_ids_com_dbert)
attention_masks_com_dbert = np.array(attention_masks_com_dbert)
labels_com_dbert = np.array(labels_com_dbert)
len(input_ids_com_dbert),len(attention_masks_com_dbert),len(labels_com_dbert)

In [None]:
train_inp_com_dbert, val_inp_com_dbert, train_label_com_dbert, val_label_com_dbert, train_mask_com_dbert, val_mask_com_dbert = train_test_split(input_ids_com_dbert,labels_com_dbert,attention_masks_com_dbert,test_size=0.25)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp_com_dbert.shape,val_inp_com_dbert.shape,train_label_com_dbert.shape,val_label_com_dbert.shape,train_mask_com_dbert.shape,val_mask_com_dbert.shape))

loss_com_dbert = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric_com_dbert = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer_com_dbert = tf.keras.optimizers.Adam(learning_rate=3e-5)

model_com_dbert.compile(loss=loss_com_dbert, optimizer=optimizer_com_dbert, metrics=[metric_com_dbert])

In [None]:
history_com_dbert = model_com_dbert.fit([train_inp_com_dbert,train_mask_com_dbert],train_label_com_dbert,batch_size = 32,epochs = 10,validation_data=([val_inp_com_dbert,val_mask_com_dbert],val_label_com_dbert))

In [None]:
preds_com_dbert = model_com_dbert.predict([val_inp_com_dbert,val_mask_com_dbert],batch_size=16)

In [None]:
pred_labels_com_dbert = preds_com_dbert.argmax(axis=1)
f1_com_dbert = f1_score(val_label_com_dbert,pred_labels_com_dbert,average='weighted')
print("F1 Score: ",f1_com_dbert)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_com_dbert, "accuracy")
plot_graphs(history_com_dbert, "loss")

In [None]:
model_com_dbert.save("/content/drive/MyDrive/Final Year Paper Work/H5 Files/COM-DISTILBERT.h5")

##**2) Abstractive Summarized Data**


In [None]:
df_abs_dbert = df
df_abs_dbert = df_abs_dbert.rename(columns = {'Judgement Status': 'label', 'Abstractive Summarized Judgements': 'text'}, inplace = False)
print('Available labels: ',df_abs_dbert.label.unique())

num_classes_abs__dbert = len(df_abs_dbert.label.unique())
df_abs_dbert.head()

In [None]:
max_len = 100
sentences_abs_dbert = df_abs_dbert['text']
labels_abs_dbert = df_abs_dbert['label']
len(sentences_abs_dbert), len(labels_abs_dbert)

### **Checking if the Tokenizer is working well with out data by giving one sentence as input**

In [None]:
dbert_tokenizer.tokenize(sentences_abs_dbert[1])

dbert_inp_abs = dbert_tokenizer.encode_plus(sentences_abs_dbert[1],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
dbert_inp_abs

In [None]:
id_inp_abs_dbert = np.asarray(dbert_inp_abs['input_ids'])
mask_inp_abs_dbert = np.asarray(dbert_inp_abs['attention_mask'])
out_abs_dbert = dbert_model([id_inp_abs_dbert.reshape(1,-1),mask_inp_abs_dbert.reshape(1,-1)])
type(out_abs_dbert),out_abs_dbert

In [None]:
dbert_tokenizer.decode(dbert_inp_abs['input_ids'])

### **Since tokenizer is working well, run the tokenizer with all sentences**

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(4, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model

model_abs_dbert = create_model()

In [None]:
input_ids_abs_dbert = []
attention_masks_abs_dbert = []

for sent in sentences_abs_dbert:
    dbert_inps_abs = dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids_abs_dbert.append(dbert_inps_abs['input_ids'])
    attention_masks_abs_dbert.append(dbert_inps_abs['attention_mask'])

input_ids_abs_dbert = np.asarray(input_ids_abs_dbert)
attention_masks_abs_dbert = np.array(attention_masks_abs_dbert)
labels_abs_dbert = np.array(labels_abs_dbert)
len(input_ids_abs_dbert),len(attention_masks_abs_dbert),len(labels_abs_dbert)

In [None]:
train_inp_abs_dbert, val_inp_abs_dbert, train_label_abs_dbert, val_label_abs_dbert, train_mask_abs_dbert, val_mask_abs_dbert = train_test_split(input_ids_abs_dbert,labels_abs_dbert,attention_masks_abs_dbert,test_size=0.25)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp_abs_dbert.shape,val_inp_abs_dbert.shape,train_label_abs_dbert.shape,val_label_abs_dbert.shape,train_mask_abs_dbert.shape,val_mask_abs_dbert.shape))

loss_abs_dbert = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric_abs_dbert = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer_abs_dbert = tf.keras.optimizers.Adam(learning_rate=3e-5)

model_abs_dbert.compile(loss=loss_abs_dbert, optimizer=optimizer_abs_dbert, metrics=[metric_abs_dbert])

In [None]:
history_abs_dbert = model_abs_dbert.fit([train_inp_abs_dbert,train_mask_abs_dbert],train_label_abs_dbert,batch_size = 32,epochs = 10,validation_data=([val_inp_abs_dbert,val_mask_abs_dbert],val_label_abs_dbert))

In [None]:
preds_abs_dbert = model_abs_dbert.predict([val_inp_abs_dbert,val_mask_abs_dbert],batch_size=16)

In [None]:
pred_labels_abs_dbert = preds_abs_dbert.argmax(axis=1)
f1_abs_dbert = f1_score(val_label_abs_dbert,pred_labels_abs_dbert,average='weighted')
print("F1 Score: ",f1_abs_dbert)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_abs_dbert, "accuracy")
plot_graphs(history_abs_dbert, "loss")

In [None]:
model_abs_dbert.save("/content/drive/MyDrive/Final Year Paper Work/H5 Files/ABS-DISTILBERT.h5")

## **3) Extractive Summarized Data**

In [None]:
df_ext_dbert = df
df_ext_dbert = df_ext_dbert.rename(columns = {'Judgement Status': 'label', 'Extractive Summarized Judgements': 'text'}, inplace = False)
print('Available labels: ',df_ext_dbert.label.unique())

num_classes_ext_dbert = len(df_ext_dbert.label.unique())
df_ext_dbert.head()

In [None]:
max_len = 100
sentences_ext_dbert = df_ext_dbert['text']
labels_ext_dbert = df_ext_dbert['label']
len(sentences_ext_dbert), len(labels_ext_dbert)

### **Checking if the Tokenizer is working well with out data by giving one sentence as input**

In [None]:
dbert_tokenizer.tokenize(sentences_ext_dbert[1])

dbert_inp_ext = dbert_tokenizer.encode_plus(sentences_ext_dbert[1],add_special_tokens = True,max_length =20,pad_to_max_length = True,truncation=True)
dbert_inp_ext

In [None]:
id_inp_ext_dbert = np.asarray(dbert_inp_ext['input_ids'])
mask_inp_ext_dbert = np.asarray(dbert_inp_ext['attention_mask'])
out_ext_dbert = dbert_model([id_inp_ext_dbert.reshape(1,-1),mask_inp_ext_dbert.reshape(1,-1)])
type(out_ext_dbert),out_ext_dbert

In [None]:
dbert_tokenizer.decode(dbert_inp_ext['input_ids'])

### **Since tokenizer is working well, run the tokenizer with all sentences**

In [None]:
def create_model():
    inps = Input(shape = (max_len,), dtype='int64')
    masks= Input(shape = (max_len,), dtype='int64')
    dbert_layer = dbert_model(inps, attention_mask=masks)[0][:,0,:]
    dense = Dense(512,activation='relu',kernel_regularizer=regularizers.l2(0.01))(dbert_layer)
    dropout= Dropout(0.5)(dense)
    pred = Dense(4, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    model = tf.keras.Model(inputs=[inps,masks], outputs=pred)
    print(model.summary())
    return model

model_ext_dbert = create_model()

In [None]:
input_ids_ext_dbert = []
attention_masks_ext_dbert = []

for sent in sentences_ext_dbert:
    dbert_inps_ext = dbert_tokenizer.encode_plus(sent,add_special_tokens = True,max_length =max_len,pad_to_max_length = True,return_attention_mask = True,truncation=True)
    input_ids_ext_dbert.append(dbert_inps_ext['input_ids'])
    attention_masks_ext_dbert.append(dbert_inps_ext['attention_mask'])

input_ids_ext_dbert = np.asarray(input_ids_ext_dbert)
attention_masks_ext_dbert = np.array(attention_masks_ext_dbert)
labels_ext_dbert = np.array(labels_ext_dbert)
len(input_ids_ext_dbert),len(attention_masks_ext_dbert),len(labels_ext_dbert)

In [None]:
train_inp_ext_dbert, val_inp_ext_dbert, train_label_ext_dbert, val_label_ext_dbert, train_mask_ext_dbert, val_mask_ext_dbert = train_test_split(input_ids_ext_dbert,labels_ext_dbert,attention_masks_ext_dbert,test_size=0.25)

print('Train inp shape {} Val input shape {}\nTrain label shape {} Val label shape {}\nTrain attention mask shape {} Val attention mask shape {}'.format(train_inp_ext_dbert.shape,val_inp_ext_dbert.shape,train_label_ext_dbert.shape,val_label_ext_dbert.shape,train_mask_ext_dbert.shape,val_mask_ext_dbert.shape))

loss_ext_dbert = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric_ext_dbert = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
optimizer_ext_dbert = tf.keras.optimizers.Adam(learning_rate=3e-5)

model_ext_dbert.compile(loss=loss_ext_dbert, optimizer=optimizer_ext_dbert, metrics=[metric_ext_dbert])

In [None]:
history_ext_dbert = model_ext_dbert.fit([train_inp_ext_dbert,train_mask_ext_dbert],train_label_ext_dbert,batch_size = 32,epochs = 10,validation_data=([val_inp_ext_dbert,val_mask_ext_dbert],val_label_ext_dbert))

In [None]:
preds_ext_dbert = model_ext_dbert.predict([val_inp_dbert_ext_dbert,val_mask_ext_dbert],batch_size=16)

In [None]:
pred_labels_ext_dbert = preds_ext_dbert.argmax(axis=1)
f1_ext_dbert = f1_score(val_label_ext_dbert,pred_labels_ext_dbert,average='weighted')
print("F1 Score: ",f1_ext_dbert)

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

def plot_graphs(history, string):
    plt.plot(history.history[string])
    plt.plot(history.history['val_'+string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, 'val_'+string])
    plt.show()

plot_graphs(history_ext_dbert, "accuracy")
plot_graphs(history_ext_dbert, "loss")

In [None]:
model_ext_dbert.save("/content/drive/MyDrive/Final Year Paper Work/H5 Files/EXT-DISTILBERT.h5")