In [None]:
# Mounting colab on drive
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
# Enter the folder name
FOLDER_NAME = '/content/drive/My Drive/LegalSummarization/'

Mounted at /content/drive


Codes provided in Repository

In [None]:
import spacy
from spacy.attrs import ORTH
import re
import string

def custom_sentencizer(doc):
    ''' Look for sentence start tokens by scanning for periods only. '''
    split_lowercase = all(w.text.islower() for w in doc)
    
    for i, token in enumerate(doc[:-2]):  # The last token cannot start a sentence
        if token.text[0] == "." or token.text[-1] == ".":
            if not split_lowercase and (not doc[i+1].text[0].isupper() or doc[i+2].text[0] == '.'):# or doc[i+1].text[0] == '.':
                    doc[i+1].is_sent_start = False  # Tell the default sentencizer to ignore this token
            # pass
        else:
            doc[i+1].is_sent_start = False  # Tell the default sentencizer to ignore this token
    return doc




def custom_splitter(text = None):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe(custom_sentencizer, before = "parser")

    special_cases = {"Rs.": "rs.", "No.": "no.", "no.": "no.", "v.": "vs", "vs.": "vs", "i.e.": "i.e.", "viz.": "viz.", "M/s.": "m/s.", "Mohd.": "mohd.", "Ex.": "exhibit", "Art." : "article", "Arts." : "articles", "S.": "section", "s.": "section", "ss.": "sections", "u/s.": "section", "u/ss.": "sections", "art.": "article", "arts.": "articles", "u/arts." : "articles", "u/art." : "article", "hon'ble" : "honourable"}
    
    for case, orth in special_cases.items():
    	nlp.tokenizer.add_special_case(case, [{ORTH: orth}])
    
    
    if text is None: return nlp
    #text = text.strip()
    #print (text)
    text = text.replace('\n', ' ')
    #text = re.sub(' +', ' ', text)
    
    
    
    parsed = nlp(text)
    
    sentences = []
    
    for sent in parsed.sents:
        sentences.append(sent.text)
    
    return sentences, nlp




class custom_tokenizer:
        def __init__(self):
                # self.NLP = spacy.load('en_core_web_sm')
                self.NLP = custom_splitter()
                puncts = string.punctuation.replace('.', '').replace('-', '')
                self.trans = str.maketrans('.-','  ', puncts)
                
        def to_words(self, text):
                text = re.sub('\n', ' ', text.lower())
                text = re.sub('\s+', ' ', text).strip()
                
                words = [s.text.lower() if s.text[0] == "'" and len(s.text) == 2 else s.text.translate(self.trans).strip().lower() for s in self.NLP(text.strip()) if not s.is_punct]
                
                return words 
        
        def to_sentences(self, text):
                #remove extra dots
                text = re.sub('\.\s*\.\s*\.', '. ', text)
                text = re.sub('\.\s*\.', '. ', text)
                
                #remove dash
                text = re.sub('-', ' ', text)
                
                # remove extra whitespace
                text = re.sub('\n', ' ', text)
                text = re.sub('\s+', ' ', text).strip()
                
                
                
                sentences = [s.text for s in self.NLP(text).sents if len(s.text.strip()) > 5]
                # if re.match('\d+\.?.*', text):
                #         text = text[4:]
                
                return sentences
        
        def to_cleaned_sents(self, text):
                sents = self.to_sentences(text)
                words = [' '.join(self.to_words(s)) + '.' for s in sents]
                return words        
        
       
        
       
        
class simple_tokenizer:
        def to_words(self, s):
                s = s.strip().strip('.').strip()
                return s.split()
        
        def to_sentences(self, s):
                return [sent.strip() + '.' for sent in s.split('.') if len(sent.strip()) > 5]

In [None]:
!pip install rouge

Collecting rouge
  Downloading rouge-1.0.1-py3-none-any.whl (13 kB)
Installing collected packages: rouge
Successfully installed rouge-1.0.1


In [None]:
import argparse
import os
from tqdm import tqdm
import json
import csv
from collections import defaultdict, Counter
from multiprocessing import Pool
from time import time
from rouge import Rouge
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize  
tokenizer = custom_tokenizer()
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))  

def remove_stopwords(sent):
    
    word_tokens = tokenizer.to_cleaned_sents(sent)
  
    filtered_sentence = [w for w in word_tokens if not w in stop_words]  
  
    filtered_sentence = []  
  
    for w in word_tokens:  
        if w not in stop_words:  
            filtered_sentence.append(w)
    filtered_sentence.append(' ')
    return " ".join(filtered_sentence)

def getRouge(hyp, ref):
    ROUGE = Rouge()
    #hyp = preprocess(hyp)
    #ref = preprocess(ref)
    
    if len(hyp) == 0 or len(ref) == 0: return {}
    
    return ROUGE.get_scores(hyp, ref, avg = True)

def dict_sum(cnt, d):
        for key, val in d.items():
                cnt[key].update(val)
                
def alt_rouge_scores(Summary, Data):
  rougesum = defaultdict(Counter)
  filelist = Summary.keys()
  file_wise_rouge = {}
  for fn in filelist:
    #print("processing = "+fn)
    summdoc = remove_stopwords("".join(Summary[fn]).replace('\n', ' '))
    golddoc = remove_stopwords("".join(Data[fn]['Summary']).replace('\n', ' '))
            
    try:
      scores = getRouge(summdoc, golddoc)
    except:
      print("ERROR in", fn); 
    file_wise_rouge[fn] = scores

  rouge_1 = [0,0,0]
  rouge_2 = [0,0,0]
  rouge_l = [0,0,0]
  for k,v in file_wise_rouge.items():
    r1_p = v['rouge-1']['p']
    r1_r = v['rouge-1']['r']
    r1_f = v['rouge-1']['f']
    
    r2_p = v['rouge-2']['p']
    r2_r = v['rouge-2']['r']
    r2_f = v['rouge-2']['f']
    
    rl_p = v['rouge-l']['p']
    rl_r = v['rouge-l']['r']
    rl_f = v['rouge-l']['f']
    
    rouge_1[0]+= r1_p
    rouge_1[1]+= r1_r
    rouge_1[2]+= r1_f
    
    rouge_2[0]+= r2_p
    rouge_2[1]+= r2_r
    rouge_2[2]+= r2_f
    
    rouge_l[0]+= rl_p
    rouge_l[1]+= rl_r
    rouge_l[2]+= rl_f
  
  r1_p = round(rouge_1[0]/len(filelist),4)
  r1_r = round(rouge_1[1]/len(filelist),4)
  r1_f = round(rouge_1[2]/len(filelist),4)
  
  r2_p = round(rouge_2[0]/len(filelist),4)
  r2_r = round(rouge_2[1]/len(filelist),4)
  r2_f = round(rouge_2[2]/len(filelist),4)
  
  rl_p = round(rouge_l[0]/len(filelist),4)
  rl_r = round(rouge_l[1]/len(filelist),4)
  rl_f = round(rouge_l[2]/len(filelist),4)

  return (r2_r, r2_f, rl_r, rl_f)


def rouge_scores(SUMMPATH,GOLDPATH,OUTFILE):
    fout = open(OUTFILE,"w")
                        
    rougesum = defaultdict(Counter)
    filelist = os.listdir(GOLDPATH)
    file_wise_rouge = {}
    for fn in tqdm(filelist):
        #print("processing = "+fn)
        with open(os.path.join(SUMMPATH, fn),encoding="utf8") as fp: 
            summdoc = remove_stopwords(fp.read().replace('\n', ' '))
                        #for golddir in GOLDPATH:
        with open(os.path.join(GOLDPATH, fn),encoding="utf8") as fp: 
            golddoc = remove_stopwords(fp.read().replace('\n', ' '))
            
        try:
            scores = getRouge(summdoc, golddoc)
            
        except:  print("ERROR in", fn); 
        file_wise_rouge[fn] = scores
    
    # p, r ,f
    rouge_1 = [0,0,0]
    rouge_2 = [0,0,0]
    rouge_l = [0,0,0]
    
    for k,v in file_wise_rouge.items():
        v['file'] = k
        print(json.dumps(v), file = fout)
        r1_p = v['rouge-1']['p']
        r1_r = v['rouge-1']['r']
        r1_f = v['rouge-1']['f']
        
        r2_p = v['rouge-2']['p']
        r2_r = v['rouge-2']['r']
        r2_f = v['rouge-2']['f']
        
        rl_p = v['rouge-l']['p']
        rl_r = v['rouge-l']['r']
        rl_f = v['rouge-l']['f']
        
        rouge_1[0]+= r1_p
        rouge_1[1]+= r1_r
        rouge_1[2]+= r1_f
        
        rouge_2[0]+= r2_p
        rouge_2[1]+= r2_r
        rouge_2[2]+= r2_f
        
        rouge_l[0]+= rl_p
        rouge_l[1]+= rl_r
        rouge_l[2]+= rl_f
        
        
    r1_p = round(rouge_1[0]/len(filelist),4)
    r1_r = round(rouge_1[1]/len(filelist),4)
    r1_f = round(rouge_1[2]/len(filelist),4)
    
    r2_p = round(rouge_2[0]/len(filelist),4)
    r2_r = round(rouge_2[1]/len(filelist),4)
    r2_f = round(rouge_2[2]/len(filelist),4)
    
    rl_p = round(rouge_l[0]/len(filelist),4)
    rl_r = round(rouge_l[1]/len(filelist),4)
    rl_f = round(rouge_l[2]/len(filelist),4)
    
    # print("R1-P: ", str(r1_p))
    # print("R1-R: ", str(r1_r))
    # print("R1-F: ", str(r1_f))
    
    #print("R2-P: ", str(r2_p))
    #print(stateno)
    print("\n\n\n")
    print("ROUGE scores:")
    print("==========")
    print("R2-R: ", str(r2_r))
    print("R2-F: ", str(r2_f))
    
    #print("Rl-P: ", str(rl_p))
    print("Rl-R: ", str(rl_r))
    print("Rl-F: ", str(rl_f))
    
    fout.write("R2-R: "+ str(r2_r)+"\t")
    fout.write("R2-F: "+ str(r2_f)+"\t")
    
    #fout.write("Rl-P: ", str(rl_p))
    fout.write("Rl-R: "+ str(rl_r)+"\t")
    fout.write("Rl-F: "+ str(rl_f))
    
    
    fout.close()
    print("==========")
    print("Written to file "+OUTFILE)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
import os
import pandas as pd
# Loading Data
def LoadData(OrigTextFolder, SummaryFolder, StatsFile):
  files = dict()
  LengthSummaries = dict()
  lenD = pd.read_csv(StatsFile)
  for file in os.listdir(OrigTextFolder):
    files[file] = dict()
    # print(lenD.loc[lenD['Column1']==file])
    LengthSummaries[file] = list(lenD.loc[lenD['Column1']==file]['Column2'])[0]
    with open(OrigTextFolder+file, "r") as f:
      files[file]['Text'] = f.readlines()
  for file in os.listdir(SummaryFolder):
    with open(SummaryFolder+file, "r") as f:
      try:
        files[file]['Summary'] = f.readlines()
      except:
        continue
  for key, value in files.items():
    if not ("Text" in value.keys() and "Summary" in value.keys()):
      del files[key]
  return files, LengthSummaries

In [None]:
!pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.1.0.tar.gz (78 kB)
[K     |████████████████████████████████| 78 kB 5.1 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 18.4 MB/s 
[?25hCollecting tokenizers>=0.10.3
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 44.2 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 55.2 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 8.9 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_

In [None]:
# Detect devices
import torch
use_cuda = torch.cuda.is_available()                   # check if GPU exists
device = torch.device("cuda" if use_cuda else "cpu")   # use CPU or GPU

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

model = SentenceTransformer('bert-base-nli-mean-tokens').to(device)

Embeddings = dict()

# Similarity between 2 sentences
def Similarity(sentence1, sentence2):
  try:
    sentence_embeddings1 = Embeddings[sentence1]
  except:
    sentence_embeddings1 = model.encode(sentence1)
    Embeddings[sentence1] = sentence_embeddings1
  
  try:
    sentence_embeddings2 = Embeddings[sentence2]
  except:
    sentence_embeddings2 = model.encode(sentence2)
    Embeddings[sentence2] = sentence_embeddings2

  sentence_embeddings = [sentence_embeddings1, sentence_embeddings2]
  sim = cosine_similarity(
        [sentence_embeddings[0]],
        sentence_embeddings[1:]
      )[0][0]
  return sim 

Downloading:   0%|          | 0.00/391 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/399 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Asymmetric wieght calculation
def WeightFact(i, j, k, lambdPositive, lambdNegative):
  factor = 0.0 
  if i>j:
    factor = lambdNegative[int((i-j)/k)]
  else:
    factor = lambdPositive[int((j-i)/k)]
  return factor

In [None]:
# Graph Construction
def GraphCons(data, k, lambdPositive, lambdNegative):
  graph = dict()
  fileList = data.keys()
  for key in tqdm(fileList):
    graph[key] = dict()
    value = data[key]
    text = value['Text']
    m = int((len(text)+k-1)/k)
    for i, line in enumerate(text):
      graph[key][i] = list([(j, WeightFact(i, j, m, lambdPositive, lambdNegative) * Similarity(line, text[j])) for j in range(len(text)) if i!=j])
  return graph    

In [None]:
# Centrality scoring
def CentralityScores(graph):
  scores = dict()
  for key, value in graph.items():
    scores[key] = []
    for node in value.keys():
      sumF = sum(list([item[1] for item in value[node]]))
      scores[key].append((sumF, node))
    scores[key].sort(reverse = True)
  return scores

In [None]:
def GenSummary(data, scores, LengthSumms):
  summaries = dict()
  for key, value in data.items():
    CurrWords = 0
    summaries[key] = list()
    text = value['Text']
    NumWords = LengthSumms[key]
    for i in range(len(scores[key])):
      if CurrWords+len(text[scores[key][i][1]].split(' ')) > NumWords:
        break
      CurrWords += len(text[scores[key][i][1]].split(' '))
      summaries[key].append(text[scores[key][i][1]])
  return summaries

In [None]:
# Defining hyperparameters
k = 3
lambP = [0.5, 1.5, 2.5]
lambN = [3.5, 4.5, 5.5]

In [None]:
# Data loading
IndiaDataFolder = FOLDER_NAME + "LEGSUMM/IN/test-data"
OrigTextFolder = IndiaDataFolder + "/judgement/"
SummaryFolder = IndiaDataFolder + "/summary/"
StatsFile = FOLDER_NAME + "/stats-IN-test.csv"
DestFolder = FOLDER_NAME + "GenSummsIn/"

Sample Run for Random Hyper Parameters on Test data

In [None]:
# Sample run
data, lenSumms = LoadData(OrigTextFolder, SummaryFolder, StatsFile)
graph = GraphCons(data, k, lambP, lambN)
scores = CentralityScores(graph)


summary = GenSummary(data, scores, lenSumms)

100%|██████████| 100/100 [59:57<00:00, 35.98s/it]


In [None]:
for fileName, Summary in summary.items():
  with open(DestFolder+fileName, "w") as f:
    for line in Summary:
      f.write(line)

In [None]:
OUTFILE = FOLDER_NAME + "ScoresInTest1.txt"
rouge_scores(DestFolder,SummaryFolder,OUTFILE)

100%|██████████| 100/100 [17:49<00:00, 10.70s/it]





ROUGE scores:
R2-R:  0.7091
R2-F:  0.2914
Rl-R:  0.855
Rl-F:  0.4409
Written to file /content/drive/My Drive/LegalSummarization/ScoresInTest1.txt





Hyper Parameter tuning using train data

In [None]:
from tqdm import tqdm
import random
# Hyper Parameter tuning

lengths = dict()

def optimize(rs, lambdPT, lambdNT, lamdbP, lambdN, currentMaximum):
  if(rs[1]+rs[3]+rs[2]+rs[0] > currentMaximum):
    lambdP = lambdPT
    lambdN = lambdNT
    currentMaximum = rs[1]+rs[3]+rs[2]+rs[0]
  return (lambdP, lambdN)

def AltGraph(data):
  graph = dict()
  fileList = data.keys()
  for key in tqdm(fileList):
    graph[key] = dict()
    value = data[key]
    text = value['Text']
    m = int((len(text)+k-1)/k)
    lengths[key] = m
    for i, line in enumerate(text):
      graph[key][i] = list([(j, Similarity(line, text[j])) for j in range(len(text)) if i!=j])
  return graph    

def AltCentralityScores(graph, k, lambP, lambN):
  scores = dict()
  for key, value in graph.items():
    scores[key] = []
    for node in value.keys():
      sumT = 0
      for item in value[node]:
        sumT += WeightFact(node, item[0], lengths[key], lambP, lambN) * item[1]
      scores[key].append((sumT, node))
    scores[key].sort(reverse = True)
  return scores

def Params(data, numIters, k, LengthSummaries):
  currentMaximum = 0.0
  lambdP = None
  lambdN = None
  graph = AltGraph(data)
  for i in tqdm(range(numIters)):
    lambdPT = [random.random() for i in range(k)]
    lambdNT = [random.random() for i in range(k)]
    lambdPT.sort()
    lambdNT.sort()
    scores = AltCentralityScores(graph, k, lambdPT, lambdNT)
    summary = GenSummary(data, scores, LengthSummaries)
    rs = alt_rouge_scores(summary,data)
    lambdP, lambdN = optimize(rs, lambdPT, lambdNT, lambdP, lambdN, currentMaximum)
  print(currentMaximum)
  return (lambdP, lambdN)

In [None]:
# Data loading
IndiaDataFolder = FOLDER_NAME + "LEGSUMM/IN/test-data"
OrigTextFolder = IndiaDataFolder + "/judgement/"
SummaryFolder = IndiaDataFolder + "/summary/"
StatsFile = FOLDER_NAME + "/stats-IN-test.csv"
DestFolder = FOLDER_NAME + "GeneratedSummariesInTest/"

In [None]:
numIters = 50


Data, lenSumms = LoadData(OrigTextFolder, SummaryFolder, StatsFile)

lambP, lambN = Params(Data, numIters, k, lenSumms)

print(lambP)
print(lambN)

In [None]:
# Final run
data, lenSumms = LoadData(OrigTextFolder, SummaryFolder, StatsFile)
graph = GraphCons(data, k, lambP, lambN)
scores = CentralityScores(graph)


summary = GenSummary(data, scores, lenSumms)

In [None]:
for fileName, Summary in summary.items():
  with open(DestFolder+fileName, "w") as f:
    for line in Summary:
      f.write(line)

In [None]:
OUTFILE = FOLDER_NAME + "ScoresInTestTuned1.txt"
rouge_scores(DestFolder,SummaryFolder,OUTFILE)

In [None]:
print(lambP)
print(lambN)