In [1]:
cd /content/drive/MyDrive/Research_fellow/linking/Linking_Judgements/

/content/drive/MyDrive/Research_fellow/linking/Linking_Judgements


# **Preprocessing and segmenting judgement and timestamps**

In [2]:
import cleanJudgement as cj
import segementJudgement as sg
import keywordExtraction as key

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
# cleaning the judgement
judge= cj.cleanj('judgement.txt')
clean_judge = judge.preprocessDoc()
# print(clean_judge)

In [4]:
# segment judgement with hard-coded sections
section_names = ['Summary','The background','The facts of this case','Preserving the status quo','Conclusions in principle',' The Outcome in this Case']
judge_all = sg.segmentJ(clean_judge, paragraphs=False)
secs = judge_all.sections(section_names)
summary = secs[0]
# text only summary
text_summary = ''.join(secs[0])  #NEEDED FOR EMBEDDING AND JSON FILE
text_summary

' Summary Hilary Term[2010] UKSC 12On appeal from: [2010] EWCA Civ 57 JUDGMENT W (Children)  before  Lord WalkerLady HaleLord BrownLord ManceLord Kerr JUDGMENT GIVEN ON 3 March 2010 Heard on 1st and 2nd March 2010AppellantCharles Geekie QCMichael Liebrecht(Instructed by Dutton Gregory LLP)RespondentLucinda DavisSarah Earley(Instructed by The County Council Legal Services)RespondentKate Branigan QCMaggie Jones(Instructed by Larcombes LLP) LADY HALE giving the judgment of the court 1. At issue in this case are the principles which should guide the exercise of the court\'s discretion in deciding whether to order a child to attend to give evidence in family proceedings. The current approach was stated by Smith LJ in LM v Medway Council, RM and YM [2007] EWCA Civ 9, [2007] 1 FLR 1698, at para 44: "The correct starting point . . . is that it is undesirable that a child should have to give evidence in care proceedings and that particular justification will be required before that course is ta

In [5]:
# segment judgement with paragraphs

judge_all = sg.segmentJ(clean_judge, paragraphs=True)
paragraphs = judge_all.paras() #list of paragraphs use for entailement
# get the length of the paragraphs
lengths_paragraphs = [len(t.split()) for t in paragraphs]

In [6]:
paragraphs[3]

'4. All the parties in care proceedings are entitled to a fair hearing in the determination of their civil rights and obligations the parents who stand to lose their children if allegations of abuse are made out, the children who stand to lose their parents if allegations of abuse are made out, but also stand to suffer abuse or further abuse if they are left at home because those allegations cannot be proved. And it is not only their article 6 rights which are in play. The civil rights in issue are also Convention rights in themselves the right to respect for the family lives of the parents and their children but also the right to respect for the private lives of the children, which include their rights to be protected from attacks upon their physical and psychological integrity: X and Y v The Netherlands (1986) 8 EHRR 235. '

In [7]:
# cd .././blackstone_entities_judge/

In [8]:
#extract both quoted keywords and BLACKSTONE NEs from judgement sections
# The Summary list as an example
extractor = key.extractkeywords()
summary_quotes = extractor.quotes_extract(secs[0])
full_list_summary = extractor.create_NE_lists('.././blackstone_entities_judge/summary.csv',summary_quotes)
full_list_summary

['LM v Medway Council, RM',
 'EWCA Civ 9',
 '1 FLR 1698',
 'R v B County Council, ex parte P',
 '1 WLR 221',
 'P (Witness Summons)',
 '2 FLR 447',
 'SW v Portsmouth City Council; W',
 'EWCA 644',
 '3 FCR 1',
 'EWCA Civ 57',
 'Family Division',
 'X v Netherlands',
 '(1986) 8 EHRR 235',
 '(Identification: Restrictions Publication)',
 'UKHL 47',
 '1 AC 593',
 'article 10:"First',
 'proportionality test',
 'Childen Act',
 'starting point',
 'particular justification',
 'intense focus',
 'starting point',
 'particular justification',
 'stranger',
 'an intense focus',
 '"starting point"',
 '"particular justification"',
 '"stranger"',
 '"an intense focus"']

In [9]:
#Clean and extract timestamps related to the summary section

import timestampExraction as ts

transcript = ts.timestamp('transcripts_with_links.txt')
clean_transcript = transcript.segment()
# print(clean_transcript)
# exclude very short timestamps
long_timestamps = transcript.longTimestamps(clean_transcript)
# print(len(long_timestamps))
# get only the timetamps that has the summary keywords and entities
summary_timestamps = transcript.extractTimestamps(long_timestamps,full_list_summary )  #TIMESTAMPS WITH TIME
print(len(summary_timestamps))
# # get the text only timestamps related to the summary section without the timespans
text_timestamp_summary = transcript.getText(summary_timestamps) #TIMESTAMPS WITHOUT TIMES
print(text_timestamp_summary[0])

40
I'm able to grieve this, Miss Branigan better than taking you to mind, falling falling lines of principle can be derived from the four principal Authorities. Managing the child's judicial discretion, the starting point is its undesirable. The child to give evidence particular justification will be required, although there are some cases where it's right. I'm


In [10]:
# get the text only of all the long timestamps regardless of keywords
all_timestamps= [sent[1] for sent in long_timestamps]  #USE TO COMPARE WITHOUT KEYWORD EXTRACTION

In [11]:
len(all_timestamps)

375

# **Document Similarity with tf-idf**







In [None]:
from featureExtraction.tfidf import tf_idf_similarity
import calculateSimilarity as cosine

In [None]:
results = tf_idf_similarity(text_summary,text_timestamp_summary)

Calculating similarity : 100%|██████████| 40/40 [00:00<00:00, 75166.74it/s]


In [None]:
# create a json file with the top 20 similarities

cosine.get_results(results,20,summary_timestamps,text_summary,'Summary_linking_tfidf' )

# **Document Similarity with Pooling (MEAN, MIN and Max Pooling of Glove Embeddings)**

In [None]:
!pip install flair

In [None]:
import featureExtraction.doc_pool as docp
import evaluationNumpy as eval

In [None]:
corpus = docp.embeddDoc(text_summary,text_timestamp_summary)

In [None]:
# doesnot work with GPU
summary_embedding,timestamps_embedding = corpus.get_embeddings_mean()

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_min()

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_max()

In [None]:
#get sorted similarities with indexes
similarity = calculate_cosine_similarity(timestamps_embedding,summary_embedding)

Calculating similarity : 100%|██████████| 40/40 [00:00<00:00, 199017.98it/s]


In [None]:
similarity

[(0.9928705, 6),
 (0.9910352, 14),
 (0.99085575, 18),
 (0.99007887, 9),
 (0.9898157, 11),
 (0.989671, 39),
 (0.9890424, 1),
 (0.98850167, 36),
 (0.9884672, 10),
 (0.9883786, 7),
 (0.9883616, 12),
 (0.9883134, 20),
 (0.9879296, 25),
 (0.98752314, 13),
 (0.98749685, 19),
 (0.9872876, 23),
 (0.98717964, 15),
 (0.98685026, 16),
 (0.98645985, 27),
 (0.9860436, 17),
 (0.9858604, 2),
 (0.9852438, 3),
 (0.9851656, 5),
 (0.9851461, 8),
 (0.984355, 4),
 (0.98432815, 29),
 (0.98431444, 28),
 (0.98426, 0),
 (0.9835709, 38),
 (0.98346174, 26),
 (0.98275447, 34),
 (0.9824039, 21),
 (0.982389, 37),
 (0.98179066, 22),
 (0.9804166, 24),
 (0.97969323, 31),
 (0.979298, 33),
 (0.9769695, 35),
 (0.9720728, 30),
 (0.97016203, 32)]

In [None]:
#write results in json file
eval.get_results(similarity,20,summary_timestamps,text_summary,'Summary_pooling_max')

# **Document Similarity with RNN Embeddings (last hidden layer)**

In [None]:
import featureExtraction.doc_pool as docp

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_rnn()

In [None]:
import evaluateTensors as evalt
cosine = evalt.calculate_cosine_similarity(timestamps_embedding,summary_embedding)

In [None]:
# create a json file with the top 20 similarities
import json
evalt.get_results(cosine,20,summary_timestamps,text_summary,'Summary_linking_rnn' )

# **Semantic Search using 'all-MiniLM-L6-v2' (Entailment Embeddings fine-tuned on Multi-NLI**

In [None]:
!pip install -U sentence-transformers

In [13]:
from featureExtraction.sentenceTrasformer import getEmbeddings
from sentence_transformers import SentenceTransformer,util

In [14]:
# calculate cosine similarity and results top 20 most relevant links
# a list of tuples (ind,score)
#similarity_method (util.cos_sim or util.dot_score)

results = getEmbeddings('all-MiniLM-L6-v2',text_summary,text_timestamp_summary,util.cos_sim)

In [16]:
from evaluationNumpy import get_results

In [17]:
#create a json file from results
get_results(results,20,summary_timestamps,text_summary,'summary_test')

# **Text Similarity using GPT3 "davinci-001" for sentence embeddings**

In [None]:
!pip install openai

In [None]:
import pandas as pd
import openai, numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from featureExtraction.GPT3Embeddings import GPT3_embeddings
import calculateSimilarity as cosine

In [None]:
api_key = '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'
openai.api_key = api_key

In [None]:
#Get embeddings of summary and timestamps

summary = GPT3_embeddings(text_summary,engine="text-search-ada-query-001" )
timestamps_embeddings = []
 
for t in text_timestamp_summary:
  timestamps_embeddings.append(get_embedding(t,engine="text-search-ada-query-001"))


In [None]:
# Get cosine similarity results

results = cosine.get_cosine_simlarity(summary,timestamps_embeddings)



In [None]:
# create a json file with the top 20 similarities

cosine.get_results(results,20,summary_timestamps,text_summary,'Summary_linking2' )

# **Asymmetric Similarity with SBERT Sentence Embeddings with dot product**

In [None]:
!pip install -U sentence-transformers

In [None]:
# calculate cosine similarity and results top 20 most relevant links
# a list of tuples (ind,score)
#similarity_method (util.cos_sim or util.dot_score)

results = getEmbeddings('msmarco-roberta-base-ance-firstp',text_summary,text_timestamp_summary,util.dot_score)

In [None]:
#create a json file from results
get_results(results,20,summary_timestamps,text_summary,'xxxxxxxxxxxx')

In [None]:
# Model for asymmetric similarity (needs a dot product)

from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('msmarco-roberta-base-ance-firstp')

In [None]:
summary_emb = model.encode(paragraphs[3]) # taking only the paragraph

In [None]:
summary_emb = model.encode(text_summary) # taking the whole summary

In [None]:
trans_emb = model.encode(text_timestamp_summary)

In [None]:
trans_emb.shape

(40, 768)

In [None]:
# calculating the dot product between 4 paragraph in the summary and the timestamps of the summary
hits = util.semantic_search(summary_emb, trans_emb, score_function=util.dot_score,top_k=20)

In [None]:
hits

In [None]:
# calculating the dot product between the whole summary and the timestamps of the summary
hits = util.semantic_search(summary_emb, trans_emb, score_function=util.dot_score,top_k=20)

In [None]:
hits

In [None]:
results = sorted(hits[0], key=lambda x: x['score'],reverse=True)

In [None]:
final_result = []

for d in results:
    ind = d['corpus_id']
    score = d['score']
    final_result.append((text_timestamp_summary[ind], text_summary,score))
  

In [None]:
import json
texts = []
scores = []
for i in range(len(final_result)):
    text = final_result[i][0]
    score = final_result[i][2]
    texts.append(text)
    scores.append(score)
    dictionary = {'Summary': text_summary}
    dictionary['Trancription'] = [{'Score': s, 'text': t} for s, t in zip(scores, texts)]
with open('./jsonfiles/' + 'summary_semantic_search_dot' +'.json', 'w') as fout:
  json.dump(dictionary , fout)

# **Similarity with MS MARCO Cross-Encoders**