In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
cd /content/drive/MyDrive/Research_fellow/linking/Linking_Judgements/

/content/drive/MyDrive/Research_fellow/linking/Linking_Judgements


# **Preprocessing and segmenting judgement and timestamps**

In [4]:
import cleanJudgement as cj
import segementJudgement as sg
import keywordExtraction as key

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [5]:
# cleaning the judgement
judge= cj.cleanj('judgement.txt')
clean_judge = judge.preprocessDoc()
# print(clean_judge)

In [None]:
# segment judgement with hard-coded sections
section_names = ['Summary','The background','The facts of this case','Preserving the status quo','Conclusions in principle',' The Outcome in this Case']
judge_all = sg.segmentJ(clean_judge, paragraphs=False)
secs = judge_all.sections(section_names)
summary = secs[0]
# text only summary
text_summary = ''.join(secs[0])  #NEEDED FOR EMBEDDING AND JSON FILE
text_summary

In [None]:
# segment judgement with paragraphs

judge_all = sg.segmentJ(clean_judge, paragraphs=True)
paragraphs = judge_all.paras() #list of paragraphs use for entailement
# get the length of the paragraphs
lengths_paragraphs = [len(t.split()) for t in paragraphs]

In [None]:
paragraphs[1]

In [None]:
#extract both quoted keywords and BLACKSTONE NEs from judgement sections
# The Summary list as an example
extractor = key.extractkeywords()
summary_quotes = extractor.quotes_extract(secs[0])
full_list_summary = extractor.create_NE_lists('summary.csv',summary_quotes)
full_list_summary

In [None]:
#Clean and extract timestamps related to the summary section

import timestampExraction as ts

transcript = ts.timestamp('transcripts.txt')
clean_transcript = transcript.segment()
# print(clean_transcript)
long_timestamps = transcript.longTimestamps(clean_transcript)
# print(len(long_timestamps))
summary_timestamps = transcript.extractTimestamps(long_timestamps,full_list_summary )  #TIMESTAMPS WITH TIME
print(len(summary_timestamps))
# text only timestamps
text_timestamp_summary = transcript.getText(summary_timestamps) #TIMESTAMPS WITHOUT TIMES
print(text_timestamp_summary[0])

52
In a parallel process going on, at the same time, chronologically cases leading to the Children Act 1989 made clear that hearsay was not admissible in cases concerning children other than in wardship. And in the written documentation in the Bundles, we've produced the case of Re(H), Re(K), which was the decision of the Court of Appeal in relation to, statutory proceedings, private law proceedings as the children. And that judgment made clear that, as I say chronologically in the lead up to the Children Act 1989 the courts decided that hearsay was not admissible in children proceedings. A parallel and similar decision was made in the Bradford case, in relation to proceedings in the juvenile court, the venue for care proceedings prior to implementation of the Children Act. And My Lord, I don't at this stage proposed necessary to go to those two cases. There's nothing that I seek to derived from them in particular, other than just setting us in the historical context of, here say not b

In [None]:
all_timestamps= [sent[1] for sent in long_timestamps]  #USE TO COMPARE WITHOUT KEYWORD EXTRACTION

In [None]:
len(all_timestamps)

375

# **Document Similarity with tf-idf**







In [None]:
from featureExtraction.tfidf import tf_idf_similarity
import calculateSimilarity as cosine

In [None]:
results = tf_idf_similarity(text_summary,text_timestamp_summary)

In [None]:
# create a json file with the top 20 similarities

cosine.get_results(results,20,summary_timestamps,text_summary,'Summary_linking' )

# **Document Similarity with Pooling (MEAN, MIN and Max Pooling of Glove Embeddings)**

In [None]:
import featureExtraction.doc_pool as docp
import evaluation2 as eval

In [None]:
!pip install flair

In [None]:
corpus = docp.embeddDoc(text_summary,text_timestamp_summary)

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_mean()

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_min()

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_max()

In [None]:
#get sorted similarities with indexes
similarity = eval.calculate_cosine_similarity(timestamps_embedding,summary_embedding)

In [None]:
#write results in json file
eval.get_results(similarity,10,summary_timestamps,text_summary,'test2')

# **Document Similarity with RNN Embeddings (last hidden layer)**

In [None]:
import featureExtraction.doc_pool as docp

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_rnn()

In [None]:
import evaluateTensors as evalt
evalt.calculate_cosine_similarity(timestamps_embedding,summary_embedding)

# **Semantic Search using 'all-MiniLM-L6-v2' (Entailment Embeddings fine-tuned on Multi-NLI**

In [None]:
!pip install -U sentence-transformers

In [None]:
from featureExtraction.sentenceTrasformer import getEmbeddings

In [None]:
# calculate cosine similarity and results top 20 most relevant links

results = getEmbeddings('all-MiniLM-L6-v2',text_summary,text_timestamp_summary)

In [None]:
results

# **Text Similarity using GPT3 "davinci-001" for sentence embeddings**

In [None]:
!pip install openai

In [None]:
import pandas as pd
import openai, numpy as np
from openai.embeddings_utils import get_embedding, cosine_similarity
from featureExtraction.GPT3Embeddings import GPT3_embeddings
import calculateSimilarity as cosine

In [None]:
api_key = 'xxxxxxxxx'
openai.api_key = api_key

In [None]:
#Get embeddings of summary and timestamps

summary = GPT3_embeddings(text_summary,engine="text-search-ada-query-001" )
timestamps_embeddings = []
 
for t in text_timestamp_summary:
  timestamps_embeddings.append(get_embedding(t,engine="text-search-ada-query-001"))


In [None]:
# Get cosine similarity results

results = cosine.get_cosine_simlarity(summary,timestamps_embeddings)



In [None]:
# create a json file with the top 20 similarities

cosine.get_results(results,20,summary_timestamps,text_summary,'Summary_linking' )