In [None]:
cd yourlocation/Linking_Judgements/

# **Preprocessing and segmenting judgement and timestamps**

In [None]:
import cleanJudgement as cj
import segementJudgement as sg
import keywordExtraction as key

In [None]:
# cleaning the judgement
judge= cj.cleanj(file of judgement in txt format with integered paragraphs)
clean_judge = judge.preprocessDoc()
# print(clean_judge)

In [None]:
# Option 1: segment judgement with paragraphs

judge_all = sg.segmentJ(clean_judge, paragraphs=True)
paragraphs = judge_all.paras() #list of paragraphs use for entailement
# get the length of the paragraphs
lengths_paragraphs = [len(t.split()) for t in paragraphs]

In [None]:
# Option2: segment judgement with hard-coded sections
# section_names = ['Summary','The background','The facts of this case','Preserving the status quo','Conclusions in principle',' The Outcome in this Case']
# judge_all = sg.segmentJ(clean_judge, paragraphs=False)
# secs = judge_all.sections(section_names)
# summary = secs[0]
# # text only summary
# text_summary = ''.join(secs[0])  #NEEDED FOR EMBEDDING AND JSON FILE
# text_summary

In [None]:
# cut first two lines if they are dates (check first paragaphs[0])
paragraphs = paragraphs[1:]
paragraphs

**Creating  windows of paragraphs**

In [None]:
# Get average token length for debugging GPT 3 and other models
lengths_paragraphs = [len(t.split()) for t in paragraphs]
max(lengths_paragraphs)
from matplotlib import pyplot as plt
plt.hist(lengths_paragraphs, 10)

plt.show()
                          

In [None]:
# Get average length
def Average(lst):
    return sum(lst) / len(lst)
Average(lengths_paragraphs)


In [None]:
# create segements of 3 consecutive paragraphs
chunks = [paragraphs[x:x+3] for x in range(0, len(paragraphs), 3)]
# connect paragraphs with a new line
window_paras_strings = ["\n ".join(p) for p in chunks]
#get lengths of segments
lengths_windows = [len(t.split()) for t in window_paras_strings]
print(max(lengths_windows))
print(min(lengths_windows))
print(Average(lengths_windows))

**Clean timestamps**

In [None]:
#Clean and extract timestamps related to the summary section

import timestampExraction as ts

transcript = ts.timestamp(txt file for collated csv files for sessions)
clean_transcript = transcript.segment()
# print(clean_transcript)
# exclude very short timestamps and get timestamps list with timestamps with +50 tokens
long_timestamps = transcript.longTimestamps(clean_transcript)  # with timestamp[s]
# # print(len(long_timestamps))

# # get the text only timestamps related to the summary section without the timespans
text_timestamp = transcript.getText(long_timestamps) #TIMESTAMPS WITHOUT TIMES
# print(text_timestamp_summary[0])

In [None]:
# Related to option 2 above: Extract links that has the same entities as the judgement paragraphs/sections
#extract both quoted keywords and BLACKSTONE NEs from judgement sections
# The Summary list as an example
# extractor = key.extractkeywords()
# summary_quotes = extractor.quotes_extract(secs[0])
# full_list_summary = extractor.create_NE_lists(csv file with entities,summary_quotes)
# full_list_summary

 # get only the timetamps that has the summary keywords and entities
# summary_timestamps = transcript.extractTimestamps(long_timestamps,full_list_summary )  #TIMESTAMPS WITH TIME
# print(len(summary_timestamps))

# Get Embedding files for judgement paragraphs and timestamps

In [None]:
!pip install tiktoken

In [None]:
!pip install openai

In [None]:
# imports
import pandas as pd
import tiktoken
import openai

from openai.embeddings_utils import get_embedding

In [None]:
# We save the embeddigns of the paragraphs and the timestamps and then calculate the dot product

In [None]:
# embedding model parameters
embedding_model = "text-embedding-ada-002"  # text embedding
embedding_encoding = "cl100k_base"  # this the encoding for text-embedding-ada-002
max_tokens = 8000  # the maximum for text-embedding-ada-002 is 8191

In [None]:
api_key = 'your api key for OpenAI'
openai.api_key = api_key

**Get judgement paragraph embeddings**

In [None]:
df_judge = pd.DataFrame(window_paras_strings, columns =['text'])
df_judge.head()

In [None]:
# get num of tokens per the model tokenizer
encoding = tiktoken.get_encoding(embedding_encoding)
df_judge["n_tokens"] = df_judge.text.apply(lambda x: len(encoding.encode(x)))
print(df_judge['n_tokens'].max())

**Save GPT embeddings for future use to avoid paying every time you make an API request**

In [None]:
cd to an embedding forlder you create
!mkdir make a directory for each case

In [None]:
# Ensure you have your API key set in your environment per the README: https://github.com/openai/openai-python#usage
# Embeddings for paragraphs
# This may take a few minutes
df_judge["embedding"] = df_judge.text.apply(lambda x: get_embedding(x, engine=embedding_model))
df_judge.to_csv("./name of directory above/name of the case.csv")

In [None]:
#  it is better to save the embeddings in a pickle file to avoid problems of data value change
import pickle
df_judge.to_pickle("./name of directory above/name of the case.pkl")

**Get timestamps embeddings**

In [None]:
df_times = pd.DataFrame(text_timestamp, columns =['text'])

# get num of tokens per the model tokenizer
df_times["n_tokens"] = df_times.text.apply(lambda x: len(encoding.encode(x)))
print(df_times['n_tokens'].max())

In [None]:
# add timestamps in the data frame

times = [t[0] for t in long_timestamps]

df_times['timestamps'] = times 

In [None]:
# Embeddings for timestamps
# This may take a few minutes
df_times["embedding"] = df_times.text.apply(lambda x: get_embedding(x, engine=embedding_model))
#debug to see that all timestamps has been embedded
df_times['embedding'].isnull().sum()


In [None]:
#save embedding files for timestamps in csv and pkl
df_times.to_csv("./name of directory above/name of the case.csv")
df_times.to_pickle("./name of directory above/name of the case.pkl")

# **Calculate pairwise similarity between embeddings of paragraphs and timestamps**

In [None]:
import pandas as pd
import numpy as np

In [None]:
cd to the embedding folder where you have the embeddings stored

In [None]:
# open from pickle file
with (open('embeddings of timestamsps.pkl', "rb")) as openfile:
     df_times = pickle.load(openfile)
df_times.head()

In [None]:
#same as above cell but from CSV

# get the embeddings of timestamps
# datafile_path = 'embeddings of timestamsps.pkl'

# df_times = pd.read_csv(datafile_path)



In [None]:
# create a list of embeddings to get indices of timestamps
list_embeddings = df_times["embedding"].to_list()

In [None]:
# open file for judgemnt embeddings

with (open('file for judgement embeddings.pkl', "rb")) as openfile:
     df_judge = pickle.load(openfile)
df_judge.head()

In [None]:
# same as above but with CSV

datafile_path = "file for judgement embeddings.csv"

df_judge = pd.read_csv(datafile_path)
df_judge["embedding"] = df_judge.embedding.apply(eval).apply(np.array)

In [None]:
from openai.embeddings_utils import cosine_similarity

# caluculate the similarity of each judgement segment with each embedding for a timestamp in the list of timestamps and add index of timestamp


CS = []
for i in range(len(df_judge['embedding'])):
  temp = []
  for n in range(len(list_embeddings)):
    temp.append((cosine_similarity(df_judge['embedding'][i],list_embeddings[n]),n))
  temp = sorted(temp, reverse=True)
  # print(temp)
  CS.append(temp)



**Save cosine similarities for future use**

In [None]:
import csv
with open('cosinesimialrity.csv','w',newline='') as out:
    csv_out=csv.writer(out)
    csv_out.writerow(['cosine_similarity','index'])
    for row in CS:  
        csv_out.writerow(row)


# **Create Json file without keyword tags**

In [None]:
# a function to take the cosine similarity (list of 20), the list of long_times, the text of the segment and the name of the segment and the id of the segment and returns a dictionary

# def get_dics(cosine_sim,timestamps_ls, section,file_name, id):
#     times = []
#     texts = []
#     for t in cosine_sim:
#         text = timestamps_ls[t[1]][1]
#         time = timestamps_ls[t[1]][0]
#         times.append(time)
#         texts.append(text)
#         dictionary = {file_name:section}
#         dictionary['Transcription'] = [{'time': times_sp, 'text': trans_te} for times_sp, trans_te in zip(times, texts)]

#     data_id= {'Segment_id': id}
#     data_id.update(dictionary)
#     return data_id

In [None]:
# the same as above but working with the dfs

def get_dics_withdf(cosine_sim,times_df, section,file_name, id):
    times = []
    texts = []
    for t in cosine_sim:
        text = times_df['text'][t[1]]
        time = times_df['timestamps'][t[1]]
        times.append(time)
        texts.append(text)
        dictionary = {file_name:section}
        dictionary['Transcription'] = [{'time': times_sp, 'text': trans_te} for times_sp, trans_te in zip(times, texts)]

    data_id= {'Segment_id': id}
    data_id.update(dictionary)
    return data_id

In [None]:
#create a dictionary of dictionaries for all the paragraph segments with the dfs

dicts2 = []
id_ = 1
for c in CS2:
  ind = CS2.index(c)  
  dict_temp = get_dics_withdf(c,df_times, df_judge['text'][ind],'Segment', id_)
  dicts2.append(dict_temp)
  id_ += 1

# **Create Json file with keyword tags**

**Add keyword tag to entities in judgement and timestamps**



In [None]:
import pandas as pd
import re

entities = pd.read_csv(csv file with entities extracted from judgement, names=['word','label'])

In [None]:
entities_list = entities['word'].to_list()

**Clean the entities list. Use this function if the extractor module is not used in the option 2 above. You can uncomment or comment as many of the cleaning lines as needed**

In [None]:
import re
#clean list
def create_NE_lists(list_ents):
  # data = pd.read_csv(file, encoding='utf8')
  
  # printable = set(string.printable) #clean non-utf8
  # ents = ["".join(filter(lambda c: c in printable, ent)) for ent in ents]
  
  # new_ents = remove_stopwords(new_ents) #clean stopwords
  new_ents = [x for x in list_ents if x !='I agree'] #clean 'I agree'
  new_ents = [elem for elem in new_ents if len(elem) > 10] #excluse short NEs
  # new_ents = [re.sub("\[\d+\]\s+" , "", ent) for ent in new_ents] #clean patterns like [2008] 
  # new_ents = [re.sub("\(\d+\)\s*" , "", ent) for ent in new_ents] #clean patterns like [2008] 
  # new_ents = [re.sub("\[" , "", ent) for ent in new_ents] # clean brackets
  # new_ents = [re.sub("\]" , "", ent) for ent in new_ents] # clean brackets
  # new_ents = [re.sub("\(" , " ", ent) for ent in new_ents] # clean brackets
  # new_ents = [re.sub("\)" , " ", ent) for ent in new_ents] # clean brackets
  # new_ents = [re.sub('"' , '', ent) for ent in new_ents] #clean quotations
  # ents =  [re.sub("\s\s+" , " ", ent) for ent in new_ents]  #clean extra spaces
  new_ents = list(dict.fromkeys(new_ents)) #delete duplicates 
  # new_ents = [t.strip() for t in new_ents]
  # new_ents = new_ents+quot
  return new_ents

**List without duplicates and short strings**

In [None]:
clean = create_NE_lists(entities_list)
clean 

In [None]:
#exclude partial match of longer entities (This is used only for creating a Json file for annotation.)

result = [item for item in clean if sum(substr not in item for substr in clean)==len(clean)-1]

**Escaping brackets for regex (again this is used to have the keywords in html format \<keywords> needed for the annotation tool)**

In [None]:
import re
cleanesc = [re.escape(T) for T in result]


**Function to add tags \<keyword\>**

In [None]:
#function to add keyword tags
import re

def add_keyword_tag(ents_list,text):
    """ Method takes a list of NEs and a text and returns a tagged text:
    :param ent_list: list of entities (str)
    :param text: text judgement or transcript (str)
    """
    KEYWORD_PRE = '<KEYWORD>'
    KEYWORD_PRE_LEN = len(KEYWORD_PRE)
    KEYWORD_POST = '</KEYWORD>'
    KEYWORD_POST_LEN = len(KEYWORD_POST)
    # idx = 0
    # inds = []
    for k in ents_list:
        idx = 0
        for match in re.finditer(k,text):
            text = text[:match.start()+idx] + KEYWORD_PRE + text[match.start()+idx:]
            idx += KEYWORD_PRE_LEN
            text = text[:match.end()+idx] + KEYWORD_POST + text[match.end()+idx:]
            idx += KEYWORD_POST_LEN
        
    return(text)

**Add keywords to text in judement and timestamps df**

In [None]:
df_times['text'] = df_times['text'].apply(lambda x: add_keyword_tag(cleanesc,x))

In [None]:
df_judge['text'] = df_judge['text'].apply(lambda x: add_keyword_tag(cleanesc,x))

**Create dict for json file with keywords**

In [None]:
def get_dics_withdf(cosine_sim,times_df, section,file_name, id):
    times = []
    texts = []
    for t in cosine_sim:
        text = times_df['text'][t[1]]
        time = times_df['timestamps'][t[1]]
        times.append(time)
        texts.append(text)
        dictionary = {file_name:section}
        dictionary['Transcription'] = [{'time': times_sp, 'text': trans_te} for times_sp, trans_te in zip(times, texts)]

    data_id= {'Segment_id': id}
    data_id.update(dictionary)
    return data_id

In [None]:
#create a dictionary of dictionaries for all the paragraph segments

dicts2 = []
id_ = 1
for c in CS2:
  ind = CS2.index(c)  
  dict_temp = get_dics_withdf(c,df_times, df_judge['text'][ind],'Segment', id_)
  dicts2.append(dict_temp)
  id_ += 1

**Add urls ( You need to get these from the following archive: https://discovery.nationalarchives.gov.uk/ )**

In [None]:
#Example of a case 
# urls = ['https://ds-live-videos.s3.amazonaws.com/66/UKSC/1/cr2-09-11-23-session1_imx30_1.webm',
#         'https://ds-live-videos.s3.amazonaws.com/66/UKSC/1/cr2-09-11-23-session2_imx30_1.mp4',
#         'https://ds-live-videos.s3.amazonaws.com/66/UKSC/1/cr2-09-11-24-session1_imx30_1.mp4',
#         'https://ds-live-videos.s3.amazonaws.com/66/UKSC/1/cr2-09-11-24-session2_imx30_1.mp4']

In [None]:
# add ursl

data_url = {'URLS':urls}
dicts2.insert(0, data_url)

#**Create Json file**

In [None]:
cd to your json file directory

In [None]:
# save the json file with the dictionaries
import json

with open("Nameofjson file", "w") as final:
    json.dump(dicts2, final)

# **Alternative Similarity Models**
**Note**
1. $\color{blue}{\text{The ouptut is a Json file with paragraph and first 20 links (you can change the number of links based on the use case)}}$
2. $\color{blue}{\text{The functions are created with  a list of timestamps text ($\color{green}{\text{text_timestamp}}$ variable above) and one judgement section.  You need to forloop through the (($\color{green}{\text{window_paras_strings}}$ variable above) to calculate with paragraphs.}}$ 



# **Document Similarity with tf-idf**







In [None]:
from featureExtraction.tfidf import tf_idf_similarity
import calculateSimilarity as cosine

In [None]:
results = tf_idf_similarity(text_summary,text_timestamp_summary)

In [None]:
# create a json file with the top 20 similarities

cosine.get_results(results,20,text_timestamp,summary_TEXT,'Summary_linking_tfidf' )

# **Similarity with BM25**

In [None]:
!pip install -U sentence-transformers rank_bm25

In [None]:
from featureExtraction import BM25 as bm

In [None]:
test = bm.bm25(text_timestamp,text_summary)

In [None]:
from evaluationNumpy import get_results

In [None]:
#create a json file from results
get_results(test,20,text_timestamp,summary_TEXT,'summary_test')

# **Document Similarity with Pooling (MEAN, MIN and Max Pooling of Glove Embeddings)**

In [None]:
!pip install flair

In [None]:
import featureExtraction.doc_pool as docp
import evaluationNumpy as eval

In [None]:
corpus = docp.embeddDoc(text_summary,text_timestamp_summary)

In [None]:
# doesnot work with GPU
summary_embedding,timestamps_embedding = corpus.get_embeddings_mean()

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_min()

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_max()

In [None]:
#get sorted similarities with indexes
similarity = calculate_cosine_similarity(timestamps_embedding,summary_embedding)

In [None]:
#write results in json file
eval.get_results(similarity,20,summary_timestamps,text_summary,'Summary_pooling_max')

# **Document Similarity with RNN Embeddings (last hidden layer)**

In [None]:
import featureExtraction.doc_pool as docp

In [None]:
summary_embedding,timestamps_embedding = corpus.get_embeddings_rnn()

In [None]:
import evaluateTensors as evalt
cosine = evalt.calculate_cosine_similarity(timestamps_embedding,summary_embedding)

In [None]:
# create a json file with the top 20 similarities
import json
evalt.get_results(cosine,20,summary_timestamps,text_summary,'Summary_linking_rnn' )

# **Semantic Search using 'all-MiniLM-L6-v2' (Entailment Embeddings fine-tuned on Multi-NLI**

In [None]:
!pip install -U sentence-transformers

In [None]:
from featureExtraction.sentenceTrasformer import getEmbeddings
from sentence_transformers import SentenceTransformer,util

In [None]:
# calculate cosine similarity and results top 20 most relevant links
# a list of tuples (ind,score)
#similarity_method (util.cos_sim or util.dot_score)

results = getEmbeddings('all-MiniLM-L6-v2',text_summary,text_timestamp_summary,util.cos_sim)

In [None]:
from evaluationNumpy import get_results

In [None]:
#create a json file from results
get_results(results,20,summary_timestamps,text_summary,'summary_test')

# **Asymmetric Similarity with SBERT Sentence Embeddings with dot product**

In [None]:
!pip install -U sentence-transformers

In [None]:
# calculate cosine similarity and results top 20 most relevant links
# a list of tuples (ind,score)
#similarity_method (util.cos_sim or util.dot_score)

results = getEmbeddings('msmarco-roberta-base-ance-firstp',text_summary,text_timestamp_summary,util.dot_score)

In [None]:
#create a json file from results
get_results(results,20,summary_timestamps,text_summary,'xxxxxxxxxxxx')