# Import Packages

In [1]:
import infomap

In [1]:
import re
import os
import nltk
import pickle
import justext
import nltk.data
import pandas as pd 
from nltk.tokenize import TreebankWordTokenizer
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from infomap import Infomap

stop_words = stopwords.words('english')
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

# Text Preprocessing
- Set working directory
- Create a new directory that stores the cleaned text - 'data_directory'
- Read and parse through case files using BeautifulSoup
- Create 3 dictionaries for multilevel storage
- Extract and clean the Name, Catchphrases and Sentences of each case 
- In gold_catch{}, key - 'Catchphrases', value - the extracted catchphrases
- In gold_sent{}, key - 'Sentences', value - the extracted sentences
- In gold_text{}, key - name of case, value - gold_catch and gold_sent
- Store each gold_text in folder 'data' as pickle file

In [2]:
# set directory to case reports
base_dir = '/Users/12029/Desktop/Jupyter nbk/NLP_Proj/fulltext'

# Make directory to store extracted text
catch_directory = os.path.dirname('data/cleanedcatch/')
if not os.path.exists(catch_directory):
    os.makedirs(catch_directory)

# Make directory to store extracted text
sent_directory = os.path.dirname('data/cleanedsent/')
if not os.path.exists(sent_directory):
    os.makedirs(sent_directory)
    

# read in case files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse case file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # create 2 dictionaries to store the case names as key 
            # and list of sentences and catchphrases as values
            # the third gold_text{} will store the other two in a multilevel fashion
            gold_sent = {}
            gold_catch = {}
            
            # obtain all names in the case files
            names = soup.find_all('name')

            # obtain all catchphrases in the case file
            catchphrases = soup.find_all('catchphrase')
            
            # obtain all sentences in the case file
            sentences = soup.find_all('sentence')
            
            # clean and store the extracted names
            for name in names:
                na = name.text
                na = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', na)
                na = na.strip()    
#                 print(na)
                
            # clean and store the extracted catchphrases 
            gold_catchphrases = []
            for catchphrase in catchphrases:
                text = catchphrase.text
                text = re.sub(r'.*>','',text)
                gold_catchphrases.append(text)
                
            # clean and store the extracted sentences                
            gold_sentences = []
            for sentence in sentences:
                text = sentence.text
                text = text.strip()
                text = text.strip('.')
                text = text.lstrip('0123456789.- ')
                text = re.sub(r'(( [\d])$)|(.*>)|(\n)','',text)
                text = text.strip()
                gold_sentences.append(text)
                
                
            # remove any missing sentences
            gold_sentences = [x for x in gold_sentences if x]

            # store the name as key and the sentences as value
            gold_sent[na] = gold_sentences
            
            # store the name as key and the catchphrases as value
            gold_catch[na] = gold_catchphrases                             
            
            num += 1
            num = str(num)
            
            # save catchphrases as pickle
            pickle_out = open(os.path.join(catch_directory, num),"wb")
            pickle.dump(gold_catch, pickle_out)
            pickle_out.close()
            
            # save sentences as pickle
            pickle_out = open(os.path.join(sent_directory, num),"wb")
            pickle.dump(gold_sent, pickle_out)
            pickle_out.close()

In [3]:
# # Checking sample sent_text files and catch_text files
c = pd.read_pickle("data/cleanedsent/1")
print(c)

{'Sharman Networks Ltd v Universal Music Australia Pty Ltd': ["Background to the current application  1 The applicants Sharman Networks Ltd ('Sharman Networks'), Sharman License Holdings Ltd ('Sharman License') and Ms Nicola Anne Hemming ('Ms Hemming') are each the subject of asset preservation orders made by Wilcox J on 22 March 2005 ('the Mareva orders')", "When referring to the applicants generally, I will do so as 'the Sharman applicants'", "Each of the Sharman applicants was one of ten respondents to infringement of copyright proceedings brought by the present respondents ('the Music companies') in respect of the operation of what was described by the parties as the 'Kazaa system' ('the primary proceedings')", 'Wilcox J made orders ancillary to the Mareva orders on 22 March 2005 requiring each of the Sharman applicants to disclose on affidavit the description and value of all of their assets, wherever situated, and to specify whether those assets were held by each applicant either

# Citation Extraction and Cleaning

In [4]:
# set directory to citations class
base_dir = '/Users/12029/Desktop/Jupyter nbk/NLP_Proj/citations_class'

# Make directory to store extracted citations
citation_directory = os.path.dirname('data/citation_text/')
if not os.path.exists(citation_directory):
    os.makedirs(citation_directory)
    
# read in citation files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse citation file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # obtain all citations in the citations file
            citations = soup.find_all('name')
            
            # obtain all cited cases in the citations file
            cited = soup.find_all('tocase')
            
            # obtain all text in identified citations and store them together
            gold_citations = []
            for citation in citations:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', text)
                text = text.strip()
                gold_citations.append(text)
            
            # obtain all text in identified cited citations and store them together
            gold_cited = []
            for citation in cited:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(\d+)|(FCA)|(;)', '', text)
                text = re.sub(r'[ ]{2,}',' ',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                gold_cited.append(text)
            
            num += 1
            num = str(num)
            
            # add them together
            gold_citations = gold_citations + gold_cited
            
            # save citations as pickle
            pickle_out = open(os.path.join(citation_directory, num),"wb")
            pickle.dump(gold_citations, pickle_out)
            pickle_out.close()

In [5]:
# Checking sample citation_text file
c = pd.read_pickle("/Users/12029/Desktop/Jupyter nbk/NLP_Proj/data/citation_text/83")
print(c)

['Purchas, in the matter of Estore Pty Limited', 'Commonwealth of Australia v Rocklea Spinning Mills Pty Ltd', 'Dean-Willcocks v ACG Engineering Pty Ltd', 'Federal Commissioner of Taxation v All Suburbs Car Repairs Pty Ltd', 'Gidley Re Aliance Motor Body Pty Ltd', 'Lombe v Wagga Leagues Club Ltd', 'Mentha v GE Capital Limited']


# Clustering cases based on Citations

In [6]:
citation_dict = {}

for i in range(1,2755):
    citation_text = pd.read_pickle("data/citation_text/"+str(i))
    if citation_text[0] not in citation_dict.keys():
        citation_dict[citation_text[0]] = citation_text[1:]
    else:
        citation_dict[citation_text[0]].extend(citation_text[1:])

# dictionary to assign numbers to cases i.e numbers as keys and case names as values
num_dict = {}
# dictionary to assign cases to numbers i.e case names as keys and numbers as values
dict_num = {}
dict_keys = list(citation_dict.keys())
dict_values = list(citation_dict.values())
dict_values = [item for sublist in dict_values for item in sublist]
citation_list = list(set(dict_keys + dict_values))
for i in range(len(citation_list)):
    num_dict[i] = citation_list[i]
    dict_num[citation_list[i]] = i


# dictionary with case numbers instead of case names (for infomap)
citation_dict_num = {}
for i in range(len(num_dict.keys())):
    if num_dict[i] in dict_keys:
        citation_dict_num[i] = []
        for j in range(len(citation_dict[num_dict[i]])):
            if i in citation_dict_num.keys():
                citation_dict_num[i].append(dict_num[citation_dict[num_dict[i]][j]])
            else:
                citation_dict_num[i] = dict_num[citation_dict[num_dict[i]][j]]
                

im = Infomap()

# Creating nodes and links between nodes for infomap
for i in citation_dict_num.keys():
    for j in range(len(citation_dict_num[i])):
        im.add_link(i,citation_dict_num[i][j])

        
# Run the Infomap search algorithm to find optimal modules
im.run()
        
modules_dict = im.get_modules()

# dictionary with clusters as keys and case numbers as values
cluster_dict = {}
for node in im.tree:
    if node.is_leaf:
        if num_dict[node.node_id] in dict_keys:
            if node.module_id in cluster_dict.keys():
                cluster_dict[node.module_id].append(node.node_id)
            else:
                cluster_dict[node.module_id] = [node.node_id]


# dictionary with clusters as keys and case numbers as values (clusters with >=10 cases)
top_clusters_dict = {}
for key,value in cluster_dict.items():
    #print(key, len(value))
    if len(value) >= 10:
        top_clusters_dict[key] = value


# dictionary with keys as cluster number and values as names of cases
top_clusters_dict_name = {}
for key in top_clusters_dict.keys():
    top_clusters_dict_name[key] = []
    for value in top_clusters_dict[key]:
        if key in top_clusters_dict_name.keys():
            top_clusters_dict_name[key].append(num_dict[value])
        else:
            top_clusters_dict_name[key] = num_dict[value]
            

# Saving the dictionary as pickle file
cluster_directory = os.path.dirname('data/clusterdata/')
if not os.path.exists(cluster_directory):
    os.makedirs(cluster_directory)

pickle_out = open(os.path.join(cluster_directory, 'clusters'),"wb")
pickle.dump(top_clusters_dict_name, pickle_out)
pickle_out.close()

In [7]:
# Checking clusters pickle file
c = pd.read_pickle("data/clusterdata/clusters")
print(c)

{1: ['VWFP and VWFQ v Minister for Immigration and Multicultural and Indigenous Affairs', 'VWBF v Minister for Immigration and Multicultural and Indigenous Affairs', 'MZXFQ v Minister for Immigration and Citizenship', 'SZCQA v Minister for Immigration and Citizenship', 'MZXKH v Minister for Immigration and Citizenship', 'NAWZ v Minister for Immigration & Multicultural Affairs', 'SZHIB v Minister for Immigration and Multicultural Affairs', 'SZIOZ v Minister for Immigration and Citizenship', 'SZHCN v Minister for Immigration and Multicultural Affairs', 'SZESF v Minister for Immigration and Multicultural Affairs', 'SZCJY v Minister for Immigration and Multicultural and Indigenous Affairs', 'SXSB v Minister for Immigration and Citizenship', 'MZXIH v Minister for Immigration and Citizenship', 'Narayan v Minister for Immigration and Citizenship', 'SZKNB v Minister for Immigration & Citizenship', 'SZJXM v Minister for Immigration and Citizenship', 'Lobo v Minister for Immigration & Multicultu

# Merging cases together in the same cluster

In [8]:
from collections import defaultdict

ref_summary = defaultdict(list)
to_summarize = defaultdict(list)

case_clusters = pd.read_pickle("data/clusterdata/clusters")

for i in range(1, 3890):
    reference = pd.read_pickle("data/cleanedcatch/"+str(i))
    case = pd.read_pickle("data/cleanedsent/"+str(i))
    
    for cluster in case_clusters.keys():
        sent = list(case.keys())
        
        # check if case is in cluster
        if (all(x in case_clusters[cluster] for x in sent)):
                       
            # add sentence to cases in the cluster
            to_summarize[cluster].extend(case.values())
            
            # add catchphrases to reference summary in the cluster
            ref_summary[cluster].extend(reference.values())

# Text Summarization and Keywords for each cluster

In [9]:
from gensim.summarization.summarizer import summarize
from gensim.summarization.summarizer import summarize_corpus
from gensim.summarization import keywords

In [10]:
to_summarize[10]

[['This is an application under s 1335 of the Corporations Act 2001 (Cth) (the Act) for orders that the applicant, Cosdean Investments Pty Ltd (Cosdean) provide security for costs to the respondent Football Federation Australia Limited (FFA) in the sum of $61,640 or such other amount as the Court deems appropriate',
  'Section 1335(1) reads:   a corporation is plaintiff in any action or other legal proceeding, the court having jurisdiction in the matter may, if it appears by credible testimony that there is reason to believe that the corporation will be unable to pay the costs of the defendant if successful in his, her or its defence, require sufficient security to be given for those costs and stay all proceedings until the security is given',
  "'   The principles upon which such an application must be determined are clear cut",
  'It is first necessary for FFA (the applicant on the motion) to establish that Cosdean will be unable to pay its costs if Cosdean is unsuccessful in the act

In [11]:
summarize_corpus(to_summarize[10], ratio=0.03)

[['The now Appellant was a party to proceedings in the Family Court of Australia in 2005',
  'In issue in that Court was a property dispute which proceeded to a contested hearing',
  'Prior to hearing, a series of offers and counter-offers were exchanged',
  'The Appellant made what was characterised by the Federal Magistrate whose decision is now under appeal as " a series of bizarre demands ..."',
  'The dispute, however, was settled on the third day of the hearing, 23 November 2005',
  'The Family Court made orders for the payment of costs against the now Appellant totalling $115,186.97',
  "Those costs were not paid and a creditor's petition was served in February 2007",
  'The judgment creditor, the present Respondent, was the solicitor retained by the Appellant for the purposes of the Family Court proceedings',
  'On 2 September 2008 a sequestration order was made against the Appellant, then using her married name: Murphy v Revis [2008] FMCA 1561',
  'The hearing on that occasion

In [12]:
cluster_summaries = {}
cluster_keywords = {}

for cluster, cases in to_summarize.items():
#     cases = ' '.join(map(str, cases))
    # obtain summary for each cluster
    cluster_summaries[cluster] = summarize_corpus(cases, ratio=0.1)
    
    # obtain keywords for each cluster
#     cluster_keywords[cluster] = keywords(cases)


In [13]:
print(cluster_summaries[10])

[['The now Appellant was a party to proceedings in the Family Court of Australia in 2005', 'In issue in that Court was a property dispute which proceeded to a contested hearing', 'Prior to hearing, a series of offers and counter-offers were exchanged', 'The Appellant made what was characterised by the Federal Magistrate whose decision is now under appeal as " a series of bizarre demands ..."', 'The dispute, however, was settled on the third day of the hearing, 23 November 2005', 'The Family Court made orders for the payment of costs against the now Appellant totalling $115,186.97', "Those costs were not paid and a creditor's petition was served in February 2007", 'The judgment creditor, the present Respondent, was the solicitor retained by the Appellant for the purposes of the Family Court proceedings', 'On 2 September 2008 a sequestration order was made against the Appellant, then using her married name: Murphy v Revis [2008] FMCA 1561', 'The hearing on that occasion took place in the

# Summarization Evaluation