# Import Packages

In [1]:
import re
import os
import nltk
import pickle
import justext
import nltk.data
import pandas as pd 
from nltk.tokenize import TreebankWordTokenizer
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

# Text Preprocessing
- Set working directory
- Create a new directory that stores the cleaned text - 'data_directory'
- Read and parse through case files using BeautifulSoup
- Create 3 dictionaries for multilevel storage
- Extract and clean the Name, Catchphrases and Sentences of each case 
- In gold_catch{}, key - 'Catchphrases', value - the extracted catchphrases
- In gold_sent{}, key - 'Sentences', value - the extracted sentences
- In gold_text{}, key - name of case, value - gold_catch and gold_sent
- Store each gold_text in folder 'data' as pickle file

In [2]:
# set directory to case reports
base_dir = '/fulltext'

# Make directory to store extracted text
data_directory = os.path.dirname('data/')
if not os.path.exists(data_directory):
    os.makedirs(data_directory)
    
# read in case files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse case file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # create 2 dictionaries to store the case names as key 
            # and list of sentences and catchphrases as values
            # the third gold_text{} will store the other two in a multilevel fashion
            gold_text = {}
            gold_sent = {}
            gold_catch = {}
            
            # obtain all names in the case files
            names = soup.find_all('name')

            # obtain all catchphrases in the case file
            catchphrases = soup.find_all('catchphrase')
            
            # obtain all sentences in the case file
            sentences = soup.find_all('sentence')
            
            # clean and store the extracted names
            for name in names:
                na = name.text
                na = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', na)
                na = na.strip()    
#                 print(na)
                
            # clean and store the extracted catchphrases 
            gold_catchphrases = []
            for catchphrase in catchphrases:
                text = catchphrase.text
                text = re.sub(r'.*>','',text)
                gold_catchphrases.append(text)
                
            # clean and store the extracted sentences                
            gold_sentences = []
            for sentence in sentences:
                text = sentence.text
                text = text.strip()
                text = text.strip('.')
                text = text.lstrip('0123456789.- ')
                text = re.sub(r'(( [\d])$)|(.*>)|(\n)','',text)
                text = text.strip()
                gold_sentences.append(text)
                
            num += 1
            num = str(num)
                
            # remove any missing sentences
            gold_sentences = [x for x in gold_sentences if x]

            # store the name as key and the sentences as value
            gold_sent['Sentences'] = gold_sentences
            
            # store the name as key and the catchphrases as value
            gold_catch['Catchphrases'] = gold_catchphrases
            
            # store the sentence and chatchphrase dictionaries as sub-dictionaries
            gold_text[na] = [gold_catch, gold_sent]     
            
            # save the final dictionary for each case as pickle file
            pickle_out = open(os.path.join(data_directory, num),"wb")
            pickle.dump(gold_text, pickle_out)
            pickle_out.close()

In [3]:
# # Checking sample sent_text files and catch_text files
# c = pd.read_pickle("data/2")
# print(c)

{'Lawrance v Human Rights and Equal Opportunity Commission': [{'Catchphrases': ['no point of principle', 'administrative law and human rights']}, {'Sentences': ['These are two applications for orders of review under the Administrative Decisions (Judicial Review) Act 1977 (Cth) ("the AD(JR) Act")', 'They concern correspondence sent to the Human Rights and Equal Opportunity Commission ("the Commission") by the applicant in late 2005', 'In a letter dated 26 September 2005, the applicant wrote to the Commission concerning allegations of unlawful discrimination', 'The Commission replied in a letter dated 7 October 2005, in which it indicated that it was not able to assist her', 'On 13 October 2005, the applicant again wrote to the Commission', 'That letter addressed alleged breaches of human rights', 'The applicant wrote to the Commission a third time on 7 November 2005, this time concerning allegations of sexual harassment', 'The Commission did not respond to the second and third letters i

# Citation Extraction and Cleaning

In [None]:
# set directory to citations class
base_dir = 'data/citations_class'

# Make directory to store extracted citations
citation_directory = os.path.dirname('data/citation_text/')
if not os.path.exists(citation_directory):
    os.makedirs(citation_directory)
    
# read in citation files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse citation file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # obtain all citations in the citations file
            citations = soup.find_all('name')
            
            # obtain all cited cases in the citations file
            cited = soup.find_all('tocase')
            
            # obtain all text in identified citations and store them together
            gold_citations = []
            for citation in citations:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', text)
                text = text.strip()
                gold_citations.append(text)
            
            # obtain all text in identified cited citations and store them together
            gold_cited = []
            for citation in cited:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(\d+)|(FCA)|(;)', '', text)
                text = re.sub(r'[ ]{2,}',' ',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                gold_cited.append(text)
            
            num += 1
            num = str(num)
            
            # add them together
            gold_citations = gold_citations + gold_cited
            
            # save citations as pickle
            pickle_out = open(os.path.join(citation_directory, num),"wb")
            pickle.dump(gold_citations, pickle_out)
            pickle_out.close()

In [None]:
# Checking sample citation_text file
# c = pd.read_pickle("data/citation_text/11")
# print(c)