# Import Packages

In [None]:
import re
import os
import nltk
import pickle
import justext
import nltk.data
import pandas as pd 
from nltk.tokenize import TreebankWordTokenizer
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from infomap import Infomap

stop_words = stopwords.words('english')
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

# Text Preprocessing
- Set working directory
- Create a new directory that stores the cleaned text - 'data_directory'
- Read and parse through case files using BeautifulSoup
- Create 3 dictionaries for multilevel storage
- Extract and clean the Name, Catchphrases and Sentences of each case 
- In gold_catch{}, key - 'Catchphrases', value - the extracted catchphrases
- In gold_sent{}, key - 'Sentences', value - the extracted sentences
- In gold_text{}, key - name of case, value - gold_catch and gold_sent
- Store each gold_text in folder 'data' as pickle file

In [None]:
# set directory to case reports
base_dir = 'data/fulltext'

# Make directory to store extracted text
cleaned_directory = os.path.dirname('data/cleanedtext/')
if not os.path.exists(cleaned_directory):
    os.makedirs(cleaned_directory)
    
# read in case files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse case file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # create 2 dictionaries to store the case names as key 
            # and list of sentences and catchphrases as values
            # the third gold_text{} will store the other two in a multilevel fashion
            gold_text = {}
            gold_sent = {}
            gold_catch = {}
            
            # obtain all names in the case files
            names = soup.find_all('name')

            # obtain all catchphrases in the case file
            catchphrases = soup.find_all('catchphrase')
            
            # obtain all sentences in the case file
            sentences = soup.find_all('sentence')
            
            # clean and store the extracted names
            for name in names:
                na = name.text
                na = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', na)
                na = na.strip()    
#                 print(na)
                
            # clean and store the extracted catchphrases 
            gold_catchphrases = []
            for catchphrase in catchphrases:
                text = catchphrase.text
                text = re.sub(r'.*>','',text)
                gold_catchphrases.append(text)
                
            # clean and store the extracted sentences                
            gold_sentences = []
            for sentence in sentences:
                text = sentence.text
                text = text.strip()
                text = text.strip('.')
                text = text.lstrip('0123456789.- ')
                text = re.sub(r'(( [\d])$)|(.*>)|(\n)','',text)
                text = text.strip()
                gold_sentences.append(text)
                
            num += 1
            num = str(num)
                
            # remove any missing sentences
            gold_sentences = [x for x in gold_sentences if x]

            # store the name as key and the sentences as value
            gold_sent['Sentences'] = gold_sentences
            
            # store the name as key and the catchphrases as value
            gold_catch['Catchphrases'] = gold_catchphrases
            
            # store the sentence and chatchphrase dictionaries as sub-dictionaries
            gold_text[na] = [gold_catch, gold_sent]     
            
            # save the final dictionary for each case as pickle file
            pickle_out = open(os.path.join(cleaned_directory, num),"wb")
            pickle.dump(gold_text, pickle_out)
            pickle_out.close()

In [None]:
# # Checking sample sent_text files and catch_text files
# c = pd.read_pickle("data/cleanedtext/9")
# print(c)

# Citation Extraction and Cleaning

In [None]:
# set directory to citations class
base_dir = 'data/citations_class'

# Make directory to store extracted citations
citation_directory = os.path.dirname('data/citation_text/')
if not os.path.exists(citation_directory):
    os.makedirs(citation_directory)
    
# read in citation files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse citation file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # obtain all citations in the citations file
            citations = soup.find_all('name')
            
            # obtain all cited cases in the citations file
            cited = soup.find_all('tocase')
            
            # obtain all text in identified citations and store them together
            gold_citations = []
            for citation in citations:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', text)
                text = text.strip()
                gold_citations.append(text)
            
            # obtain all text in identified cited citations and store them together
            gold_cited = []
            for citation in cited:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(\d+)|(FCA)|(;)', '', text)
                text = re.sub(r'[ ]{2,}',' ',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                gold_cited.append(text)
            
            num += 1
            num = str(num)
            
            # add them together
            gold_citations = gold_citations + gold_cited
            
            # save citations as pickle
            pickle_out = open(os.path.join(citation_directory, num),"wb")
            pickle.dump(gold_citations, pickle_out)
            pickle_out.close()

In [None]:
# Checking sample citation_text file
# c = pd.read_pickle("data/citation_text/83")
# print(c)

# Clustering cases based on Citations

In [None]:
citation_dict = {}

for i in range(1,2755):
    citation_text = pd.read_pickle("data/citation_text/"+str(i))
    if citation_text[0] not in citation_dict.keys():
        citation_dict[citation_text[0]] = citation_text[1:]
    else :
        citation_dict[citation_text[0]].extend(citation_text[1:])

# dictionary to assign numbers to cases i.e numbers as keys and case names as values
num_dict = {}
# dictionary to assign cases to numbers i.e case names as keys and numbers as values
dict_num = {}
dict_keys = list(citation_dict.keys())
dict_values = list(citation_dict.values())
dict_values = [item for sublist in dict_values for item in sublist]
citation_list = list(set(dict_keys + dict_values))
for i in range(len(citation_list)):
    num_dict[i] = citation_list[i]
    dict_num[citation_list[i]] = i


# dictionary with case numbers instead of case names (for infomap)
citation_dict_num = {}
for i in range(len(num_dict.keys())):
    if num_dict[i] in dict_keys:
        citation_dict_num[i] = []
        for j in range(len(citation_dict[num_dict[i]])):
            if i in citation_dict_num.keys():
                citation_dict_num[i].append(dict_num[citation_dict[num_dict[i]][j]])
            else :
                citation_dict_num[i] = dict_num[citation_dict[num_dict[i]][j]]
                

im = Infomap()

# Creating nodes and links between nodes for infomap
for i in citation_dict_num.keys():
    for j in range(len(citation_dict_num[i])):
        im.add_link(i,citation_dict_num[i][j])

        
# Run the Infomap search algorithm to find optimal modules
im.run()
        
modules_dict = im.get_modules()

# dictionary with clusters as keys and case numbers as values
cluster_dict = {}
for node in im.tree:
    if node.is_leaf:
        if num_dict[node.node_id] in dict_keys:
            if node.module_id in cluster_dict.keys():
                cluster_dict[node.module_id].append(node.node_id)
            else:
                cluster_dict[node.module_id] = [node.node_id]


# dictionary with clusters as keys and case numbers as values (clusters with >=10 cases)
top_clusters_dict = {}
for key,value in cluster_dict.items():
    #print(key, len(value))
    if len(value) >= 10:
        top_clusters_dict[key] = value


# dictionary with keys as cluster number and values as names of cases
top_clusters_dict_name = {}
for key in top_clusters_dict.keys():
    top_clusters_dict_name[key] = []
    for value in top_clusters_dict[key]:
        if key in top_clusters_dict_name.keys():
            top_clusters_dict_name[key].append(num_dict[value])
        else:
            top_clusters_dict_name[key] = num_dict[value]
            

# Saving the dictionary as pickle file
cluster_directory = os.path.dirname('data/clusterdata/')
if not os.path.exists(cluster_directory):
    os.makedirs(cluster_directory)

pickle_out = open(os.path.join(cluster_directory, 'clusters'),"wb")
pickle.dump(top_clusters_dict_name, pickle_out)
pickle_out.close()

In [None]:
# Checking clusters pickle file
# c = pd.read_pickle("data/clusterdata/clusters")
# print(c)