In [None]:
# read in files

# for each file, parse into soup

    # find all catchprases
        # for each catchprase, extract text and store in a list
    # save list as pickle to disk

    # find all sentences
        # for each sentence, extract text and store in a list
    # save list as pickle to disk

In [None]:
import re
import os
import nltk
import pickle
import justext
import nltk.data
import pandas as pd 
from nltk.tokenize import TreebankWordTokenizer
from bs4 import BeautifulSoup
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords

stop_words = stopwords.words('english')
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
treebank_tokenizer = TreebankWordTokenizer()

In [None]:
# set directory to case reports
base_dir = 'data/fulltext'

# Make directory to store extracted catchphrases
catch_directory = os.path.dirname('data/catch_text/')
if not os.path.exists(catch_directory):
    os.makedirs(catch_directory)
    
# Make directory to store extracted sentences
sent_directory = os.path.dirname('data/sent_text/')
if not os.path.exists(sent_directory):
    os.makedirs(sent_directory)
    
# read in case files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse case file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # obtain all catchphrases in the case file
            catchphrases = soup.find_all('catchphrase')
            
            # obtain all sentences in the case file
            sentences = soup.find_all('sentence')
            
            # obtain all text in identified catchphrase and store them together
            gold_catchphrases = []
            for catchphrase in catchphrases:
                text = catchphrase.text
                text = re.sub(r'.*>','',text)
                gold_catchphrases.append(text)
                
            # obtain all text in identified sentences and store them together
            gold_sentences = []
            for sentence in sentences:
                text = sentence.text
                text = text.strip()
                text = text.strip('.')
                text = text.lstrip('0123456789.- ')
                text = re.sub(r'(( [\d])$)|(.*>)|(\n)','',text)
                text = text.strip()
                gold_sentences.append(text)
            
            num += 1
            num = str(num)
            
            # Removing empty sentences
            gold_sentences = [x for x in gold_sentences if x]
            #print(gold_sentences)
                        
            # save catchphrases as pickle
            pickle_out = open(os.path.join(catch_directory, num),"wb")
            pickle.dump(gold_catchphrases, pickle_out)
            pickle_out.close()
            
            # save sentences as pickle
            pickle_out = open(os.path.join(sent_directory, num),"wb")
            pickle.dump(gold_sentences, pickle_out)
            pickle_out.close()

In [None]:
# Checking sample sent_text files and catch_text files
# c = pd.read_pickle("data/sent_text/2")
# c = pd.read_pickle("data/catch_text/2")
# print(c)

In [None]:
#function for extracting tokens from catchphrases text and sentences text
def tokenextraction(catch_text, sent_text):
    catch_tokens = []
    sent_tokens = []
    for text in catch_text :
        catch_tokens += treebank_tokenizer.tokenize(text)
    for l in sent_text :
        sent_tokens += treebank_tokenizer.tokenize(text)
    return catch_tokens, sent_tokens

In [None]:
#iterating through all the pickle files for extracting tokens

all_catch_tokens = []
all_sent_tokens = []
for i in range(1,3890):
    try :
        catch_text = pd.read_pickle("data/catch_text/"+str(i))
        sent_text = pd.read_pickle("data/sent_text/"+str(i))
    except EOFError:
        catch_text = list()
    catch_tokens, sent_tokens = tokenextraction(catch_text, sent_text)
    all_catch_tokens.extend(catch_tokens)
    all_sent_tokens.extend(sent_tokens)


In [None]:
# print(all_catch_tokens)
# print(all_sent_tokens)

In [None]:
# Cleaning the tokens function
def cleanedtokens(tokens):
    cleaned_tokens = [w for w in tokens if w.isalnum()]
    cleaned_tokens = [w for w in cleaned_tokens if not w in stop_words]
    return cleaned_tokens

In [None]:
# Cleaning the tokens
all_catch_cleaned_tokens = cleanedtokens(all_catch_tokens)
# print(all_catch_cleaned_tokens)
all_sent_cleaned_tokens = cleanedtokens(all_sent_tokens)
# print(all_sent_cleaned_tokens)

# Citation Extraction and Cleaning

In [None]:
# set directory to citations class
base_dir = 'data/citations_class'

# Make directory to store extracted citations
citation_directory = os.path.dirname('data/citation_text/')
if not os.path.exists(citation_directory):
    os.makedirs(citation_directory)
    
# read in citation files
for num, filename in enumerate(os.scandir(base_dir)):
    with open(filename, 'rb') as case_file:
            # parse citation file into beautiful soup
            soup = BeautifulSoup(case_file, 'xml')
            
            # obtain all citations in the citations file
            citations = soup.find_all('name')
            
            # obtain all cited cases in the citations file
            cited = soup.find_all('tocase')
            
            # obtain all text in identified citations and store them together
            gold_citations = []
            for citation in citations:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(FCA)|(\d+)', '', text)
                text = text.strip()
                gold_citations.append(text)
            
            # obtain all text in identified cited citations and store them together
            gold_cited = []
            for citation in cited:
                text = citation.text
                # Cleaning citation part by removing all the years and courts mentioned
                text = re.sub(r'\([^)]*\)|(\[[^)]*\])|(\d+)|(FCA)|(;)', '', text)
                text = re.sub(r'[ ]{2,}',' ',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                text = re.sub(r'[A-Z]{2,}$','',text)
                text = text.strip()
                gold_cited.append(text)
            
            num += 1
            num = str(num)
            
            # add them together
            gold_citations = gold_citations + gold_cited
            gold_citations = [x for x in gold_citations if x]
            
            # save citations as pickle
            pickle_out = open(os.path.join(citation_directory, num),"wb")
            pickle.dump(gold_citations, pickle_out)
            pickle_out.close()

In [None]:
# Checking sample citation_text file
c = pd.read_pickle("data/citation_text/13")
print(c)