In [8]:
import json
import re
import csv

In [2]:
#### This file is to take the criminal_cases_info and tax_cases_info, as well as the actual text of the cases,
#### and combine it all together into one JSON file.
#### Then with that JSON file we will later perform TFIDF and Naive Bayes analysis

In [3]:
with open("Data/criminal_cases_info.txt") as json_file:
            crim_stats =  json.load(json_file)

with open("Data/tax_cases_info.txt") as json_file:
            tax_stats =  json.load(json_file)

In [4]:
#creating a few functions to help clean the text data when it is read into the list
def concat_rows(txt_file):
    """Takes a text file of a case, and returns one string of everything. 1 space after each row"""
    all_text = ""
    for row in txt_file:
        all_text = all_text + row.strip() + " "
    return all_text

def remove_garbage(all_text):
    """Takes a full string (i.e. all_text from concat_rows() and removes garbage like \xa0)"""
    #replaces them with " " in order to avoid concatenating words e.g. sometimes its like McLachlin\xa0C.J.
    garbage = ["\xa0"]
    for item in garbage:
        all_text = re.sub(item," ",all_text)
    return all_text


In [5]:
#making it just 100 cases each
crim_stats = crim_stats[:100]
tax_stats = tax_stats[:100]

In [6]:
#I want to zip up the crim stats and tax stats together, taking just
#the case code, case name, URL, crim or tax category, and case text
cases_clean=[]
for i in range (0,100):
    #create a dict of the stats of this case we want to keep
    #1 for crim 1 for tax
    crim_entry={}
    tax_entry={}
    
    #put data into the entries
    ##case_code
    crim_entry.update({"case_code":crim_stats[i]["case_code"]})
    tax_entry.update({"case_code":tax_stats[i]["case_code"]})
    
    ##case_name
    crim_entry.update({"case_name":crim_stats[i]["case_name"]})
    tax_entry.update({"case_name":tax_stats[i]["case_name"]})
    
    ##url
    crim_entry.update({"url":crim_stats[i]["url"]})
    tax_entry.update({"url":tax_stats[i]["url"]})
    
    ##crim or tax category
    crim_entry.update({"crim_or_tax":crim_stats[i]["crim_tax"]})
    tax_entry.update({"crim_or_tax":tax_stats[i]["crim_tax"]})
    
    ##actual text of the case
    #crim
    with open ("Data/criminal_cases/"+crim_stats[i]["case_code"]+".txt",'r') as file_object:
        crim_text = concat_rows(file_object)
    crim_text = remove_garbage(crim_text)
    
    crim_entry.update({"case_text":crim_text})
    
    #tax
    with open ("Data/tax_cases/"+tax_stats[i]["case_code"]+".txt",'r') as file_object:
        tax_text = concat_rows(file_object)
    tax_text = remove_garbage(tax_text)
    
    tax_entry.update({"case_text":tax_text})    
    
    #add entries to the master list (cases_clean)
    cases_clean.append(crim_entry)
    cases_clean.append(tax_entry)
    

    

In [7]:
len(cases_clean)

200

In [9]:
cases_clean[0]

{'case_code': 'criminal2019SCC47',
 'case_name': 'R. v. Poulin',
 'url': 'https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/17964/index.do',
 'crim_or_tax': 'criminal',
 'case_text': 'SUPREME COURT OF CANADA  Citation: R. v. Poulin, 2019 SCC 47 Appeal Heard: March 25, 2019 Judgment Rendered: October 11, 2019 Docket: 37994  Between: Her Majesty The Queen Appellant  and  Rosaire Poulin Respondent  - and -  Attorney General of Ontario, Association québécoise des avocats et avocates de la défense and Criminal Lawyers’ Association Interveners   Official English Translation: Reasons of Karakatsanis J.  Coram: Wagner C.J. and Abella, Moldaver, Karakatsanis, Côté, Brown and Martin JJ.  Reasons for Judgment: (paras. 1 to 121)  Dissenting Reasons: (paras. 122 to 156) Martin J. (Wagner C.J. and Moldaver and Côté JJ. concurring)  Karakatsanis J. (Abella and Brown JJ. concurring)  Note: This document is subject to editorial revision before its reproduction in final form in the Canada Supreme Court 

In [None]:
#print the master list to txt file
with open("Data/all_cases_with_text.txt", 'w') as outfile:
    json.dump(cases_clean, outfile)

In [14]:
#print the master list to csv file
with open('all_cases_csv.csv', 'w', newline='') as csvfile:
    fieldnames = ['case_code', 'case_name', 'url', 'crim_or_tax', 'case_text']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    
    for item in cases_clean:
        try:
            writer.writerow({fieldnames[0]:item['case_code'],
                            fieldnames[1]:item['case_name'],
                            fieldnames[2]:item['url'],
                            fieldnames[3]:item['crim_or_tax'],
                            fieldnames[4]:str(item['case_text'])})
        except:
            pass