In [46]:
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
import csv
import pandas as pd

In [22]:
#SETUP
crim_cases_folder = "Data/criminal_cases/"
tax_cases_folder = "Data/tax_cases/"

#get list of all criminal case names
#get list of all tax case names
crim_case_codes=[]
with open("Data/criminal_cases_info.txt") as json_file:
    crim_stats =  json.load(json_file)
    for item in crim_stats:
        crim_case_codes.append(item["case_code"])
        
tax_case_codes=[]
with open("Data/tax_cases_info.txt") as json_file:
    tax_stats =  json.load(json_file)
    for item in tax_stats:
        tax_case_codes.append(item["case_code"])      
        

#crim_master_word_dict will be a dict of every word that exists in the crim corpus, along with count
#tax_master_word_dict will be a dict of every word that exists in the crim corpus, along with count
crim_master_word_dict={}
tax_master_word_dict={}
all_master_word_dict={}

In [23]:
##SETUP
def concat_rows(txt_file):
    """Takes a text file of a case, and returns one string of everything. 1 space after each row"""
    all_text = ""
    for row in txt_file:
        all_text = all_text + row.strip() + " "
    return all_text

def remove_garbage(all_text):
    """Takes a full string (i.e. all_text from concat_rows() and removes garbage like \xa0)"""
    #replaces them with " " in order to avoid concatenating words e.g. sometimes its like McLachlin\xa0C.J.
    garbage = ["\xa0"]
    for item in garbage:
        all_text = re.sub(item," ",all_text)
    return all_text

stop_words = set(stopwords.words('english')) 

In [60]:
#loop through crim cases
for case in crim_case_codes:
    filepath = crim_cases_folder + case + ".txt"
    
    #open the case
    with open (filepath,'r') as file_object:
        #turn it all into one string
        crim_text = concat_rows(file_object)
        #remove garbage like "\xa0"
        crim_text = remove_garbage(crim_text)
        #turn it into a list of word tokens
        crim_text = word_tokenize(crim_text)
        #remove stopwords
        crim_text = [w.lower() for w in crim_text if w.lower() not in stop_words]
        #only keep words greater than 3 letters
        crim_text = [w for w in crim_text if len(w) > 3]
        
    #add the words in the file to the master_word_dict
    for word in crim_text:
        try:
            curr_word_count_crim = crim_master_word_dict[word]
        except:
            curr_word_count_crim = 0
            
        crim_master_word_dict.update({word:curr_word_count_crim+1})

#loop through tax cases
for case in tax_case_codes:
    filepath = tax_cases_folder + case + ".txt"
    
    #open the case
    with open (filepath,'r') as file_object:
        #turn it all into one string
        tax_text = concat_rows(file_object)
        #remove garbage like "\xa0"
        tax_text = remove_garbage(tax_text)
        #turn it into a list of word tokens
        tax_text = word_tokenize(tax_text)
        #remove stopwords
        tax_text = [w.lower() for w in tax_text if w.lower() not in stop_words]
        #only keep words greater than 3 letters
        tax_text = [w for w in tax_text if len(w) > 3]
        
    #add the words in the file to the master_word_dict
    for word in tax_text:
        try:
            curr_word_count_tax = tax_master_word_dict[word]
        except:
            curr_word_count_tax = 0
            
        tax_master_word_dict.update({word:curr_word_count_tax+1})

#print to csv
with open("Data/wordcloud_counts.csv", "w", newline="") as outfile:
    f = csv.writer(outfile)
    f.writerow(["crim_or_tax","word","count"])
    for key, value in crim_master_word_dict.items():
        f.writerow(["crim",
                  key,
                  value])
    for key, value in tax_master_word_dict.items():
        f.writerow(["tax",
                  key,
                  value])       
    