In [3]:
# Dependencies
from bs4 import BeautifulSoup
from bs4.element import Comment
import requests
import csv
import re
from time import sleep
from random import randint
import json

In [4]:
#Build URL of the navigation pages - this page will contain hyperlinks to actual cases.
#e.g. if you put in 2016, it will have a page with SCC cases related to 2016. We will take those hyperlinks
#then later use those to scrape the actual cases
def buildurl(hyperlink):
    url = 'https://scc-csc.lexum.com'+str(hyperlink)+'?iframe=true'
    return url

def builduserurl(hyperlink):
    url_user = 'https://scc-csc.lexum.com'+str(hyperlink)
    return url_user

def print_to_txt(curr_case_code,soup_object):
    """Takes information and creates a file named e.g. 2019tcc203.txt with the case text inside"""
    ##as written, would need to pass in the taxation year, court system, iteration num, case text like
    ##current_case.print_to_txt(2019, "scc", 37,"testing")

    filename = curr_case_code+".txt"
    filepath = "Data/criminal_cases/"+str(filename)

    text_to_print=""
    
    print("curr_case_code: "+curr_case_code)
    with open (filepath,'a') as file_object:
        for item in soup_object.body.findAll("p"):
            item = item.text
            item = item.rstrip()
            try:
                file_object.write(str(item))
                file_object.write(" ")
                #only added this space after tcc145
            except:
                pass

In [9]:
cases_folder_name = "Data/"
cases_file_name = str("criminal_cases_info.txt")

#load list of cases already scraped as case_dict_list
try:
    with open(str(cases_folder_name+cases_file_name)) as json_file:
            case_dict_list =  json.load(json_file)
except:
    case_dict_list=[]

len_cases_scraped = len(case_dict_list)

#load list of cases to scrape as criminal_hyperlinks
with open(str("Data/hyperlinks/criminal_hyperlinks.txt")) as json_file:
    hyperlinkdict =  json.load(json_file)

hyperlinks = hyperlinkdict["hyperlinks_criminal"]

for i in range(0,51):
    #make URL then get response
    ###~~~~~FUNNEL HYPERLINK LIST IN HERE
    hyperlink=hyperlinks[len_cases_scraped+i]
    url = buildurl(hyperlink)
    url_user = builduserurl(hyperlink)
    print(url_user)
    ###~~~~~

    response = requests.get(url)
    
    print(response.status_code)
    if response.status_code == 429:
        print("429 Error")
        break
    
    if response.status_code == 403:
        print("403 Error")
        break
    
    #make soup object
    soup_object = BeautifulSoup(response.text, 'html.parser')

    #collect table stats
    case_table = soup_object.body.find("table")
    table_rows = case_table.findAll("tr")
    info_sub_dict={}
    for row in table_rows:
        columns = row.findAll("td")
        row_data=[]
        keys=[]
        values=[]
        for column in columns:
            row_data.append(column.text)

        info_sub_dict.update({row_data[0]:row_data[1].strip()})

    #get case_code
    try:
        case_code_full=info_sub_dict["Neutral citation"]
        case_code_regex = re.search(r'(\d\d\d\d)(\s?)(\w\w\w)(\s?)(\d\d?\d?)',case_code_full)
        case_code = "criminal"+str(case_code_regex[1])+case_code_regex[3]+str(case_code_regex[5])
    except:
        try:
            case_code = "criminal"+info_sub_dict["Report"]
        except:
            try:
                case_code = "criminal"+str(info_sub_dict["Recueil"])+" FRENCH"
            except:
                case_code="criminal"+info_sub_dict["Case number"]
            
    
    info_sub_dict.update({"case_code":case_code})
        
    #get case_name
    case_name = soup_object.body.find("h3", class_="title").text
    info_sub_dict.update({"case_name":case_name})
        
    #build user url

    info_sub_dict.update({"url":url_user})
    info_sub_dict.update({"crim_tax":"criminal"})

    #update info_dict_list
    case_dict_list.append(info_sub_dict)

    #print case contents to file
    print_to_txt(case_code,soup_object)

    print("~~")
    sleep(0.5)
    #print info_dict_list to file
    with open(str(cases_folder_name+cases_file_name), 'w') as outfile:
        json.dump(case_dict_list, outfile)

https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13561/index.do
200
curr_case_code: criminal2014SCC22
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13511/index.do
200
curr_case_code: criminal2014SCC19
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13487/index.do
200
curr_case_code: criminal2014SCC16
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13503/index.do
200
curr_case_code: criminal2014SCC18
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13504/index.do
200
curr_case_code: criminal2014SCC17
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13486/index.do
200
curr_case_code: criminal2014SCC15
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13485/index.do
200
curr_case_code: criminal2014SCC14
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13429/index.do
200
curr_case_code: criminal2014SCC10
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/item/13426/index.do
200
curr_case_code: criminal2014SCC6
~~
https://scc-csc.lexum.com/scc-csc/scc-csc/en/it