#### 1. Data Collection from CanLII API and Web Scraping of Case Decisions

In [None]:
# Import necessary libraries
import requests
import pandas as pd
import os
import re
import time
from datetime import datetime
from bs4 import BeautifulSoup

In [None]:
# 1.1 Adjust Display Options for Pandas DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

In [None]:
# 1.2 Fetch API Key and Define Initial Parameters
api_key = os.getenv('canl2_api_key')
if not api_key:
    raise ValueError("API key not found. Please set the 'canl_api_key' environment variable.")

# Define parameters for API call
language = 'en'
database_id = 'onhrt'  # Database ID for Ontario Human Rights Tribunal
start_date = '2021-01-01'
end_date = '2024-12-31'
offset = 0
result_count = 6000

In [None]:
# 1.3 Function to Fetch Case Decisions from the API
def fetch_case_decisions(database_id, start_date, end_date, offset, result_count):
    """
    Fetch case decisions from CanLII API within the specified date range.

    Parameters:
    - database_id: ID of the legal database (e.g., 'onhrt' for Ontario Human Rights Tribunal)
    - start_date: Start date for case collection
    - end_date: End date for case collection
    - offset: Starting point for results
    - result_count: Number of results to fetch per request

    Returns:
    - List of cases
    """
    cases = []
    while True:
        url = (f'https://api.canlii.org/v1/caseBrowse/{language}/{database_id}/'
               f'?offset={offset}&resultCount={result_count}&decisionDateAfter={start_date}'
               f'&decisionDateBefore={end_date}&api_key={api_key}')
        response = requests.get(url)
        data = response.json()
        if 'cases' in data:
            cases.extend(data['cases'])
            if len(data['cases']) < result_count:
                break
            offset += result_count
        else:
            break
    return cases

In [None]:
# 1.4 Fetching Case Decisions and Creating DataFrame
cases = fetch_case_decisions(database_id, start_date, end_date, offset, result_count)
df = pd.DataFrame(cases)

In [None]:
# 1.5 Data Cleaning: Extract 'caseId' and Remove '(CanLII)' from Citations
df['caseId'] = df['caseId'].apply(lambda x: x.get('en') if isinstance(x, dict) else x)

def remove_canlii(citation):
    """
    Remove the '(CanLII)' text from citations.
    """
    return re.sub(r'\s*\(CanLII\)', '', citation)

df['citation'] = df['citation'].apply(remove_canlii)

In [None]:
# 1.6 Checking for Null and NaN Values
columns_to_check = ['databaseId', 'caseId', 'title', 'citation']
null_counts = df[columns_to_check].isnull().sum()
nan_counts = df[columns_to_check].applymap(lambda x: pd.isna(x) and not pd.isnull(x)).sum()

print("Null values count per column:\n", null_counts)
print("\nNaN values count per column:\n", nan_counts)

In [None]:
# 1.7 Handling Missing Case IDs
null_caseId_rows = df[df['caseId'].isnull()]
nan_caseId_rows = df[df['caseId'].isnull()]

def create_caseId(citation):
    """
    Create case ID from the citation if caseId is missing.
    """
    return citation.lower().replace(' ', '')

df.loc[df['caseId'].isnull(), 'caseId'] = nan_caseId_rows['citation'].apply(create_caseId)

In [None]:
# 1.8 API Calls for Case Details
# Reading caseId from existing file
df = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\case_ids.csv')
caseId_list = df['caseId'].tolist()

In [None]:
# 1.9 Function to Retrieve Case Details from API
def get_case_details(caseId):
    url = f"https://api.canlii.org/v1/caseBrowse/{language}/{database_Id}/{caseId}/?api_key={api_key}"
    try:
        response = requests.get(url)
        response.raise_for_status()
        return response.json()
    except requests.exceptions.HTTPError as errh:
        print("Http Error:", errh)
    except requests.exceptions.ConnectionError as errc:
        print("Error Connecting:", errc)
    except requests.exceptions.Timeout as errt:
        print("Timeout Error:", errt)
    except requests.exceptions.RequestException as err:
        print("Oops: Something Else", err)
    return None

In [None]:
# 1.10 Making API Calls and Saving Results
cases_details = []
caseId_list_5000 = caseId_list[:5000]
caseId_list_383 = caseId_list[5000:]

def make_api_calls(case_ids):
    """
    Make API calls to retrieve case details for the provided list of caseIds.
    """
    for caseId in case_ids:
        case_details = get_case_details(caseId)
        if case_details is not None:
            cases_details.append(case_details)
        time.sleep(1)  # Adjust the sleep time as per API rate limit policy

In [None]:
# First 5000 API calls
print("Making first 5000 API calls...")
make_api_calls(caseId_list_5000)

# Saving results of the first 5000
cases_df_5000 = pd.DataFrame(cases_details)
cases_df_5000.to_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\metadata_50000.csv', index=False)
print("First 5000 API calls completed and saved.")

In [None]:
# Final 383 API calls
print("Making final 383 API calls...")
make_api_calls(caseId_list_383)
cases_df_383 = pd.DataFrame(cases_details)
cases_df_383.to_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\metadata_383.csv', index=False)

In [None]:
# 1.11 Combining DataFrames and Final Output
cases_df_5000 = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\metadata_5000.csv')
cases_df_383 = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\metadata_383.csv')
cases_df_combined = pd.concat([cases_df_5000, cases_df_383])
cases_df_combined.to_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\metadata.csv', index=False)
print("All API calls completed and combined.")

In [None]:
# 1.12 Web Scraping Case Content
# Function to fetch case content from URL
def fetch_case_content(url):
    try:
        response = requests.get(url)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')
        original_document_div = soup.find('div', {'id': 'originalDocument', 'class': 'solexHlZone lbh-document'})
        if not original_document_div:
            return "Main content not found"
        return original_document_div.get_text(separator='\n')
    except requests.exceptions.RequestException as e:
        if response.status_code == 429:
            return "Rate limit hit"
        print(f"Failed to fetch {url}: {e}")
        return None

In [None]:
# 1.13 Saving Case Content
def save_case_content(case_number, content):
    with open(f"cases/2024hrto{case_number}.txt", 'w', encoding='utf-8') as file:
        file.write(content)

In [None]:
# 1.14 Generating Case URLs and Scraping
def generate_case_urls(start_case=1, end_case=966):
    base_url = "https://www.canlii.org/en/on/onhrt/doc/2022/"
    return [f"{base_url}2022hrto{case_number}/2022hrto{case_number}.html" for case_number in range(start_case, end_case + 1)]

def scrape_cases(urls):
    if not os.path.exists('cases'):
        os.makedirs('cases')
    last_successful_case = None
    for url in urls:
        case_number = url.split('/')[-1].replace('2022hrto', '').replace('.html', '')
        print(f"Fetching case {case_number}: {url}")
        case_content = fetch_case_content(url)
        if case_content == "Rate limit hit":
            print(f"Rate limit hit at case {case_number}. Stopping...")
            break
        elif case_content:
            save_case_content(case_number, case_content)
            print(f"Saved case {case_number}")
            last_successful_case = case_number
        else:
            print(f"Failed to save case {case_number}")
    if last_successful_case:
        with open('last_successful_case.txt', 'w') as file:
            file.write(last_successful_case)

# Starting scraping process
start_case = 1
if os.path.exists('last_successful_case.txt'):
    with open('last_successful_case.txt', 'r') as file:
        start_case = int(file.read()) + 1

urls = generate_case_urls(start_case=start_case)
print(urls








