#### 2. Data Preprocessing for CanLII Case Decisions

In [None]:
# 2.1 Import Necessary Libraries
import pandas as pd
import chardet
import re
import os
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
# 2.2 Download NLTK Resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
# 2.3 Adjust Display Options for DataFrame
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None)

#### Cases

In [None]:
# 2.4 Define Folder Path for Case Files
folder_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\cases'

In [None]:
# 2.5 Initialize Empty List to Store Case Details
case_list = []

In [None]:
# 2.6 Function to Detect File Encoding
def detect_encoding(file_path):
    """
    Detect the encoding of a file using chardet.
    """
    with open(file_path, 'rb') as f:
        raw_data = f.read()
    result = chardet.detect(raw_data)
    return result['encoding']

In [None]:
# 2.7 Iterate Over Files in the Folder and Load Case Content
for filename in os.listdir(folder_path):
    if filename.endswith('.txt'):
        file_path = os.path.join(folder_path, filename)
        encoding = detect_encoding(file_path)
        with open(file_path, 'r', encoding=encoding) as file:
            content = file.read()
            case_id = os.path.splitext(filename)[0]  # Remove .txt extension
            case_list.append({'caseId': case_id, 'caseContent': str(content)})

In [None]:
# 2.8 Convert List to DataFrame
cases = pd.DataFrame(case_list)

# Ensure 'caseContent' column is of string type
cases['caseContent'] = cases['caseContent'].astype(str)

In [None]:
# 2.9 Display Basic Information About DataFrame
cases.info()

# Display the first case
cases.head(1)

In [None]:
# 2.10 Save Initial Case DataFrame to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\cases.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

# Reload the saved DataFrame for further processing
cases = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\cases.csv')

#### Clean Summary

In [None]:
# 2.11 Function to Clean Text Content
def clean_text(text):
    """
    Clean the text by removing excessive newlines, whitespace, and underscores.
    """
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'_', '', text)
    return text.strip()

# Apply the text cleaning function to 'caseContent' column
cases['cleanSummary'] = cases['caseContent'].apply(clean_text)

# Drop the original 'caseContent' column
cases.drop(columns=['caseContent'], inplace=True)

In [None]:
# 2.12 Save Cleaned DataFrame to CSV (Optional)
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\cleanSummary.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

# Reload the cleaned DataFrame for further processing
cases = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\cleanSummary.csv')

#### Case Details and Reasoning

In [None]:
# 2.13 Function to Separate Case Details and Reasoning
def separate_case_details(text):
    """
    Split the text into case details and reasoning using a specific pattern.
    """
    pattern = r'\[\s*1\s*\]'
    parts = re.split(pattern, text, maxsplit=1)
    case_details = parts[0].strip()
    cleaned_summary = '[1]' + parts[1].strip() if len(parts) > 1 else ''
    return case_details, cleaned_summary

# Apply the function to the 'cleanSummary' column
cases['caseDetails'], cases['reasoning'] = zip(*cases['cleanSummary'].apply(separate_case_details))

# Drop the 'cleanSummary' column
cases.drop(columns=['cleanSummary'], inplace=True)

# Display basic info after separation
cases.info()

# Display first row
cases.head(1)

In [None]:
# 2.14 Save Separate Case Details and Reasoning to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\reasoning.csv'
cases[['caseId', 'reasoning']].to_csv(output_path, index=False, encoding='utf-8')

output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\caseDetails.csv'
cases[['caseId', 'caseDetails']].to_csv(output_path, index=False, encoding='utf-8')

#### Applicant Data

In [None]:
# 2.15 Extract Applicant Names from Case Details
def extract_applicant(text):
    """
    Extract the applicant's name from case details using regex.
    """
    between_pattern = r'(B\s*E\s*T\s*W\s*E\s*E\s*N\s*[:\s]*)'
    applicant_pattern = r'(Applicant|Applicants)'
    
    between_match = re.search(between_pattern, text)
    if between_match:
        start = between_match.end()
    else:
        return ''
    
    applicant_match = re.search(applicant_pattern, text[start:])
    if applicant_match:
        end = start + applicant_match.start()
    else:
        end = len(text)
    
    return text[start:end].strip()

# Apply the function to extract applicants
cases['applicant'] = cases['caseDetails'].apply(extract_applicant)

# Reorder columns to have 'applicant' before 'caseDetails'
cases = cases[['caseId', 'applicant', 'caseDetails']]

In [None]:
# 2.16 Save Applicant Data to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\applicant.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Respondent Data

In [None]:
# 2.17 Extract Respondent Names from Case Details
def extract_respondent(text):
    """
    Extract the respondent's name from case details using regex.
    """
    and_pattern = r'(-?\s*a\s*n\s*d\s*-?\s*)'
    respondent_pattern = r'(Respondent|Respondents)'
    
    and_match = re.search(and_pattern, text, re.IGNORECASE)
    if and_match:
        start = and_match.end()
    else:
        return ''
    
    respondent_match = re.search(respondent_pattern, text[start:], re.IGNORECASE)
    if respondent_match:
        end = start + respondent_match.start()
    else:
        end = len(text)
    
    return text[start:end].strip()

# Apply the function to extract respondents
cases['respondent'] = cases['caseDetails'].apply(extract_respondent)

# Reorder columns to have 'respondent' before 'caseDetails'
cases = cases[['caseId', 'respondent', 'caseDetails']]

In [None]:
# 2.18 Save Respondent Data to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\respondent.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Adjudicator Data

In [None]:
# 2.19 Extract Adjudicator Names from Case Details
def extract_adjudicator(text):
    """
    Extract the adjudicator's name from case details using regex.
    """
    pattern = r'Adjudicator:\s*(.*?)\s*Date:'
    match = re.search(pattern, text)
    if match:
        return match.group(1).strip()
    return ''

# Apply the function to extract adjudicators
cases['adjudicator'] = cases['caseDetails'].apply(extract_adjudicator)

# Reorder columns to have 'adjudicator' before 'caseDetails'
cases = cases[['caseId', 'adjudicator', 'caseDetails']]

In [None]:
# 2.20 Save Adjudicator Data to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\adjudicator.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Decision Type

In [None]:
def extract_decision_type(text):
    """
    Extract the decision type between 'Respondent' and 'Adjudicator' from case details.
    """
    pattern = r'Respondent[s]?\s*(.*?)\s*Adjudicator[s]?:'
    match = re.search(pattern, text, re.IGNORECASE)
    if match:
        return match.group(1).strip()
    return 'UNKNOWN'

# Apply the function to extract decision types
cases['decisionType'] = cases['caseDetails'].apply(extract_decision_type)

In [None]:
# 2.22 Save Decision Type Data to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\decision.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Written Submission and Appearances

In [None]:
# 2.23 Search for Written Submissions and Appearances
cases['writtenSubmission'] = cases['caseDetails'].str.contains('written submission|written submissions', case=False, regex=True)
cases['appearances'] = cases['caseDetails'].str.contains('APPEARANCES|Appearances', case=False, regex=True)

In [None]:
# 2.24 Save Written Submission and Appearances Data to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\written.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Protected Grounds & Areas

In [None]:
# 2.25 Feature Extraction from Reasoning Column (e.g., Age, Creed, Race)
cases = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\reasoning.csv')

# Define keywords to extract
keywords = ['age', 'ancestry', 'citizenship', 'colour', 'creed', 'disability', 'ethnic origin', 'family status', 
            'gender identity', 'marital status', 'place of origin', 'public assistance', 'race', 'sexual orientation']

for keyword in keywords:
    cases[keyword] = cases['reasoning'].str.contains(keyword, case=False, regex=True)

In [None]:
# 2.26 Save Extracted Features to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\protectedQA.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Reasoning

In [None]:
# 2.27 Text Preprocessing: Removing Punctuation, Numbers, and Lemmatizing
def preprocess_text(text):
    """
    Preprocess text by converting to lowercase, removing punctuation, and numbers.
    """
    if isinstance(text, str):
        text = text.lower()
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = re.sub(r'\d+', '', text)
    else:
        text = ''
    return text

# Apply preprocessing to 'reasoning' column
cases['reasoning'] = cases['reasoning'].apply(preprocess_text)

# Lemmatize and remove stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def tokenize_and_lemmatize(text):
    """
    Tokenize text and lemmatize words, removing stopwords.
    """
    words = nltk.word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

cases['reasoning'] = cases['reasoning'].apply(tokenize_and_lemmatize)

In [None]:
# 2.28 Save Preprocessed Reasoning Data to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\reasoningQA.csv'
cases.to_csv(output_path, index=False, encoding='utf-8')

#### Master Dataset

In [None]:
# 2.29 Merge All DataFrames to Create a Master DataFrame
# Load other dataframes for merging
adjudicator = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\adjudicatorQA.csv')
applicant = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\applicantQA.csv')
decisionType = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\decisionQA.csv')
metadata = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\metadataQA.csv')
protectedGrounds = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\protectedQA.csv')
respondent = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\respondentQA.csv')
representation = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\writtenQA.csv')
labels = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\labelsQA.csv')
reasoning = pd.read_csv(r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\reasoningQA.csv')

# Merge all dataframes on 'caseId'
master = labels.merge(applicant, on='caseId', how='inner')\
               .merge(decisionType, on='caseId', how='inner')\
               .merge(metadata, on='caseId', how='inner')\
               .merge(protectedGrounds, on='caseId', how='inner')\
               .merge(respondent, on='caseId', how='inner')\
               .merge(representation, on='caseId', how='inner')\
               .merge(adjudicator, on='caseId', how='inner')\
               .merge(reasoning, on='caseId', how='inner')

# Display summary of the master dataframe
master.info()
master.head()

In [None]:
# 2.30 Save Master DataFrame to CSV
output_path = r'E:\Vocational\Lighthouse Labs\Flex Course\Projects\P06_Final Project\data\master.csv'
master.to_csv(output_path, index=False, encoding='utf-8')