In [4]:
import pandas as pd
import numpy as np
import re
import glob
import os
from collections import Counter

# ============================================================================
# CONFIGURATION 
# ============================================================================

CASE_TEXTS_DIR = '/Users/tszyan/Downloads/court_case_texts_cleaned/'  
OUTPUT_DIR = '/Users/tszyan/Downloads/outputs/'  

# ============================================================================
# VERIFY SETUP
# ============================================================================

print("\n" + "="*80)
print("ONTARIO COURT OF JUSTICE - COMPLEXITY ANALYSIS")
print("="*80)

if not os.path.exists(CASE_TEXTS_DIR):
    print(f"\n‚ùå ERROR: Case texts directory not found!")
    print(f"   Looking for: {CASE_TEXTS_DIR}")
    print(f"\n   Please extract court_case_texts_cleaned.zip and update CASE_TEXTS_DIR")
    exit(1)

os.makedirs(OUTPUT_DIR, exist_ok=True)

print(f"\n‚úì Case texts directory: {CASE_TEXTS_DIR}")
print(f"‚úì Output directory: {OUTPUT_DIR}")

# ============================================================================
# LEXICONS AND WORD LISTS
# ============================================================================

# Positive legal terms with weights
POSITIVE_LEXICON = {
    'allowed': 3, 'granted': 3, 'admissible': 4, 'proper': 2, 'reasonable': 2,
    'appropriate': 2, 'valid': 3, 'legitimate': 3, 'complies': 2, 'satisfied': 2,
    'accepted': 2, 'approved': 3, 'successful': 3, 'favorable': 3, 'supported': 2,
    'credible': 3, 'reliable': 2, 'clear': 1, 'accurate': 2, 'adequate': 2,
    'justified': 3, 'lawful': 3, 'sound': 2, 'consistent': 2, 'persuasive': 2,
}

# Negative legal terms with weights
NEGATIVE_LEXICON = {
    'dismissed': -4, 'denied': -4, 'rejected': -4, 'inadmissible': -4,
    'prejudice': -3, 'improper': -3, 'violation': -4, 'breach': -4,
    'failed': -3, 'contravenes': -4, 'insufficient': -3, 'flawed': -3,
    'unreliable': -3, 'questionable': -2, 'doubtful': -2, 'inconsistent': -2,
    'unreasonable': -3, 'inadequate': -2, 'deficient': -2, 'unacceptable': -3,
}

# Conversion/transition words (argumentative complexity markers)
CONVERSION_WORDS = {
    'however', 'whereas', 'notwithstanding', 'nevertheless', 'nonetheless',
    'although', 'though', 'despite', 'conversely', 'contrary',
    'moreover', 'furthermore', 'additionally', 'accordingly', 'consequently',
    'therefore', 'thus', 'hence', 'subsequently', 'alternatively',
    'similarly', 'likewise', 'whereas', 'whilst',
}

# Stopwords
ENGLISH_STOPWORDS = {
    'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for',
    'of', 'with', 'by', 'from', 'as', 'is', 'was', 'are', 'were', 'be',
    'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will',
    'would', 'could', 'should', 'may', 'might', 'can', 'this', 'that',
    'these', 'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what',
    'which', 'who', 'when', 'where', 'why', 'how', 'all', 'each', 'every',
}

LEGAL_STOPWORDS = {
    'court', 'section', 'subsection', 'shall', 'may', 'pursuant', 'case',
}

# ============================================================================
# ANALYSIS FUNCTIONS
# ============================================================================

def tokenize(text):
    """Tokenize text to lowercase words"""
    return re.findall(r'\b[a-z]+\b', text.lower())

def calculate_sentiment(tokens):
    """Calculate sentiment score using lexicons"""
    pos_score = sum(POSITIVE_LEXICON.get(t, 0) for t in tokens)
    neg_score = sum(NEGATIVE_LEXICON.get(t, 0) for t in tokens)
    total_score = (pos_score + neg_score) / len(tokens) if tokens else 0
    
    # Classify sentiment
    if total_score > 0.005:
        sentiment_class = 'Positive'
    elif total_score < -0.005:
        sentiment_class = 'Negative'
    else:
        sentiment_class = 'Neutral'
    
    return total_score, pos_score, neg_score, sentiment_class

def count_citations(text):
    """Count legal citations using regex patterns"""
    patterns = [
        r'\bR\.\s+v\.\s+[A-Z]',      # R. v. Someone
        r'\bs\.\s*\d+',               # s. 123
        r'\bss\.\s*\d+',              # ss. 123
        r'\bsection\s+\d+',           # section 123
        r'\d+\(\d+\)',                # 123(4) - subsections
        r'Criminal\s+Code',           # Criminal Code
        r'\[\d{4}\]',                 # [2024] citations
        r'\d+\s+S\.C\.R\.',           # Supreme Court Reports
        r'para\.\s*\d+',              # para. 15
    ]
    return sum(len(re.findall(p, text, re.IGNORECASE)) for p in patterns)

def count_conversion_words(tokens):
    """Count conversion/transition words"""
    return sum(1 for t in tokens if t in CONVERSION_WORDS)

def extract_case_title(text):
    """Extract case title from document"""
    # Try uppercase format first (original)
    match = re.search(r'R\.\s+v\.\s+([A-Z][\w.-]+)', text)
    if match:
        return f"R. v. {match.group(1)}"
    
    # Try lowercase format (cleaned files)
    match = re.search(r'r\.\s+v\.\s+([a-z][\w.-]+)', text, re.IGNORECASE)
    if match:
        name = match.group(1).upper()  # Convert to uppercase
        return f"R. v. {name}"
    
    return "Unknown"

def extract_judge(text):
    """Extract judge name from document"""
    # Try uppercase format patterns
    patterns = [
        r'Justice\s+([A-Z][\w.\s]+?)(?:\n|Heard)',
        r'J\.\s+([A-Z][\w.\s]+?)(?:\n|Heard)',
        r'([A-Z][\w.\s]+?)\s+J\.',
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            name = match.group(1).strip()
            if len(name) > 3 and len(name) < 30:
                return name
    
    # Try lowercase format (cleaned files)
    # Pattern: "justice [name]" followed by newline
    match = re.search(r'justice\s+([a-z][a-z.\s]+?)(?:\n|heard)', text, re.IGNORECASE)
    if match:
        name = match.group(1).strip()
        # Convert to title case
        name = ' '.join(word.upper() for word in name.split())
        if len(name) > 3 and len(name) < 30:
            return name
    
    # Try pattern: "before\njustice [name]"
    match = re.search(r'before\s+justice\s+([a-z][a-z.\s]+)', text, re.IGNORECASE)
    if match:
        name = match.group(1).strip()
        name = ' '.join(word.upper() for word in name.split())
        if len(name) > 3 and len(name) < 30:
            return name
    
    return "Unknown"

def is_valid_case(text):
    """Check if document is valid"""
    text_lower = text.lower()
    
    # Check for CAPTCHA
    if 'captcha' in text_lower or 'verification' in text_lower:
        return False
    
    # Check minimum length (lowered threshold for cleaned files)
    if len(text) < 200:  # Changed from 500 to 200
        return False
    
    # Check for court-related keywords
    court_keywords = ['court', 'judge', 'justice', 'decision', 'ruling', 'judgment', 'criminal code']
    if not any(keyword in text_lower for keyword in court_keywords):
        return False
    
    return True

def classify_complexity(score):
    """Classify complexity level"""
    if score >= 0.040:
        return 'Very High'
    elif score >= 0.030:
        return 'High'
    elif score >= 0.020:
        return 'Medium'
    elif score >= 0.010:
        return 'Low'
    else:
        return 'Very Low'

# ============================================================================
# MAIN ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 1: PROCESSING CASE FILES")
print("="*80)

print("\n[1.1] Finding case files...")
case_files = sorted(glob.glob(f'{CASE_TEXTS_DIR}case_*.txt'))

if len(case_files) == 0:
    print(f"‚ùå ERROR: No case files found in {CASE_TEXTS_DIR}")
    exit(1)

print(f"  ‚úì Found {len(case_files)} case files")

print("\n[1.2] Analyzing cases...")
results = []
all_stopwords = ENGLISH_STOPWORDS.union(LEGAL_STOPWORDS)
skipped = 0

for i, filepath in enumerate(case_files):
    if (i + 1) % 100 == 0:
        print(f"    Processed {i+1}/{len(case_files)} files...")
    
    try:
        case_id = os.path.basename(filepath).replace('.txt', '')
        
        with open(filepath, 'r', encoding='utf-8', errors='ignore') as f:
            text = f.read()
        
        if not is_valid_case(text):
            skipped += 1
            continue
        
        # Extract metadata
        case_title = extract_case_title(text)
        judge = extract_judge(text)
        
        # Tokenize
        all_tokens = tokenize(text)
        tokens_filtered = [t for t in all_tokens if t not in all_stopwords and len(t) > 2]
        
        # Calculate sentiment
        sentiment, pos_score, neg_score, sentiment_class = calculate_sentiment(tokens_filtered)
        
        # Calculate complexity metrics
        citations = count_citations(text)
        conversions = count_conversion_words(all_tokens)
        
        # Overall complexity score
        complexity = (citations + conversions) / len(tokens_filtered) if tokens_filtered else 0
        complexity_class = classify_complexity(complexity)
        
        # Lexical diversity (vocabulary richness)
        lexical_div = len(set(tokens_filtered)) / len(tokens_filtered) if tokens_filtered else 0
        
        # Sentence analysis
        sentences = re.split(r'[.!?]+', text)
        sentences = [s.strip() for s in sentences if len(s.strip()) > 10]
        avg_sent_len = len(tokens_filtered) / len(sentences) if sentences else 0
        
        results.append({
            'case_id': case_id,
            'case_title': case_title,
            'judge': judge,
            
            # Document metrics
            'char_count': len(text),
            'word_count': len(tokens_filtered),
            'unique_words': len(set(tokens_filtered)),
            'sentence_count': len(sentences),
            'avg_sentence_length': avg_sent_len,
            
            # Sentiment metrics
            'sentiment_score': sentiment,
            'sentiment_class': sentiment_class,
            'pos_word_score': pos_score,
            'neg_word_score': neg_score,
            
            # Complexity metrics
            'citation_count': citations,
            'conversion_word_count': conversions,
            'complexity_score': complexity,
            'complexity_class': complexity_class,
            'lexical_diversity': lexical_div,
        })
        
    except Exception as e:
        skipped += 1
        continue

df = pd.DataFrame(results)
print(f"\n  ‚úì Successfully analyzed {len(df)} cases")
print(f"  ‚úì Skipped {skipped} invalid files")

# ============================================================================
# STEP 2: DESCRIPTIVE STATISTICS
# ============================================================================

print("\n" + "="*80)
print("STEP 2: DESCRIPTIVE STATISTICS")
print("="*80)

print(f"\nüìä SAMPLE SIZE:")
print(f"  Total cases analyzed: {len(df)}")

print(f"\nüìù DOCUMENT STATISTICS:")
print(f"  Mean word count: {df['word_count'].mean():.0f}")
print(f"  Median word count: {df['word_count'].median():.0f}")
print(f"  Range: {df['word_count'].min():.0f} - {df['word_count'].max():.0f} words")
print(f"  Mean sentence count: {df['sentence_count'].mean():.1f}")
print(f"  Mean sentence length: {df['avg_sentence_length'].mean():.1f} words")

print(f"\nüí≠ SENTIMENT DISTRIBUTION:")
sentiment_counts = df['sentiment_class'].value_counts()
for sentiment, count in sentiment_counts.items():
    pct = (count / len(df)) * 100
    print(f"  {sentiment:8s}: {count:3d} cases ({pct:5.1f}%)")
print(f"  Mean sentiment score: {df['sentiment_score'].mean():.4f}")

print(f"\nüî¨ COMPLEXITY METRICS:")
print(f"  Mean citations: {df['citation_count'].mean():.1f}")
print(f"  Median citations: {df['citation_count'].median():.0f}")
print(f"  Mean conversion words: {df['conversion_word_count'].mean():.1f}")
print(f"  Mean complexity score: {df['complexity_score'].mean():.4f}")
print(f"  Mean lexical diversity: {df['lexical_diversity'].mean():.4f}")

print(f"\nüìä COMPLEXITY DISTRIBUTION:")
complexity_counts = df['complexity_class'].value_counts()
for complexity, count in complexity_counts.items():
    pct = (count / len(df)) * 100
    print(f"  {complexity:10s}: {count:3d} cases ({pct:5.1f}%)")

# ============================================================================
# STEP 3: JUDGE-LEVEL ANALYSIS
# ============================================================================

print("\n" + "="*80)
print("STEP 3: JUDGE-LEVEL ANALYSIS")
print("="*80)

judge_stats = df.groupby('judge').agg({
    'case_id': 'count',
    'complexity_score': 'mean',
    'citation_count': 'mean',
    'conversion_word_count': 'mean',
    'sentiment_score': 'mean',
    'word_count': 'mean',
}).round(4)

judge_stats.columns = ['case_count', 'avg_complexity', 'avg_citations', 
                       'avg_conversions', 'avg_sentiment', 'avg_words']
judge_stats = judge_stats.sort_values('avg_complexity', ascending=False)

print(f"\n‚öñÔ∏è  TOP 10 JUDGES BY COMPLEXITY:")
print(f"  {'Judge':<25} Cases  Complexity  Citations  Conversions")
print(f"  {'-'*70}")
for judge, row in judge_stats.head(10).iterrows():
    print(f"  {judge:<25} {row['case_count']:4.0f}    {row['avg_complexity']:.4f}     "
          f"{row['avg_citations']:5.1f}      {row['avg_conversions']:5.1f}")

# ============================================================================
# STEP 4: TOP/BOTTOM CASES
# ============================================================================

print("\n" + "="*80)
print("STEP 4: MOST & LEAST COMPLEX CASES")
print("="*80)

print(f"\nüîù TOP 10 MOST COMPLEX CASES:")
print(f"  {'Case ID':<12} {'Title':<40} {'Score':<8} Citations  Conversions")
print(f"  {'-'*95}")
for idx, row in df.nlargest(10, 'complexity_score').iterrows():
    title = row['case_title'][:37] + '...' if len(row['case_title']) > 40 else row['case_title']
    print(f"  {row['case_id']:<12} {title:<40} {row['complexity_score']:.4f}   "
          f"{row['citation_count']:4.0f}      {row['conversion_word_count']:4.0f}")

print(f"\nüîª TOP 10 LEAST COMPLEX CASES:")
print(f"  {'Case ID':<12} {'Title':<40} {'Score':<8} Citations  Conversions")
print(f"  {'-'*95}")
for idx, row in df.nsmallest(10, 'complexity_score').iterrows():
    title = row['case_title'][:37] + '...' if len(row['case_title']) > 40 else row['case_title']
    print(f"  {row['case_id']:<12} {title:<40} {row['complexity_score']:.4f}   "
          f"{row['citation_count']:4.0f}      {row['conversion_word_count']:4.0f}")

# ============================================================================
# STEP 5: CORRELATION ANALYSIS (TEXT FEATURES ONLY)
# ============================================================================

print("\n" + "="*80)
print("STEP 5: CORRELATION ANALYSIS (TEXT FEATURES)")
print("="*80)

from scipy import stats

correlations = []
variables = [
    ('Citations', 'Word Count', 'citation_count', 'word_count'),
    ('Conversions', 'Word Count', 'conversion_word_count', 'word_count'),
    ('Complexity', 'Word Count', 'complexity_score', 'word_count'),
    ('Complexity', 'Sentiment', 'complexity_score', 'sentiment_score'),
    ('Citations', 'Conversions', 'citation_count', 'conversion_word_count'),
    ('Lexical Diversity', 'Complexity', 'lexical_diversity', 'complexity_score'),
]

print(f"\nüìà CORRELATIONS BETWEEN TEXT FEATURES:")
print(f"  {'Variable 1':<20} {'Variable 2':<20} {'r':>8}  {'p-value':>8}")
print(f"  {'-'*65}")

for var1_name, var2_name, var1, var2 in variables:
    r, p = stats.pearsonr(df[var1], df[var2])
    correlations.append({
        'Variable_1': var1_name,
        'Variable_2': var2_name,
        'Correlation_r': r,
        'P_value': p,
        'Significant': 'Yes' if p < 0.05 else 'No'
    })
    sig = '**' if p < 0.05 else ''
    print(f"  {var1_name:<20} {var2_name:<20} {r:8.3f}  {p:8.4f} {sig}")

# ============================================================================
# STEP 6: SAVE RESULTS
# ============================================================================

print("\n" + "="*80)
print("STEP 6: SAVING RESULTS")
print("="*80)

# Main results
main_output = f'{OUTPUT_DIR}complexity_analysis.csv'
df.to_csv(main_output, index=False)
print(f"  ‚úì Saved: {main_output}")

# Judge statistics
judge_output = f'{OUTPUT_DIR}judge_complexity_stats.csv'
judge_stats.to_csv(judge_output)
print(f"  ‚úì Saved: {judge_output}")

# Correlations
corr_output = f'{OUTPUT_DIR}complexity_correlations.csv'
pd.DataFrame(correlations).to_csv(corr_output, index=False)
print(f"  ‚úì Saved: {corr_output}")

# Summary statistics
summary_stats = pd.DataFrame({
    'Metric': [
        'Total Cases Analyzed',
        '',
        'Mean Word Count',
        'Mean Sentence Count',
        'Mean Sentence Length',
        '',
        'Mean Citations',
        'Mean Conversion Words',
        'Mean Complexity Score',
        'Mean Lexical Diversity',
        '',
        'Mean Sentiment Score',
        'Positive Cases (%)',
        'Neutral Cases (%)',
        'Negative Cases (%)',
        '',
        'Very High Complexity (%)',
        'High Complexity (%)',
        'Medium Complexity (%)',
        'Low Complexity (%)',
        'Very Low Complexity (%)',
    ],
    'Value': [
        len(df),
        '',
        f"{df['word_count'].mean():.0f}",
        f"{df['sentence_count'].mean():.1f}",
        f"{df['avg_sentence_length'].mean():.1f}",
        '',
        f"{df['citation_count'].mean():.1f}",
        f"{df['conversion_word_count'].mean():.1f}",
        f"{df['complexity_score'].mean():.4f}",
        f"{df['lexical_diversity'].mean():.4f}",
        '',
        f"{df['sentiment_score'].mean():.4f}",
        f"{(sentiment_counts.get('Positive', 0) / len(df) * 100):.1f}",
        f"{(sentiment_counts.get('Neutral', 0) / len(df) * 100):.1f}",
        f"{(sentiment_counts.get('Negative', 0) / len(df) * 100):.1f}",
        '',
        f"{(complexity_counts.get('Very High', 0) / len(df) * 100):.1f}",
        f"{(complexity_counts.get('High', 0) / len(df) * 100):.1f}",
        f"{(complexity_counts.get('Medium', 0) / len(df) * 100):.1f}",
        f"{(complexity_counts.get('Low', 0) / len(df) * 100):.1f}",
        f"{(complexity_counts.get('Very Low', 0) / len(df) * 100):.1f}",
    ]
})

summary_output = f'{OUTPUT_DIR}complexity_summary.csv'
summary_stats.to_csv(summary_output, index=False)
print(f"  ‚úì Saved: {summary_output}")

# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETE!")
print("="*80)

print(f"\nüìä SUMMARY:")
print(f"  Cases analyzed: {len(df)}")
print(f"  Mean complexity: {df['complexity_score'].mean():.4f}")
print(f"  Most complex: {df['complexity_score'].max():.4f}")
print(f"  Least complex: {df['complexity_score'].min():.4f}")

print(f"\nüìÅ OUTPUT FILES:")
print(f"  1. {main_output}")
print(f"     ‚Üí All {len(df)} cases with complexity metrics")
print(f"  2. {judge_output}")
print(f"     ‚Üí Judge-level statistics")
print(f"  3. {corr_output}")
print(f"     ‚Üí Correlations between text features")
print(f"  4. {summary_output}")
print(f"     ‚Üí Summary statistics table")

print(f"\nüéØ KEY FINDINGS:")
print(f"  ‚Ä¢ Complexity range: {df['complexity_score'].min():.4f} to {df['complexity_score'].max():.4f}")
print(f"  ‚Ä¢ Most cases are: {complexity_counts.idxmax()} complexity")
print(f"  ‚Ä¢ Sentiment distribution: {sentiment_counts.idxmax()} cases dominate")

print("\n" + "="*80 + "\n")


ONTARIO COURT OF JUSTICE - COMPLEXITY ANALYSIS

‚úì Case texts directory: /Users/tszyan/Downloads/court_case_texts_cleaned/
‚úì Output directory: /Users/tszyan/Downloads/outputs/

STEP 1: PROCESSING CASE FILES

[1.1] Finding case files...
  ‚úì Found 510 case files

[1.2] Analyzing cases...
    Processed 100/510 files...
    Processed 200/510 files...
    Processed 300/510 files...
    Processed 400/510 files...
    Processed 500/510 files...

  ‚úì Successfully analyzed 386 cases
  ‚úì Skipped 124 invalid files

STEP 2: DESCRIPTIVE STATISTICS

üìä SAMPLE SIZE:
  Total cases analyzed: 386

üìù DOCUMENT STATISTICS:
  Mean word count: 3086
  Median word count: 2538
  Range: 239 - 38801 words
  Mean sentence count: 368.8
  Mean sentence length: 8.7 words

üí≠ SENTIMENT DISTRIBUTION:
  Positive: 253 cases ( 65.5%)
  Neutral :  94 cases ( 24.4%)
  Negative:  39 cases ( 10.1%)
  Mean sentiment score: 0.0090

üî¨ COMPLEXITY METRICS:
  Mean citations: 47.5
  Median citations: 34
  Mean co