# Allyanonimiser Reporting in Jupyter Notebooks

This notebook demonstrates the rich visualization capabilities of the Allyanonimiser reporting system when used in Jupyter notebooks.

## Setup and Imports

In [None]:
from allyanonimiser import create_allyanonimiser, AnonymizationConfig
import pandas as pd
import matplotlib.pyplot as plt
import time

## Sample Data

Let's create some sample text containing different types of PII to demonstrate the reporting capabilities.

In [None]:
sample_texts = [
    """
    Customer Name: John Smith
    Policy Number: POL-12345678
    Date of Birth: 15/07/1982
    Email: john.smith@example.com
    Phone: 0412 345 678
    Address: 42 Main Street, Sydney NSW 2000
    TFN: 123 456 789
    """,
    
    """
    Claim #: CL-87654321
    Claimant: Jane Doe
    Medicare: 2345 67891 0
    Contact: jane.doe@company.org or 0423 456 789
    Incident occurred on 22/03/2023 at 123 Business Ave, Melbourne VIC 3000
    """,
    
    """
    Patient: Robert Johnson
    DOB: 10/11/1975
    Driver's License: NSW12345678
    Referred by Dr. Sarah Williams (Medical Registration: MED-98765)
    Clinical notes: Patient reports lower back pain following MVA on 05/02/2023.
    """,
    
    """
    From: michael.brown@enterprise.com
    To: support@insurance.com
    Subject: Claim Update - Michael Brown (DOB: 03/09/1990)
    
    Hello,
    
    I'm writing to update my claim #CL-54321. My new address is 78 Park Avenue, Brisbane QLD 4000.
    Please update your records with my new phone number: 0487 654 321.
    
    Regards,
    Michael Brown
    Credit Card: 4111-2222-3333-4444 (please don't store this)
    ABN: 12 345 678 901
    """,
    
    """
    Internal Memo - Confidential
    
    RE: Case Review for Elizabeth Wilson (ID: 987654321)
    
    Key details:
    - Customer since: 15/03/2015
    - Date of incident: 12/12/2022
    - Policy: HEALTH-9876543
    - Claims assessor: David Thompson (Employee ID: EMP-45678)
    
    Customer can be reached at elizabeth.wilson@personalmail.net or on her mobile 0432 567 890.
    Her Medicare number is 3456 78901 2 and her TFN is 987 654 321.
    """
]

## Create Allyanonimiser Instance and Configure Anonymization

In [None]:
# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Configure anonymization operations
config = AnonymizationConfig(
    operators={
        "PERSON": "replace",
        "EMAIL_ADDRESS": "mask",
        "PHONE_NUMBER": "redact",
        "AU_ADDRESS": "replace",
        "DATE_OF_BIRTH": "age_bracket",
        "AU_TFN": "hash",
        "AU_MEDICARE": "mask",
        "AU_ABN": "mask",
        "CREDIT_CARD": "mask"
    },
    age_bracket_size=10
)

## Start a New Report Session

In [None]:
# Start a new report session
report = ally.start_new_report(session_id="notebook_example")
print(f"Started new report session: {report.session_id}")

## Process Sample Texts and Record Results

In [None]:
# Process each sample text
for i, text in enumerate(sample_texts):
    print(f"Processing sample {i+1}...")
    
    # Process the text and record statistics
    result = ally.anonymize(
        text=text,
        config=config,
        document_id=f"sample_{i+1}"
    )
    
    # Display anonymized result (just the first one as an example)
    if i == 0:
        print("\nExample of anonymized text:")
        print(result["text"])

## Display Rich Report Visualizations

Now we'll display the report with rich visualizations using the built-in notebook display functionality.

In [None]:
# Display the report with rich visualizations
ally.display_report_in_notebook()

## Access and Display Report Data Programmatically

You can also access the report data programmatically and create custom visualizations.

In [None]:
# Get the report summary
report = ally.get_report()
summary = report.get_summary()

# Display key metrics
print(f"Total documents processed: {summary['total_documents']}")
print(f"Total entities detected: {summary['total_entities']}")
print(f"Entities per document: {summary['entities_per_document']:.2f}")
print(f"Anonymization rate: {summary['anonymization_rate']*100:.2f}%")
print(f"Average processing time: {summary['avg_processing_time']*1000:.2f} ms")

## Custom Visualization: Entity Type Distribution

In [None]:
# Create a custom visualization of entity type distribution
entity_counts = summary['entity_counts']
entity_types = list(entity_counts.keys())
counts = list(entity_counts.values())

# Sort by count in descending order
entity_counts_sorted = sorted(entity_counts.items(), key=lambda x: x[1], reverse=True)
entity_types = [item[0] for item in entity_counts_sorted]
counts = [item[1] for item in entity_counts_sorted]

plt.figure(figsize=(12, 6))
bars = plt.bar(entity_types, counts, color='skyblue')

# Add count labels on top of each bar
for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height + 0.1,
            f'{int(height)}',
            ha='center', va='bottom', fontweight='bold')

plt.title('Entity Types Detected', fontsize=15)
plt.xlabel('Entity Type', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## Custom Visualization: Operator Usage

In [None]:
# Create a custom visualization of operator usage
operator_counts = summary['operator_counts']
operators = list(operator_counts.keys())
op_counts = list(operator_counts.values())

# Sort by count
operator_counts_sorted = sorted(operator_counts.items(), key=lambda x: x[1], reverse=True)
operators = [item[0] for item in operator_counts_sorted]
op_counts = [item[1] for item in operator_counts_sorted]

# Calculate percentages
total = sum(op_counts)
percentages = [count/total*100 for count in op_counts]

# Create pie chart
plt.figure(figsize=(10, 6))
plt.pie(op_counts, labels=operators, autopct='%1.1f%%', startangle=90, shadow=True)
plt.title('Anonymization Operators Used', fontsize=15)
plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
plt.tight_layout()
plt.show()

## Export Reports to Different Formats

The report can be exported to different formats for sharing or further analysis.

In [None]:
# Export report to different formats
html_path = "output/notebook_report.html"
json_path = "output/notebook_report.json"
csv_path = "output/notebook_stats.csv"

# Export to HTML (rich visualization)
report.export_report(html_path, "html")
print(f"HTML report saved to {html_path}")

# Export to JSON (full data)
report.export_report(json_path, "json")
print(f"JSON report saved to {json_path}")

# Export to CSV (document statistics)
report.export_report(csv_path, "csv")
print(f"CSV statistics saved to {csv_path}")

## Working with Document-Level Data

The reporting system also tracks document-level statistics, which can be analyzed separately.

In [None]:
# Create a DataFrame from document statistics
document_stats = pd.DataFrame(report.document_stats)
document_stats

In [None]:
# Visualize document-level statistics
plt.figure(figsize=(12, 6))

# Create a dual-axis plot for processing time and entity count
ax1 = plt.gca()
ax2 = ax1.twinx()

# Plot processing time (bars)
bars = ax1.bar(document_stats['document_id'], document_stats['processing_time'] * 1000, 
               color='lightblue', alpha=0.7, label='Processing Time (ms)')
ax1.set_ylabel('Processing Time (ms)', color='blue')
ax1.tick_params(axis='y', labelcolor='blue')

# Plot entity count (line)
line = ax2.plot(document_stats['document_id'], document_stats['entity_count'], 
                color='red', marker='o', linewidth=2, label='Entity Count')
ax2.set_ylabel('Entity Count', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Add labels for both metrics
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height + 1,
            f'{height:.1f}',
            ha='center', va='bottom', color='blue', fontsize=9)
    
    entity_count = document_stats['entity_count'].iloc[i]
    ax2.text(i, entity_count + 0.3,
            f'{entity_count}',
            ha='center', va='bottom', color='red', fontsize=9)

plt.title('Document Processing Metrics', fontsize=15)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()

# Add legend
lines, labels = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines + lines2, labels + labels2, loc='upper left')

plt.show()

## Anonymization Ratio by Document

Let's visualize how much of each document's text was anonymized.

In [None]:
# Plot anonymization ratio by document
plt.figure(figsize=(12, 6))

# Create percentage data
anonymized_pct = document_stats['anonymization_ratio'] * 100
remaining_pct = 100 - anonymized_pct

# Create a stacked bar chart
bar_width = 0.6
indices = range(len(document_stats))

p1 = plt.bar(indices, anonymized_pct, bar_width, color='#ff9999', label='Anonymized (%)')
p2 = plt.bar(indices, remaining_pct, bar_width, bottom=anonymized_pct, color='#99ff99', label='Original (%)')

# Add percentage labels inside bars
for i, (anon, orig) in enumerate(zip(anonymized_pct, remaining_pct)):
    if anon > 5:  # Only add text if there's enough space
        plt.text(i, anon/2, f'{anon:.1f}%', ha='center', va='center', color='black', fontweight='bold')
    if orig > 5:
        plt.text(i, anon + orig/2, f'{orig:.1f}%', ha='center', va='center', color='black', fontweight='bold')

plt.ylabel('Percentage of Text')
plt.title('Anonymization Ratio by Document')
plt.xticks(indices, document_stats['document_id'], rotation=45)
plt.ylim(0, 100)
plt.legend(loc='upper right')
plt.tight_layout()
plt.show()

## Generate Report for File Processing

For real-world scenarios, you can also generate reports when processing files.

In [None]:
# Example of file processing with built-in reporting
print("When processing files, use:")
print("""result = ally.process_files(
    file_paths=["file1.txt", "file2.txt"],
    output_dir="output",
    operators={"PERSON": "replace", "EMAIL_ADDRESS": "mask"},
    report=True,
    report_output="output/batch_report.html",
    report_format="html"
)""")
print("\nThis will generate a comprehensive processing report automatically.")

## Conclusion

The Allyanonimiser reporting system provides rich insights into anonymization activities, helping you understand what PII is being detected in your data and how it's being anonymized. The notebook integration allows for interactive exploration of the reports with rich visualizations.