1. **Import Necessary Libraries**

    - Import various required libraries, including those for image processing, generating fake data, OCR, machine learning, etc.
    - Initialize the `Faker` library, setting it to the UK locale to generate personal information in UK formats.

In [None]:
# Import necessary libraries
import os
import cv2
import numpy as np
import random
import xml.etree.ElementTree as ET
from faker import Faker
import pandas as pd
import easyocr
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Conv1D, MaxPooling1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import joblib
from tensorflow.keras.models import load_model
import tkinter as tk
from tkinter import filedialog
from tkinter import ttk
from PIL import Image, ImageTk
import matplotlib.pyplot as plt
import numpy as np

# Initialize Faker with UK locale
fake = Faker('en_GB')  # Use the UK locale for Faker


2. **Create Dataset Directories**

   Create folders to store training and validation datasets.

In [None]:
# Create directories for datasets
os.makedirs('data/train/images', exist_ok=True)
os.makedirs('data/train/labels', exist_ok=True)
os.makedirs('data/val/images', exist_ok=True)
os.makedirs('data/val/labels', exist_ok=True)


3. **Define Functions to Generate Personal Information**


   Define functions to generate personal information (such as name, phone, email, etc.).

In [None]:
# Define types of PII
pii_types = ['Name', 'Phone', 'Email', 'Passport', 'DriverLicense', 'PostalCode']

# Define functions to generate UK-specific PII
def generate_uk_phone_number():
    return fake.phone_number()

def generate_uk_email():
    return fake.email()

def generate_uk_passport_number():
    return fake.bothify(text='########')

def generate_uk_driver_license():
    return fake.bothify(text='??????????')

def generate_uk_postcode():
    return fake.postcode()

def generate_uk_name():
    return fake.name()

pii_funcs = [
    generate_uk_name,
    generate_uk_phone_number,
    generate_uk_email,
    generate_uk_passport_number,
    generate_uk_driver_license,
    generate_uk_postcode
]


4. **Generate Images with Personal Information**

   - Generate images containing personal information and ensure that the information does not overlap within the image.
   - Save the generated images and their corresponding annotation files (in XML format).

In [None]:
def create_image_with_pii(image_path, label_path):
    # Create a blank image with a white background
    image = np.ones((500, 800, 3), dtype=np.uint8) * 255
    
    annotations = []
    used_bboxes = []

    def check_overlap(new_bbox):
        for bbox in used_bboxes:
            if (new_bbox[0] < bbox[2] and new_bbox[2] > bbox[0] and
                new_bbox[1] < bbox[3] and new_bbox[3] > bbox[1]):
                return True
        return False

    for i, pii_func in enumerate(pii_funcs):
        text = pii_func()
        font = cv2.FONT_HERSHEY_SIMPLEX
        font_scale = random.uniform(0.8, 1.2)  # Random font scale
        thickness = random.randint(1, 2)  # Random thickness

        text_size = cv2.getTextSize(text, font, font_scale, thickness)[0]
        attempts = 0
        while attempts < 100:  # Try 100 times to find a non-overlapping position
            text_x = random.randint(0, image.shape[1] - text_size[0])
            text_y = random.randint(text_size[1], image.shape[0] - text_size[1])
            new_bbox = [text_x, text_y - text_size[1], text_x + text_size[0], text_y]
            
            if not check_overlap(new_bbox):
                used_bboxes.append(new_bbox)
                break
            attempts += 1
        
        if attempts == 100:
            print(f"Could not find non-overlapping position: {text}")
            continue

        # Use black color for text to ensure visibility
        color = (0, 0, 0)
        # Draw text
        cv2.putText(image, text, (text_x, text_y), font, font_scale, color, thickness)

        annotations.append((pii_types[i], text, new_bbox))

    # Save the image
    cv2.imwrite(image_path, image)

    # Save the annotation file
    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = 'images'
    ET.SubElement(annotation, 'filename').text = os.path.basename(image_path)
    ET.SubElement(annotation, 'path').text = os.path.abspath(image_path)
    
    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = str(image.shape[1])
    ET.SubElement(size, 'height').text = str(image.shape[0])
    ET.SubElement(size, 'depth').text = str(image.shape[2])

    for (pii_type, text, bbox) in annotations:
        obj = ET.SubElement(annotation, 'object')
        ET.SubElement(obj, 'name').text = pii_type
        ET.SubElement(obj, 'text').text = text  # Add text content
        bndbox = ET.SubElement(obj, 'bndbox')
        ET.SubElement(bndbox, 'xmin').text = str(bbox[0])
        ET.SubElement(bndbox, 'ymin').text = str(bbox[1])
        ET.SubElement(bndbox, 'xmax').text = str(bbox[2])
        ET.SubElement(bndbox, 'ymax').text = str(bbox[3])

    tree = ET.ElementTree(annotation)
    tree.write(label_path)


5. **Generate Training and Validation Datasets**


   Generate 1000 training images and 200 validation images, saving their paths and annotation files.

In [None]:
# Generate training and validation sets
num_train = 1000
num_val = 200

for i in range(num_train):
    image_path = f'data/train/images/img_{i}.jpg'
    label_path = f'data/train/labels/img_{i}.xml'
    create_image_with_pii(image_path, label_path)

for i in range(num_val):
    image_path = f'data/val/images/img_{i}.jpg'
    label_path = f'data/val/labels/img_{i}.xml'
    create_image_with_pii(image_path, label_path)


6. **Initialize EasyOCR and Define OCR and Data Extraction Functions**


   - Initialize EasyOCR and define functions to extract text information from images.
   - Read text and labels from annotation files and match them with the OCR results to generate datasets for training and validation.

In [None]:
# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

def ocr_image(image_path):
    ocr_result = reader.readtext(image_path, detail=0)
    return ocr_result

def extract_labeled_texts(image_dir, label_dir):
    data = []

    for filename in os.listdir(image_dir):
        if filename.endswith('.jpg'):
            image_path = os.path.join(image_dir, filename)
            label_path = os.path.join(label_dir, filename.replace('.jpg', '.xml'))
            
            # Use OCR tool to extract text
            ocr_result = ocr_image(image_path)
            print(f"OCR Result: {ocr_result}")  # Print OCR result for debugging

            # Read annotation file
            tree = ET.parse(label_path)
            root = tree.getroot()

            for obj in root.findall('object'):
                name = obj.find('name').text
                text = obj.find('text').text
                for ocr_text in ocr_result:
                    if ocr_text and ocr_text == text:
                        data.append({'text': ocr_text, 'label': name})
                        print(f"Matched text: {ocr_text}, Label: {name}")  # Print matched text and label for debugging

    return pd.DataFrame(data)


7. **Create Training and Validation Datasets**

   Use OCR and annotation files to generate DataFrames for the training and validation datasets.

In [None]:
# Create training and validation datasets
train_df = extract_labeled_texts('data/train/images', 'data/train/labels')
val_df = extract_labeled_texts('data/val/images', 'data/val/labels')


8. **Check if All Required Labels are Present in the Dataset**

   Check if the generated datasets contain all necessary labels and print any missing labels.

In [None]:
# Check if the data contains all required labels
required_labels = ['Name', 'Phone', 'Email', 'Passport', 'DriverLicense', 'PostalCode']

def check_labels(df, required_labels):
    labels_in_data = df['label'].unique()
    for label in required_labels:
        if label not in labels_in_data:
            print(f"Warning: Missing label in dataset: {label}")
        else:
            print(f"Label {label} is present in the dataset")

check_labels(train_df, required_labels)
check_labels(val_df, required_labels)


9. **Text Preprocessing and Model Training**

   - Preprocess the text by converting it to sequences and padding it.
   - Use `LabelEncoder` to encode the labels.
   - Build and train a neural network model with embedding, convolutional, and LSTM layers.
   - Save the trained model and preprocessors.


In [None]:
# Text preprocessing
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(train_df['text'].values)

def preprocess_text(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded_sequences = pad_sequences(sequences, maxlen=100)
    return padded_sequences

X_train = preprocess_text(train_df['text'].values)
X_val = preprocess_text(val_df['text'].values)

# Label encoding
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(train_df['label'].values)
y_val = label_encoder.transform(val_df['label'].values)

# Model training
model = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),  
    Conv1D(256, 5, padding='same', activation='relu'),  
    MaxPooling1D(pool_size=2),
    LSTM(128),  
    Dense(128, activation='relu'),  
    Dropout(0.5),
    Dense(len(set(train_df['label'])), activation='softmax') 
])

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_val, y_val))  

# Save the model and preprocessors
model.save('pii_model.h5')
joblib.dump(tokenizer, 'tokenizer.pkl')
joblib.dump(label_encoder, 'label_encoder.pkl')


10. **Define OCR and Prediction Functions**

This part of the process involves using EasyOCR to detect and extract text from input images, followed by predicting the type of Personal Identifiable Information (PII) using a pre-trained deep learning model:

 - OCR Execution: Utilizes EasyOCR to scan images for text, providing detailed results including the text content and its bounding box coordinates. This is essential for identifying textual elements within diverse and complex image backgrounds.

 - PII Classification: Each piece of extracted text is subsequently processed through a trained neural network model which categorizes it into specific PII types such as names, phone numbers, or email addresses. This classification is crucial for applications requiring data protection and compliance checks.

 - Visual Feedback: After classification, the original image is annotated with bounding boxes around each piece of text, color-coded by PII category. This visual augmentation helps users quickly understand the classification results and locate PII within the image.



In [None]:
# Load the model and preprocessors
model = load_model('pii_model.h5')
tokenizer = joblib.load('tokenizer.pkl')
label_encoder = joblib.load('label_encoder.pkl')

# Initialize EasyOCR reader
reader = easyocr.Reader(['en'])

def predict_pii(texts):
    """Predict the PII type for a list of texts using the trained model."""
    seqs = tokenizer.texts_to_sequences(texts)
    padded_seqs = pad_sequences(seqs, maxlen=100)
    preds = model.predict(padded_seqs)
    labels = label_encoder.inverse_transform(np.argmax(preds, axis=1))
    return labels

def ocr_image(image_path):
    results = reader.readtext(image_path, detail=1)
    merged_results = []

    for result in results:
        bbox, text, confidence = result
        if len(merged_results) > 0 and is_adjacent(merged_results[-1][0], bbox):
            merged_results[-1] = (
                merge_bboxes(merged_results[-1][0], bbox),
                merged_results[-1][1] + " " + text,
                min(merged_results[-1][2], confidence)
            )
        else:
            merged_results.append(result)

    return merged_results

def is_adjacent(bbox1, bbox2):
    return abs(bbox1[2][0] - bbox2[0][0]) < 10

def merge_bboxes(bbox1, bbox2):
    x_min = min(bbox1[0][0], bbox2[0][0])
    y_min = min(bbox1[0][1], bbox2[0][1])
    x_max = max(bbox1[2][0], bbox2[2][0])
    y_max = max(bbox1[2][1], bbox2[2][1])
    return [[x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max]]

def draw_boxes(image_path, ocr_results, labels):
    """Draw bounding boxes on the image based on OCR results and labels."""
    image = cv2.imread(image_path)
    color_dict = {
        'Name': (0, 255, 255),  # Yellow
        'Phone': (255, 0, 0),   # Blue
        'Email': (0, 255, 255), # Yellow
        'Passport': (0, 0, 255),# Red
        'DriverLicense': (0, 0, 255), # Red
        'PostalCode': (255, 0, 0)  # Blue
    }
    
    for result, label in zip(ocr_results, labels):
        top_left = tuple(result[0][0])
        bottom_right = tuple(result[0][2])
        text = result[1]
        color = color_dict.get(label, (0, 255, 0))  # Default green
        cv2.rectangle(image, top_left, bottom_right, color, 2)
        text_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 2)[0]
        cv2.rectangle(image, (top_left[0], top_left[1] - 30), (top_left[0] + text_size[0], top_left[1]), color, -1)
        cv2.putText(image, label, (top_left[0], top_left[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.5, (255,255,255), 2)
    return image


11. **Create Graphical User Interface (GUI)**

This step is about creating a user-friendly interface using Tkinter, allowing users to easily interact with the image processing and PII detection functionalities:

 - GUI Setup: Develops a Tkinter-based GUI that facilitates the entire operation from image loading to displaying the processed results. This setup aims to make the application accessible and practical for everyday use.

 - Interactive Elements: Incorporates interactive elements such as a button to load images and a panel to display the annotated results. These elements are designed to provide a seamless user experience, enabling users to load and process images without any technical expertise.

 - Results Visualization: Displays the processed images directly in the GUI, with PII text highlighted and labeled according to its category. This immediate feedback allows users to effectively review and analyze the detected PII, enhancing usability in tasks requiring quick data verification.

In [None]:
# Set up the main application window
root = tk.Tk()
root.title("PII Detection Tool")
root.geometry("1000x700")

# Function to open an image file, process it, and display the results
def open_image():
    file_path = filedialog.askopenfilename(filetypes=[("Image files", "*.jpg *.jpeg *.png")])
    if file_path:
        ocr_results = ocr_image(file_path)
        texts = [result[1] for result in ocr_results]
        labels = predict_pii(texts)
        processed_image = draw_boxes(file_path, ocr_results, labels)
        
        # Convert image for display in Tkinter
        processed_image = cv2.cvtColor(processed_image, cv2.COLOR_BGR2RGB)
        processed_image = Image.fromarray(processed_image)
        processed_image.thumbnail((800, 600))
        img = ImageTk.PhotoImage(processed_image)
        
        # Display the image
        panel.config(image=img)
        panel.image = img

# Create UI elements
panel = tk.Label(root)
panel.pack(padx=10, pady=10)

btn = ttk.Button(root, text="Load Image", command=open_image)
btn.pack(pady=20)

# Start the GUI event loop
root.mainloop()
