In [None]:
# notebooks/data_exploration.ipynb
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Define dataset path
dataset_path = r"E:\Projects\QR Code Authentication Model\dataset"
first_print_path = os.path.join(dataset_path, "First_Print")
second_print_path = os.path.join(dataset_path, "Second_Print")

def load_image_paths(folder):
    return [os.path.join(folder, file) for file in os.listdir(folder) if file.lower().endswith(('.png', '.jpg', '.jpeg'))]

first_images = load_image_paths(first_print_path)
second_images = load_image_paths(second_print_path)

print("Number of First Print images:", len(first_images))
print("Number of Second Print images:", len(second_images))

# Visualize some sample images
def show_samples(image_paths, title, num_samples=4):
    plt.figure(figsize=(10, 10))
    for i, img_path in enumerate(image_paths[:num_samples]):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        plt.subplot(1, num_samples, i+1)
        plt.imshow(img)
        plt.title(title)
        plt.axis('off')
    plt.show()

show_samples(first_images, "First Print")
show_samples(second_images, "Second Print")

# Plot image dimensions distribution
def get_image_dims(image_paths):
    dims = []
    for path in image_paths:
        img = cv2.imread(path)
        dims.append(img.shape[:2])
    return dims

dims_first = get_image_dims(first_images)
dims_second = get_image_dims(second_images)

# Convert dimensions to separate lists for height and width
h_first, w_first = zip(*dims_first)
h_second, w_second = zip(*dims_second)

sns.kdeplot(h_first, label="First Print Height", shade=True)
sns.kdeplot(h_second, label="Second Print Height", shade=True)
plt.legend()
plt.title("Distribution of Image Heights")
plt.show()
