In [2]:
import os
from PIL import Image

import cv2
import numpy as np
# Import necessary libraries
import os
import cv2
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from skimage.feature import hog
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import EarlyStopping



# Section 1: Convert images to grayscale and save them as black-and-white images in the 'grayscale' directory.
# This section reads images from the 'data' directory, converts them to grayscale, and then to pure black-and-white 
# using a threshold of 128. The processed images are saved to a new directory named 'grayscale'.
output_dir = "grayscale"
os.makedirs(output_dir, exist_ok=True)

for i in range(1, 17):
    img_path = "data/"+f"{i}.jpg"
    img = Image.open(img_path).convert('L')  # Convert to grayscale
    bw_img = img.point(lambda x: 0 if x < 128 else 255, '1')  # Convert to black and white
    bw_img.save(os.path.join(output_dir, f"{i}_bw.jpg"))

print("Conversion completed and saved to grayscale directory.")



Conversion completed and saved to grayscale directory.


In [3]:
# Section 2: Extract cells from the black-and-white images and save all extracted cells in a single folder within the 'all_cells' directory.
# This section processes each image to extract cells by detecting the horizontal and vertical lines in the table structure.
# After detecting contours of reasonable size, all extracted cells are saved in one directory called 'all_cells'.

output_dir = "all_cells"
os.makedirs(output_dir, exist_ok=True)

def extract_cells(image_path, output_dir, image_index):
    # Read the black and white image
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

    # Convert the image to binary using thresholding
    _, binary = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY_INV)

    # Detect horizontal and vertical lines using morphological operations
    horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
    detect_horizontal = cv2.morphologyEx(binary, cv2.MORPH_OPEN, horizontal_kernel, iterations=2)
    vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))
    detect_vertical = cv2.morphologyEx(binary, cv2.MORPH_OPEN, vertical_kernel, iterations=2)

    # Combine horizontal and vertical lines to detect table structure and find contours
    table_structure = cv2.add(detect_horizontal, detect_vertical)
    contours, _ = cv2.findContours(table_structure, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

    # Sort contours and extract cells based on area constraints
    contours = sorted(contours, key=lambda c: cv2.boundingRect(c)[1] * img.shape[1] + cv2.boundingRect(c)[0])
    cells = [img[y:y+h, x:x+w] for x, y, w, h in (cv2.boundingRect(contour) for contour in contours) if 300000 < w * h < 1350000]

    # Save each cell in the 'all_cells' directory
    for idx, cell_img in enumerate(cells, start=1):
        cell_filename = f"image_{image_index}_cell_{idx}.jpg"
        cv2.imwrite(os.path.join(output_dir, cell_filename), cell_img)

# Loop through the black and white images and extract cells
for i in range(1, 17):
    img_path = f"grayscale/{i}_bw.jpg"
    extract_cells(img_path, output_dir, i)

print("Cell extraction completed. All cells saved to 'all_cells' directory.")


Cell extraction completed. All cells saved to 'all_cells' directory.
