<a href="https://colab.research.google.com/github/saadkhi/GSoC-2025-Task/blob/main/Specific_Test_I.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Task 1 (Use 1 PDF for training model)**

In [10]:
from google.colab import auth, drive
import gdown
import os

# Authentication for Google Drive
auth.authenticate_user()

# Mount Google Drive
drive.mount('/content/drive')

# Folder ID from the shared link
folder_id = "1-91y1fQHanXfzx5WUy0qLroflfgVnk_L"

# Destination in your Google Drive
destination = "/content/drive/My Drive/dataset"

# Create the destination folder if it doesn't exist
os.makedirs(destination, exist_ok=True)

# Use gdown to download the folder
gdown.download_folder(f"https://drive.google.com/drive/folders/{folder_id}", output=destination, quiet=False, use_cookies=False)

print("Download complete! Files saved in:", destination)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Retrieving folder contents


Retrieving folder 1-ATumAsFodV9Yjb4sChmslZ45HKtW9pX images
Processing file 1-JaE5tNwyNHGDG60GHVihshifrVGgRKj Ezcaray - Vozes.pdf
Processing file 1-7pV-zFu6cVuty3wRhjMTN3XbpBhQzHC Mendo - Principe perfecto.pdf


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1-JaE5tNwyNHGDG60GHVihshifrVGgRKj
To: /content/drive/My Drive/dataset/Ezcaray - Vozes.pdf
100%|██████████| 3.44M/3.44M [00:00<00:00, 46.4MB/s]
Downloading...
From: https://drive.google.com/uc?id=1-7pV-zFu6cVuty3wRhjMTN3XbpBhQzHC
To: /content/drive/My Drive/dataset/Mendo - Principe perfecto.pdf
100%|██████████| 2.37M/2.37M [00:00<00:00, 60.3MB/s]

Download complete! Files saved in: /content/drive/My Drive/dataset



Download completed


In [11]:
!pip install pdf2image
!apt-get install -y poppler-utils

import tensorflow as tf
print(tf.__version__)

import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from pdf2image import convert_from_path
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score, f1_score, accuracy_score

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
2.18.0


In [12]:
# Define paths
pdf_folder_path = "/content/drive/My Drive/dataset/"  # Folder containing PDFs
output_image_folder = "/content/pdf_images"

# Create directories if they don't exist
if not os.path.exists(output_image_folder):
    os.makedirs(output_image_folder)

# Function to convert PDF to images
def pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path)
    base_filename = os.path.splitext(os.path.basename(pdf_path))[0] # get filename without extension
    for i, image in enumerate(images):
        image.save(os.path.join(output_folder, f"{base_filename}_page_{i}.jpg"), "JPEG")

# Convert all PDFs to images
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder_path, pdf_file)
    pdf_to_images(pdf_path, output_image_folder)

# Load images and masks
image_paths = [os.path.join(output_image_folder, f) for f in os.listdir(output_image_folder) if f.endswith(".jpg")]
images = [cv2.imread(path) for path in image_paths]

# Function to create simple masks (replace with your actual mask creation)
def create_masks(image_paths):
    masks = []
    for image_path in image_paths:
        img = cv2.imread(image_path)
        mask = np.zeros_like(img[:, :, 0], dtype=np.uint8)
        mask[100:img.shape[0]-100, 100:img.shape[1]-100] = 255
        masks.append(mask)
    return masks

masks = create_masks(image_paths)

# Normalize images and create numpy arrays
images = [img / 255.0 for img in images]
resized_images = [cv2.resize(img, (256, 256)) for img in images]
resized_masks = [cv2.resize(mask, (256, 256), interpolation=cv2.INTER_NEAREST) / 255.0 for mask in masks]
X = np.array(resized_images)
y = np.array(resized_masks)[:, :, :, np.newaxis]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# U-Net model
def build_unet(input_shape=(256, 256, 3)):
    inputs = layers.Input(input_shape)
    # Encoder
    conv1 = layers.Conv2D(64, 3, activation='relu', padding='same')(inputs)
    conv1 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv1)
    pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)
    # Bottleneck
    conv_mid = layers.Conv2D(256, 3, activation='relu', padding='same')(pool1)
    conv_mid = layers.Conv2D(256, 3, activation='relu', padding='same')(conv_mid)
    # Decoder
    up7 = layers.Conv2DTranspose(64, 2, strides=(2, 2), padding='same')(conv_mid)
    merge7 = layers.concatenate([up7, conv1], axis=3)
    conv7 = layers.Conv2D(64, 3, activation='relu', padding='same')(merge7) # Corrected line
    conv7 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv7)
    outputs = layers.Conv2D(1, 1, activation='sigmoid')(conv7)
    model = models.Model(inputs=inputs, outputs=outputs)
    return model

# Compile the model
model = build_unet()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with ImageDataGenerator
model.fit(datagen.flow(X_train, y_train, batch_size=32),
          steps_per_epoch=int(len(X_train) / 32),
          epochs=10,
          validation_data=(X_test, y_test))

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(np.uint8)

# Calculate IoU and F1-score
iou = jaccard_score(y_test.flatten(), y_pred_binary.flatten())
f1 = f1_score(y_test.flatten(), y_pred_binary.flatten())

print(f"Test IoU: {iou}, Test F1-Score: {f1}")

  self._warn_if_super_not_called()


Epoch 1/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 5s/step - accuracy: 0.6058 - loss: 0.6919 - val_accuracy: 0.8034 - val_loss: 0.6001
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 709ms/step - accuracy: 0.8038 - loss: 0.5936 - val_accuracy: 0.8034 - val_loss: 0.5280
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 916ms/step - accuracy: 0.8038 - loss: 0.5177 - val_accuracy: 0.8034 - val_loss: 0.6263
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.8038 - loss: 0.6089 - val_accuracy: 0.8034 - val_loss: 0.5206
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 908ms/step - accuracy: 0.8038 - loss: 0.5115 - val_accuracy: 0.8034 - val_loss: 0.5243
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.8038 - loss: 0.5211 - val_accuracy: 0.8034 - val_loss: 0.5317
Epoch 7/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━

In [13]:
accuracy = accuracy_score(y_test.flatten(), y_pred_binary.flatten())
print(f"Test Accuracy: {accuracy}")

Test Accuracy: 0.803375244140625


# **Task 1 (Use all PDF's for training model)**

In [14]:
from google.colab import auth, drive
import gdown
import os

# Authentication for Google Drive
auth.authenticate_user()

# Mount Google Drive
drive.mount('/content/drive')

# Folder ID from the shared link
folder_id = "1acoMZD4i2OTYFcoRHfH3_INITYPvN-UR"

# Destination in your Google Drive
destination = "/content/drive/My Drive/dataset_all"

# Create the destination folder if it doesn't exist
os.makedirs(destination, exist_ok=True)

# Use gdown to download the folder
gdown.download_folder(f"https://drive.google.com/drive/folders/{folder_id}", output=destination, quiet=False, use_cookies=False)

print("Download complete! Files saved in:", destination)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Retrieving folder contents


Processing file 1IuFzvkG8eKQQYV2BHQp-PRAgZNSbTTkB Buendia - Instruccion.pdf
Processing file 1hFA6Xl6qSTMWHdWofO8M5vXr4dlwFrx4 Constituciones sinodales Calahorra 1602.pdf
Processing file 1NIq9VdyuQQwmlptcdwcfVyHKtFfiRKxr ES-AHPHU - J-000312-0014 – 1579.pdf
Processing file 15acKc0qnG_OiGxLigIJVAknlOSQ5S4Hu Ezcaray - Vozes.pdf
Processing file 17MCzlffI2JavfKE4M4yqGVnP814p2SNH J&#x3a;0017&#x3a;03-J&#x3a;0085&#x3a;11 – 1799-1845.pdf
Processing file 1P4bXC8olGpZQ2Dp-azT-W3XG0HWuxV2I Mendo - Principe perfecto.pdf
Processing file 1tkeFJLLHOzaKxef7FAzyTmVVpfd394aJ Paredes - Reglas generales.pdf
Processing file 1yDoblfJxzM906V07RxzcNy5w303jyQHo PORCONES.228.35 – 1636.pdf


Retrieving folder contents completed
Building directory structure
Building directory structure completed
Downloading...
From: https://drive.google.com/uc?id=1IuFzvkG8eKQQYV2BHQp-PRAgZNSbTTkB
To: /content/drive/My Drive/dataset_all/Buendia - Instruccion.pdf
100%|██████████| 3.29M/3.29M [00:00<00:00, 161MB/s]
Downloading...
From: https://drive.google.com/uc?id=1hFA6Xl6qSTMWHdWofO8M5vXr4dlwFrx4
To: /content/drive/My Drive/dataset_all/Constituciones sinodales Calahorra 1602.pdf
100%|██████████| 1.81M/1.81M [00:00<00:00, 122MB/s]
Downloading...
From: https://drive.google.com/uc?id=1NIq9VdyuQQwmlptcdwcfVyHKtFfiRKxr
To: /content/drive/My Drive/dataset_all/ES-AHPHU - J-000312-0014 – 1579.pdf
100%|██████████| 8.44M/8.44M [00:00<00:00, 134MB/s]
Downloading...
From: https://drive.google.com/uc?id=15acKc0qnG_OiGxLigIJVAknlOSQ5S4Hu
To: /content/drive/My Drive/dataset_all/Ezcaray - Vozes.pdf
100%|██████████| 3.44M/3.44M [00:00<00:00, 171MB/s]
Downloading...
From: https://drive.google.com/uc?id=17MCz

Download complete! Files saved in: /content/drive/My Drive/dataset_all


Download completed


In [15]:
!pip install pdf2image
!apt-get install -y poppler-utils

import tensorflow as tf
print(tf.__version__)

import os
import cv2
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from pdf2image import convert_from_path
from sklearn.model_selection import train_test_split
from sklearn.metrics import jaccard_score, f1_score, accuracy_score

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.6).
0 upgraded, 0 newly installed, 0 to remove and 29 not upgraded.
2.18.0


In [None]:
# Define paths
pdf_folder_path = "/content/drive/My Drive/dataset_all/"  # Folder containing PDFs
output_image_folder = "/content/pdf_images"

# Create directories if they don't exist
if not os.path.exists(output_image_folder):
    os.makedirs(output_image_folder)

# Function to convert PDF to images
def pdf_to_images(pdf_path, output_folder):
    images = convert_from_path(pdf_path)
    base_filename = os.path.splitext(os.path.basename(pdf_path))[0] # get filename without extension
    for i, image in enumerate(images):
        image.save(os.path.join(output_folder, f"{base_filename}_page_{i}.jpg"), "JPEG")

# Convert all PDFs to images
pdf_files = [f for f in os.listdir(pdf_folder_path) if f.endswith(".pdf")]
for pdf_file in pdf_files:
    pdf_path = os.path.join(pdf_folder_path, pdf_file)
    pdf_to_images(pdf_path, output_image_folder)

# Load images and masks
image_paths = [os.path.join(output_image_folder, f) for f in os.listdir(output_image_folder) if f.endswith(".jpg")]
images = [cv2.imread(path) for path in image_paths]

# Function to create simple masks (replace with your actual mask creation)
def create_masks(image_paths):
    masks = []
    for image_path in image_paths:
        img = cv2.imread(image_path)
        mask = np.zeros_like(img[:, :, 0], dtype=np.uint8)
        mask[100:img.shape[0]-100, 100:img.shape[1]-100] = 255
        masks.append(mask)
    return masks

masks = create_masks(image_paths)

# Normalize images and create numpy arrays
images = [img / 255.0 for img in images]
resized_images = [cv2.resize(img, (256, 256)) for img in images]
resized_masks = [cv2.resize(mask, (256, 256), interpolation=cv2.INTER_NEAREST) / 255.0 for mask in masks]
X = np.array(resized_images)
y = np.array(resized_masks)[:, :, :, np.newaxis]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ImageDataGenerator for augmentation
datagen = ImageDataGenerator(
    rotation_range=10,
    width_shift_range=0.1,
    height_shift_range=0.1,
    shear_range=0.1,
    zoom_range=0.1,
    horizontal_flip=True,
    fill_mode='nearest'
)

# U-Net model
def build_unet(input_shape=(256, 256, 3)):
    inputs = layers.Input(input_shape)
    # Encoder
    conv1 = layers.Conv2D(64, 3, activation='relu', padding='same')(inputs)
    conv1 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv1)
    pool1 = layers.MaxPooling2D(pool_size=(2, 2))(conv1)
    # Bottleneck
    conv_mid = layers.Conv2D(256, 3, activation='relu', padding='same')(pool1)
    conv_mid = layers.Conv2D(256, 3, activation='relu', padding='same')(conv_mid)
    # Decoder
    up7 = layers.Conv2DTranspose(64, 2, strides=(2, 2), padding='same')(conv_mid)
    merge7 = layers.concatenate([up7, conv1], axis=3)
    conv7 = layers.Conv2D(64, 3, activation='relu', padding='same')(merge7) # Corrected line
    conv7 = layers.Conv2D(64, 3, activation='relu', padding='same')(conv7)
    outputs = layers.Conv2D(1, 1, activation='sigmoid')(conv7)
    model = models.Model(inputs=inputs, outputs=outputs)
    return model

# Compile the model
model = build_unet()
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model with ImageDataGenerator
model.fit(datagen.flow(X_train, y_train, batch_size=32),
          steps_per_epoch=int(len(X_train) / 32),
          epochs=10,
          validation_data=(X_test, y_test))

# Evaluate the model
y_pred = model.predict(X_test)
y_pred_binary = (y_pred > 0.5).astype(np.uint8)

# Calculate IoU and F1-score
iou = jaccard_score(y_test.flatten(), y_pred_binary.flatten())
f1 = f1_score(y_test.flatten(), y_pred_binary.flatten())

print(f"Test IoU: {iou}, Test F1-Score: {f1}")