<a href="https://colab.research.google.com/github/probml/probml-notebooks/blob/main/notebooks/convert_image_formats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install pdf2image

Collecting pdf2image
  Downloading pdf2image-1.16.0-py3-none-any.whl (10 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.16.0


In [2]:
import pdf2image

In [3]:
!sudo apt-get install poppler-utils

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 40 not upgraded.
Need to get 154 kB of archives.
After this operation, 613 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic-updates/main amd64 poppler-utils amd64 0.62.0-2ubuntu2.12 [154 kB]
Fetched 154 kB in 0s (361 kB/s)
debconf: unable to initialize frontend: Dialog
debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 1.)
debconf: falling back to frontend: Readline
debconf: unable to initialize frontend: Readline
debconf: (This frontend requires a controlling tty.)
debconf: falling back to frontend: Teletype
dpkg-preconfigure: unable to re-open stdin: 
Selecting previously unselected package poppler-utils.
(Reading database ... 160837 

In [4]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [32]:
# pip install pdf2image
# pip install --upgrade pillow

import os
import shutil
from pdf2image import convert_from_path
from PIL import Image
from PIL import ImageCms
import argparse
from glob import glob
from tqdm import tqdm
import functools
import multiprocessing
import concurrent.futures 


def split_file_name(input_path):
    base_name, dir_name = os.path.basename(input_path),os.path.dirname(input_path)
    file_name,ext = os.path.splitext(os.path.basename(base_name))
    return base_name, dir_name, file_name, ext

def convert(input_path,output_path,color_space="CMYK",input_profile_path=None,output_profile_path=None,quality=100,verbose=False,overwrite=False):
    """ converts an image or pdf into a color space of choice
        for CMYK the default output format is JPG
        Keyword arguments:
        input_path -- the input path of the file
        output_path -- the output path for the result to be written.
        color_space -- the color space to convert to , default value is CMYK
        input_profile_path -- the path to the input profile 
        output_profile_path -- the path to the output profile
    """
    try:
        if not overwrite and os.path.exists(output_path):
            return True
        
        if input_path.endswith(".pdf") or input_path.endswith(".PDF"):
            #_, dir_name, file_name, _ =split_file_name(output_path)
            _, dir_name, file_name, _ =split_file_name(input_path)
            temp_file_name="temp"+file_name
            temp_file_path=os.path.join(dir_name,temp_file_name)
            print("converting ", input_path, " to ", temp_file_path)
            convert_from_path(input_path,output_file=temp_file_path,fmt="png",use_pdftocairo=True,single_file=True,
                              use_cropbox=True)
            temp_file_path+=".png"
            print("converting ", temp_file_path, " to ", output_path)
            _convert_profiles(temp_file_path,output_path,color_space=color_space,
                              input_profile_path=input_profile_path,output_profile_path=output_profile_path,quality=quality)
            os.remove(temp_file_path)
            return True
        elif input_path.endswith(".png") or input_path.endswith(".PNG") or \
            input_path.endswith(".jpg") or input_path.endswith(".JPG") or \
            input_path.endswith(".jpeg") or input_path.endswith(".JPEG") :
            return _convert_profiles(input_path,output_path,color_space=color_space,input_profile_path=input_profile_path,output_profile_path=output_profile_path,quality=quality)
        else:
            print(f"{input_path} is not a valid image file, copying it instead to {output_path}.")
            shutil.copy(input_path,output_path)
            return False
    except Exception as e:
        if verbose:
            print(f"Error in file: {input_path}\n",e)
        return False





def _convert_profiles(input_path=None,output_path=None,color_space="CMYK",input_profile_path=None,output_profile_path=None,quality="100"):
    try:
        with Image.open(input_path) as im:
            img_cmyk = ImageCms.profileToProfile(im, input_profile_path, output_profile_path, renderingIntent=0,outputMode=color_space)
            quality=int(quality)
            img_cmyk.save(output_path, quality=quality)
            
            return True
    except Exception as e:
        print(e)
        print(f"cannot convert{input_path}, copying it instead.")
        shutil.copy(input_path,output_path)
        return False


# from https://pillow.readthedocs.io/en/stable/handbook/tutorial.html?highlight=cmyk#using-the-image-class
def check_image_properties(input_path):
    try:
        with Image.open(input_path) as im:
            print(input_path, im.format, f"{im.size}x{im.mode}")
    except OSError as e:
        print("error opening the image\n",e)


In [39]:
from glob import glob
files=glob("/content/drive/MyDrive/MLAPA/book-images-original/*.*")
p=[print(f) for f in files]

filenames = []
for f in files:
  parts = f.split("/")
  fname = parts[-1]
  base = fname.split(".")[:-1][0]
  #filenames.append(base)
  filenames.append(fname)

print(filenames)

/content/drive/MyDrive/MLAPA/book-images-original/ae_mnist_conv_20d_linear.pdf
/content/drive/MyDrive/MLAPA/book-images-original/ae_mnist_conv_20d_samples.pdf
/content/drive/MyDrive/MLAPA/book-images-original/separable-conv2d.pdf
/content/drive/MyDrive/MLAPA/book-images-original/vae_mnist_conv_20d_samples.pdf
/content/drive/MyDrive/MLAPA/book-images-original/vae_mnist_conv_20d_linear.pdf
/content/drive/MyDrive/MLAPA/book-images-original/largeMarginPrinciple2.pdf
/content/drive/MyDrive/MLAPA/book-images-original/svmCvSurf.pdf
['ae_mnist_conv_20d_linear.pdf', 'ae_mnist_conv_20d_samples.pdf', 'separable-conv2d.pdf', 'vae_mnist_conv_20d_samples.pdf', 'vae_mnist_conv_20d_linear.pdf', 'largeMarginPrinciple2.pdf', 'svmCvSurf.pdf']


In [40]:
in_folder = "/content/drive/MyDrive/MLAPA/book-images-original"
for use_rgb in [False, True]:
  if use_rgb:
    out_folder = "/content/drive/MyDrive/MLAPA/book-images-rgb-80"
    color_space = "RGB"
    quality = 80
  else:
    out_folder = "/content/drive/MyDrive/MLAPA/book-images-cmyk-100"
    color_space = "CMYK"
    quality = 100

  rgb_profile = 'sRGB Color Space Profile.icm'
  cmyk_profile = 'USWebCoatedSWOP.icc'
  profile_folder = '/content/drive/MyDrive/MLAPA'
  input_profile_path = f'{profile_folder}/{rgb_profile}'
  if color_space == "RGB":
    output_profile_path = f'{profile_folder}/{rgb_profile}'
  else:
    output_profile_path = f'{profile_folder}/{cmyk_profile}'

  for fname in filenames:
    base = fname.split(".")[:-1][0]
    in_name = f'{in_folder}/{fname}'
    #in_name = f'{in_folder}/{fname}.pdf'
    out_name = f'{out_folder}/{base}.jpg'
    print('!converting ', in_name, ' to ', out_name)
    convert(in_name,
            out_name, 
            color_space=color_space, 
            quality=quality,
            verbose=True,
            input_profile_path=input_profile_path,
            output_profile_path=output_profile_path)


!converting  /content/drive/MyDrive/MLAPA/book-images-original/ae_mnist_conv_20d_linear.pdf  to  /content/drive/MyDrive/MLAPA/book-images-cmyk-100/ae_mnist_conv_20d_linear.jpg
converting  /content/drive/MyDrive/MLAPA/book-images-original/ae_mnist_conv_20d_linear.pdf  to  /content/drive/MyDrive/MLAPA/book-images-original/tempae_mnist_conv_20d_linear
converting  /content/drive/MyDrive/MLAPA/book-images-original/tempae_mnist_conv_20d_linear.png  to  /content/drive/MyDrive/MLAPA/book-images-cmyk-100/ae_mnist_conv_20d_linear.jpg
!converting  /content/drive/MyDrive/MLAPA/book-images-original/ae_mnist_conv_20d_samples.pdf  to  /content/drive/MyDrive/MLAPA/book-images-cmyk-100/ae_mnist_conv_20d_samples.jpg
converting  /content/drive/MyDrive/MLAPA/book-images-original/ae_mnist_conv_20d_samples.pdf  to  /content/drive/MyDrive/MLAPA/book-images-original/tempae_mnist_conv_20d_samples
converting  /content/drive/MyDrive/MLAPA/book-images-original/tempae_mnist_conv_20d_samples.png  to  /content/drive

In [35]:
!ls /content/drive/MyDrive/MLAPA/book-images-original

d2l-conv1d.pdf		   d2l-densenet-v2.png
d2l-conv-1x1.pdf	   d2l-inception-full-rotated-v2.png
d2l-conv-multi-in.pdf	   d2l-inception.pdf
d2l-correlation.pdf	   d2l-lenet.pdf
d2l-correlation-v2.png	   d2l-pooling.pdf
d2l-densenet-block.pdf	   EigenFergusNoModel.pdf
d2l-densenet-block-v2.png  EigenFergus-v2.png
d2l-densenet.pdf


In [36]:
!ls /content/drive/MyDrive/MLAPA/book-images-rgb-80

d2l-conv1d.jpg		   d2l-densenet-v2.jpg
d2l-conv-1x1.jpg	   d2l-inception-full-rotated-v2.jpg
d2l-conv-multi-in.jpg	   d2l-inception.jpg
d2l-correlation.jpg	   d2l-lenet.jpg
d2l-correlation-v2.jpg	   d2l-pooling.jpg
d2l-densenet-block.jpg	   EigenFergusNoModel.jpg
d2l-densenet-block-v2.jpg  EigenFergus-v2.jpg
d2l-densenet.jpg


In [37]:
!ls /content/drive/MyDrive/MLAPA/book-images-cmyk-100

d2l-conv1d.jpg		   d2l-densenet-v2.jpg
d2l-conv-1x1.jpg	   d2l-inception-full-rotated-v2.jpg
d2l-conv-multi-in.jpg	   d2l-inception.jpg
d2l-correlation.jpg	   d2l-lenet.jpg
d2l-correlation-v2.jpg	   d2l-pooling.jpg
d2l-densenet-block.jpg	   EigenFergusNoModel.jpg
d2l-densenet-block-v2.jpg  EigenFergus-v2.jpg
d2l-densenet.jpg
