In [1]:
import pandas as pd
import cv2
import pytesseract
from IPython.core.interactiveshell import InteractiveShell
import warnings

warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')
# allow displaying multiple outputs
InteractiveShell.ast_node_interactivity = "all"


print(f"Pytesseract version: {pytesseract.__version__}")
print(f"Open CV version: {cv2.__version__}")

Pytesseract version: 0.3.10
Open CV version: 4.7.0


### Args

In [2]:
path_to_model = 'C:/Source/ML.Detection/ML models/haarcascade_frontalface_default.xml'

### Functions to detect face and text from an image

In [3]:
def _detect_face(image_path):
    # Load the classifier and image
    face_cascade = cv2.CascadeClassifier(path_to_model)
    img = cv2.imread(image_path)
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Detect faces in the image
    faces = face_cascade.detectMultiScale(gray, 1.1, 4)
    
    # If a face is detected, return True, otherwise return False
    if len(faces) > 0:
        return True
    else:
        return False
    
def _detect_text(image_path):
    # Load the image
    img = cv2.imread(image_path)
    
    # Convert the image to grayscale
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Apply thresholding to remove noise and enhance text
    thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
    
    # Use pytesseract to extract text from the image
    text = pytesseract.image_to_string(thresh)
    
    # If text is detected, return the text, otherwise return False
    if text:
        return text
    else:
        return False

In [4]:
def detector(image_path):
    print("> Starting to detect from the image provided ...")
    if _detect_face(image_path):
        print("\t > A face has been detected in the image ...")

        if _detect_text(image_path):
            text = _detect_text(image_path).split()
            print(f"\t > Text identified from the imgae..")
            print("> We conclude that the image is an official document ...")
            return "Identity"
        else:
            print("\t > No text extracted from this image ...")
            print("> We conclude that the image is just a head shot ...")
            return "Headshot"
    else:
        print("\t > Faces or text not detected in the image ...") 

In [5]:
detector(r'C:/Source/ML.Detection/Data/Identity/75c905df157de24e9cadd48b010f362e.jpg')

> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...


'Identity'

In [6]:
import os

folder_path = r'C:\Source\ML.Detection\Data\Headshot'
file_paths = []

output_headshot_df = pd.DataFrame(columns=['folder','image','Detectedas'])

for root, directories, files in os.walk(folder_path):
    for filename in files:
        filepath = os.path.join(root, filename)
        file_paths.append(filepath)
        output = detector(filepath)
        output_headshot_df.loc[len(output_headshot_df)] = ['Headshot', filename, output]

output_headshot_df

> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > No text extracted from this image ...
> We conclude that the image is just a head shot ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > No text extracted from this image ...
> We conclude that the image is just a head shot ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > No text extracted from this image ...
> We conclude that the image is just a head shot ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > No text extracted from this image ...
> We conclude that the image is just a head shot ...
> Starting to detect from the image provided ...
	 

Unnamed: 0,folder,image,Detectedas
0,Headshot,0998488474d203e03366e7b787eec930.jpg,Identity
1,Headshot,1737-09-sized.webp,Headshot
2,Headshot,3107316-large.jfif,Headshot
3,Headshot,Agnieszka-Glica.jpg,Headshot
4,Headshot,background_fixed.jpg,Headshot
5,Headshot,Desislava-Miteva.jpg,Headshot
6,Headshot,emanuela-contora.1024x1024.jpg,Headshot
7,Headshot,fj_passport_35x45mm.jpg,Identity
8,Headshot,Mxolisi-400x400.jpg,Headshot
9,Headshot,OIP (1).jfif,Headshot


In [7]:
import os

folder_path = r'C:\Source\ML.Detection\Data\Identity'
file_paths = []

output_identity_df = pd.DataFrame(columns=['folder','image','Detectedas'])

for root, directories, files in os.walk(folder_path):
    for filename in files:
        filepath = os.path.join(root, filename)
        file_paths.append(filepath)
        output = detector(filepath)
        output_identity_df.loc[len(output_identity_df)] = ['Identity', filename, output]

output_identity_df

> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...
> Starting to detect from the image provided ...
	 > A face has been detected in the image ...
	 > Text identified from the imgae..
> We conclude that the image is an official document ...
> Starting to detect from the image provided ...
	 > A 

Unnamed: 0,folder,image,Detectedas
0,Identity,131.webp,Identity
1,Identity,149820.jfif,Identity
2,Identity,1599px-JTK_Stearne_Passport.jpg,Identity
3,Identity,75c905df157de24e9cadd48b010f362e.jpg,Identity
4,Identity,ab5ea5134d5d2c1986b5b44e0703cb4f.jpg,Identity
5,Identity,download (1).jfif,Identity
6,Identity,download (2).jfif,Identity
7,Identity,download (3).jfif,Identity
8,Identity,download (4).jfif,Identity
9,Identity,download (5).jfif,Headshot
