# Serial Extraction Agro-check Autorotation

**Installing the necessary libraires**

In [16]:
! pip install -q pytesseract Tesseract opencv-python

**Importing the necessary libraires**

In [17]:
# Importing the required libraries
import numpy as np
import matplotlib.pyplot as plt
import os
import glob
import cv2
import re
import pytesseract
import shutil
import csv
import random
import pandas as pd

from PIL import Image
from PIL import ImageFilter
from PIL import ImageEnhance

**Connect to google drive with below code (if necessary)**

In [18]:
# from google.colab import drive
# drive.mount('/content/drive')

**Tesseract Path if using MacOS**

In [19]:
# To get path of tessearct in Mac

# ! which tesseract

# set path of tessearct for mac below

# pytesseract.pytesseract.tesseract_cmd = '/opt/homebrew/bin/tesseract'  # Update this path to your Tesseract installation path


**Tesseract Path if using Windows**

In [20]:
# set the path in windows

# pytesseract.pytesseract.tesseract_cmd = r'your_code_here' 

## Extracting serial numbers from all images

**Set path of folder import below** 

Uncomment and change folder name as per system requirements.

In [21]:
# Path pattern for image files (example for MacOS)
# path_pattern = "1.rawimages/*.jpg"

# Path pattern for image files (example for Windows)
path_pattern = "1. Rawimages/*.jpg"

**Set RegEx pattern for serial number recognition below**

In [22]:
# Define the regex pattern for serial numbers
serial_pattern = r'\b[A-C]{2}\d{7}\b'

**Set SKU number below**

In [23]:
sku_options = 'ZIMB91U'

In [24]:
# Initialize lists to store processed images and extracted serial numbers
data_records = []  # This will hold all records
serial_records = []  # This will hold records with serial numbers
na_records = []  # This will hold records with "NA"
processed_serials = set()  # To prevent processing duplicates

# Tesseract configuration
custom_oem_psm_config = r'--oem 3 --psm 6'

# Iterate over files that match the given pattern
for img_path in glob.glob(path_pattern):
    filename = os.path.basename(img_path)
    file_number = int(filename.split('_')[-1].split('.')[0])

    if file_number % 2 != 0:
        img = cv2.imread(img_path)
        height, width, _ = img.shape

        # Define the cropping coordinates
        cropped_img = img[int(0.15 * height):int(0.27 * height), int(0.74 * width):int(0.94 * width)]

        # First attempt without enhancements
        direct_text = pytesseract.image_to_string(cropped_img, config=custom_oem_psm_config)
        direct_matches = re.findall(serial_pattern, direct_text)

        if direct_matches:
            for match in direct_matches:
                if match not in processed_serials:
                    record = [sku_options, match, ""]
                    data_records.append(record)
                    serial_records.append(record)
                    processed_serials.add(match)
            continue  # Move to the next image if successful

        # Apply enhancements if no matches found
        pil_img = Image.fromarray(cv2.cvtColor(cropped_img, cv2.COLOR_BGR2RGB))
        pil_img = ImageEnhance.Brightness(pil_img).enhance(1.5)
        pil_img = ImageEnhance.Contrast(pil_img).enhance(2.0)
        pil_img = ImageEnhance.Sharpness(pil_img).enhance(2.0)
        enhanced_img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
        gray = cv2.cvtColor(enhanced_img, cv2.COLOR_BGR2GRAY)
        _, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)

        extracted_text = pytesseract.image_to_string(thresh, config=custom_oem_psm_config)
        matches = re.findall(serial_pattern, extracted_text)

        if matches:
            for match in matches:
                if match not in processed_serials:
                    record = [sku_options, match, ""]
                    data_records.append(record)
                    serial_records.append(record)
                    processed_serials.add(match)
        else:
            # Rotate the image by 180 degrees and retry if no matches are found
            img_rotated = cv2.rotate(img, cv2.ROTATE_180)
            cropped_img_rotated = img_rotated[int(0.15 * height):int(0.27 * height), int(0.74 * width):int(0.94 * width)]
            text_rotated = pytesseract.image_to_string(cropped_img_rotated, config=custom_oem_psm_config)
            matches_rotated = re.findall(serial_pattern, text_rotated)
            
            if matches_rotated:
                for match in matches_rotated:
                    if match not in processed_serials:
                        record = [sku_options, match, ""]
                        data_records.append(record)
                        serial_records.append(record)
                        processed_serials.add(match)
            else:
                record = [sku_options, "NA", "", filename]
                data_records.append(record)
                na_records.append(record)


**Writing the data to a csv file**

In [25]:
# Write the data to CSV files
csv_file_path = '2. Ready CSV File/output.csv'
serial_csv_file_path = '2. Ready CSV File/serial_output.csv'
na_csv_file_path = '2. Ready CSV File/na_output.csv'

# Write all records
with open(csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['sku', 'serial', 'authnum', 'filename'])
    writer.writerows(data_records)

# Write records with serial numbers
with open(serial_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['sku', 'serial', 'authnum'])
    writer.writerows(serial_records)

# Write records with "NA"
with open(na_csv_file_path, mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerow(['sku', 'serial', 'authnum', 'filename'])
    writer.writerows(na_records)

print(f'Data has been written to {csv_file_path}, {serial_csv_file_path}, and {na_csv_file_path}')


Data has been written to 2. Ready CSV File/output.csv, 2. Ready CSV File/serial_output.csv, and 2. Ready CSV File/na_output.csv


## In order to display the records locally

In [26]:
# Convert lists to DataFrame for easy handling and analysis
df_dr = pd.DataFrame(data=data_records, columns=['SKU', 'Serial Number','AuthNum','Filename'])
df_sr = pd.DataFrame(data=serial_records, columns=['SKU', 'Serial Number','AuthNum'])
df_nr = pd.DataFrame(data=na_records, columns=['SKU', 'Serial Number','AuthNum','Filename'])


**All Records**

In [27]:
df_dr

Unnamed: 0,SKU,Serial Number,AuthNum,Filename
0,ZIMB91U,AB5580659,,
1,ZIMB91U,AA5885429,,
2,ZIMB91U,AA2524371,,
3,ZIMB91U,AA0745362,,
4,ZIMB91U,AB4106623,,
...,...,...,...,...
95,ZIMB91U,AB1063554,,
96,ZIMB91U,AA7743820,,
97,ZIMB91U,AA2215807,,
98,ZIMB91U,AB2870055,,


**NA Records**

In [28]:
df_nr

Unnamed: 0,SKU,Serial Number,AuthNum,Filename
0,ZIMB91U,,,1_67.jpg
1,ZIMB91U,,,1_159.jpg
2,ZIMB91U,,,1_63.jpg
3,ZIMB91U,,,1_61.jpg
4,ZIMB91U,,,1_113.jpg
5,ZIMB91U,,,1_107.jpg
6,ZIMB91U,,,1_111.jpg
7,ZIMB91U,,,1_25.jpg
8,ZIMB91U,,,1_133.jpg
9,ZIMB91U,,,1_21.jpg


**Serial Records**

In [29]:
df_sr

Unnamed: 0,SKU,Serial Number,AuthNum
0,ZIMB91U,AB5580659,
1,ZIMB91U,AA5885429,
2,ZIMB91U,AA2524371,
3,ZIMB91U,AA0745362,
4,ZIMB91U,AB4106623,
...,...,...,...
79,ZIMB91U,AB1063554,
80,ZIMB91U,AA7743820,
81,ZIMB91U,AA2215807,
82,ZIMB91U,AB2870055,
