# Team Aloo Parathaaa @ Snehasish_Notebook2
### OCR -> Custom Line Detection -> Basic Regex -> Write to csv file
### For entity_name: weight, wattage, voltage, volume

# Dependancies

In [31]:
# General
import re
import cv2
import time
import math
import torch
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt

# Data Extraction
import os
import sys
import zipfile
import requests
import pandas as pd
from io import BytesIO
from pathlib import Path

# Images
from IPython import display
from PIL import Image

# PaddleOCR dpendencies
!pip install -q paddlepaddle-gpu==2.6.1
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
if device == "cpu": 
    !pip install -q paddlepaddle
!pip install -q paddleocr

# Setting up OCR
import paddle
import logging
from paddleocr import PaddleOCR, draw_ocr
OCR = PaddleOCR(use_angle_cls=True, lang='en', use_gpu=True)

# Set logging level to WARNING to suppress DEBUG logs
logging.getLogger('ppocr').setLevel(logging.WARNING)

[2024/12/01 20:11:36] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=True, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/root/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_length=25, rec_ch

In [32]:
# Checking if OCR on GPU
print("OCR GPU Compile Check: ", paddle.device.is_compiled_with_cuda())
print("OCR on GPU check: ", paddle.device.get_device())
print("Current device: ", device)

OCR GPU Compile Check:  True
OCR on GPU check:  gpu:0
Current device:  cuda:0


# FULL PROCESS

## Helper Functions

In [33]:
def distance_point_to_line(px, py, x1, y1, x2, y2):
    """ Calculate the perpendicular distance from point (px, py) to the line (x1, y1)-(x2, y2) """
    numerator = abs((y2 - y1) * px - (x2 - x1) * py + x2 * y1 - y2 * x1)
    denominator = math.sqrt((y2 - y1)**2 + (x2 - x1)**2)
    return numerator / denominator

def line_angle(x1, y1, x2, y2):
    """ Calculate the angle of the line (x1, y1) to (x2, y2) with respect to the horizontal axis """
    return abs(math.degrees(math.atan2(y2 - y1, x2 - x1)))%360

def is_approximately_vertical(angle, lower_bound=75, upper_bound=105):
    """ Check if the normalized angle is between lower_bound and upper_bound degrees """
    return lower_bound <= angle <= upper_bound

def is_approximately_horizontal(angle, lower_bound=0, upper_bound=60):
    """ Check if the normalized angle is between lower_bound and upper_bound degrees """
    return lower_bound <= angle <= upper_bound

def is_approximately_diagonal(angle, lower_bound=50, upper_bound=70):
    """ Check if the normalized angle is between lower_bound and upper_bound degrees """
    return lower_bound <= angle <= upper_bound

def calculate_centroid(box):
  sum_x, sum_y = 0, 0
  for point in box:
    sum_x += point[0]
    sum_y += point[1]
  return (sum_x // len(box), sum_y // len(box))

## Regex

In [34]:
# Dictionary to store all dimensions 
units_map = {
    'mm'        : 'millimetre',
    'millimetre': 'millimetre',
    'cm'        : 'centimetre',
    'centimetre': 'centimetre',
    'in'        : 'inch',
    '"'         : 'inch',
    'inch'      : 'inch',
    "'"         : 'foot',
    'ft'        : 'foot',
    'foot'      : 'foot',
    'yd'        : 'yard',
    'yard'      : 'yard',
    'metre'     : 'metre',
}

def extract_number(text):
    # Use regular expression to find integers and decimal numbers in the text
    match = re.search(r'\d+(\.\d+)?', text)
    # Return the found number as a float if a match is found
    return float(match.group()) if match else None

# Example usage
text = "H: 16.32mm"
number = extract_number(text)
print(number)

16.32


In [35]:
def compute_dimension(file_name = "box1", image_url = "", mode = "h", show0 = False, show = False, show2 = False, show_time = True, show_results = True):
    start = time.time()
    time_mode = 2
    
    
    ############# EXTRACT OCR ####################
    # Read image
    image_BRG = None
    image_path = f"/kaggle/input/the-dataset/images/train/{file_name}.jpg"
    if image_url != "":
        response = requests.get(image_url)
        image_bytes = BytesIO(response.content)
        image_array = np.array(bytearray(image_bytes.read()), dtype=np.uint8)
        img_BRG = cv2.imdecode(image_array, cv2.IMREAD_COLOR)
    else: 
        img_BRG = cv2.imread(image_path)

    # Center of Image
    height, width, _ = img_BRG.shape
    center_x = width // 2
    center_y = height // 2
    cv2.circle(img_BRG, (center_x, center_y), radius=20, color=(225, 0, 255), thickness=-1)

    # Perform OCR on the image
    result = OCR.ocr(img_BRG, cls=True)
    
    # Early Stopping 
    if result[0] is None:
        return ("-1","")
    
    # Extract bounding boxes, texts, and confidence scores
    bbox   = [line[0] for line in result[0]]
    texts  = [line[1][0] for line in result[0]]
    scores = [line[1][1] for line in result[0]]
    
    ### TIME
    if time_mode == 1:
        end = time.time()
        print("Time to extract OCR: ", end-start)


    ############# SELECT RELEVANT OCR ####################
    # Only considering bounding box with dimensions
    dims = []
    unit_names = []
    dim_scores = []
    dim_bboxes = []
    for line in result:
        for word in line:
            bbox, dim, score = word[0], word[1][0], word[1][1]
            # Checking for valid unit
            for unit, full_name in units_map.items():
                if unit in dim.lower():
                    unit_names.append(full_name)
                    dim_scores.append(score)
                    dim_bboxes.append(bbox)
                    dims.append(dim)
       
    
    ############# RELEVANT OCR BBOX CENTER ####################
    # Calculating centres of bounding boxes
    object_centre      = (center_x, center_y)
    dim_bboxes_centre  = [calculate_centroid(box) for box in dim_bboxes]
    
    # Render centers of all bounding boxes
    if show:
        for point in dim_bboxes_centre:
            x, y = map(int, point)
            cv2.circle(img_BRG, (x,y), radius=8, color=(0, 0, 255), thickness=-1)


    ############# LINE DETECTION ####################
    # Convert to grayscale
    gray = cv2.cvtColor(img_BRG, cv2.COLOR_BGR2GRAY)

    # Edge detection
    edges = cv2.Canny(gray, 50, 150, apertureSize=3)
    # Detect lines using Hough Line Transform
    lines = cv2.HoughLinesP(edges, 1, np.pi / 180, 100, minLineLength=100, maxLineGap=10)

    # Early Stopping
    if lines is None:
        return ("-1","")
    
    # Draw the lines on the image
    if show2:
        for line in lines:
            x1, y1, x2, y2 = line[0]
            cv2.line(img_BRG, (x1, y1), (x2, y2), (0, 255, 0), 2)

    ############# FIND DIMENSION USING NEAREST LINE ANGLE ####################
    
    candidates = []
        
    for i, bbox_centre in enumerate(dim_bboxes_centre):
        min_distance = float('inf')
        closest_line = None

        for line in lines:
            x1, y1, x2, y2 = line[0]
            px, py = bbox_centre

            # Calculate the distance from the bounding box center to the line
            dist = distance_point_to_line(px, py, x1, y1, x2, y2)

            # Check if this line is closer than previously found
            if dist < min_distance:
                min_distance = dist
                closest_line = line[0]

        if closest_line is not None:
            x1, y1, x2, y2 = closest_line

            # Calculate the angle of the line
            angle = line_angle(x1, y1, x2, y2)
            if show: 
                print(f"{i}. Bbox: {bbox_centre}, Dimension: {dims[i]}")
                print(f"   Line: {closest_line}, Angle {line_angle(x1, y1, x2, y2)}")
                print(f"   {is_approximately_horizontal(angle)}")

            # Check if the line is approximately vertical
            if mode == "h" and is_approximately_vertical(angle):
                candidates.append((bbox_centre, dims[i], unit_names[i]))
            elif mode == "l" and is_approximately_horizontal(angle):
                candidates.append((bbox_centre, dims[i], unit_names[i]))
            elif mode == "w" and is_approximately_horizontal(angle):
                candidates.append((bbox_centre, dims[i], unit_names[i]))
            
            
    ############# GETTING NUMERIC VALUE ##############
    max_dim = -1
    max_dim_name = ""
    for cen_dim_name in candidates:
        cur_dim = extract_number(cen_dim_name[1])
        if cur_dim != None:
            if (cur_dim > max_dim): 
                max_dim = cur_dim
                max_dim_name = cen_dim_name[2]
    
    
    ############# RESULTS ####################
    if show_time:
        if time_mode == 2:
            end = time.time()
            print(f"TIME FOR FULL PROCESS: {end-start}")
            
    if show_results:
        # Candidates
        print(f"{mode.upper()} Candidates:")
        for idx, (center, dimension, name) in enumerate(candidates, start=1):
            print(f"{idx} | Center: {center} | Dimension: {dimension} {name}")
        print(f"\nFINAL {mode.upper()}: {max_dim} {max_dim_name}")

    # Displaying dimensions only 
    if show2:
        if show0:
            print(f"\nDimensions: {len(dims)}, {dims}")
            print(f"Scores: {len(dim_scores)}, {dim_scores}")
            print(f"Text Bboxes: {len(dim_bboxes)}, {dim_bboxes}\n\n")
        print(f"\nNumber of relevant bboxes: {len(dims)}")
        image_with_text_boxes = draw_ocr(img_BRG, dim_bboxes, dims, dim_scores, font_path = "/kaggle/input/times-new-roman-addon/Times New Roman.ttf")
        image_with_text_boxes_rgb = cv2.cvtColor(image_with_text_boxes, cv2.COLOR_BGR2RGB)
        # Show the image with bounding boxes
        plt.figure(figsize = (20,20))
        plt.imshow(image_with_text_boxes_rgb)
        plt.axis('off')
        plt.show()
    
    # Returning Answer
    return (f"{max_dim} {max_dim_name}", texts)

In [38]:
# Function to determine 'mode' based on entity_name
def determine_mode(entity_name):
    if entity_name in ['depth', 'height']:
        return 'h'
    elif entity_name == 'width':
        return 'w'
    else:
        return 'Invalid' 

In [39]:
# Read the CSV file
df  = pd.read_csv('/kaggle/input/amazon-ml-challenge-2024/test.csv')  
df2 = pd.read_csv('/kaggle/input/amazon-ml-challenge-2024/test.csv')  

# Latter Half
start_index = 65594
#df_latter_half = df.iloc[start_index:].reset_index(drop=True)
#df_latter_half2 = df_latter_half.copy(deep=True)
#df_latter_half.to_csv('/kaggle/working/test_latter_half_65594.csv', index=False)

# Counters
count_wrong = 0
count_correct = 0

# dfs in use
df_in_use = df
df_in_use_ocr = df2

# Version of saved files
ver = "_2"

# Iterate over each row and perform operations
for index, row in df_in_use.iterrows():
    
    # Step 0: Start Index
    if index < start_index: continue
        
    # Step 1: Store 'image_link' in 'current_url'
    current_url = row['image_link']
    
    # Step 2: Determine 'mode' based on 'entity_name'
    current_mode = determine_mode(row['entity_name'])
    
    # Step 3: Write a desired string into a new 'entity_value' column
    if current_mode != 'Invalid':
        (answer, ocr_out) = compute_dimension(file_name = "", image_url = current_url, mode = current_mode, show = False, show2 = False, show_time = False, show_results = False)
        # OCR Output
        if ocr_out != "": ocr_out_string = '[' + ', '.join(ocr_out) + ']'
        else: ocr_out_string = ""
        # Storing results
        if "-1" in answer: 
            df_in_use.at[index, 'entity_value'] = ""
            df_in_use_ocr.at[index, 'OCR Output']  = ocr_out_string
            count_wrong += 1
        else: 
            df_in_use.at[index, 'entity_value']  = answer
            df_in_use_ocr.at[index, 'OCR Output']   = ocr_out_string
            count_correct += 1
    else:
        df_in_use.at[index, 'entity_value']  = ""
        df_in_use_ocr.at[index, 'OCR Output']   = ""
    
    # Step 4: Every 1000 rows write .csv file to kaggle directory
    if index == start_index+15:
        df_in_use.to_csv(f'/kaggle/working/test_results{ver}.csv', index = False)
        df_in_use_ocr.to_csv(f'/kaggle/working/test_OCR{ver}.csv', index = False)

    if (index + 1) % 5000 == 0 or (index + 1) == len(df_in_use):
        df_in_use.to_csv(f'/kaggle/working/test_results{ver}.csv', index = False)
        df_in_use_ocr.to_csv(f'/kaggle/working/test_OCR{ver}.csv', index = False)
        print(f"Progress: {index+1}/{len(df_in_use)}")
    
    #if index == 65593:
    #    break
    
# Write the updated DataFrame with the new 'entity_value' column back to a new CSV file
df_in_use.to_csv(f'/kaggle/working/test_results_final{ver}.csv', index = False)
df_in_use_ocr.to_csv(f'/kaggle/working/test_OCR_final{ver}.csv', index = False)

print("\nFilled Responses:  ", count_correct)
print("Blank Responses:   ", count_wrong)
print("Total Responses:   ", count_correct+count_wrong)
print("% Filled: ", count_correct/(count_correct+count_wrong)*100)
print("\nCOMPLETED")

Progress: 70000/131187
Progress: 75000/131187
Progress: 80000/131187
Progress: 85000/131187
Progress: 90000/131187
Progress: 95000/131187
Progress: 100000/131187
Progress: 105000/131187
Progress: 110000/131187
Progress: 115000/131187
Progress: 120000/131187
Progress: 125000/131187
Progress: 130000/131187
Progress: 131187/131187

Filled Responses:   27190
Blank Responses:    11082
Total Responses:    38272
% Filled:  71.04410535117057

COMPLETED


##### COMMENT OUT BEFORE AUTOMATED RUNNING 

In [None]:
#!rm -rf /kaggle/working/*
#!ls /kaggle/working/