#### Downloading all the Images from URLs from Datasets

In [None]:
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

In [None]:
def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)
    
    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)

#### Download and save images

In [None]:
import os
import pandas as pd

In [None]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

In [None]:
DATASET_FOLDER = '../dataset/'

### Run Sanity check using src/sanity.py

In [None]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

In [None]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

### Download images

In [None]:
from utils import download_images
download_images(sample_test['image_link'], '../images/sample_test')

In [None]:
from utils import download_images
download_images(test['image_link'], '../images/test')

In [None]:
from utils import download_images
download_images(train['image_link'], '../images/train')

#### Extract Text from image

In [None]:
import os
import pandas as pd
import numpy as np
import shutil
import requests
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image
import csv

# installing the libraries

# !pip install pytesseract
import re
import pytesseract
import cv2


#Function for preprocessing ad saving the images


def process_and_save_images(directory_path, csv_output_path):
    # Initialize lists to hold processed images
    gray_images = []
    bw_images = []
    processed_images = []
    image_files = []

    # Define paths for saving processed images
    gray_save_dir = '/kaggle/working/gray_images/'
    bw_save_dir = '/kaggle/working/bw_images/'
    processed_save_dir = '/kaggle/working/processed_images/'
    
    # Ensure the save directories exist
    os.makedirs(gray_save_dir, exist_ok=True)
    os.makedirs(bw_save_dir, exist_ok=True)
    os.makedirs(processed_save_dir, exist_ok=True)

    # Get all jpg image files from the directory
    for image_file in Path(directory_path).glob('*.jpg'):
        image_files.append(image_file)
    
    # Prepare the CSV file for writing
    with open(csv_output_path, 'w', newline='', encoding='utf-8') as csvfile:
        csv_writer = csv.writer(csvfile)
        csv_writer.writerow(['Image Filename', 'Extracted Text'])  # Header row
        
        # Process each image file
        for i, image_file in enumerate(image_files, start=1):
            # Read the image
            img = cv2.imread(str(image_file))
            
            # Check if the image was loaded properly
            if img is None:
                print(f"Error loading image: {image_file}")
                continue
            
            # Convert the image to grayscale
            gray_image = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            gray_images.append(gray_image)
            gray_output_path = os.path.join(gray_save_dir, f'gray_image_{i}.jpg')
            cv2.imwrite(gray_output_path, gray_image)

            # Apply binary thresholding
            _, bw_image = cv2.threshold(gray_image, 130, 255, cv2.THRESH_BINARY)
            bw_images.append(bw_image)
            bw_output_path = os.path.join(bw_save_dir, f'bw_image_{i}.jpg')
            cv2.imwrite(bw_output_path, bw_image)
            
            # Noise removal on black and white image
            kernel = np.ones((1, 1), np.uint8)
            noise_removed = cv2.dilate(bw_image, kernel, iterations=1)
            kernel = np.ones((1, 1), np.uint8)
            noise_removed = cv2.erode(noise_removed, kernel, iterations=1)
            noise_removed = cv2.morphologyEx(noise_removed, cv2.MORPH_CLOSE, kernel)
            noise_removed = cv2.medianBlur(noise_removed, 1)
            processed_images.append(noise_removed)
           
            # Save the noise-removed image
            processed_output_path = os.path.join(processed_save_dir, f'processed_image_{i}.jpg')
            cv2.imwrite(processed_output_path, noise_removed)
            
            # Extract text using PyTesseract
            extracted_text = pytesseract.image_to_string(noise_removed)
            extracted_text = extracted_text.strip()  # Remove leading/trailing whitespace
            extracted_text = ' '.join(extracted_text.split())  # Replace multiple spaces/newlines with a single space
            extracted_text = extracted_text[:1000]  # Limit text length to 1000 characters (adjust as needed)
            
            # Write the filename and compacted extracted text to the CSV file
            csv_writer.writerow([image_file.name, extracted_text])

# input which contains the download set of images
dataset='../images/sample_test'

#file to save the extracted text
output ='../final_output/Train_output.xlsx'

#function call
process_and_save_images(dataset,output)

#### Extract entity_value and entity_name

In [None]:
import re
import pandas as pd

# Define the entity_name array (unique units)
entity_units = ['centimetre', 'foot', 'gram', 'inch', 'kilogram', 'kilovolt', 'kilowatt', 'metre', 
                'microgram', 'millimetre', 'millivolt', 'pound', 'ton', 'volt', 'watt', 'yard']

# Load the old dataset into a DataFrame (assume it's a CSV file, or you can modify for other formats)
df = pd.read_csv("../final_output/Train_output.csv")  # replace with your actual file path

# Function to extract entity_name and value based on matching units
def extract_entities(text, units):
    if pd.isna(text):
        return ""
    
    extracted_values = []
    # Look for patterns like 'value unit' (e.g., '20CM', '50cm', '100.5 pound', etc.)
    for unit in units:
        # Regex to capture a number (int or float) followed by the unit (case-insensitive)
        pattern = r"(\d+\.?\d*)\s*(" + re.escape(unit) + r")"
        matches = re.findall(pattern, text, re.IGNORECASE)
        
        # Store each match in "value unit" format as a string
        for match in matches:
            value, matched_unit = match
            extracted_values.append(f"{value} {matched_unit.lower()}")
    
    # Join all extracted values into a single string, separated by commas if there are multiple
    return ", ".join(extracted_values)

# Apply the extraction function to each row in the 'Extracted Text' column
df['extracted_entities'] = df['Extracted Text'].apply(lambda x: extract_entities(x, entity_units))

# Print or save the DataFrame with the extracted entities
print(df[['Image Filename', 'Extracted Text', 'extracted_entities']])

# Optionally, save the results to a new CSV file
df.to_csv("extracted_entities.csv", index=False)