In [None]:
import os
import pandas as pd
from PIL import Image
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import img_to_array
from utils import download_images
from pathlib import Path
import pytesseract
import re
import logging

# Set up logging
logging.basicConfig(filename='image_processing.log', level=logging.INFO)

# OCR-based text extraction using pytesseract
def extract_text_from_image(image_path):
    try:
        image = Image.open(image_path)
        text = pytesseract.image_to_string(image)  # Extract text using pytesseract OCR
        return text
    except Exception as e:
        logging.error(f"Error extracting text from image {image_path}: {e}")
        return None

# Function to extract unit and value from text
def extract_value_and_unit(text):
    # Regular expression to find numbers followed by units like grams, kg, etc.
    pattern = re.compile(r'(\d+(\.\d+)?\s?(gram|kilogram|kg|g|pound|lb|ounce|oz))', re.IGNORECASE)
    matches = pattern.findall(text)
    if matches:
        return matches[0][0]  # Return the first match (number + unit)
    else:
        logging.warning(f"No value and unit found in text: {text}")
        return None

# Preprocess images
def preprocess_image(image_path):
    try:
        image = Image.open(image_path)
        image = image.resize((224, 224))
        image_array = img_to_array(image)
        image_array = np.expand_dims(image_array, axis=0)
        image_array = tf.keras.applications.resnet50.preprocess_input(image_array)
        return image_array
    except Exception as e:
        logging.error(f"Error processing image {image_path}: {e}")
        return None

# Load the base ResNet50 model
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
base_model.trainable = False

# Define the model architecture
inputs = keras.layers.Input(shape=(224, 224, 3))
x = base_model(inputs, training=False)
x = keras.layers.GlobalAveragePooling2D()(x)
outputs = keras.layers.Dense(1000, activation='softmax')(x)  # Modify based on the actual classes

model = keras.Model(inputs, outputs)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Example: Download images and extract labels
train_df = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_df = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

download_folder = '/home/rguktrkvalley/Desktop/images'
download_images(train_df['image_link'].tolist() + test_df['image_link'].tolist(), download_folder)

# Prepare training data
train_images = [os.path.join(download_folder, Path(link).name) for link in train_df['image_link']]
train_labels = train_df['entity_value'].tolist()

# Filter out invalid labels and extract text from images
valid_labels = []
filtered_images = []

for img_path in train_images:
    # Step 1: Extract text from the image using OCR
    extracted_text = extract_text_from_image(img_path)
    
    if extracted_text:
        # Step 2: Extract the value and unit from the text
        value_and_unit = extract_value_and_unit(extracted_text)
        
        if value_and_unit:
            # Log the extracted value and unit
            logging.info(f"Extracted from {img_path}: {value_and_unit}")
            valid_labels.append(value_and_unit)
            filtered_images.append(img_path)
        else:
            logging.warning(f"No valid value and unit found in {img_path}")
    else:
        logging.warning(f"No text extracted from {img_path}")

# Preprocess the valid images
processed_images = []
valid_labels_filtered = []

for img_path, lbl in zip(filtered_images, valid_labels):
    processed_image = preprocess_image(img_path)
    if processed_image is not None:
        processed_images.append(processed_image)
        valid_labels_filtered.append(lbl)

# Train the model if valid images are available
if valid_labels_filtered and processed_images:
    print(f"Training with {len(processed_images)} images and labels.")
    train_labels_categorical = tf.keras.utils.to_categorical(
        [0] * len(valid_labels_filtered),  # Dummy labels for now, adjust for your case
        num_classes=1000  # Adjust based on your output classes
    )
    model.fit(np.concatenate(processed_images),
              train_labels_categorical,
              epochs=10,
              batch_size=32,
              validation_split=0.2)
else:
    logging.error("No valid labels or images for training.")

# Generate predictions for test images
test_images = [os.path.join(download_folder, Path(link).name) for link in test_df['image_link']]
test_preds = []

for path in test_images:
    processed_image = preprocess_image(path)
    if processed_image is not None:
        test_preds.append(processed_image)

# Save predictions
if test_preds:
    test_preds_array = model.predict(np.concatenate(test_preds))
    
    # Format and save the prediction output
    output_df = pd.DataFrame({'index': test_df['index']})
    output_df['prediction'] = ['Predicted_Class' for _ in test_preds_array]  # Placeholder
    output_df.to_csv('/home/rguktrkvalley/Desktop/pertest_out.csv', index=False)
else:
    logging.error("No valid test images processed. Predictions cannot be generated.")
