In [26]:
import pandas as pd
import requests
from PIL import Image
import pytesseract
from io import BytesIO
import re

# Load datasets
train_data = pd.read_csv('/home/rguktrkvalley/Desktop/train1.csv')
test_data = pd.read_csv('/home/rguktrkvalley/Desktop/sample_test.csv')

# Define constants
ALLOWED_UNITS = {'gram', 'centimetre', 'millilitre', 'kilogram', 'millimetre', 'ounce', 'litre'}
# Function to download images
def download_image(url):
    try:
        response = requests.get(url)
        img = Image.open(BytesIO(response.content))
        return img
    except Exception as e:
        print(f"Error downloading image: {e}")
        return None

# Function to perform OCR and extract text from images
def extract_text_from_image(image):
    try:
        text = pytesseract.image_to_string(image)
        return text
    except Exception as e:
        print(f"Error extracting text: {e}")
        return ""

# Function to clean and process extracted text
def clean_extracted_text(text, entity_name):
    # Regex to find numbers followed by units
    pattern = r'(\d+\.?\d*)\s*(gram|g|cm|centimeter|ml|ounce|kg|kilogram|litre|mm)'
    matches = re.findall(pattern, text.lower())  # Find all matches
    
    cleaned_results = []
    
    for match in matches:
        number = match[0]
        unit = match[1]
        # Map shorthand to allowed unit
        unit_mapping = {
            'g': 'gram',
            'cm': 'centimetre',
            'ml': 'millilitre',
            'kg': 'kilogram',
            'mm': 'millimetre',
            'ounce': 'ounce',
            'litre': 'litre'
        }
        unit = unit_mapping.get(unit, unit)
        if unit in ALLOWED_UNITS:
            cleaned_results.append(f"{number} {unit}")
    
    # Ensure that the result aligns with the entity name
    if entity_name == 'item_weight':
        # Example: refine result if entity_name suggests weight
        if cleaned_results:
            return cleaned_results[0]
    elif entity_name == 'item_volume':
        # Example: refine result if entity_name suggests volume
        if cleaned_results:
            return cleaned_results[0]
    # Add more conditions based on entity_name as needed

    return cleaned_results[0] if cleaned_results else ""

# Main processing loop for the test dataset
predictions = []
for idx, row in test_data.iterrows():
    image_url = row['image_link']
    img = download_image(image_url)
    
    if img:
        extracted_text = extract_text_from_image(img)
        cleaned_text = clean_extracted_text(extracted_text,row['entity_name'])
        
        # Placeholder logic: This should be replaced by actual entity extraction logic
        # Currently just outputting the cleaned text
        prediction = cleaned_text
        
        # Append the prediction to the list
        predictions.append({"index": row['index'], "prediction": prediction})
    else:
        predictions.append({"index": row['index'], "prediction": ""})
        

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions)

# Save the predictions to CSV in the required format
output_file = '/home/rguktrkvalley/Desktop/test_out1.csv'
predictions_df.to_csv(output_file, index=False)

print(f"Predictions saved to {output_file}")


Predictions saved to /home/rguktrkvalley/Desktop/test_out1.csv
