In [1]:
import pandas as pd
import pytesseract
from PIL import Image
import os
import cv2
from urllib.request import urlretrieve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import torchvision.transforms as transforms
import torch
import torchvision
from torchvision import models
import re

# Set the path for Tesseract executable (Update it according to your system)
pytesseract.pytesseract.tesseract_cmd = (
    r"C:\\Program Files\\Tesseract-OCR\\tesseract.exe"  # Windows path
)

# Ensure temp and image directories exist
os.makedirs("temp_images", exist_ok=True)
os.makedirs("images", exist_ok=True)


In [2]:
# Unit mapping
entity_unit_map = {
    "width": {"centimetre", "foot", "inch", "metre", "millimetre", "yard"},
    "depth": {"centimetre", "foot", "inch", "metre", "millimetre", "yard"},
    "height": {"centimetre", "foot", "inch", "metre", "millimetre", "yard"},
    "item_weight": {
        "gram",
        "kilogram",
        "microgram",
        "milligram",
        "ounce",
        "pound",
        "ton",
    },
    "maximum_weight_recommendation": {
        "gram",
        "kilogram",
        "microgram",
        "milligram",
        "ounce",
        "pound",
        "ton",
    },

    "voltage": {"kilovolt", "millivolt", "volt"},
    "wattage": {"kilowatt", "watt"},
    "item_volume": {
        "centilitre",
        "cubic foot",
        "cubic inch",
        "cup",
        "decilitre",
        "fluid ounce",
        "gallon",
        "imperial gallon",
        "litre",
        "microlitre",
        "millilitre",
        "pint",
        "quart",
    },
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

# Unit normalization map
unit_normalization = {
    "gm": "gram",
    "kg": "kilogram",
    "cm": "centimetre",
    "mm": "millimetre",
    "m": "metre",
    "kv": "kilovolt",
    "mv": "millivolt",
    "v": "volt",
    "w": "watt",
    "kw": "kilowatt",
    "oz": "ounce",
    "lb": "pound",
    "g": "gram",
    "l": "litre",
    "ml": "millilitre",
}


In [3]:


def download_image(image_url, filename):
    if os.path.exists(filename):
        print(f"Using existing image {filename}")
        return
    try:
        urlretrieve(image_url, filename)
        print(f"Image successfully downloaded and saved to {filename}")
    except Exception as e:
        print(f"Failed to download image {image_url}: {e}")


# Function to preprocess image (Optional: Can be tuned for better OCR accuracy)
def preprocess_image(image_path):
    image = cv2.imread(image_path)
    if image is None:
        return None
    gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    return gray_image

# Function to extract text from an image using Tesseract
def extract_text_from_image(image_path):
    processed_image = preprocess_image(image_path)
    if processed_image is None:
        return ""
    text = pytesseract.image_to_string(processed_image)
    return text

# Function to normalize extracted unit to the allowed units
def normalize_unit(text):
    # Extract numbers and potential units from the text
    match = re.search(r"(\d+(\.\d+)?)\s*([a-zA-Z]+)", text)
    if match:
        value = match.group(1)
        unit = match.group(3).lower()
        unit_normalized = unit_normalization.get(
            unit, unit
        )  # Normalize the unit if needed
        if unit_normalized in allowed_units:
            return f"{value} {unit_normalized}"
    return ""


# Function to match extracted text with allowed units
def match_units(entity_name, extracted_text):
    if entity_name in entity_unit_map:
        return normalize_unit(extracted_text)
    return ""

In [4]:
def prepare_data(training_file, sample_size=None):
    df = pd.read_csv(training_file)

    if sample_size:
        df = df.sample(n=sample_size, random_state=42)

    texts = []
    labels = []

    for idx, row in df.iterrows():
        image_url = row["image_link"]
        entity_value = row["entity_value"]
        image_path = f"temp_images/{idx}.jpg"  # Temp storage for the image

        download_image(image_url, image_path)
        text = extract_text_from_image(image_path)
        texts.append(text)
        labels.append(entity_value)

    return texts, labels

In [5]:



# Function to train the model
def train_model(training_file, sample_size=None):
    texts, labels = prepare_data(training_file, sample_size)

    # Encode labels
    label_encoder = LabelEncoder()
    labels_encoded = label_encoder.fit_transform(labels)
    # Create a text processing and model pipeline
    model = make_pipeline(
        CountVectorizer(),  # Converts text to feature vectors
        LogisticRegression(
            max_iter=1000
        ),  # Basic classification model with increased iterations
    )

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(
        texts, labels_encoded, test_size=0.2, random_state=42
    )

    # Train the model
    model.fit(X_train, y_train)

    # Print model accuracy
    print(f"Model accuracy: {model.score(X_test, y_test) * 100:.2f}%")

    return model, label_encoder



In [9]:

# Function to generate predictions for all rows in the test.csv
def generate_predictions(test_file, output_file, model=None, label_encoder=None):
    test_data = pd.read_csv(test_file)
    predictions = []

    for idx, row in test_data.iterrows():
        image_url = row["image_link"]
        entity_name = row["entity_name"]
        index = row["index"]

        image_path = f"images/{idx}.jpg"
        download_image(image_url, image_path)

        extracted_text = extract_text_from_image(image_path)

        if model and label_encoder:
            # Use the trained model to predict the entity value
            entity_value_prediction = model.predict([extracted_text])[0]
            entity_value = label_encoder.inverse_transform([entity_value_prediction])[0]
        else:
            entity_value = row["entity_value"]  # Use the actual value if no model
        # Predict based on units and entity name
        prediction = match_units(entity_name, extracted_text)

        # If prediction is empty, use the actual entity_value or leave it empty
        prediction = prediction if prediction else ""

        predictions.append({"index": index, "prediction": prediction})

        print(f"Index: {index}, Prediction: {prediction}")

    # Save predictions to CSV
    output_df = pd.DataFrame(predictions)
    output_df.to_csv(output_file, index=False)
    print(f"Predictions saved to {output_file}")

In [10]:


training_file = os.path.abspath(
        "C:\\Users\\DELL\\Harshini\\unstop-hackathon\\student_resource 3\\dataset\\train.csv"
    )
test_file = os.path.abspath(
        "C:\\Users\\DELL\\Harshini\\unstop-hackathon\\student_resource 3\\dataset\\test.csv"
    )
output_file = os.path.abspath(
        "C:\\Users\\DELL\\Harshini\\unstop-hackathon\\student_resource 3\\dataset\\test_out5.csv"
    )

    # Train the model with a subset of the data (Optional)
model, label_encoder = train_model(training_file, sample_size=1000)


Using existing image temp_images/101138.jpg
Using existing image temp_images/165563.jpg
Using existing image temp_images/228235.jpg
Using existing image temp_images/197996.jpg
Using existing image temp_images/107574.jpg
Using existing image temp_images/243052.jpg
Using existing image temp_images/202803.jpg
Using existing image temp_images/59020.jpg
Using existing image temp_images/215157.jpg
Using existing image temp_images/225702.jpg
Using existing image temp_images/88217.jpg
Using existing image temp_images/260325.jpg
Using existing image temp_images/54742.jpg
Using existing image temp_images/227945.jpg
Using existing image temp_images/194615.jpg
Using existing image temp_images/24072.jpg
Using existing image temp_images/43569.jpg
Using existing image temp_images/256113.jpg
Using existing image temp_images/182355.jpg
Using existing image temp_images/118253.jpg
Using existing image temp_images/53375.jpg
Using existing image temp_images/42404.jpg
Using existing image temp_images/153129

In [None]:
    # Generate predictions
generate_predictions(
        test_file, output_file, model=model, label_encoder=label_encoder
    )

Image successfully downloaded and saved to images/125000.jpg
Index: 125096, Prediction: 
Image successfully downloaded and saved to images/125001.jpg
Index: 125097, Prediction: 
Image successfully downloaded and saved to images/125002.jpg
Index: 125098, Prediction: 
Image successfully downloaded and saved to images/125003.jpg
Index: 125099, Prediction: 
Image successfully downloaded and saved to images/125004.jpg
Index: 125100, Prediction: 
Image successfully downloaded and saved to images/125005.jpg
Index: 125101, Prediction: 
Image successfully downloaded and saved to images/125006.jpg
Index: 125102, Prediction: 5 volt
Image successfully downloaded and saved to images/125007.jpg
Index: 125103, Prediction: 5 volt
Image successfully downloaded and saved to images/125008.jpg
Index: 125104, Prediction: 
Image successfully downloaded and saved to images/125009.jpg
Index: 125105, Prediction: 60 watt
Image successfully downloaded and saved to images/125010.jpg
Index: 125106, Prediction: 60 