In [None]:
import numpy as np
import pandas as pd
import os

#utils

In [None]:
import re
import constants
import os
import requests
import pandas as pd
import multiprocessing
import time
from time import time as timer
from tqdm import tqdm
import numpy as np
from pathlib import Path
from functools import partial
import requests
import urllib
from PIL import Image

def common_mistake(unit):
    if unit in constants.allowed_units:
        return unit
    if unit.replace('ter', 'tre') in constants.allowed_units:
        return unit.replace('ter', 'tre')
    if unit.replace('feet', 'foot') in constants.allowed_units:
        return unit.replace('feet', 'foot')
    return unit

def parse_string(s):
    s_stripped = "" if s==None or str(s)=='nan' else s.strip()
    if s_stripped == "":
        return None, None
    pattern = re.compile(r'^-?\d+(\.\d+)?\s+[a-zA-Z\s]+$')
    if not pattern.match(s_stripped):
        raise ValueError("Invalid format in {}".format(s))
    parts = s_stripped.split(maxsplit=1)
    number = float(parts[0])
    unit = common_mistake(parts[1])
    if unit not in constants.allowed_units:
        raise ValueError("Invalid unit [{}] found in {}. Allowed units: {}".format(
            unit, s, constants.allowed_units))
    return number, unit


def create_placeholder_image(image_save_path):
    try:
        placeholder_image = Image.new('RGB', (100, 100), color='black')
        placeholder_image.save(image_save_path)
    except Exception as e:
        return

def download_image(image_link, save_folder, retries=3, delay=3):
    if not isinstance(image_link, str):
        return

    filename = Path(image_link).name
    image_save_path = os.path.join(save_folder, filename)

    if os.path.exists(image_save_path):
        return

    for _ in range(retries):
        try:
            urllib.request.urlretrieve(image_link, image_save_path)
            return
        except:
            time.sleep(delay)

    create_placeholder_image(image_save_path) #Create a black placeholder image for invalid links/images

def download_images(image_links, download_folder, allow_multiprocessing=True):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    if allow_multiprocessing:
        download_image_partial = partial(
            download_image, save_folder=download_folder, retries=3, delay=3)

        with multiprocessing.Pool(64) as pool:
            list(tqdm(pool.imap(download_image_partial, image_links), total=len(image_links)))
            pool.close()
            pool.join()
    else:
        for image_link in tqdm(image_links, total=len(image_links)):
            download_image(image_link, save_folder=download_folder, retries=3, delay=3)


#sanity

In [None]:
import pandas as pd
import argparse
import re
import os
import constants
from utils import parse_string

def check_file(filename):
    if not filename.lower().endswith('.csv'):
        raise ValueError("Only CSV files are allowed.")
    if not os.path.exists(filename):
        raise FileNotFoundError("Filepath: {} invalid or not found.".format(filename))

def sanity_check(test_filename, output_filename):
    check_file(test_filename)
    check_file(output_filename)

    try:
        test_df = pd.read_csv(test_filename)
        output_df = pd.read_csv(output_filename)
    except Exception as e:
        raise ValueError(f"Error reading the CSV files: {e}")

    if 'index' not in test_df.columns:
        raise ValueError("Test CSV file must contain the 'index' column.")

    if 'index' not in output_df.columns or 'prediction' not in output_df.columns:
        raise ValueError("Output CSV file must contain 'index' and 'prediction' columns.")

    missing_index = set(test_df['index']).difference(set(output_df['index']))
    if len(missing_index) != 0:
        print("Missing index in test file: {}".format(missing_index))

    extra_index = set(output_df['index']).difference(set(test_df['index']))
    if len(extra_index) != 0:
        print("Extra index in test file: {}".format(extra_index))

    output_df.apply(lambda x: parse_string(x['prediction']), axis=1)
    print("Parsing successfull for file: {}".format(output_filename))

if __name__ == "__main__":
    #Usage example: python sanity.py --test_filename sample_test.csv --output_filename sample_test_out.csv

    parser = argparse.ArgumentParser(description="Run sanity check on a CSV file.")
    parser.add_argument("--test_filename", type=str, required=True, help="The test CSV file name.")
    parser.add_argument("--output_filename", type=str, required=True, help="The output CSV file name to check.")
    args = parser.parse_args()
    try:
        sanity_check(args.test_filename, args.output_filename)
    except Exception as e:
        print('Error:', e)

#README.md

# ML Challenge Problem Statement

## Feature Extraction from Images

In this hackathon, the goal is to create a machine learning model that extracts entity values from images. This capability is crucial in fields like healthcare, e-commerce, and content moderation, where precise product information is vital. As digital marketplaces expand, many products lack detailed textual descriptions, making it essential to obtain key details directly from images. These images provide important information such as weight, volume, voltage, wattage, dimensions, and many more, which are critical for digital stores.

### Data Description:

The dataset consists of the following columns:

1. **index:** An unique identifier (ID) for the data sample
2. **image_link**: Public URL where the product image is available for download. Example link - https://m.media-amazon.com/images/I/71XfHPR36-L.jpg
    To download images use `download_images` function from `src/utils.py`. See sample code in `src/test.ipynb`.
3. **group_id**: Category code of the product
4. **entity_name:** Product entity name. For eg: “item_weight”
5. **entity_value:** Product entity value. For eg: “34 gram”
    Note: For test.csv, you will not see the column `entity_value` as it is the target variable.

### Output Format:

The output file should be a csv with 2 columns:

1. **index:** The unique identifier (ID) of the data sample. Note the index should match the test record index.
2. **prediction:** A string which should have the following format: “x unit” where x is a float number in standard formatting and unit is one of the allowed units (allowed units are mentioned in the Appendix). The two values should be concatenated and have a space between them. For eg: “2 gram”, “12.5 centimetre”, “2.56 ounce” are valid. Few invalid cases: “2 gms”, “60 ounce/1.7 kilogram”, “2.2e2 kilogram” etc.
    Note: Make sure to output a prediction for all indices. If no value is found in the image for any test sample, return empty string, i.e, `“”`. If you have less/more number of output samples in the output file as compared to test.csv, your output won’t be evaluated.

### File Descriptions:

*source files*

1. **src/sanity.py**: Sanity checker to ensure that the final output file passes all formatting checks. Note: the script will not check if less/more number of predictions are present compared to the test file. See sample code in `src/test.ipynb`
2. **src/utils.py**: Contains helper functions for downloading images from the image_link.
3. **src/constants.py:** Contains the allowed units for each entity type.
4. **sample_code.py:** We also provided a sample dummy code that can generate an output file in the given format. Usage of this file is optional.

*Dataset files*

1. **dataset/train.csv**: Training file with labels (`entity_value`).
2. **dataset/test.csv**: Test file without output labels (`entity_value`). Generate predictions using your model/solution on this file's data and format the output file to match sample_test_out.csv (Refer the above section "Output Format")
3. **dataset/sample_test.csv**: Sample test input file.
4. **dataset/sample_test_out.csv**: Sample outputs for sample_test.csv. The output for test.csv must be formatted in the exact same way. Note: The predictions in the file might not be correct

### Constraints

1. You will be provided with a sample output file and a sanity checker file. Format your output to match the sample output file exactly and pass it through the sanity checker to ensure its validity. Note: If the file does not pass through the sanity checker, it will not be evaluated. You should recieve a message like `Parsing successfull for file: ...csv` if the output file is correctly formatted.

2. You are given the list of allowed units in constants.py and also in Appendix. Your outputs must be in these units. Predictions using any other units will be considered invalid during validation.

### Evaluation Criteria

Submissions will be evaluated based on F1 score, which are standard measures of prediction accuracy for classification and extraction problems.

Let GT = Ground truth value for a sample and OUT be output prediction from the model for a sample. Then we classify the predictions into one of the 4 classes with the following logic:

1. *True Positives* - If OUT != `""` and GT != `""` and OUT == GT
2. *False Positives* - If OUT != `""` and GT != `""` and OUT != GT
3. *False Positives* - If OUT != `""` and GT == `""`
4. *False Negatives* - If OUT == `""` and GT != `""`
5. *True Negatives* - If OUT == `""` and GT == `""`

Then, F1 score = 2*Precision*Recall/(Precision + Recall) where:

1. Precision = True Positives/(True Positives + False Positives)
2. Recall = True Positives/(True Positives + False Negatives)

### Submission File

Upload a test_out.csv file in the Portal with the exact same formatting as sample_test_out.csv

### Appendix

```
entity_unit_map = {
  "width": {
    "centimetre",
    "foot",
    "millimetre",
    "metre",
    "inch",
    "yard"
  },
  "depth": {
    "centimetre",
    "foot",
    "millimetre",
    "metre",
    "inch",
    "yard"
  },
  "height": {
    "centimetre",
    "foot",
    "millimetre",
    "metre",
    "inch",
    "yard"
  },
  "item_weight": {
    "milligram",
    "kilogram",
    "microgram",
    "gram",
    "ounce",
    "ton",
    "pound"
  },
  "maximum_weight_recommendation": {
    "milligram",
    "kilogram",
    "microgram",
    "gram",
    "ounce",
    "ton",
    "pound"
  },
  "voltage": {
    "millivolt",
    "kilovolt",
    "volt"
  },
  "wattage": {
    "kilowatt",
    "watt"
  },
  "item_volume": {
    "cubic foot",
    "microlitre",
    "cup",
    "fluid ounce",
    "centilitre",
    "imperial gallon",
    "pint",
    "decilitre",
    "litre",
    "millilitre",
    "quart",
    "cubic inch",
    "gallon"
  }
}
```


#sample code

In [None]:
import os
import random
import pandas as pd

def predictor(image_link, category_id, entity_name):
    '''
    Call your model/approach here
    '''
    #TODO
    return "" if random.random() > 0.5 else "10 inch"

if __name__ == "__main__":
    DATASET_FOLDER = '../dataset/'

    test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))

    test['prediction'] = test.apply(
        lambda row: predictor(row['image_link'], row['group_id'], row['entity_name']), axis=1)

    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
    test[['index', 'prediction']].to_csv(output_filename, index=False)

#constants

In [None]:
entity_unit_map = {
    'width': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'depth': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'height': {'centimetre', 'foot', 'inch', 'metre', 'millimetre', 'yard'},
    'item_weight': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'maximum_weight_recommendation': {'gram',
        'kilogram',
        'microgram',
        'milligram',
        'ounce',
        'pound',
        'ton'},
    'voltage': {'kilovolt', 'millivolt', 'volt'},
    'wattage': {'kilowatt', 'watt'},
    'item_volume': {'centilitre',
        'cubic foot',
        'cubic inch',
        'cup',
        'decilitre',
        'fluid ounce',
        'gallon',
        'imperial gallon',
        'litre',
        'microlitre',
        'millilitre',
        'pint',
        'quart'}
}

allowed_units = {unit for entity in entity_unit_map for unit in entity_unit_map[entity]}

In [None]:
path_train = "/content/drive/MyDrive/Amazon ML Challenge/train.csv"


train = pd.read_csv(path_train)


In [None]:
path_test = "/content/drive/MyDrive/Amazon ML Challenge/test.csv"


test = pd.read_csv(path_test)


In [None]:
test_set = test

In [None]:
# path_sample_test = "/content/drive/MyDrive/Amazon ML Challenge/sample_test.csv"
# paht_sample_test_out = "/content/drive/MyDrive/Amazon ML Challenge/sample_test_out.csv"
# path_sample_test_out_fail = "/content/drive/MyDrive/Amazon ML Challenge/sample_test_out_fail.csv"

# sample_test = pd.read_csv(path_sample_test)
# sample_test_out = pd.read_csv(paht_sample_test_out)
# sample_test_out_fail = pd.read_csv(path_sample_test_out_fail)

In [None]:
train_set = train

In [None]:
# train_set.to_csv('train_set.csv', index=False)

In [None]:


import pandas as pd
import re
import os

# Define the entity_unit_map
entity_unit_map = {
  "width": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
  "depth": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
  "height": {"centimetre", "foot", "millimetre", "metre", "inch", "yard"},
  "item_weight": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
  "maximum_weight_recommendation": {"milligram", "kilogram", "microgram", "gram", "ounce", "ton", "pound"},
  "voltage": {"millivolt", "kilovolt", "volt"},
  "wattage": {"kilowatt", "watt"},
  "item_volume": {"cubic foot", "microlitre", "cup", "fluid ounce", "centilitre", "imperial gallon", "pint",
                  "decilitre", "litre", "millilitre", "quart", "cubic inch", "gallon"}
}

# Define a mapping for unit normalization
unit_normalization_map = {
    "lbs": "pound",  # Convert 'lbs' to 'pound'
    "cm": "centimetre",  # Add other abbreviations if needed
    "mm": "millimetre",
    "in": "inch",
    # Add other abbreviations as required
}

# Function to normalize units in the extracted text
def normalize_units(entity_value):
    for abbrev, full_unit in unit_normalization_map.items():
        # Replace abbreviations with full unit names
        entity_value = entity_value.replace(abbrev, full_unit)
    return entity_value

# Update the is_valid_entity_value function to include unit normalization
def is_valid_entity_value(entity_value, entity_name):
    # Normalize the entity_value to replace abbreviations with valid units
    entity_value = normalize_units(entity_value)

    # Get valid units for the given entity_name
    valid_units = entity_unit_map.get(entity_name, set())

    # Build a regex pattern that matches a number followed by a valid unit
    pattern = r'^\d+\.?\d*\s*({})$'.format('|'.join(valid_units))

    # Check if the entity_value matches the pattern
    return bool(re.match(pattern, entity_value))

# Function to clean the dataset by removing invalid rows
def clean_dataset(dataframe):
    cleaned_data = []

    for idx, row in dataframe.iterrows():
        entity_value = row['entity_value']
        entity_name = row['entity_name']

        # Check if the entity_value is valid for the given entity_name
        if is_valid_entity_value(entity_value, entity_name):
            cleaned_data.append(row)  # Keep valid rows

    # Create a new DataFrame with only valid rows
    cleaned_df = pd.DataFrame(cleaned_data)
    return cleaned_df





In [None]:
cleaned_train = clean_dataset(train_set)

text extraction

In [None]:
# # Step 1: Install required libraries
# !pip install easyocr pandas

In [None]:


# # Import necessary libraries
# import pandas as pd
# import easyocr
# from PIL import Image
# import requests
# from io import BytesIO

# # Step 3: Initialize EasyOCR Reader
# reader = easyocr.Reader(['en'])  # You can specify other languages if needed

# # Function to extract text from an image URL
# def extract_text_from_image(image_url):
#     try:
#         # Download image
#         response = requests.get(image_url)
#         img = Image.open(BytesIO(response.content))

#         # Perform text extraction
#         results = reader.readtext(img)

#         # Combine all extracted texts
#         text = ' '.join([result[1] for result in results])
#         return text
#     except Exception as e:
#         print(f"Error processing image {image_url}: {e}")
#         return None

# # Step 4: Apply text extraction to each image link and store in a new column
# cleaned_train['extracted_text'] = cleaned_train['image_link'].apply(extract_text_from_image)

# # Save the updated DataFrame to a new CSV file
# # cleaned_train.to_csv('updated_dataset.csv', index=False)

# # print("Text extraction completed and saved to 'updated_dataset.csv'")


the above fucntion will take a lot of time to run ,
maybe , we can use batch process

In [None]:
# test_set = test

In [None]:
# cleaned_train.info()

In [None]:
# test_set.info()

In [None]:
# sub1 = test_set[['index' , 'group_id']]

In [None]:
# sub1['prediction'] = '2 gram'

In [None]:
# sub1.head()

In [None]:
# cleaned_train.head()

In [None]:
# cleaned_train = cleaned_train.drop_duplicates()

In [None]:
# train_set.info()

In [None]:
# group_to_entity = cleaned_train.set_index('group_id')['entity_value'].to_dict()

# # Step 3: Update `sub1` DataFrame
# def update_prediction(row):
#     return group_to_entity.get(row['group_id'], '2 gram')

# sub1['prediction'] = sub1.apply(update_prediction, axis=1)

# # Step 4: Prepare the final DataFrame
# final_df = sub1[['index', 'group_id', 'prediction']]

In [None]:
# final_df.info()

In [None]:
# final_df.head()

In [None]:
# final_df = final_df.drop(columns = ['group_id'])

In [None]:
# final_df.to_csv('submission1.csv', index=False)

In [None]:
cleaned_train.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [None]:
# !pip install pytesseract

In [None]:
# !pip install requests pytesseract Pillow


In [None]:
# # Install Tesseract-OCR
# !apt-get update
# !apt-get install -y tesseract-ocr


In [None]:
# import pytesseract
# from PIL import Image
# import multiprocessing as mp
# import pandas as pd
# import requests
# from io import BytesIO

# # Function to download and process an image from a URL
# def process_image_from_url(image_url):
#     try:
#         # Download the image
#         response = requests.get(image_url)
#         response.raise_for_status()  # Raise an error for bad responses

#         # Open the image
#         img = Image.open(BytesIO(response.content))

#         # Perform OCR
#         text = pytesseract.image_to_string(img)
#         return text
#     except Exception as e:
#         print(f"Error processing image {image_url}: {e}")
#         return ""

# # Function to apply OCR in parallel
# def apply_ocr_in_parallel(image_urls, num_workers):
#     with mp.Pool(num_workers) as pool:
#         texts = pool.map(process_image_from_url, image_urls)
#     return texts

# # Load your dataset
# # df = pd.read_csv('your_dataset.csv')

# # Assuming image URLs are in the 'image_url' column
# image_urls = cleaned_train['image_link'].tolist()

# # Apply OCR to images
# num_workers = mp.cpu_count()  # or specify your number of workers
# texts = apply_ocr_in_parallel(image_urls, num_workers)

# # Add extracted text to the DataFrame
# df['extracted_text'] = texts

# # Save the updated dataset
# df.to_csv('updated_dataset.csv', index=False)


In [None]:
# import requests
# from io import BytesIO
# from PIL import Image
# import pytesseract
# import os

# def text_extract_from_url(image_url):
#     """
#     Extract text from an image URL, save the image temporarily, and then delete it after processing.

#     Args:
#     - image_url (str): URL of the image.

#     Returns:
#     - str: Extracted text from the image.
#     """
#     try:
#         # Fetch the image from the URL
#         response = requests.get(image_url)
#         response.raise_for_status()  # Ensure the request was successful

#         # Open the image from the response content
#         img = Image.open(BytesIO(response.content))

#         # Save the image temporarily
#         temp_image_path = "temp_image.jpg"
#         img.save(temp_image_path)

#         # Extract text using pytesseract
#         extracted_text = pytesseract.image_to_string(img)

#         # Delete the temporary image file
#         if os.path.exists(temp_image_path):
#             os.remove(temp_image_path)
#             print(f"Deleted temporary file: {temp_image_path}")

#         return extracted_text

#     except requests.RequestException as e:
#         print(f"Error fetching the image: {e}")
#         return ""
#     except Exception as e:
#         print(f"An error occurred while processing the image: {e}")
#         return ""



In [None]:
# # Example usage
# image_url = 'https://m.media-amazon.com/images/I/71gSRbyXmoL.jpg	'  # Replace with your image URL
# extracted_text = text_extract_from_url(image_url)
# print("Extracted Text:", extracted_text)


In [None]:
# !pip install opencv-python
# !pip install matplotlib
# !pip install numpy
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
# !pip install easyocr
# !pip install Pillow  # Required for handling image URLs

In [None]:
# import cv2
# import numpy as np
# import easyocr
# import matplotlib.pyplot as plt
# import requests
# from PIL import Image
# from io import BytesIO

In [None]:
# # Install Tesseract OCR
# !apt-get update
# !apt-get install -y tesseract-ocr

# # Install Python packages
# !pip install pytesseract transformers torch pandas Pillow requests


In [None]:
# import pandas as pd
# import requests
# from PIL import Image
# import pytesseract
# from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments
# from sklearn.model_selection import train_test_split
# import torch
# from torch.utils.data import Dataset

# # Ensure Tesseract is installed and in your PATH
# # pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'  # Adjust this path if necessary

# # Load and preprocess the data
# # def load_data(train_file, test_file):
# #     train_df = pd.read_csv(train_file)
# #     test_df = pd.read_csv(test_file)
# #     return train_df, test_df

# # OCR function to extract text from image
# def extract_text_from_image(image_url):
#     try:
#         image = Image.open(requests.get(image_url, stream=True).raw)
#         text = pytesseract.image_to_string(image)
#         return text.strip()
#     except Exception as e:
#         print(f"Error processing image {image_url}: {e}")
#         return ""

# # Apply OCR to all images in the dataset
# def preprocess_images(df):
#     df['extracted_text'] = df['image_link'].apply(extract_text_from_image)
#     return df

# # Define a custom Dataset class
# class CustomDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = self.texts[idx]
#         label = self.labels[idx]
#         inputs = self.tokenizer.encode_plus(
#             text, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt'
#         )
#         labels = self.tokenizer.encode(label, max_length=self.max_length, padding='max_length', truncation=True, return_tensors='pt')
#         return {'input_ids': inputs['input_ids'].squeeze(), 'attention_mask': inputs['attention_mask'].squeeze(), 'labels': labels.squeeze()}

# # Fine-tune GPT-2
# def fine_tune_gpt2(train_df, tokenizer_name='gpt2', model_name='gpt2'):
#     tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
#     model = GPT2LMHeadModel.from_pretrained(model_name)

#     train_texts = train_df['extracted_text'].tolist()
#     train_labels = train_df['entity_value'].tolist()

#     train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=512)
#     training_args = TrainingArguments(
#         per_device_train_batch_size=4,
#         num_train_epochs=3,
#         logging_dir='./logs',
#         logging_steps=10,
#         output_dir='./results',
#         save_steps=10,
#         evaluation_strategy='epoch'
#     )

#     trainer = Trainer(
#         model=model,
#         args=training_args,
#         train_dataset=train_dataset
#     )

#     trainer.train()

#     model.save_pretrained('./fine_tuned_model')
#     tokenizer.save_pretrained('./fine_tuned_model')

# # Predict using the fine-tuned model
# def predict(test_df, tokenizer_name='gpt2', model_name='fine_tuned_model'):
#     tokenizer = GPT2Tokenizer.from_pretrained(tokenizer_name)
#     model = GPT2LMHeadModel.from_pretrained(model_name)
#     model.eval()

#     predictions = []
#     for text in test_df['extracted_text']:
#         inputs = tokenizer.encode(text, return_tensors='pt')
#         with torch.no_grad():
#             outputs = model.generate(inputs, max_length=50, num_return_sequences=1)
#         decoded_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
#         predictions.append(decoded_output.strip())

#     return predictions


In [None]:
# train_df = preprocess_images(cleaned_train)
# test_df = preprocess_images(test_set)

In [None]:
cleaned_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 213778 entries, 0 to 263858
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   image_link    213778 non-null  object
 1   group_id      213778 non-null  int64 
 2   entity_name   213778 non-null  object
 3   entity_value  213778 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.2+ MB


In [None]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131187 entries, 0 to 131186
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   index        131187 non-null  int64 
 1   image_link   131187 non-null  object
 2   group_id     131187 non-null  int64 
 3   entity_name  131187 non-null  object
dtypes: int64(2), object(2)
memory usage: 4.0+ MB


In [None]:
# !pip install paddleocr

In [None]:
# !pip install paddlepaddle-gpu

In [None]:
# import pandas as pd
# import requests
# from io import BytesIO
# from paddleocr import PaddleOCR
# from PIL import Image

# # Initialize PaddleOCR with GPU support
# ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Adjust the language and settings as needed

# def extract_text_from_image(image_url):
#     try:
#         # Fetch image from URL
#         response = requests.get(image_url)
#         image = Image.open(BytesIO(response.content))

#         # Perform OCR
#         results = ocr.ocr(image, cls=True)

#         # Extract text from results
#         extracted_text = ' '.join([line[1][0] for line in results[0]])
#         return extracted_text
#     except Exception as e:
#         print(f"Error processing image {image_url}: {e}")
#         return ""

# # Load your dataset
# # df = pd.read_csv('your_dataset.csv')

# # Extract text from each image
# cleaned_train['extracted_text'] = cleaned_train['image_link'].apply(extract_text_from_image)

# # Save the updated dataset to a new CSV
# # df.to_csv('your_dataset_with_text.csv', index=False)

# # print("Text extraction complete. Updated dataset saved to 'your_dataset_with_text.csv'.")


In [None]:
# Install PaddleOCR and PaddlePaddle with GPU support
# !pip install paddleocr paddlepaddle-gpu


In [None]:
# import os

# # Create directory for PaddleOCR model
# os.makedirs('/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/', exist_ok=True)


In [None]:
# import urllib.request

# # Define the URL and file path
# url = 'https://paddleocr.bj.bcebos.com/PP-OCRv4/english/en_PP-OCRv4_rec_infer.tar'
# file_path = '/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/en_PP-OCRv4_rec_infer.tar'

# # Remove existing file if it exists
# if os.path.exists(file_path):
#     os.remove(file_path)

# # Download the model file
# urllib.request.urlretrieve(url, file_path)


In [None]:
# import tarfile

# # Verify if the file exists
# if os.path.exists(file_path):
#     # Extract the tar file
#     with tarfile.open(file_path, 'r') as tar:
#         tar.extractall(path='/root/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer/')
#     print("Model file extracted successfully.")
# else:
#     print("Model file not found. Please check the download process.")


In [None]:
# test_set.to_csv('test_set.csv', index=False)

In [None]:
# import pandas as pd
# import requests
# from io import BytesIO
# from paddleocr import PaddleOCR
# from PIL import Image
# import os
# import tempfile

# # Initialize PaddleOCR with GPU support
# ocr = PaddleOCR(use_angle_cls=True, lang='en')  # Adjust the language and settings as needed

# def extract_text_from_image(image_url):
#     try:
#         # Fetch image from URL
#         response = requests.get(image_url)
#         image = Image.open(BytesIO(response.content))

#         # Create a temporary file to save the image
#         with tempfile.NamedTemporaryFile(delete=False, suffix='.jpg') as temp_file:
#             temp_file_path = temp_file.name
#             image.save(temp_file_path)

#             # Perform OCR
#             results = ocr.ocr(temp_file_path, cls=True)

#             # Extract text from results
#             extracted_text = ' '.join([line[1][0] for line in results[0]])

#         # Remove the temporary file
#         os.remove(temp_file_path)

#         return extracted_text
#     except Exception as e:
#         print(f"Error processing image {image_url}: {e}")
#         return ""

# def process_dataframe(df, output_csv):
#     # Open a file for appending processed data
#     with open(output_csv, 'w') as f:
#         # Write header (assuming your DataFrame has columns 'image_url' and 'extracted_text')
#         f.write('image_url,extracted_text\n')

#     # Initialize the counter
#     counter = 0

#     for index, row in df.iterrows():
#         image_url = row['image_link']
#         extracted_text = extract_text_from_image(image_url)

#         # Increment the counter
#         counter += 1
#         print(f"Processed {counter}/{len(df)} rows")

#         # Append the processed row to the CSV
#         with open(output_csv, 'a') as f:
#             f.write(f"{image_url},{extracted_text}\n")

#     print("Text extraction complete. Updated dataset saved.")

# # Example DataFrame loading
# # df = pd.read_csv('your_dataset.csv')

# # Specify output CSV file path
# output_csv = 'test_set.csv'

# # Process the DataFrame and save results
# process_dataframe(test_set, output_csv)


In [None]:
# cleaned_train['extracted_text'] = cleaned_train['image_link'].apply(extract_text_from_image)

In [None]:
# first_10_rows = cleaned_train.head(10)

# # You can now use 'first_10_rows' as needed
# # print(first_10_rows)
# first_10_rows = pd.DataFrame(first_10_rows)

In [None]:
# first_10_rows

In [None]:
# first_10_rows['extracted_text'] = first_10_rows['image_link'].apply(extract_text_from_image)

In [None]:
# first_10_rows

In [None]:
# first_1000_rows = cleaned_train.head(1000)

# # You can now use 'first_1000_rows' as needed
# # print(first_1000_rows)
# first_1000_rows = pd.DataFrame(first_1000_rows)

In [None]:
# first_1000_rows['extracted_text'] = first_1000_rows['image_link'].apply(extract_text_from_image)

In [None]:
# first_1000_rows.to_csv('first_1000_rows.csv', index=False)

In [None]:
# path_1 = "/content/drive/MyDrive/Amazon ML Challenge/40k_train.csv"
# path_2 = "/content/drive/MyDrive/Amazon ML Challenge/80k_train.csv"
# path_3 = "/content/drive/MyDrive/Amazon ML Challenge/rest_train.csv"

In [None]:
# part1 = pd.read_csv(path_1)
# part2 = pd.read_csv(path_2)
# part3 = pd.read_csv(path_3)

In [None]:
# part2['extracted_text'] = part2['image_link'].apply(extract_text_from_image)

In [None]:
# part3['extracted_text'] = part3['image_link'].apply(extract_text_from_image)

In [None]:
# part1['extracted_text'] = part1['image_link'].apply(extract_text_from_image)

In [None]:
# test_set.info()

In [None]:
# import numpy as np

# # Define the number of chunks
# num_chunks = 12

# # Split the dataframe into chunks
# chunks = np.array_split(test_set, num_chunks)


In [None]:
# chunks[0]['extracted_text'] = chunks[0]['image_link'].apply(extract_text_from_image)

In [None]:
# chunks[0]

In [None]:
test_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131187 entries, 0 to 131186
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   index        131187 non-null  int64 
 1   image_link   131187 non-null  object
 2   group_id     131187 non-null  int64 
 3   entity_name  131187 non-null  object
dtypes: int64(2), object(2)
memory usage: 4.0+ MB


In [None]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 263859 entries, 0 to 263858
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   image_link    263859 non-null  object
 1   group_id      263859 non-null  int64 
 2   entity_name   263859 non-null  object
 3   entity_value  263859 non-null  object
dtypes: int64(1), object(3)
memory usage: 8.1+ MB


In [None]:
common_ids = train_set['group_id'].unique()

# Filter the second dataset to remove rows with common group_id values
test_filtered = test_set[~test_set['group_id'].isin(train_set)]

In [None]:
test_filtered = pd.DataFrame(test_filtered)

In [None]:
test_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 131187 entries, 0 to 131186
Data columns (total 4 columns):
 #   Column       Non-Null Count   Dtype 
---  ------       --------------   ----- 
 0   index        131187 non-null  int64 
 1   image_link   131187 non-null  object
 2   group_id     131187 non-null  int64 
 3   entity_name  131187 non-null  object
dtypes: int64(2), object(2)
memory usage: 4.0+ MB


In [None]:
train_set.head()

Unnamed: 0,image_link,group_id,entity_name,entity_value
0,https://m.media-amazon.com/images/I/61I9XdN6OF...,748919,item_weight,500.0 gram
1,https://m.media-amazon.com/images/I/71gSRbyXmo...,916768,item_volume,1.0 cup
2,https://m.media-amazon.com/images/I/61BZ4zrjZX...,459516,item_weight,0.709 gram
3,https://m.media-amazon.com/images/I/612mrlqiI4...,459516,item_weight,0.709 gram
4,https://m.media-amazon.com/images/I/617Tl40LOX...,731432,item_weight,1400 milligram


In [None]:
test_set.head()

Unnamed: 0,index,image_link,group_id,entity_name
0,0,https://m.media-amazon.com/images/I/110EibNycl...,156839,height
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
2,2,https://m.media-amazon.com/images/I/11TU2clswz...,792578,height
3,3,https://m.media-amazon.com/images/I/11TU2clswz...,792578,depth
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth


In [None]:
distinct_count = test_set['group_id'].nunique()

In [None]:
distinct_count

924

In [None]:
df_unique_test = test_set.drop_duplicates(subset='image_link', keep='first')

# Optionally, you can reset the index if you want a clean index after dropping duplicates
df_unique_test = df_unique_test.reset_index(drop=True)
df_unique_test = pd.DataFrame(df_unique_test)


In [None]:
df_unique_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90666 entries, 0 to 90665
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   index        90666 non-null  int64 
 1   image_link   90666 non-null  object
 2   group_id     90666 non-null  int64 
 3   entity_name  90666 non-null  object
dtypes: int64(2), object(2)
memory usage: 2.8+ MB


In [None]:
df_unique_train = train_set.drop_duplicates(subset='image_link', keep='first')

# Optionally, you can reset the index if you want a clean index after dropping duplicates
df_unique_train = df_unique_train.reset_index(drop=True)
df_unique_train = pd.DataFrame(df_unique_train)

In [None]:
df_unique_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 255906 entries, 0 to 255905
Data columns (total 4 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   image_link    255906 non-null  object
 1   group_id      255906 non-null  int64 
 2   entity_name   255906 non-null  object
 3   entity_value  255906 non-null  object
dtypes: int64(1), object(3)
memory usage: 7.8+ MB


In [None]:
test_set_grp = test_set[test_set['group_id'] == 792578]

In [None]:
test_set_grp_uni = test_set_grp.drop_duplicates(subset='image_link', keep='first')

In [None]:
test_set_grp_uni = pd.DataFrame(test_set_grp_uni)

In [None]:
test_set_grp_uni

Unnamed: 0,index,image_link,group_id,entity_name
1,1,https://m.media-amazon.com/images/I/11TU2clswz...,792578,width
4,4,https://m.media-amazon.com/images/I/11gHj8dhhr...,792578,depth
7765,7775,https://m.media-amazon.com/images/I/41LFZLoydR...,792578,width
12517,12529,https://m.media-amazon.com/images/I/41cDLX-Yit...,792578,width
16543,16560,https://m.media-amazon.com/images/I/41ra-cpagl...,792578,depth
21101,21128,https://m.media-amazon.com/images/I/510SI9z5nJ...,792578,height
21609,21639,https://m.media-amazon.com/images/I/510z6C7CkJ...,792578,depth
26595,26629,https://m.media-amazon.com/images/I/516SMtqT4I...,792578,height
34648,34689,https://m.media-amazon.com/images/I/51FCtskKZY...,792578,height
34893,34935,https://m.media-amazon.com/images/I/51FUGdqd1i...,792578,depth


In [None]:
import torch
import torchvision.transforms as transforms
from torchvision import models
from torch.utils.data import DataLoader, Dataset
from PIL import Image
import os

# Define your dataset class
class CustomDataset(Dataset):
    def __init__(self, image_links, labels, transform=None):
        self.image_links = image_links
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_links)

    def __getitem__(self, idx):
        img = Image.open(self.image_links[idx]).convert("RGB")
        label = self.labels[idx]
        if self.transform:
            img = self.transform(img)
        return img, label

# Define transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Load your dataset
image_links = ["path/to/image1.jpg", "path/to/image2.jpg"]  # Update with your paths
labels = [0, 1]  # Update with your labels
dataset = CustomDataset(image_links=image_links, labels=labels, transform=transform)
dataloader = DataLoader(dataset, batch_size=4, shuffle=True)

# Load pre-trained model and modify it for your task
model = models.resnet18(pretrained=True)
num_ftrs = model.fc.in_features
model.fc = torch.nn.Linear(num_ftrs, len(set(labels)))  # Update num_classes

# Training setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, labels in dataloader:
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(dataset)
    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {epoch_loss:.4f}")

print("Training complete!")


In [None]:
from PIL import Image
import numpy as np

def predict_image(image_path, model, transform, device):
    model.eval()
    image = Image.open(image_path).convert("RGB")
    image = transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        outputs = model(image)
        _, predicted = torch.max(outputs, 1)

    return predicted.item()

# Example usage
image_path = "path/to/test_image.jpg"
predicted_label = predict_image(image_path, model, transform, device)
print(f"Predicted Label: {predicted_label}")


In [None]:
from yolov5 import train  # Ensure you have YOLOv5 installed

# Define your training parameters
train.run(
    data='data.yaml',        # Path to data configuration file
    cfg='yolov5s.yaml',      # Path to YOLOv5 configuration file
    weights='yolov5s.pt',    # Path to pre-trained weights
    epochs=10,               # Number of epochs
    imgsz=640                # Image size
)


In [None]:
import torch
from yolov5 import load  # Ensure you have YOLOv5 installed

# Load pre-trained model
model = load('yolov5s.pt')  # Path to pre-trained weights

def predict_image(image_path, model):
    results = model(image_path)
    results.print()  # Print results to the console
    results.save()   # Save results to 'runs/detect' directory

# Example usage
image_path = "path/to/test_image.jpg"
predict_image(image_path, model)


In [None]:
import clip
import torch
from PIL import Image

# Load CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess = clip.load("ViT-B/32", device=device)

def predict(image_path, text_prompts):
    image = preprocess(Image.open(image_path)).unsqueeze(0).to(device)
    text = clip.tokenize(text_prompts).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image)
        text_features = model.encode_text(text)
        similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)

    return similarity

# Example usage
image_path = "path/to/test_image.jpg"
text_prompts = ["A cat", "A dog", "A person"]
similarities = predict(image_path, text_prompts)
print("Similarities:", similarities)
