### Basic library imports

In [1]:
import os
import pandas as pd

### Read Dataset

In [2]:
DATASET_FOLDER = '../dataset/'
train = pd.read_csv(os.path.join(DATASET_FOLDER, 'train.csv'))
test = pd.read_csv(os.path.join(DATASET_FOLDER, 'test.csv'))
sample_test = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test.csv'))
sample_test_out = pd.read_csv(os.path.join(DATASET_FOLDER, 'sample_test_out.csv'))

### Run Sanity check using src/sanity.py

In [3]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out.csv

Parsing successfull for file: ../dataset/sample_test_out.csv


In [4]:
!python sanity.py --test_filename ../dataset/sample_test.csv --output_filename ../dataset/sample_test_out_fail.csv

Error: Invalid unit [lbs] found in 6.75 lbs. Allowed units: {'cubic foot', 'cup', 'cubic inch', 'kilogram', 'foot', 'centimetre', 'kilowatt', 'centilitre', 'milligram', 'decilitre', 'gallon', 'ounce', 'yard', 'litre', 'metre', 'kilovolt', 'volt', 'pint', 'inch', 'microgram', 'microlitre', 'quart', 'imperial gallon', 'ton', 'millivolt', 'watt', 'millimetre', 'pound', 'millilitre', 'gram', 'fluid ounce'}


### Download images

In [14]:
from utils import download_images
download_images(sample_test['image_link'], '../images')
download_images(train['image_link'], '../images')

100%|██████████| 88/88 [00:10<00:00,  8.42it/s]
  1%|          | 1538/263859 [03:08<8:54:55,  8.17it/s] 


KeyboardInterrupt: 

In [7]:
assert len(os.listdir('../images')) > 0

In [34]:
rm -rf ../images

In [6]:
from pathlib import Path
train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

In [8]:
import easyocr
import os
from pathlib import Path

reader = easyocr.Reader(['en'])

def extract_text_from_image(image_path):
    result = reader.readtext(image_path)
    text = " ".join([res[1] for res in result])
    return text

# Sample image link
link = train_df.head()['image_link'][1]
images_1 = Path(link).name

# Adjust the path to the directory where images are stored
sample_image_path = f"../images/41ADVPQgZOL.jpg"

# Debugging: Check if the file exists
if os.path.exists(sample_image_path):
    extracted_text = extract_text_from_image(sample_image_path)
    print(extracted_text)
else:
    print(f"File not found: {sample_image_path}")

Calabrian Powder  Ca-ean Chili


In [10]:
#Feature Extraction with CNN
# main.py
import torch
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image

# Load a pre-trained ResNet model
resnet = models.resnet50(pretrained=True)
resnet.eval()

# Image preprocessing function
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

def extract_image_features(image_path):
    image = Image.open(image_path)
    image = transform(image).unsqueeze(0)
    with torch.no_grad():
        features = resnet(image)
    return features.numpy().flatten()

# Example usage
image_features = extract_image_features(sample_image_path)
# print("\n")
# print(image_features)
# print("\n")
print(image_features.shape)

(1000,)


In [11]:
#Data Processing
import re

def extract_entity_value(extracted_text):
    # Simple regex pattern to extract "number unit" format
    pattern = r'(\d+(\.\d+)?)\s*(gram|kilogram|centimetre|inch|ounce)'
    match = re.search(pattern, extracted_text)
    if match:
        return f"{match.group(1)} {match.group(3)}"
    return ""

# Example usage
entity_value = extract_entity_value(extracted_text)
print(entity_value)




In [12]:
#Checking Columns
print(train_df.columns)
print(test_df.columns)

Index(['image_link', 'group_id', 'entity_name', 'entity_value'], dtype='object')
Index(['index', 'image_link', 'group_id', 'entity_name'], dtype='object')


In [13]:

# print(train_df.head())

link=test_df.head()['image_link'][0]
Path(link).name

'110EibNyclL.jpg'

In [13]:
# Model Training
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare training data
# Extract image features using the DataFrame's own index for training data
X_image_features = [extract_image_features(f'../images/{idx}.jpg') for idx in train_df.index]

# Extract text features for training data
X_text_features = [extract_text_from_image(f'../images/{idx}.jpg') for idx in train_df.index]

# Assuming 'test_df' is your DataFrame for the test set
# Extract image features using the 'index' column for test data
X_test_image_features = [extract_image_features(f'../images/{idx}.jpg') for idx in test_df['index']]

# Extract text features for test data
X_test_text_features = [extract_text_from_image(f'../images/{idx}.jpg') for idx in test_df['index']]

# Encode the target variable
y = train_df['entity_value']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data
X_train_img, X_test_img, X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_image_features, X_text_features, y_encoded, test_size=0.2, random_state=42)

# Combine image and text features in a pipeline
model_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

# Train the model using image features only for simplicity
model_pipeline.fit(X_train_text, y_train)

# Evaluate the model
print(f"Training accuracy: {model_pipeline.score(X_test_text, y_test)}")

FileNotFoundError: [Errno 2] No such file or directory: '/Users/shubham/Downloads/7th Sem/Amazon ML/student_resource 3/images/0.jpg'