In [None]:
!pip install torch torchvision transformers pandas requests Pillow tqdm


In [25]:
from IPython.display import HTML, clear_output
from subprocess import getoutput
s = getoutput('nvidia-smi')
if 'K80' in s:gpu = 'K80'
elif 'T4' in s:gpu = 'T4'
elif 'P100' in s:gpu = 'P100'
else:
    gpu='DONT PROCEED'
display(HTML(f"<h1>{gpu}</h1>"))



```
# This is formatted as code
```

## **Importing Necessary Laibrary**

In [26]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import pandas as pd
import requests
from io import BytesIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import os
torch.cuda.empty_cache()

# **Configuration Parameters**

In [27]:

# Path to your CSV file containing image URLs
CSV_FILE_PATH = '/content/image_url.csv'#'/content/Final_dataset.csv'  # Replace with your actual CSV file path

# Batch size for processing images
BATCH_SIZE = 100

# Maximum number of threads for downloading images
MAX_WORKERS = 20

# Directory to cache downloaded images
CACHE_DIR = 'image_cache'

# Define your unique categories list
data = pd.read_csv('/content/product_category.csv') #('/content/product_category.csv')
df = pd.DataFrame(data)
# Get unique categories from the 'product_category' column
unique_categories = df['Product_category'].unique()
# Convert the unique categories to a list
unique_categories_list = unique_categories.tolist()



# **Step 1: Data Management**

In [28]:

# Load image URLs from the CSV file into a pandas DataFrame
try:
    data = pd.read_csv(CSV_FILE_PATH)
    df = pd.DataFrame(data)

    # Verify required columns exist
    required_columns = ['PRODUCT_MAIN_IMAGE_URL']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"The CSV file must contain a column named '{col}'.")

    print(f"Successfully loaded {len(df)} image URLs from the CSV file.")
except FileNotFoundError:
    print(f"CSV file not found at path: {CSV_FILE_PATH}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"CSV file at {CSV_FILE_PATH} is empty.")
    exit(1)
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
    exit(1)


Successfully loaded 994 image URLs from the CSV file.



# **Step 2: Image Loading**

In [29]:

def load_image_from_url(url, size=(224, 224), retries=3, backoff=5, cache_dir='image_cache'):
    """
    Downloads an image from the given URL, resizes it, and returns a PIL Image.
    Implements retries with exponential backoff and caching to avoid re-downloading.

    Args:
        url (str): The URL of the image.
        size (tuple): Desired size to resize the image.
        retries (int): Number of retry attempts for failed downloads.
        backoff (int): Initial backoff time in seconds.
        cache_dir (str): Directory to cache downloaded images.

    Returns:
        PIL.Image.Image or None: The downloaded and processed image, or None if failed.
    """
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    # Create a unique filename based on the URL
    # Replace non-filename-friendly characters
    filename = os.path.join(cache_dir, os.path.basename(url).split("?")[0])
    # Ensure the filename has a valid image extension
    if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        filename += '.jpg'

    # Check if the image is already cached
    if os.path.exists(filename):
        try:
            img = Image.open(filename).convert("RGB")
            img = img.resize(size)
            return img
        except Exception as e:
            print(f"Error loading cached image {filename}: {e}")
            # If cached image is corrupted, remove it and re-download
            os.remove(filename)

    # Attempt to download the image with retries
    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()  # Raise an exception for HTTP errors
            img = Image.open(BytesIO(response.content)).convert("RGB")
            img = img.resize(size)
            img.save(filename)  # Save to cache
            return img
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {url}: {e}")
            time.sleep(backoff * (2 ** attempt))  # Exponential backoff

    print(f"All attempts failed for {url}.")
    return None

def download_images(urls, max_workers=20):
    """
    Downloads images in parallel using ThreadPoolExecutor.
    """
    images = [None] * len(urls)

    def fetch_image(idx, url):
        images[idx] = load_image_from_url(url, size=(224, 224), cache_dir=CACHE_DIR)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_image, idx, url) for idx, url in enumerate(urls)]
        for future in as_completed(futures):
            pass  # Errors are already handled in load_image_from_url

    return images


# **Step 3: Batch Processing*

In [30]:

# Load CLIP model and processor
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
    model.to(device)
    model.eval()
except Exception as e:
    print(f"Error loading CLIP model or processor: {e}")
    exit(1)

def predict_categories_batch(images, categories, model, processor, device):
    if not images:
        return []

    # Process inputs
    try:
        inputs = processor(text=categories, images=images, return_tensors="pt", padding=True).to(device)
    except Exception as e:
        print(f"Error during processing inputs: {e}")
        return ["Error"] * len(images)

    # Forward pass
    try:
        with torch.no_grad():
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image  # Shape: (batch_size, num_categories)
    except Exception as e:
        print(f"Error during model inference: {e}")
        return ["Error"] * len(images)

    # Calculate probabilities
    try:
        probs = logits_per_image.softmax(dim=1).cpu().numpy()  # Shape: (batch_size, num_categories)
    except Exception as e:
        print(f"Error during probability calculation: {e}")
        return ["Error"] * len(images)

    # Get the category with the highest probability for each image
    try:
        predicted_indices = probs.argmax(axis=1)  # Shape: (batch_size,)
        predicted_categories = [categories[idx] for idx in predicted_indices]
    except Exception as e:
        print(f"Error during category assignment: {e}")
        predicted_categories = ["Error"] * len(images)

    return predicted_categories


Using device: cuda





# **Step 4: Predict Categories for All Images**

In [31]:

# Define batch size
BATCH_SIZE = 100

# Initialize a list to store predicted categories
predicted_categories = []

# Calculate the number of batches
num_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE

# Iterate over the DataFrame in batches
for batch_num in tqdm(range(num_batches), desc="Processing Batches"):
    start_idx = batch_num * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, len(df))

    # Replace 'image_url' with the actual column name containing the image URLs
    # For example, if the column is named 'PRODUCT_MAIN_IMAGE_URL', use:
    batch_urls = df['PRODUCT_MAIN_IMAGE_URL'].iloc[start_idx:end_idx].tolist()


    # Download images in parallel
    images = download_images(batch_urls, max_workers=MAX_WORKERS)

    # Initialize list for valid images
    valid_images = []
    valid_indices = []

    # Assign predictions
    for idx, img in enumerate(images):
        if img is not None:
            valid_images.append(img)
            valid_indices.append(idx)
        else:
            predicted_categories.append("Error")  # Assign "Error" for failed downloads

    # Predict categories for valid images
    if valid_images:
        batch_predictions = predict_categories_batch(valid_images, unique_categories_list, model, processor, device)
        # Assign predictions to the corresponding positions
        for pred in batch_predictions:
            predicted_categories.append(pred)

    # Assign "Error" for images that failed to load (already done above)
    # Ensure the length of predicted_categories matches the DataFrame
    while len(predicted_categories) < end_idx:
        predicted_categories.append("Error")

# Assign the predictions to the DataFrame
df['predicted_category'] = predicted_categories[:len(df)]

Processing Batches: 100%|██████████| 10/10 [00:16<00:00,  1.61s/it]


# **Step 5: Result Storage**

In [32]:

# Save the DataFrame with predictions to a CSV file
output_csv_path = "product_categories_predictions.csv"
try:
    df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")
except Exception as e:
    print(f"Error saving predictions to CSV: {e}")

# Optional: Display the first few rows of the DataFrame
print(df.head(150))


Predictions saved to product_categories_predictions.csv
                                PRODUCT_MAIN_IMAGE_URL   predicted_category
0    https://content.jdmagicbox.com/quickquotes/ima...    Embroidery Fabric
1    https://content.jdmagicbox.com/quickquotes/ima...        Salwar Kameez
2    https://content.jdmagicbox.com/quickquotes/ima...   Furniture Hardware
3    https://content.jdmagicbox.com/quickquotes/ima...  Sliding Door Roller
4    https://content.jdmagicbox.com/quickquotes/ima...  Sliding Door Roller
..                                                 ...                  ...
145  https://image1.jdomni.in/product/01042022/FD/8...           Key Chains
146  https://image1.jdomni.in/product/01042022/66/B...           Key Chains
147  https://image1.jdomni.in/product/01042022/8E/8...           Key Chains
148  https://image1.jdomni.in/product/01042022/F5/5...           Key Chains
149  https://image1.jdomni.in/product/01042022/24/7...           Key Chains

[150 rows x 2 columns]
