In [None]:
!pip install torch torchvision transformers pandas requests Pillow tqdm


In [None]:
from IPython.display import HTML, clear_output
from subprocess import getoutput
s = getoutput('nvidia-smi')
if 'K80' in s:gpu = 'K80'
elif 'T4' in s:gpu = 'T4'
elif 'P100' in s:gpu = 'P100'
else:
    gpu='DONT PROCEED'
display(HTML(f"<h1>{gpu}</h1>"))

## Import Necessary Library

In [None]:
import torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
import pandas as pd
import requests
from io import BytesIO
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import os
torch.cuda.empty_cache()

# **Configuration Parameters**

In [None]:

CSV_FILE_PATH = '/content/Final_dataset.csv'
BATCH_SIZE = 100
MAX_WORKERS = 20
CACHE_DIR = 'image_cache'


data = pd.read_csv('/content/product_category.csv')
df = pd.DataFrame(data)
unique_categories = df['Product_category'].unique()
unique_categories_list = unique_categories.tolist()



# **Step 1: Data Management**

In [None]:

try:
    data = pd.read_csv(CSV_FILE_PATH)
    df = pd.DataFrame(data)
    required_columns = ['PRODUCT_MAIN_IMAGE_URL']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"The CSV file must contain a column named '{col}'.")

    print(f"Successfully loaded {len(df)} image URLs from the CSV file.")
except FileNotFoundError:
    print(f"CSV file not found at path: {CSV_FILE_PATH}")
    exit(1)
except pd.errors.EmptyDataError:
    print(f"CSV file at {CSV_FILE_PATH} is empty.")
    exit(1)
except Exception as e:
    print(f"An error occurred while reading the CSV file: {e}")
    exit(1)



# **Step 2: Image Loading**

In [None]:

def load_image_from_url(url, size=(224, 224), retries=3, backoff=5, cache_dir='image_cache'):

    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)
    filename = os.path.join(cache_dir, os.path.basename(url).split("?")[0])

    if not filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
        filename += '.jpg'

    if os.path.exists(filename):
        try:
            img = Image.open(filename).convert("RGB")
            img = img.resize(size)
            return img
        except Exception as e:
            print(f"Error loading cached image {filename}: {e}")
            os.remove(filename)


    for attempt in range(retries):
        try:
            response = requests.get(url, timeout=10)
            response.raise_for_status()
            img = Image.open(BytesIO(response.content)).convert("RGB")
            img = img.resize(size)
            img.save(filename)
            return img
        except Exception as e:
            print(f"Attempt {attempt + 1} failed for {url}: {e}")
            time.sleep(backoff * (2 ** attempt))

    print(f"All attempts failed for {url}.")
    return None

def download_images(urls, max_workers=20):
    """
    Downloads images in parallel using ThreadPoolExecutor.
    """
    images = [None] * len(urls)

    def fetch_image(idx, url):
        images[idx] = load_image_from_url(url, size=(224, 224), cache_dir=CACHE_DIR)

    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(fetch_image, idx, url) for idx, url in enumerate(urls)]
        for future in as_completed(futures):
            pass

    return images


# *Step 3: Batch Processing*

In [None]:

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

try:
    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch16")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")
    model.to(device)
    model.eval()
except Exception as e:
    print(f"Error loading CLIP model or processor: {e}")
    exit(1)

def predict_categories_batch(images, categories, model, processor, device):
    if not images:
        return []


    try:
        inputs = processor(text=categories, images=images, return_tensors="pt", padding=True).to(device)
    except Exception as e:
        print(f"Error during processing inputs: {e}")
        return ["Error"] * len(images)


    try:
        with torch.no_grad():
            outputs = model(**inputs)
            logits_per_image = outputs.logits_per_image
    except Exception as e:
        print(f"Error during model inference: {e}")
        return ["Error"] * len(images)


    try:
        probs = logits_per_image.softmax(dim=1).cpu().numpy()
    except Exception as e:
        print(f"Error during probability calculation: {e}")
        return ["Error"] * len(images)


    try:
        predicted_indices = probs.argmax(axis=1)
        predicted_categories = [categories[idx] for idx in predicted_indices]
    except Exception as e:
        print(f"Error during category assignment: {e}")
        predicted_categories = ["Error"] * len(images)

    return predicted_categories

Using device: cpu



# **Step 4: Predict Categories for All Images**

In [None]:

BATCH_SIZE = 35
predicted_categories = []
num_batches = (len(df) + BATCH_SIZE - 1) // BATCH_SIZE


for batch_num in tqdm(range(num_batches), desc="Processing Batches"):
    start_idx = batch_num * BATCH_SIZE
    end_idx = min(start_idx + BATCH_SIZE, len(df))
    batch_urls = df['PRODUCT_MAIN_IMAGE_URL'].iloc[start_idx:end_idx].tolist()



    images = download_images(batch_urls, max_workers=MAX_WORKERS)
    valid_images = []
    valid_indices = []


    for idx, img in enumerate(images):
        if img is not None:
            valid_images.append(img)
            valid_indices.append(idx)
        else:
            predicted_categories.append("Error")

    if valid_images:
        batch_predictions = predict_categories_batch(valid_images, unique_categories_list, model, processor, device)
        for pred in batch_predictions:
            predicted_categories.append(pred)

    while len(predicted_categories) < end_idx:
        predicted_categories.append("Error")

df['predicted_category'] = predicted_categories[:len(df)]

# **Step 5: Result Storage**

In [None]:

output_csv_path = "product_categories_predictions.csv"
try:
    df.to_csv(output_csv_path, index=False)
    print(f"Predictions saved to {output_csv_path}")
except Exception as e:
    print(f"Error saving predictions to CSV: {e}")

print(df.head(150))
