In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import os
import time
from tqdm import tqdm
import pandas as pd
import urllib.request
from concurrent.futures import ThreadPoolExecutor, as_completed
import tempfile
import configparser
import rembg
from PIL import Image

In [2]:
config = configparser.RawConfigParser()
config.read('config.ini')
img_folder = config['folder_name']['img_download']
product_table_path = config['intermediate_path']['product_table']
image_table_path = config['intermediate_path']['image_table']
os.makedirs(img_folder, exist_ok=True)

In [3]:
def open_browser(path):
    try:
        # set up Selenium WebDriver
        driver = webdriver.Chrome()
        driver.get(path)

        # Wait for page to load
        time.sleep(5)
        
        driver.find_element(By.XPATH, '//button[text() ="Ask Me Later"]').click()
        return driver
    
    except Exception as e:
        raise e
    
def resize_image(img, output_size=(256, 256)):
    return img.resize(output_size, Image.LANCZOS)

def process_image(driver, product_id, img_url, img_extension=['.jpeg'], folder_name=img_folder):
    try:
        images = driver.find_elements(By.TAG_NAME, 'img')
        img_url = list(
            filter(
                lambda url: any(extn in url for extn in img_extension),
                map(lambda img: img.get_attribute('src'), images)
            )            
        )[0]

        # Create a temporary file for each image
        with tempfile.NamedTemporaryFile(delete=True, suffix='.png') as temp_file:
            urllib.request.urlretrieve(img_url, temp_file.name) # download temp file

            img = Image.open(temp_file.name)
            rm_back = rembg.remove(img) # Remove background

            if rm_back.mode == 'RGBA':
                rm_back = rm_back.convert('RGB') # Convert RGBA -> RGB

            output = resize_image(rm_back)
            output.save(f'{folder_name}/{product_id}.png', 'PNG')

            return {'product_id': product_id, 'image_path': f'{folder_name}/{product_id}.png'}
    
    except Exception as e:
        print(f'Error processing {product_id}: {e}')
    
    finally:
        driver.quit()

def process_wrapper(id, path):
    try:
        driver = open_browser(path=path)
        row = process_image(driver, id, path, img_extension=['.jpeg'], folder_name=img_folder)
        return row
    except Exception as e:
        raise e

def save_img(prod_dict, max_workar=4):
    try:
        rows = []

        with ThreadPoolExecutor(max_workers=max_workar) as executor:
            results = list(
                tqdm(
                    executor.map(lambda item: process_wrapper(*item), prod_dict.items()), 
                    total=len(prod_dict),
                    colour='red'
                )
            )
            
        rows.extend(results)
        return pd.DataFrame([r for r in rows if r])
    
    except Exception as e:
        raise e

In [4]:
prod_href = pd.read_csv(product_table_path)
prod_id_index = prod_href[['product_id', 'product_url']].set_index('product_id').to_dict('index')
prod_id_href_dict = dict(map(lambda x: (x, list(prod_id_index[x].values())[0]), prod_id_index.keys()))

In [None]:
img_path_df = save_img(prod_id_href_dict, max_workar=2)

  0%|[31m          [0m| 1/6036 [01:46<178:04:30, 106.23s/it]


In [None]:
img_path_df.to_csv(image_table_path, index=False)

In [5]:
k =[]
for key, value in zip(list(prod_id_href_dict.keys())[:2], list(prod_id_href_dict.values())[:2]):
    k.append((key, value))
dic = dict(k)
dic

{'mp000000022960625': 'https://www.tatacliq.com/american-eagle-burgundy-cotton-regular-fit-t-shirt/p-mp000000022960625',
 'mp000000024030162': 'https://www.tatacliq.com/sf-jeans-by-pantaloons-apple-green-cotton-slim-fit-t-shirts/p-mp000000024030162'}

In [6]:
img_path_df = save_img(dic)

100%|[31m██████████[0m| 2/2 [01:43<00:00, 51.75s/it] 
