In [12]:
import json
import torch
import torchvision.transforms as transforms
from PIL import Image, UnidentifiedImageError
from torchvision import models
from scipy.spatial import distance
import requests
from io import BytesIO
from tqdm import tqdm
import pandas as pd

In [2]:
with open("materiais_scrapped_data_v2.json", 'r') as file:
    materiais_data = json.load(file)

In [3]:
materiais_data

{'tecido cetim azul escuro': {'pt.aliexpress.com/item/3256807143248027.html': [],
  'pt.aliexpress.com/item/3256806983023750.html': [{'subproduct_name': 'Hon-white',
    'subproduct_img_url': 'https://ae-pic-a1.aliexpress-media.com/kf/S7690778f73954624a6c5bbd0e12153b59.jpg'},
   {'subproduct_name': 'Purple',
    'subproduct_img_url': 'https://ae-pic-a1.aliexpress-media.com/kf/Sa8b87f3699ab479195c2ea5a4dc5cd76x.jpg'},
   {'subproduct_name': 'Light blue',
    'subproduct_img_url': 'https://ae-pic-a1.aliexpress-media.com/kf/S4b37e7439a0d4c0ab8d666af36ef7705w.jpg'},
   {'subproduct_name': 'Deep purple',
    'subproduct_img_url': 'https://ae-pic-a1.aliexpress-media.com/kf/Sd378b1b90e604ca29b114de626a2a1c8E.jpg'},
   {'subproduct_name': 'Klein Blue',
    'subproduct_img_url': 'https://ae-pic-a1.aliexpress-media.com/kf/Sdff26fa0a0d84cf4bd2fde54e614fdf5U.jpg'},
   {'subproduct_name': 'green',
    'subproduct_img_url': 'https://ae-pic-a1.aliexpress-media.com/kf/S3d899f6ff1564d518f74afb06baf09bf

In [4]:
main_pic_path = '../imgs/isa-azul.jpeg'
main_img = Image.open(main_pic_path)

In [5]:
# Define the transformation to preprocess the images for the ResNet model
transform = transforms.Compose([
    transforms.Resize((224, 224)),  # ResNet expects 224x224 input images
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),  # ResNet normalization
])

# Load a pre-trained ResNet model
model = models.resnet50(pretrained=True)
model.eval()  # Set the model to evaluation mode




ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [6]:
def get_image_from_url(url):
    """Loads image from a URL and returns the PIL Image object"""
    try:
        response = requests.get(url)
        response.raise_for_status()  # Check for HTTP request errors
        img = Image.open(BytesIO(response.content))
        return img
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch image from {url}. Error: {e}")
    except UnidentifiedImageError:
        print(f"Could not identify image file from {url}.")
    return None

def extract_features(image, model):
    """Extracts features from an image using a pre-trained model"""
    # Preprocess the image
    image = transform(image).unsqueeze(0)  # Add batch dimension
    # Extract features
    with torch.no_grad():  # Disable gradients for faster computation
        features = model(image)
    return features.flatten().numpy()  # Return features as a flattened numpy array

def calculate_similarity(main_img_features, compare_img_features):
    """Calculates cosine similarity between two feature vectors"""
    return 1 - distance.cosine(main_img_features, compare_img_features)

def compare_images(main_image, url_to_compare: str):
    # Load and process the main image
    main_image_features = extract_features(main_image, model)

    # Calculate similarity for each image in the comparison list
    compare_image = get_image_from_url(url_to_compare)
    compare_image_features = extract_features(compare_image, model)
    similarity = calculate_similarity(main_image_features, compare_image_features)
    return similarity


In [7]:
for m_name, m_data in tqdm(materiais_data.items()):
    for product_url, product_data in m_data.items():
        if len(product_data) > 0:
            for subproducts_data in product_data:
                subproducts_data['similarity'] = compare_images(main_img, subproducts_data['subproduct_img_url'])

100%|██████████| 8/8 [16:13<00:00, 121.64s/it]


In [9]:
data_df = []

for m_name, m_data in tqdm(materiais_data.items()):
    for product_url, product_data in m_data.items():
        for subproducts_data in product_data:
            data = {
                'material': m_name,
                'produto_link': product_url,
                'subproduct_img_url': subproducts_data['subproduct_img_url'],
                'subproduct_similarity': subproducts_data['similarity'],
            }
            data_df.append(data)

100%|██████████| 8/8 [00:00<00:00, 3589.86it/s]


In [14]:
data_df = pd.DataFrame(data_df)

In [17]:
data_df.loc[data_df.groupby('material')['subproduct_similarity'].idxmax()]

Unnamed: 0,material,produto_link,subproduct_img_url,subproduct_similarity
832,fita de strass,pt.aliexpress.com/item/3256806062707856.html,https://ae-pic-a1.aliexpress-media.com/kf/Sd64...,0.615595
1062,forro azul claro,pt.aliexpress.com/item/3256805140400205.html,https://ae-pic-a1.aliexpress-media.com/kf/Se5d...,0.666554
906,renda preta,pt.aliexpress.com/item/3256807280247268.html,https://ae-pic-a1.aliexpress-media.com/kf/Saee...,0.696547
119,tecido cetim azul escuro,pt.aliexpress.com/item/3256807517459320.html,https://ae-pic-a1.aliexpress-media.com/kf/S4df...,0.705826
203,tecido tule preto,pt.aliexpress.com/item/3256806467317182.html,https://ae-pic-a1.aliexpress-media.com/kf/Se57...,0.710536
427,tecido veludo azul,pt.aliexpress.com/item/3256801657548334.html,https://ae-pic-a1.aliexpress-media.com/kf/H9c4...,0.661418


In [21]:
data_df.groupby('material').apply(lambda x: x.nlargest(5, 'subproduct_similarity')).reset_index(drop=True).to_csv('top_5_por_material.csv', index = False)


  data_df.groupby('material').apply(lambda x: x.nlargest(5, 'subproduct_similarity')).reset_index(drop=True).to_csv('top_5_por_material.csv', index = False)
