In [9]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

checkpoint = "patrickjohncyh/fashion-clip"

model = CLIPModel.from_pretrained(checkpoint)
processor = CLIPProcessor.from_pretrained(checkpoint)

url = "https://d1pdzcnm6xgxlz.cloudfront.net/tops/8905074863310-18.jpg"
image = Image.open(requests.get(url, stream=True).raw)

image2 = Image.open(requests.get("https://www.kreeva.com/blog/wp-content/uploads/2021/10/stylish-dresses-for-Diwali.jpg", stream=True).raw)

image3 = Image.open("top_wear.png")

inputs = processor(text=["a blue shirt", "a blue shirt with white strips" , "Diwali clothes" , "Pink chunri diwali lehenga"], images=[image ,image2 , image3], return_tensors="pt", padding=True)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
print(probs)

tensor([[5.1264e-03, 9.9487e-01, 1.0512e-07, 3.0413e-14],
        [3.4284e-12, 5.5845e-12, 9.5987e-03, 9.9040e-01],
        [9.4266e-03, 9.9057e-01, 6.0318e-09, 4.0575e-15]],
       grad_fn=<SoftmaxBackward0>)


In [20]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

checkpoint = "patrickjohncyh/fashion-clip"

model = CLIPModel.from_pretrained(checkpoint)
processor = CLIPProcessor.from_pretrained(checkpoint)

def getImageEmbeddingsFromPath(image_path):
	image = Image.open(image_path)
	inputs = processor(text=["dummy"] , images=image, return_tensors="pt", padding=True)
	outputs = model(**inputs , return_dict=True)
	return outputs["image_embeds"]

def getImageEmbeddings(image):
	inputs = processor(text=["dummy"] , images=image, return_tensors="pt", padding=True)
	outputs = model(**inputs , return_dict=True)
	return outputs["image_embeds"]


In [29]:
from transformers import SegformerImageProcessor, AutoModelForSemanticSegmentation , AutoFeatureExtractor
from PIL import Image
import requests
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from typing import Dict, List, Any
import numpy as np
import cv2

seg_processor = SegformerImageProcessor.from_pretrained("mattmdjaga/segformer_b2_clothes")
seg_model = AutoModelForSemanticSegmentation.from_pretrained("mattmdjaga/segformer_b2_clothes")

def apply_mask(image, mask):
	# Convert to numpy arrays
	image = np.array(image)
	mask = np.array(mask)
	# Convert grayscale image to RGB
	mask = np.stack((mask,)*3, axis=-1)
	# Multiply arrays
	resultant = image*mask
	print(resultant.shape)
	resultant[mask == 0] = 255

	return resultant

def write_image(image, path):
	image = Image.fromarray(np.uint8(image))
	image.save(path)

def crop_image(image):
	#convert to grayscale
	temp = image[:, :, ::-1].copy() 
	temp = temp.astype('uint8')
	gray = cv2.cvtColor(temp, cv2.COLOR_BGR2GRAY)
	#threshold the image
	thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
	#find contours
	contours = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)[0]
	#find bounding box coordinates
	contours = sorted(contours, key=lambda x: cv2.contourArea(x), reverse=True)
	x,y,w,h = cv2.boundingRect(contours[0])
	#crop image
	crop = image[y:y+h, x:x+w]
	return crop

def segment(image , to_mask):
	inputs = seg_processor(images=image, return_tensors="pt")

	outputs = seg_model(**inputs)

	logits = outputs.logits.cpu()

	upsampled_logits = nn.functional.interpolate(
		logits,
		size=image.size[::-1],
		mode="bilinear",
		align_corners=False,
	)

	pred_seg = upsampled_logits.argmax(dim=1)[0]


	#write a function to create mask for different classes and then apply the mask to the image

	result = []
	
	for i in to_mask:
		mask = pred_seg.numpy().copy()
		mask[mask != i] = 0
		mask[mask == i] = 1

		item = apply_mask(image, mask)
		write_image(item, f"images/{i}.png")
		result.append(item)

	return result
	
	



Labels: 0: "Background", 1: "Hat", 2: "Hair", 3: "Sunglasses", 4: "Upper-clothes", 5: "Skirt", 6: "Pants", 7: "Dress", 8: "Belt", 9: "Left-shoe", 10: "Right-shoe", 11: "Face", 12: "Left-leg", 13: "Right-leg", 14: "Left-arm", 15: "Right-arm", 16: "Bag", 17: "Scarf"

In [25]:
def segmentAndEmbedd(image_path , to_mask):
	result = {}
	image = Image.open(image_path)


	# Get full image embedding

	fullImageEmbedding = getImageEmbeddings(image)
	result['fullImageEmbedding'] = fullImageEmbedding

	# Get image segments

	segements = segment(image , to_mask)

	# Get segment embeddings

	for i in range(len(to_mask)):
		segmentEmbedding = getImageEmbeddings(segements[i])
		result[i] = segmentEmbedding

	return result

	

In [31]:
print(segmentAndEmbedd("full_test.jpg" , [6 , 4 , 8 , 9 , 10 , 3]))

(1134, 638, 3)
(1134, 638, 3)
(1134, 638, 3)
(1134, 638, 3)
(1134, 638, 3)
(1134, 638, 3)
{'fullImageEmbedding': tensor([[-3.7344e-02,  2.0649e-01, -5.0522e-02,  9.0021e-03,  1.9690e-02,
         -6.6172e-04, -1.5248e-02,  7.7745e-03,  1.6250e-02, -1.2663e-02,
          8.9793e-03, -2.5270e-02, -1.1402e-03,  2.8586e-02, -2.8763e-02,
         -2.6833e-02,  4.8742e-03,  3.2035e-02, -8.3520e-03, -1.2855e-02,
         -1.0224e-02,  2.5459e-03, -4.8898e-03, -1.9015e-02, -3.1299e-03,
          7.2104e-03,  3.9921e-02, -2.3996e-02, -6.2076e-03, -9.8668e-03,
         -3.2812e-02,  3.5751e-02,  3.9470e-02, -7.2819e-04, -3.0541e-03,
         -1.3941e-02, -2.2981e-02, -2.4436e-02, -9.9496e-03,  3.9844e-03,
          5.7584e-02,  2.6625e-03,  7.1901e-02,  9.1389e-03, -8.8060e-03,
         -1.3822e-02, -1.4319e-02,  8.8376e-03,  3.9227e-02, -1.3542e-02,
          3.5471e-02,  9.2467e-02, -1.3736e-02,  6.3213e-04, -7.2210e-02,
          3.3231e-02, -5.7419e-03, -2.9399e-02,  4.2010e-02, -4.7042e-02,

# Function get text embedding

In [39]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel

checkpoint = "patrickjohncyh/fashion-clip"

model = CLIPModel.from_pretrained(checkpoint)
processor = CLIPProcessor.from_pretrained(checkpoint)


def getTextEmbeddings(text):
	inputs = processor(text=text , images=Image.new('RGB' , (72 , 72)), return_tensors="pt", padding=True)
	outputs = model(**inputs , return_dict=True)
	return outputs["text_embeds"]

getTextEmbeddings("Formal outfit with pink shirt")

tensor([[-3.0969e-02,  3.2595e-02,  2.5459e-02, -2.3454e-03,  2.7888e-02,
         -8.0815e-03,  4.3273e-03,  2.2296e-02, -1.4490e-02, -9.3051e-03,
         -1.6037e-02, -2.1318e-02,  1.1411e-02,  2.5484e-02, -5.9731e-02,
         -3.1086e-02, -1.0816e-01,  6.3198e-02, -2.1068e-02,  6.6639e-02,
         -3.4676e-02,  1.6390e-02, -4.0184e-03,  7.8004e-03,  5.7912e-03,
          3.3051e-02,  1.1349e-02, -2.8740e-02,  9.1858e-02, -2.7375e-03,
         -2.2224e-02, -1.0884e-02,  5.3596e-03, -1.3473e-02,  1.1667e-02,
         -3.4577e-03,  3.1523e-02, -2.3184e-02, -2.0383e-02, -6.5440e-03,
          2.2344e-02, -3.5566e-02,  5.9986e-02, -1.4312e-02, -1.5235e-02,
          6.7203e-02, -7.9798e-02, -1.0125e-02,  9.4251e-03,  2.5425e-02,
         -3.5615e-02, -1.6142e-02,  1.3827e-02,  3.7496e-02, -5.6269e-02,
          7.4935e-02, -2.9099e-02, -2.8049e-03, -6.1234e-03, -5.7611e-02,
          5.1486e-02,  2.7654e-02,  2.3166e-03, -2.2454e-03,  7.5747e-03,
         -1.3407e-02,  1.8393e-02,  1.