In [1]:
!pip install -U fashion-clip

Collecting fashion-clip
  Downloading fashion_clip-0.2.2-py3-none-any.whl (15 kB)
Collecting annoy>=1.17.0
  Using cached annoy-1.17.3.tar.gz (647 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Collecting pyarrow>=7.0.0
  Downloading pyarrow-14.0.1-cp310-cp310-win_amd64.whl (24.6 MB)
     ---------------------------------------- 24.6/24.6 MB 3.0 MB/s eta 0:00:00
Collecting matplotlib>=3.5.1
  Downloading matplotlib-3.8.1-cp310-cp310-win_amd64.whl (7.6 MB)
     ---------------------------------------- 7.6/7.6 MB 3.7 MB/s eta 0:00:00
Collecting transformers>=4.26.1
  Using cached transformers-4.35.0-py3-none-any.whl (7.9 MB)
Collecting validators
  Using cached validators-0.22.0-py3-none-any.whl (26 kB)
Collecting boto3>=1.10.50
  Downloading boto3-1.28.85-py3-none-any.whl (135 kB)
     -------------------------------------- 135.8/135.8 kB 4.1 MB/s eta 0:00:00
Collecting torch>=1.11.0
  Using cached torch-2.1.0-cp310-cp310-win_am


[notice] A new release of pip available: 22.3.1 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import sys
#sys.path.append("fashion-clip/")
from fashion_clip.fashion_clip import FashionCLIP
import pandas as pd
import numpy as np
from collections import Counter
from PIL import Image
import numpy as np
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import *
from sklearn.linear_model import LogisticRegression

ModuleNotFoundError: No module named 'sklearn'

In [None]:
fclip = FashionCLIP('fashion-clip')

In [None]:
articles = pd.read_csv("data_for_fashion_clip/articles.csv")

# drop items that have the same description
subset = articles.drop_duplicates("detail_desc").copy()

# remove items of unkown category
subset = subset[~subset["product_group_name"].isin(["Unknown"])]

# FashionCLIP has a limit of 77 tokens, let's play it safe and drop things with more than 40 tokens
subset = subset[subset["detail_desc"].apply(lambda x : 4 < len(str(x).split()) < 40)]

# We also drop products types that do not occur very frequently in this subset of data
most_frequent_product_types = [k for k, v in dict(Counter(subset["product_type_name"].tolist())).items() if v > 10]
subset = subset[subset["product_type_name"].isin(most_frequent_product_types)]

# lots of data here, but we will just use only descriptions and a couple of other columns
subset.head(3)

In [None]:
subset.to_csv("subset_data.csv", index=False)
f"There are {len(subset)} elements in the dataset"

In [None]:
images = ["data_for_fashion_clip/" + str(k) + ".jpg" for k in subset["article_id"].tolist()]
texts = subset["detail_desc"].tolist()

# we create image embeddings and text embeddings
image_embeddings = fclip.encode_images(images, batch_size=32)
text_embeddings = fclip.encode_text(texts, batch_size=32)

# we normalize the embeddings to unit norm (so that we can use dot product instead of cosine similarity to do comparisons)
image_embeddings = image_embeddings/np.linalg.norm(image_embeddings, ord=2, axis=-1, keepdims=True)
text_embeddings = text_embeddings/np.linalg.norm(text_embeddings, ord=2, axis=-1, keepdims=True)

In [None]:
precision = 0

# we could batch this operation to make it faster
for index, t in enumerate(text_embeddings):
    arr = t.dot(image_embeddings.T)

    best = arr.argsort()[-5:][::-1]

    if index in best:
        precision +=1

round(precision/len(text_embeddings), 2)

In [None]:
text_embedding = fclip.encode_text(["a pair of pink shorts"], 32)[0]

id_of_matched_object = np.argmax(text_embedding.dot(image_embeddings.T))
found_object = subset["article_id"].iloc[id_of_matched_object].tolist()

fixed_height = 224

image = Image.open(f"data_for_fashion_clip/{found_object}.jpg")
height_percent = (fixed_height / float(image.size[1]))
width_size = int((float(image.size[0]) * float(height_percent)))
image = image.resize((width_size, fixed_height), Image.NEAREST)

image

In [None]:
text_embedding = fclip.encode_text(["a house"], 32)[0]

id_of_matched_object = np.argmax(text_embedding.dot(image_embeddings.T))
found_object = subset["article_id"].iloc[id_of_matched_object].tolist()

fixed_height = 224

image = Image.open(f"data_for_fashion_clip/{found_object}.jpg")
height_percent = (fixed_height / float(image.size[1]))
width_size = int((float(image.size[0]) * float(height_percent)))
image = image.resize((width_size, fixed_height), Image.NEAREST)

image

In [None]:
labels = subset["product_type_name"].unique()
print(f"These are our labels: {labels}")

In [None]:
labels_prompt = [f"a photo of a {k}" for k in labels]

label_embeddings = fclip.encode_text(labels_prompt, batch_size=32)
label_embeddings = label_embeddings/np.linalg.norm(label_embeddings, ord=2, axis=-1, keepdims=True)

In [None]:
# let's compute the class distribution for all images
predicted_classes_distribution = label_embeddings.dot(image_embeddings.T)

In [None]:
# let's take the best label (the most similar to the image)
predicted = [labels[k] for k in np.argmax(predicted_classes_distribution, axis=0)]

In [None]:
print(classification_report(subset["product_type_name"], predicted))

In [None]:
classes = subset["product_group_name"].tolist()
X_train, X_test, y_train, y_test = train_test_split(image_embeddings, classes, test_size=0.20, random_state=32, stratify=classes)

In [None]:
clf = LogisticRegression(random_state=0, class_weight="balanced").fit(X_train, y_train)

In [None]:
print(classification_report(y_test, clf.predict(X_test)))