In [82]:
!kaggle datasets download -d cclark/product-item-data

Downloading product-item-data.zip to /home/jovyan/gkabanov/_marketplace
100%|█████████████████████████████████████████| 137k/137k [00:00<00:00, 472kB/s]
100%|█████████████████████████████████████████| 137k/137k [00:00<00:00, 472kB/s]


In [148]:
import numpy as np
import pandas as pd
import random
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

import gensim
import gensim.downloader
from gensim.models import word2vec,KeyedVectors

import nltk
from nltk.tokenize import word_tokenize

import pymorphy2
import re
import spacy

from string import punctuation
from sentence_transformers import SentenceTransformer, util

In [171]:
df = pd.read_csv("product-item-data.zip")
print(df.shape)
df.head(3)

(500, 2)


Unnamed: 0,id,description
0,1,Active classic boxers - There's a reason why o...
1,2,Active sport boxer briefs - Skinning up Glory ...
2,3,Active sport briefs - These superbreathable no...


## TFIDF

In [101]:
# get embeddings
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(df["description"])

# get cosine distance
cos_sim_matrix = cosine_similarity(tfidf_matrix, tfidf_matrix)
cos_dist_df = pd.DataFrame(1-cos_sim_matrix, index=df["id"], columns=df["id"])
print(cos_dist_df.shape)
cos_dist_df.head(3)

(500, 500)


id,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-2.220446e-16,0.6720795,0.7918016,0.78275,0.829141,0.847257,0.844323,0.885709,0.819121,0.897325,...,0.830831,0.857001,0.854509,0.554897,0.639489,0.665624,0.69075,0.82303,0.798561,0.774019
2,0.6720795,4.440892e-16,0.4326491,0.801128,0.819191,0.847378,0.827806,0.891323,0.856591,0.890437,...,0.84284,0.879609,0.846964,0.688518,0.718213,0.744557,0.790819,0.870748,0.788603,0.806036
3,0.7918016,0.4326491,-2.220446e-16,0.83056,0.794694,0.826511,0.799392,0.868152,0.848871,0.880664,...,0.830651,0.880396,0.861136,0.71112,0.629878,0.747056,0.760768,0.864901,0.858142,0.842826


In [120]:
# manual check of the most similiar goods
import random

idx = random.randint(0, len(df)-1)
print("base goods:")
print(df[df.id == idx].description.values[0])

print("\nthe most similiar goods (minimal cosine distance):")
idx = cos_dist_df[idx].argmin()
print(df[df.id == idx].description.values[0])

base goods:
Lithia skirt - After packing the quiver, who has room for clothes? The versatile Lithia Skirt solves the space dilemma by functioning as both a skirt and a top. The multitasker's lightweight organic cotton (55%)/Tencel (45%) fabric is supersoft and scrunchable. As an A-line skirt, the doubled, shirred waistband cinches down with side ties on one side and falls just above the knee. As a top, the adjustable waistband becomes a bandeau-style bodice that ties behind the neck. Waist-to-hem length is 17". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Soft-handed jersey knit with smooth drape</li> <li>"Snug, wide, cinched waistband; adjustable on right side with ties"</li> <li>"A-line shape skirt, above the knee length"</li> <li>"17"" length, waist to hem, when cinched"</li> <li>Skirt's doubled waistband becomes bandeau style bodice for top and ties behind the neck</li></ul><br><br><b>Fabric: </b>4.8-oz 55% organic cotton/45% Tencel. Recyc

## Word2Vec

In [123]:
# load the model
word_vectors = gensim.downloader.load("word2vec-google-news-300")

In [136]:
# convert description to embeddings

def sentence_to_embedding(desc):
    words = word_tokenize(desc)
    word_embeddings = []

    for word in words:
        if word in word_vectors:
            word_embeddings.append(word_vectors[word])
    if word_embeddings:
        sentence_embedding = np.mean(word_embeddings, axis=0)
    else:
        sentence_embedding = np.zeros(word_vectors.vector_size)

    return sentence_embedding

df["embeddings"] = df["description"].apply(sentence_to_embedding)

In [144]:
# convert embeddings list to np array
embedding_list = df["embeddings"].to_list()
embedding_matrix = np.vstack(embedding_list)
print(embedding_matrix.shape)

(500, 300)


In [145]:
# get cosine distance
cos_sim_matrix = cosine_similarity(embedding_matrix, embedding_matrix)
cos_dist_df = pd.DataFrame(1-cos_sim_matrix, index=df["id"], columns=df["id"])
print(cos_dist_df.shape)
cos_dist_df.head(3)

(500, 500)


id,1,2,3,4,5,6,7,8,9,10,...,491,492,493,494,495,496,497,498,499,500
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.192093e-07,0.031735,0.04411799,0.058153,0.101205,0.062593,0.100866,0.099978,0.042518,0.052501,...,0.055239,0.071541,0.06338,0.02691,0.035152,0.047642,0.074891,0.059498,0.046201,0.040448
2,0.031735,0.0,0.02069259,0.051744,0.081773,0.054929,0.091701,0.106142,0.039475,0.057405,...,0.070862,0.081968,0.072011,0.025773,0.033508,0.046169,0.066931,0.05478,0.04997,0.050544
3,0.04411799,0.020693,-3.576279e-07,0.063497,0.100387,0.063903,0.117196,0.088598,0.046009,0.075642,...,0.057741,0.073017,0.059564,0.037657,0.043264,0.057312,0.083886,0.068514,0.05547,0.052001


In [147]:
# manual check of the most similiar goods

idx = random.randint(0, len(df)-1)
print("base goods:")
print(df[df.id == idx].description.values[0])

print("\nthe most similiar goods (minimal cosine distance):")
idx = cos_dist_df[idx].argmin()
print(df[df.id == idx].description.values[0])

base goods:
Duway skirt - Everyone knows it's what's inside that counts. Beneath this silky, sanded 100% polyester skirt (40% recycled) is a built-in pair of shorts. The travel-friendly fabric is treated with a DWR (durable water repellent) finish that drapes gracefully, dries quickly, and resists wrinkles. Updated fit this season, with a center-back shaping dart and flattering hip seams, the skirt has a more classic, feminine fit. Zippered side vents allow mobility. With a discreet front zip pocket and a waist-to-hem length of 17" (size 8). Built-in shorts inseam is 3". Recyclable through the Common Threads Recycling Program.<br><br><b>Details:</b><ul> <li>Sanded 100% polyester microfiber with DWR (durable water repellent) finish</li> <li>"Slight styling update, same great fit"</li> <li>"Flattering seams at hip, and shaping dart at center back"</li> <li>Front security pocket on right seam</li> <li>Zippered side vents for increased mobility</li> <li>"Built-in 3"" shorts for modesty"</l

## preprocessing + e5

In [153]:
!python -m spacy download en_core_web_sm

In [172]:
# preprocess

def contains_number(input_string):
    return bool(re.search(r'\d', input_string))

def extract_keywords(text, nlp):

    doc = nlp(text)
    keywords = []

    for token in doc:
        if token.text in nlp.Defaults.stop_words:
            continue
        elif token.text in punctuation:
            continue
        elif len(token.text) < 3:
            continue
        elif contains_number(token.text) == True:
            continue
        elif token.pos_ in ["PROPN", 'X']:
            continue
        elif token.pos_ in ["NOUN", "ADJ"] or token.ent_type_:
            keywords.append(token.text.lower())

    return keywords

nlp = spacy.load('en_core_web_sm')
morph = pymorphy2.MorphAnalyzer()

# extract keywords
df["keywords"] = df.description.apply(extract_keywords, nlp=nlp)
# get lemmas
df.keywords = df.keywords.apply(lambda v: [morph.parse(w)[0].normal_form for w in v])
df.keywords = df.keywords.apply(lambda v: " ".join(v))

In [173]:
# get embeddings 
model =  SentenceTransformer('intfloat/multilingual-e5-large')
df.keywords = df.keywords.apply(lambda v: model.encode("query: " + v))

In [176]:
# convert embeddings list to np array
embedding_list = df["keywords"].to_list()
embedding_matrix = np.vstack(embedding_list)
print(embedding_matrix.shape)

# get cosine distance
cos_sim_matrix = cosine_similarity(embedding_matrix, embedding_matrix)
cos_dist_df = pd.DataFrame(1-cos_sim_matrix, index=df["id"], columns=df["id"])
print(cos_dist_df.shape)
cos_dist_df.head(3)

# manual check of the most similiar goods

idx = random.randint(0, len(df)-1)
print("base goods:")
print(df[df.id == idx].description.values[0])

print("\nthe most similiar goods (minimal cosine distance):")
idx = cos_dist_df[idx].argmin()
print(df[df.id == idx].description.values[0])

(500, 1024)
(500, 500)
base goods:
Ultra hw mountaineering socks - Every weather window shuts eventually - some faster than others. These ultra-warm socks stymie the raw temps of skiing, ice climbing and mountaineering so you can keep moving when the barometer hits bottom. Tightly knit from a blend of 75% chlorine-free merino wool for warmth, they also have 12% nylon/10% polyester/3% spandex to balance durability and wicking performance. Soft, felted merino wool yarns throughout cushion, insulate, and last the lifetime of the sock; looped Lintoe construction adds additional comfort. Height is 10.5"  allowing for use under big boots.<br><br><b>Details:</b><ul> <li>Heavyweight wool blend insulates against extreme cold and provides excellent cushioning</li> <li>"Fabric content and strategic knit balances warmth, durability and wicking performance"</li> <li>"Soft, plush merino wool is chlorine-free, itch-free and maintains durability and cushioning for the lifetime of the socks"</li> <li>L