In [7]:
!pip3 install --upgrade gensim

Collecting gensim




  Using cached gensim-4.3.1-cp39-cp39-win_amd64.whl (24.0 MB)
Collecting smart-open>=1.8.1 (from gensim)
  Using cached smart_open-6.3.0-py3-none-any.whl (56 kB)
Installing collected packages: smart-open, gensim
Successfully installed gensim-4.3.1 smart-open-6.3.0


In [3]:
import pandas as pd

# Load the CSV file
df = pd.read_csv('sample-data.csv')

# Preliminary text preprocessing
df['description'] = df['description'].str.lower().str.replace('[^\w\s]', '')  # convert to lowercase and remove punctuation
df

Unnamed: 0,id,description
0,1,active classic boxers - there's a reason why o...
1,2,active sport boxer briefs - skinning up glory ...
2,3,active sport briefs - these superbreathable no...
3,4,"alpine guide pants - skin in, climb ice, switc..."
4,5,"alpine wind jkt - on high ridges, steep ice an..."
...,...,...
495,496,cap 2 bottoms - cut loose from the maddening c...
496,497,cap 2 crew - this crew takes the edge off fick...
497,498,all-time shell - no need to use that morning t...
498,499,all-wear cargo shorts - all-wear cargo shorts ...


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
from sklearn.metrics.pairwise import cosine_similarity

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
tfidf_vectors = tfidf_vectorizer.fit_transform(df['description'])

# Word2Vec Vectorization
sentences = [desc.split() for desc in df['description']]
word2vec_model = Word2Vec(sentences, vector_size=300, window=5, min_count=1, workers=4)
word2vec_model.train(sentences, total_examples=len(sentences), epochs=10)

def get_word2vec_vec(sentence, model):
    vec = [model.wv[word] for word in sentence if word in model.wv]
    return sum(vec) / len(vec)

word2vec_vectors = df['description'].apply(lambda x: get_word2vec_vec(x.split(), word2vec_model))

In [24]:
tfidf_similarities = cosine_similarity(tfidf_vectors)
word2vec_similarities = cosine_similarity(list(word2vec_vectors))

In [57]:
tfidf_similarities

array([[1.        , 0.32792053, 0.20819843, ..., 0.17696975, 0.20143942,
        0.22598052],
       [0.32792053, 1.        , 0.5673509 , ..., 0.12925175, 0.21139731,
        0.19396413],
       [0.20819843, 0.5673509 , 1.        , ..., 0.13509939, 0.14185763,
        0.15717399],
       ...,
       [0.17696975, 0.12925175, 0.13509939, ..., 1.        , 0.14187074,
        0.17045334],
       [0.20143942, 0.21139731, 0.14185763, ..., 0.14187074, 1.        ,
        0.55846363],
       [0.22598052, 0.19396413, 0.15717399, ..., 0.17045334, 0.55846363,
        1.        ]])

In [32]:
word2vec_similarities

array([[1.0000001 , 0.99245334, 0.9746805 , ..., 0.8993511 , 0.9452015 ,
        0.97037023],
       [0.99245334, 1.0000001 , 0.9834    , ..., 0.9094849 , 0.9640587 ,
        0.9826725 ],
       [0.9746805 , 0.9834    , 1.0000002 , ..., 0.9609721 , 0.94972163,
        0.962013  ],
       ...,
       [0.8993511 , 0.9094849 , 0.9609721 , ..., 1.        , 0.876427  ,
        0.8849877 ],
       [0.9452015 , 0.9640587 , 0.94972163, ..., 0.876427  , 1.0000004 ,
        0.99092275],
       [0.97037023, 0.9826725 , 0.962013  , ..., 0.8849877 , 0.99092275,
        0.9999999 ]], dtype=float32)

In [33]:
threshold = 0.7  # example threshold

def get_similar_ids(row_index, similarity_matrix):
    return df['id'][similarity_matrix[row_index] > threshold].tolist()

df['similar_ids_tfidf'] = [get_similar_ids(i, tfidf_similarities) for i in range(len(df))]
df['similar_ids_word2vec'] = [get_similar_ids(i, word2vec_similarities) for i in range(len(df))]

In [34]:
df

Unnamed: 0,id,description,similar_ids_tfidf,similar_ids_word2vec
0,1,active classic boxers - there's a reason why o...,[1],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
1,2,active sport boxer briefs - skinning up glory ...,[2],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
2,3,active sport briefs - these superbreathable no...,[3],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
3,4,"alpine guide pants - skin in, climb ice, switc...","[4, 159]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
4,5,"alpine wind jkt - on high ridges, steep ice an...","[5, 308]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
...,...,...,...,...
495,496,cap 2 bottoms - cut loose from the maddening c...,"[173, 496]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
496,497,cap 2 crew - this crew takes the edge off fick...,"[22, 23, 174, 175, 359, 360, 497]","[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
497,498,all-time shell - no need to use that morning t...,[498],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."
498,499,all-wear cargo shorts - all-wear cargo shorts ...,[499],"[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14..."


This solution provides you with columns similar_ids_tfidf and similar_ids_word2vec which list similar product IDs based on TF-IDF and Word2Vec, respectively.

Note:

Ensure you've installed pandas, sklearn, and gensim to run the above code.
You can adjust parameters like max_features in TF-IDF or vector_size, window in Word2Vec to fine-tune the representations.
Make sure you replace 'your_dataset_path.csv' with the correct path to your dataset.

In [60]:
print(df[df["id"] == 173]["description"])
print(df[df["id"] == 496]["description"])

172    cap 2 bottoms - simul-climbing the final pitch...
Name: description, dtype: object
495    cap 2 bottoms - cut loose from the maddening c...
Name: description, dtype: object


In [63]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

ds = pd.read_csv("sample-data.csv")

ds['description'] = ds['description'].str.lower().str.replace('[^\w\s]', '')
tf = TfidfVectorizer(analyzer='word', ngram_range=(1, 3), min_df=0.0, stop_words='english')
tfidf_matrix = tf.fit_transform(ds['description'])

cosine_similarities = linear_kernel(tfidf_matrix, tfidf_matrix)

results = {}

for idx, row in ds.iterrows():
    similar_indices = cosine_similarities[idx].argsort()[:-100:-1]
    similar_items = [(cosine_similarities[idx][i], ds['id'][i]) for i in similar_indices]

    # First item is the item itself, so remove it.
    # Each dictionary entry is like: [(1,2), (3,4)], with each tuple being (score, item_id)
    results[row['id']] = similar_items[1:]
    
print('done!')

done!


In [66]:
# hacky little function to get a friendly item name from the description field, given an item ID
def item(id):
    return ds.loc[ds['id'] == id]['description']#.tolist()[0].split(' - ')[0]

# Just reads the results out of the dictionary. No real logic here.
def recommend(item_id, num):
    print("Recommending " + str(num) + " products similar to " + item(item_id) + "...")
    print("-------")
    recs = results[item_id][:num]
    for rec in recs:
        print("Recommended: " + item(rec[1]) + " (score:" + str(rec[0]) + ")")

# Just plug in any item id here (1-500), and the number of recommendations you want (1-99)
# You can get a list of valid item IDs by evaluating the variable 'ds', or a few are listed below

recommend(item_id=11, num=5)

10    Recommending 5 products similar to baby sunsha...
Name: description, dtype: object
-------
418    Recommended: sunshade hoody - put an end to th...
Name: description, dtype: object
464    Recommended: baby baggies apron dress - this l...
Name: description, dtype: object
347    Recommended: runshade t-shirt - there's a long...
Name: description, dtype: object
244    Recommended: runshade t-shirt - out in the bac...
Name: description, dtype: object
116    Recommended: runshade top - from the trailhead...
Name: description, dtype: object


In [67]:
recommend(item_id=173, num=5)

172    Recommending 5 products similar to cap 2 botto...
Name: description, dtype: object
-------
495    Recommended: cap 2 bottoms - cut loose from th...
Name: description, dtype: object
21    Recommended: cap 2 t-shirt - this one revels i...
Name: description, dtype: object
496    Recommended: cap 2 crew - this crew takes the ...
Name: description, dtype: object
22    Recommended: cap 2 zip neck - designed for wet...
Name: description, dtype: object
359    Recommended: cap 2 t-shirt - like a sweet bree...
Name: description, dtype: object


In [68]:
recommend(item_id=497, num=5)

496    Recommending 5 products similar to cap 2 crew ...
Name: description, dtype: object
-------
21    Recommended: cap 2 t-shirt - this one revels i...
Name: description, dtype: object
359    Recommended: cap 2 t-shirt - like a sweet bree...
Name: description, dtype: object
358    Recommended: cap 2 cap sleeve - the fastest-dr...
Name: description, dtype: object
22    Recommended: cap 2 zip neck - designed for wet...
Name: description, dtype: object
174    Recommended: cap 2 zip neck - the bugaboos hav...
Name: description, dtype: object
