In [1]:
# import necessary libraries

import pandas as pd # To read data
from sklearn.metrics.pairwise import cosine_similarity # This function calculates cosine similarity
from sklearn.feature_extraction.text import TfidfVectorizer # TfidfVectorizer is a text feature extraction method that transforms text data into numerical vectors using the TF-IDF (Term Frequency-Inverse Document Frequency) weighting scheme.

In [2]:
data = pd.read_csv('dataset.csv') # Read dataset as pandas dataframe.

data.head() # print first 5 entries of dataframe.

Unnamed: 0,Position,URL,Item
0,1,https://assets.ajio.com/medias/sys_master/root...,GUFRINA Floral Tunic with Bishop Sleeves
1,2,https://assets.ajio.com/medias/sys_master/root...,Pannkh Novelty Top
2,3,https://assets.ajio.com/medias/sys_master/root...,HAWT Floral Embroidered V-Neck Tunic
3,4,https://assets.ajio.com/medias/sys_master/root...,Ives Floral Print Button-Down Top
4,5,https://assets.ajio.com/medias/sys_master/root...,HAWT Flared Top with Embroidered Yoke


In [3]:
vectorizer = TfidfVectorizer() # This creates an instance of the TfidfVectorizer class from scikit-learn. 
item_names = data['Item'].tolist() # This retrieves the values from the 'Item' column of the 'data' DataFrame and converts them into a Python list.
vectorizer.fit(item_names) # This fits the TfidfVectorizer on the item names data. It analyzes the item names to learn the vocabulary and calculates the IDF (Inverse Document Frequency) weights.
item_name_vectors = vectorizer.transform(item_names) # This transforms the item names into TF-IDF vectors using the previously fitted vectorizer. Each item name is represented as a numerical vector based on its TF-IDF values.
similarity_matrix = cosine_similarity(item_name_vectors) # This computes the cosine similarity between the item name vectors. The cosine_similarity function calculates the pairwise cosine similarity between all item name vectors, resulting in a similarity matrix.

In [4]:
# Function to show relevant products 

def cloth_predict(query, n=5):
  item_names = data['Item'].tolist() # This retrieves the values from the 'Item' column of the 'data' DataFrame and converts them into a Python list.
  query_vector = vectorizer.transform([query]) # Here, the vectorizer object transforms the query (a text describing a clothing item) into a TF-IDF vector using the transform() method.
  query_similarity = cosine_similarity(query_vector, vectorizer.transform(item_names)) # This calculates the cosine similarity between the query_vector and the TF-IDF vectors of all the item names in your dataset using cosine_similarity fucntion.
  top_indices = query_similarity.argsort()[0][-n:][::-1] # This line gives top n relevant products.
  similar_items = data.iloc[top_indices] # Selecting rows of relevant products using top_indices.
  similar_items = similar_items['URL'] # selecting URL's of top n products

  # printing top 5 suggestions.

  print("Top 5 suggestions are")
  ind = 1
  for i in similar_items:
    print(ind, ":", i)
    ind+=1

In [5]:
query = "Black v-neck"

cloth_predict(query)

Top 5 suggestions are
1 : https://assets.ajio.com/medias/sys_master/root/20230201/jyrV/63da0b40aeb269c651083572/fig_black_floral_print_v-neck_top.jpg
2 : https://assets.ajio.com/medias/sys_master/root/20220719/quR6/62d6e8c6aeb26921af85c524/fig_wine_striped_v-neck_top.jpg
3 : https://assets.ajio.com/medias/sys_master/root/20221123/IAs9/637d446ff997ddfdbd9096c5/wedani_navy_blue_floral_print_v-neck_top.jpg
4 : https://assets.ajio.com/medias/sys_master/root/20230301/PMRG/63fe8442f997dde6f4d2b0a0/wedani_black_floral_print_v-neck_top.jpg
5 : https://assets.ajio.com/medias/sys_master/root/20230308/OCw2/640894a9aeb26924e3af9ef4/wedani_navy_blue_floral_print_v-neck_top.jpg
