In [1]:
import pandas as pd
import random
import os
import numpy as np

In [2]:
# Path to the CSV file
csv_file_path = './csv_files/combined_dataset.csv'

In [3]:
# Read the CSV file into a DataFrame
products_df = pd.read_csv(csv_file_path, on_bad_lines='skip')

print(products_df.head())

      id gender masterCategory subCategory  articleType baseColour  season  \
0  15970    Men        Apparel     Topwear       Shirts  Navy Blue    Fall   
1  39386    Men        Apparel  Bottomwear        Jeans       Blue  Summer   
2  59263  Women    Accessories     Watches      Watches     Silver  Winter   
3  21379    Men        Apparel  Bottomwear  Track Pants      Black    Fall   
4  53759    Men        Apparel     Topwear      Tshirts       Grey  Summer   

     year   usage                             productDisplayName  seller_id  \
0  2011.0  Casual               Turtle Check Men Navy Blue Shirt        148   
1  2012.0  Casual             Peter England Men Party Blue Jeans         24   
2  2016.0  Casual                       Titan Women Silver Watch        173   
3  2011.0  Casual  Manchester United Men Solid Black Track Pants        191   
4  2012.0  Casual                          Puma Men Grey T-shirt        166   

     status  total_reviewers  average_rating            

In [4]:
products_df['productDisplayName'].head(5)

0                 Turtle Check Men Navy Blue Shirt
1               Peter England Men Party Blue Jeans
2                         Titan Women Silver Watch
3    Manchester United Men Solid Black Track Pants
4                            Puma Men Grey T-shirt
Name: productDisplayName, dtype: object

In [5]:
# Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
products_df['productDisplayName'] = products_df['productDisplayName'].fillna('')

# Compute the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(products_df['productDisplayName'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(44417, 8567)

In [6]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [8]:
#Construct a reverse map of indices and product titles
indices = pd.Series(products_df.index, index=products_df['productDisplayName']).drop_duplicates()

In [9]:
# Function that takes in product title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the product that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that product
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar products
    return products_df['productDisplayName'].iloc[product_indices]

In [14]:
get_recommendations('Peter England Men Party Blue Jeans')

3522          Peter England Men Blue Party Jeans
459          Peter England Men Party Black Jeans
26956    Peter England Men Navy Blue Party Jeans
3897                Peter England Men Blue Jeans
10945               Peter England Men Blue Jeans
21240               Peter England Men Blue Jeans
23706               Peter England Men Blue Jeans
27146               Peter England Men Blue Jeans
28026               Peter England Men Blue Jeans
29846               Peter England Men Blue Jeans
Name: productDisplayName, dtype: object