In [14]:
import pandas as pd
import random
import os
import numpy as np

In [15]:
# Path to the CSV file
csv_file_path = './csv_files/FashionProductImages.csv'
image_folder = './mytradataset/images/'

# Read the CSV file into a DataFrame
seller_df = pd.read_csv(csv_file_path, on_bad_lines='skip')

# Display the first few rows of the DataFrame
print(seller_df.head())

      id gender masterCategory subCategory  articleType baseColour  season  \
0  15970    Men        Apparel     Topwear       Shirts  Navy Blue    Fall   
1  39386    Men        Apparel  Bottomwear        Jeans       Blue  Summer   
2  59263  Women    Accessories     Watches      Watches     Silver  Winter   
3  21379    Men        Apparel  Bottomwear  Track Pants      Black    Fall   
4  53759    Men        Apparel     Topwear      Tshirts       Grey  Summer   

     year   usage                             productDisplayName  
0  2011.0  Casual               Turtle Check Men Navy Blue Shirt  
1  2012.0  Casual             Peter England Men Party Blue Jeans  
2  2016.0  Casual                       Titan Women Silver Watch  
3  2011.0  Casual  Manchester United Men Solid Black Track Pants  
4  2012.0  Casual                          Puma Men Grey T-shirt  


In [16]:
# Generate random seller IDs
seller_df['seller_id'] = np.random.randint(1, 201, size=len(seller_df))

status_values = ['sold', 'for sale']
seller_df['status'] = np.random.choice(status_values, size=len(seller_df), p=[0.3, 0.7])
print(seller_df.head())

      id gender masterCategory subCategory  articleType baseColour  season  \
0  15970    Men        Apparel     Topwear       Shirts  Navy Blue    Fall   
1  39386    Men        Apparel  Bottomwear        Jeans       Blue  Summer   
2  59263  Women    Accessories     Watches      Watches     Silver  Winter   
3  21379    Men        Apparel  Bottomwear  Track Pants      Black    Fall   
4  53759    Men        Apparel     Topwear      Tshirts       Grey  Summer   

     year   usage                             productDisplayName  seller_id  \
0  2011.0  Casual               Turtle Check Men Navy Blue Shirt        148   
1  2012.0  Casual             Peter England Men Party Blue Jeans         24   
2  2016.0  Casual                       Titan Women Silver Watch        173   
3  2011.0  Casual  Manchester United Men Solid Black Track Pants        191   
4  2012.0  Casual                          Puma Men Grey T-shirt        166   

     status  
0  for sale  
1  for sale  
2  for sale  


In [17]:
print(seller_df.nunique())

id                    44424
gender                    5
masterCategory            7
subCategory              45
articleType             143
baseColour               46
season                    4
year                     13
usage                     8
productDisplayName    31121
seller_id               200
status                    2
dtype: int64


In [18]:
pd.Series(seller_df['status'] == 'sold', dtype="float64").sum()

13425.0

In [19]:
# Count how many empty values
def count_empty_values(df):
    empty_counts = df.isna().sum()
    return empty_counts

df = pd.DataFrame(seller_df)
empty_counts = count_empty_values(df)

print(empty_counts)

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             15
season                 21
year                    1
usage                 317
productDisplayName      7
seller_id               0
status                  0
dtype: int64


In [20]:
def delete_rows_with_nan(df_del, attribute):
    df_del = df_del.dropna(subset=[attribute])
    return df_del

df_del = pd.DataFrame(seller_df)

# Delete rows where 'productDisplayName' contains NaN
attribute_to_check = 'productDisplayName'
seller_df = delete_rows_with_nan(df_del, attribute_to_check)

In [21]:
# Re-check if the NaN values from the 'productDisplayName' attribute is deleted
df = pd.DataFrame(seller_df)
empty_counts = count_empty_values(df)
print(empty_counts)

id                      0
gender                  0
masterCategory          0
subCategory             0
articleType             0
baseColour             10
season                 21
year                    1
usage                 312
productDisplayName      0
seller_id               0
status                  0
dtype: int64


In [22]:
# Filter 'sold' items from 'seller' DataFrame
sold_items = seller_df[seller_df['status'] == 'sold']

# Get the number of 'sold' items
num_sold_items = len(sold_items)

rate = np.random.uniform(low=0.0, high=5.0, size=num_sold_items)
rounded_ratings = np.round(rate, decimals=1)

# Generate 'rating' dataset based on 'sold' items
rating_data = {
    'user_id': np.random.choice(range(1, 501), size=num_sold_items, replace=True),
    'fashion_id': sold_items['id'].tolist(),
    'rating': rounded_ratings,
    'rated_sellers_id': sold_items['seller_id'].tolist()
}

rating_df = pd.DataFrame(rating_data)

print(rating_df)

       user_id  fashion_id  rating  rated_sellers_id
0          451       53759     3.0               166
1          469        1855     2.6                46
2          424       12369     1.8               199
3          318       29928     2.9                30
4          355        7990     2.9               102
...        ...         ...     ...               ...
13419      284        4336     2.0                31
13420      308       37431     2.9               187
13421      345       55283     4.8                45
13422      181       42234     1.2                23
13423      364       18842     4.5               182

[13424 rows x 4 columns]


In [23]:
# Count occurrences of each unique value in 'seller_id' label
seller_counts = rating_df['rated_sellers_id'].value_counts()

# Insert 'total_reviewers' label to 'seller_df'
seller_df_copy = seller_df.copy()
seller_df_copy['total_reviewers'] = seller_df['seller_id'].map(seller_counts)
seller_df = seller_df_copy
# seller_df.loc[:, 'total_reviewers'] = seller_df['seller_id'].map(seller_counts)
# seller_df['total_reviewers'] = seller_df['seller_id'].map(seller_counts)

In [24]:
# Calculate average rating for each seller_id
average_ratings = rating_df.groupby('rated_sellers_id')['rating'].mean()
rounded_ratings = np.round(average_ratings, decimals=1)

# Add 'average_rating' label to 'seller' DataFrame
seller_df['average_rating'] = rounded_ratings[seller_df['seller_id']].values

print(seller_df)

          id gender masterCategory subCategory            articleType  \
0      15970    Men        Apparel     Topwear                 Shirts   
1      39386    Men        Apparel  Bottomwear                  Jeans   
2      59263  Women    Accessories     Watches                Watches   
3      21379    Men        Apparel  Bottomwear            Track Pants   
4      53759    Men        Apparel     Topwear                Tshirts   
...      ...    ...            ...         ...                    ...   
44419  17036    Men       Footwear       Shoes           Casual Shoes   
44420   6461    Men       Footwear  Flip Flops             Flip Flops   
44421  18842    Men        Apparel     Topwear                Tshirts   
44422  46694  Women  Personal Care   Fragrance  Perfume and Body Mist   
44423  51623  Women    Accessories     Watches                Watches   

      baseColour  season    year   usage  \
0      Navy Blue    Fall  2011.0  Casual   
1           Blue  Summer  2012.0  C

In [25]:
def get_image_path(row):
    image_id = row['id']
    image_path = os.path.join(image_folder, str(image_id) + '.jpg')
    return image_path

In [26]:
seller_df['image_path'] = seller_df.apply(get_image_path, axis=1)
seller_df.head()

Unnamed: 0,id,gender,masterCategory,subCategory,articleType,baseColour,season,year,usage,productDisplayName,seller_id,status,total_reviewers,average_rating,image_path
0,15970,Men,Apparel,Topwear,Shirts,Navy Blue,Fall,2011.0,Casual,Turtle Check Men Navy Blue Shirt,148,for sale,66,2.3,./mytradataset/images/15970.jpg
1,39386,Men,Apparel,Bottomwear,Jeans,Blue,Summer,2012.0,Casual,Peter England Men Party Blue Jeans,24,for sale,68,2.4,./mytradataset/images/39386.jpg
2,59263,Women,Accessories,Watches,Watches,Silver,Winter,2016.0,Casual,Titan Women Silver Watch,173,for sale,55,2.5,./mytradataset/images/59263.jpg
3,21379,Men,Apparel,Bottomwear,Track Pants,Black,Fall,2011.0,Casual,Manchester United Men Solid Black Track Pants,191,for sale,70,2.3,./mytradataset/images/21379.jpg
4,53759,Men,Apparel,Topwear,Tshirts,Grey,Summer,2012.0,Casual,Puma Men Grey T-shirt,166,sold,80,2.5,./mytradataset/images/53759.jpg


In [27]:
seller_df.to_csv('./csv_files/combined_dataset.csv', index=False)

In [37]:
# Content Based

In [38]:
seller_df['productDisplayName'].head(5)

0                 Turtle Check Men Navy Blue Shirt
1               Peter England Men Party Blue Jeans
2                         Titan Women Silver Watch
3    Manchester United Men Solid Black Track Pants
4                            Puma Men Grey T-shirt
Name: productDisplayName, dtype: object

In [39]:
# Import TfIdfVectorizer from scikit-learn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity, linear_kernel

# Create a TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with an empty string
seller_df['productDisplayName'] = seller_df['productDisplayName'].fillna('')

# Compute the TF-IDF matrix
tfidf_matrix = tfidf.fit_transform(seller_df['productDisplayName'])

#Output the shape of tfidf_matrix
tfidf_matrix.shape

(44417, 8567)

In [None]:
# Compute the cosine similarity matrix
# cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [None]:
# Compute the cosine similarity matrix
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
#Construct a reverse map of indices and product titles
indices = pd.Series(seller_df.index, index=seller_df['productDisplayName']).drop_duplicates()

In [None]:
# Function that takes in product title as input and outputs most similar movies
def get_recommendations(title, cosine_sim=linear_ker):
    # Get the index of the product that matches the title
    idx = indices[title]

    # Get the pairwsie similarity scores of all movies with that product
    sim_scores = list(enumerate(linear_ker[idx]))

    # Sort the products based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 10 most similar products
    sim_scores = sim_scores[1:11]

    # Get the movie indices
    product_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar products
    return seller_df['productDisplayName'].iloc[product_indices]

In [None]:
# def get_recommendations(product_id, num_recommendations=5):
#     # Get the index of the product
#     idx = df[df['id'] == product_id].index[0]
    
#     # Get the pairwise similarity scores for the product
#     sim_scores = list(enumerate(cosine_sim[idx]))
    
#     # Sort the products based on similarity scores
#     sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
#     # Get the top N similar products (excluding the same product)
#     top_products = [i for i, _ in sim_scores[1:num_recommendations+1]]
    
#     # Return the recommendations
#     return df.iloc[top_products]

# # Example usage
# recommendations = get_recommendations(product_id, num_recommendations=5)
# print(recommendations[['id', 'productDisplayName']])

In [None]:
get_recommendations('Chimp Men Draculla & Sons Yellow Tshirts')