In [9]:
import pandas as pd

# Source: Addressing Markeiting Bias in Product Recommendations - Mengting Wan, Jianmo Ni, Rishabh Misra, Julian McAuley - WSDM, 2020
# https://github.com/MengtingWan/marketBias
# https://github.com/Kal-Lemma/Clothes-Recommendation-System/blob/master/modcloth_final_data.json
# https://www.kaggle.com/code/agrawaladitya/step-by-step-data-preprocessing-eda/data
df = pd.read_json("../../data/recommendations/modcloth_final_data.json", lines=True)

In [10]:
df.head()

Unnamed: 0,item_id,waist,size,quality,cup size,hips,bra size,category,bust,height,user_name,length,fit,user_id,shoe size,shoe width,review_summary,review_text
0,123373,29.0,7,5.0,d,38.0,34.0,new,36.0,5ft 6in,Emily,just right,small,991571,,,,
1,123373,31.0,13,3.0,b,30.0,36.0,new,,5ft 2in,sydneybraden2001,just right,small,587883,,,,
2,123373,30.0,7,2.0,b,,32.0,new,,5ft 7in,Ugggh,slightly long,small,395665,9.0,,,
3,123373,,21,5.0,dd/e,,,new,,,alexmeyer626,just right,fit,875643,,,,
4,123373,,18,5.0,b,,36.0,new,,5ft 2in,dberrones1,slightly long,small,944840,,,,


In [11]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 82790 entries, 0 to 82789
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   item_id         82790 non-null  int64  
 1   waist           2882 non-null   float64
 2   size            82790 non-null  int64  
 3   quality         82722 non-null  float64
 4   cup size        76535 non-null  str    
 5   hips            56064 non-null  float64
 6   bra size        76772 non-null  float64
 7   category        82790 non-null  str    
 8   bust            11854 non-null  str    
 9   height          81683 non-null  str    
 10  user_name       82790 non-null  str    
 11  length          82755 non-null  str    
 12  fit             82790 non-null  str    
 13  user_id         82790 non-null  int64  
 14  shoe size       27915 non-null  float64
 15  shoe width      18607 non-null  str    
 16  review_summary  76065 non-null  str    
 17  review_text     76065 non-null  str    
dt

In [12]:
df = df[~df["user_id"].isna()]
df = df[~df["item_id"].isna()]
df = df[~df["quality"].isna()]
df = df[~df["review_text"].isna()]
df = df.reset_index()

In [13]:
len(df.item_id.unique())


1322

In [14]:
len(df.user_id.unique())


44811

In [15]:
len(df)

76000

# Неперсоналізовані рекомендаційні системи

Popularity-based recommender systems: Popularity-based recommenders work by suggesting the most frequently purchased products to customers. As the name suggests, Popularity based recommendation system works with the trend. It basically uses the items which are in trend right now.

## Frequency of purchase 
Popularity-based recommenders work by suggesting the most frequently purchased products to customers. This vague idea can be turned into at least two concrete implementations:

Check which articles are bought most often across all customers. Recommend these articles to each customer. 

Source: https://towardsdatascience.com/how-to-build-popularity-based-recommenders-with-polars-cc7920ad3f68#:~:text=Popularity%2Dbased%20recommenders%20work%20by,these%20articles%20to%20each%20customer.

In [17]:
items_popularity = df.groupby("item_id")["user_id"].count().sort_values(ascending=False)
items_popularity = items_popularity.reset_index()
items_popularity

Unnamed: 0,item_id,user_id
0,539980,2007
1,668696,1555
2,397005,1506
3,175771,1438
4,407134,1437
...,...,...
1317,617837,1
1318,631992,1
1319,628198,1
1320,626936,1


In [18]:
items_popularity.iloc[:3]["item_id"].to_list()

[539980, 668696, 397005]

In [21]:
import random
popular_items = items_popularity.iloc[:3]["item_id"].to_list()

def present_recommended_products(popular_items: list):
    print("**Currently trending products**")
    print("")

    for index, item_id_ in enumerate(popular_items):
        slice_df = df[df["item_id"] == item_id_]
        print(f"Recommended item {index+1}/{len(popular_items)}: product {item_id_}")

        category = slice_df["category"].unique()[0]
        print(f"{category=}")

        slice_with_reviews = slice_df[~slice_df["review_text"].isna()]
        review_for_slice = list(slice_with_reviews["review_text"].unique())
        if len(review_for_slice)>0:
            reviews = random.sample(review_for_slice, min(len(review_for_slice), 3))
            print(f"User reviews:")
            for review in reviews:
                print("-", review)
            print("...")
        else:
            print("There are no reviews for this product yet.")
        print("")

present_recommended_products(popular_items)

**Currently trending products**

Recommended item 1/3: product 539980
category='tops'
User reviews:
- I originally purchased this, as well as in red, both in medium. They seemed slightly big, so I ended up exchanging them for smalls... And the medium honey one was actually bigger than the small I received as an exchange. It was bizarre. Apparently the sizes are slightly inconsistent based on batch? Modcloth couldn't guarantee me a small that would be smaller than the medium I purchased. They offered to refund me but wouldn't exchange it again. Oh well. I opted to keep it but it's still a little big for me. It's a pretty color that goes with a lot of things, but a pain to care for.
- Super cute cardigan.
- I ordered this because I needed a go to mustard color sweater, the color is exactly what I wanted! I'm usually a medium, however, after reading the reviews ordered a large, which fits perfect. One thing I noticed is that one of the seams was sewn wrong (the end pokes out instead of in

# Content-based personalized systems

In [22]:
df.head()
df_reviews = df[["item_id", "review_text", "category"]][~df["review_text"].isna()]
df_reviews.head()

Unnamed: 0,item_id,review_text,category
0,152702,"I liked the color, the silhouette, and the fab...",new
1,152702,From the other reviews it seems like this dres...,new
2,152702,I love the design and fit of this dress! I wo...,new
3,152702,I bought this dress for work it is flattering...,new
4,152702,This is a very professional look. It is Great ...,new


In [23]:
len(df_reviews)

76000

In [None]:
df_grouped = df_reviews.groupby(["item_id", "category"]).agg({'review_text': ' '.join}) # Group by item_id and concatenate reviews
df_grouped = df_grouped.reset_index()
df_grouped.head()

Unnamed: 0,item_id,category,review_text
0,152702,new,"I liked the color, the silhouette, and the fab..."
1,153494,new,I wanted to fit in this dress so bad so I made...
2,153798,new,Unfortunately the fabric is soooo thin and wri...
3,154411,new,My only complaint is that people notice when I...
4,154882,new,Most of the other reviews said size up one but...


In [26]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

Creating recommendations based on TF-IDF score:

# TF-IDF (Term Frequency - Inverse Document Frequency)

$TFIDF$ score for term $i$ in document $j = TF(i,j) \cdot IDF(i)$

where
* **IDF** = Inverse Document Frequency
* **TF** = Term Frequency

$$TF(i,j) = \frac{\text{Term } i \text{ frequency in document } j}{\text{Total words in document } j}$$

$$IDF(i) = \log_2 \left( \frac{\text{Total documents}}{\text{documents with term } i} \right)$$

and
* $t$ = Term
* $j$ = Document

---

### General Formula

$$w_{i,j} = tf_{i,j} \times \log \left( \frac{N}{df_i} \right)$$

**Where:**
* $tf_{i,j}$ = number of occurrences of $i$ in $j$
* $df_i$ = number of documents containing $i$
* $N$ = total number of documents

---
**Source:** [Medium Article](https://medium.com/@imamun/creating-a-tf-idf-in-python-e43f05e4d424)

In [27]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(df_grouped['review_text'])
tfidf_matrix

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 420695 stored elements and shape (1322, 22088)>

In [28]:
len(df_grouped['review_text'])

1322

In [29]:
pd.DataFrame(tfidf_matrix.toarray())

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22078,22079,22080,22081,22082,22083,22084,22085,22086,22087
0,0.001489,0.0,0.0,0.0,0.0016,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.001636,0.0,0.0,0.0,0.0
2,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317,0.000949,0.0,0.0,0.0,0.0000,0.0,0.001298,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1318,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1319,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1320,0.000000,0.0,0.0,0.0,0.0000,0.0,0.000000,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [30]:
tfidf_vectorizer.vocabulary_

{'liked': 10972,
 'the': 19327,
 'color': 4221,
 'silhouette': 17207,
 'and': 1434,
 'fabric': 7073,
 'of': 13143,
 'this': 19449,
 'dress': 6173,
 'but': 3253,
 'ruching': 16247,
 'just': 10353,
 'looked': 11200,
 'bunchy': 3188,
 'ruined': 16261,
 'whole': 21489,
 'thing': 19418,
 'was': 21178,
 'so': 17764,
 'disappointed': 5804,
 'really': 15414,
 'waned': 21119,
 'to': 19692,
 'like': 10971,
 'runs': 16283,
 'little': 11066,
 'small': 17583,
 'would': 21798,
 'need': 12635,
 'size': 17309,
 'up': 20643,
 'make': 11502,
 'it': 10080,
 'workappropriate': 21735,
 'from': 8011,
 'other': 13428,
 'reviews': 15992,
 'seems': 16695,
 'either': 6473,
 'works': 21756,
 'for': 7818,
 'your': 22023,
 'body': 2703,
 'type': 20212,
 'or': 13370,
 'doesn': 5978,
 'have': 8970,
 'waist': 21058,
 'flabby': 7543,
 'tummy': 20099,
 'is': 10058,
 'perfect': 14002,
 'me': 11738,
 'detail': 5615,
 'around': 1702,
 'front': 8017,
 'hides': 9154,
 'everything': 6827,
 'clingyness': 4058,
 'makes': 11507

In [31]:
# Compute cosine similarity between user input and items
user_input = 'Knee-long skirt made of silk'
user_tfidf=tfidf_vectorizer.transform([user_input])
user_df = pd.DataFrame(user_tfidf.toarray())
user_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22078,22079,22080,22081,22082,22083,22084,22085,22086,22087
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
tfidf_vectorizer.vocabulary_["silk"]

17211

In [33]:
user_df[tfidf_vectorizer.vocabulary_["silk"]]

0    0.726013
Name: 17211, dtype: float64

In [34]:
tfidf_vectorizer.idf_[tfidf_vectorizer.vocabulary_["silk"]]

np.float64(4.855452653939752)

In [35]:
cosine_similarities = linear_kernel(user_tfidf, tfidf_matrix).flatten()
cosine_similarities

array([0.04200172, 0.04524237, 0.        , ..., 0.02718421, 0.03061408,
       0.04886547], shape=(1322,))

In [36]:
# Get indices of items sorted by similarity
top_n = 3
item_indices = cosine_similarities.argsort()[:-top_n-1:-1]
print("indices:", item_indices)
#Get recommended item names
recommendations = df_grouped['item_id'].iloc[item_indices].tolist()
print("item_ids:", recommendations)

indices: [ 915 1105 1133]
item_ids: [605558, 701811, 714723]


In [37]:
def content_based_recommender(df, user_input, top_n=3):
    """ 
    Content-based recommender system using TF-DF.

    Parameters:
    - df: pandas DataFrame with 'item_id' and 'review_text' columns.
    - user_input: textual input representing user preferences.
    -top_n: number of top items to recommend.

    Returns:
    - recommendations: a list of top_n recommended item names.
    """

    # Combine relevant text features into a single string
    df['text_features'] = df['category']+' '+df['review_text']

    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    tfidf_matrix = tfidf_vectorizer.fit_transform(df['text_features'])

    # Compute cosine similarity between user input and items
    user_tfdf = tfidf_vectorizer.transform([user_input])
    cosine_similarities = linear_kernel(user_tfdf, tfidf_matrix).flatten()

    # Get indices of items sorted by similarity
    item_indices = cosine_similarities.argsort()[:-top_n-1:-1]

    # Get recommended item names
    recommendations = df['item_id'].iloc[item_indices].to_list()

    return recommendations

# Example usage
user_preferences = 'Knee-long skirt made of silk'
recommended_products = content_based_recommender(df_grouped, user_preferences, top_n=3)
present_recommended_products(recommended_products)



**Currently trending products**

Recommended item 1/3: product 605558
category='tops'
User reviews:
- Very good quality. The fabric feels like silk. :)
- Just as pictured; soft material; looks and feels good quality. I usually wear an XL or L (I'm an apple shape) and got an L for this item and it fits well.
...

Recommended item 2/3: product 701811
category='bottoms'
User reviews:
- I liked the librarian look, with the plaid and strap details, however on, this skirt was not pretty.  I'm surprised at the cost with how cheaply made this skirt was.  I sent it back.  The length was great though, just wish it was a better quality.
- When I received this skirt, I was pleasantly surprised on the quality of the skirt. The buckle by the waist is a cute feature that makes the skirt unique. My only complaint is that the skirt is too long for my 5'4 stature. It reached about twothree inches above my ankles. Unfortunately, this skirt will be going back.
- The skirt is made very well but is comicall

# Collaborative Filtering (Колаборативні рекомендаційні системи)

## Item-based filtering
Item-based collaborative filtering recommends items based on the similarity between items, particulary their past interractions with users

In [38]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Sample dataset (user_id, item_id, rating)
data = {'user_id': [1,1,2,2,3,3,4,4],
        'item_id':['A','B','A','C','C','D','B','D'],
        'rating':[5, 4,3,2,4,5,1,3]}

df_sample = pd.DataFrame(data)
df_sample

Unnamed: 0,user_id,item_id,rating
0,1,A,5
1,1,B,4
2,2,A,3
3,2,C,2
4,3,C,4
5,3,D,5
6,4,B,1
7,4,D,3


In [39]:
# Pivot the DataFrame to create  a user-item matrix
user_item_matrix = df_sample.pivot_table(index='user_id', columns='item_id', values='rating', fill_value=0)
user_item_matrix

item_id,A,B,C,D
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,5.0,4.0,0.0,0.0
2,3.0,0.0,2.0,0.0
3,0.0,0.0,4.0,5.0
4,0.0,1.0,0.0,3.0


In [41]:
#Calculate cosine similarity between items
item_similarity = cosine_similarity(user_item_matrix.T)
item_similarity

array([[1.        , 0.83189033, 0.2300895 , 0.        ],
       [0.83189033, 1.        , 0.        , 0.12478355],
       [0.2300895 , 0.        , 1.        , 0.76696499],
       [0.        , 0.12478355, 0.76696499, 1.        ]])