# Recommending Items Based on User Profiles

## Imports

In [183]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.pyplot as plt

## Loading Datasets

In [184]:
item_embeddings_df = pd.read_csv('../data/item_embeddings.csv')
user_profiles_df = pd.read_csv('../data/user_profiles.csv')

In [185]:
item_embeddings_df.shape

(101, 384)

In [186]:
user_profiles_df.shape

(48, 383)

In [187]:
item_embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.074143,0.06405,-0.009264,0.046263,-0.027863,-0.048036,0.06423,-0.072758,-0.083279,-0.023699,...,-0.032186,-0.009203,-0.015632,0.013018,0.038265,0.057547,0.033841,-0.040459,-0.005354,-0.016443
1,-0.053667,0.086153,0.008184,-0.027359,0.064028,-0.062513,0.064731,0.053492,-0.086663,0.04204,...,-0.039931,-0.055881,0.070485,0.079482,-0.065805,0.01045,0.083477,-0.020643,0.026802,0.023278
2,-0.103944,0.160174,0.006674,0.032914,0.023432,0.015533,0.124695,-0.012912,-0.071057,0.007715,...,-0.04839,-0.047802,-0.056081,0.034009,0.005964,0.015365,0.026713,-0.099851,-0.039444,0.01975
3,-0.068707,0.033529,0.007046,0.08301,0.072068,0.020052,0.061295,0.038851,0.006501,0.060401,...,-0.073263,-0.021329,-0.063976,0.042687,0.028866,0.015143,-0.02514,-0.103199,-0.039904,0.047723
4,-0.069275,0.071025,0.019378,0.039093,0.075728,-0.033218,0.122828,-0.019662,-0.06492,-0.04029,...,-0.048714,-0.067688,-0.049945,-0.00556,-0.028457,0.019639,-0.037464,-0.115767,-0.017989,0.000391


In [188]:
user_profiles_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,373,374,375,376,377,378,379,380,381,382
0,0.060143,0.036061,0.048108,0.008844,-0.007575,0.06545,-0.048654,-0.067773,0.014274,0.024337,...,-0.05487,-0.012844,-0.018181,0.041335,0.007615,0.011697,-0.027718,-0.063241,0.007207,-0.003882
1,0.038324,0.021305,-0.009668,0.062717,-0.027837,0.082639,0.013449,-0.052762,-0.003109,0.028908,...,0.002024,-0.060947,-0.035074,0.033858,0.003707,-0.009584,0.002526,-0.081802,-0.057961,-0.013604
2,0.063626,-0.022125,0.018907,0.037874,-0.007774,0.043303,0.007892,-0.044934,0.031136,0.045607,...,-0.054719,-0.027572,0.020879,0.054398,-0.066077,0.013572,0.070344,-0.072806,-0.012058,0.03805
3,0.078631,0.007365,0.026449,0.023867,0.000867,0.078875,0.040846,-0.059256,0.005375,-0.013123,...,-0.02719,-0.045285,-0.011149,0.047736,0.016887,0.021995,0.036113,-0.101998,-0.036493,0.005175
4,0.116301,0.017553,0.029933,0.044769,-0.029575,0.086199,-0.01863,-0.05432,0.015293,-0.012321,...,-0.020133,-0.030688,-0.024696,0.026634,0.004515,0.004056,-0.016762,-0.056608,-0.036605,-0.030182


In [189]:
interactions = pd.read_csv('../data/fashion_interactions.csv')
interactions.head()

Unnamed: 0,user_id,item_id,liked
0,0,0,1
1,0,8,1
2,0,15,1
3,0,18,1
4,0,36,1


In [190]:
users = pd.read_csv('../data/fashion_users.csv')
users.head()

Unnamed: 0,user_id,interests
0,0,"['boho', 'summer', 'feminine']"
1,1,"['streetwear', 'sneakers', 'urban']"
2,2,"['minimalist', 'neutrals', 'sustainable']"
3,3,"['vintage', 'fall', 'denim']"
4,4,"['formal', 'classic', 'blazer']"


In [191]:
items = pd.read_csv('../data/fashion_items.csv')
items.head()

Unnamed: 0,item_id,title,tags,category
0,0,Boho Summer Maxi Dress,"['boho', 'dress', 'summer']",dresses
1,1,Minimalist Linen Blouse,"['minimalist', 'blouse', 'neutrals']",tops
2,2,Vintage Denim Jacket,"['vintage', 'jacket', 'denim']",outerwear
3,3,Cozy Knit Sweater,"['cozy', 'sweater', 'fall']",tops
4,4,Streetwear Graphic Hoodie,"['streetwear', 'hoodie', 'urban']",outerwear


## Recommendation by Profile Similarity

In [194]:
def recommend_items(user_id, top_n=5):
    """
    Recommend items based on user profile similarity.
    
    Parameters:
    - user_id: ID of the user for whom to recommend items.
    - top_n: Number of top recommendations to return.
    
    Returns:
    - DataFrame of recommended items.
    """
    user_profile = user_profiles_df.loc[user_id]
    item_embeddings = item_embeddings_df.iloc[:, 1:].values

    print(user_profile.shape)
    print(item_embeddings.shape)
    
    # Calculate cosine similarity between user profile and item embeddings
    user_profile = user_profile.values.reshape(1, -1)  # Reshape to 2D array for cosine similarity
    similarities = cosine_similarity(user_profile, item_embeddings).flatten()

    # Remove the user's saved items from recommendations
    # Get saved items based on interactions data
    # This gives a list of indices of saved items
    saved_items = interactions[interactions['user_id'] == user_id]['item_id'].values
    # Remove saved items from the similarity scores
#    similarities[saved_items] = -np.inf  # Set saved items' similarity to -inf to exclude them

    # Get indices of top N similar items
    top_indices = np.argsort(similarities)[-top_n:][::-1]
    
    # Return recommended items from the actual fashion_items dataframe
    return items.iloc[top_indices]

Let's test this function on a sample user!

In [195]:
user = 0  # Example user ID
# Print user profile for reference
print("User Profile for User ID:", user)
print(users.loc[user])
# print the users saved items
print("Saved Items for User ID:", user)
print(items[interactions['user_id'] == user])
# And print the recommendations
recommended_items = recommend_items(user)
print("Recommended Items for User ID:", user)
print(recommended_items)

User Profile for User ID: 0
user_id                                   0
interests    ['boho', 'summer', 'feminine']
Name: 0, dtype: object
Saved Items for User ID: 0
   item_id                      title                                   tags  \
0        0     Boho Summer Maxi Dress            ['boho', 'dress', 'summer']   
1        1    Minimalist Linen Blouse   ['minimalist', 'blouse', 'neutrals']   
2        2       Vintage Denim Jacket         ['vintage', 'jacket', 'denim']   
3        3          Cozy Knit Sweater            ['cozy', 'sweater', 'fall']   
4        4  Streetwear Graphic Hoodie      ['streetwear', 'hoodie', 'urban']   
5        5       Sustainable Yoga Set  ['sustainable', 'athleisure', 'yoga']   

     category  
0     dresses  
1        tops  
2   outerwear  
3        tops  
4   outerwear  
5  activewear  
(383,)
(101, 383)
Recommended Items for User ID: 0
    item_id                    title  \
36       36  Floral Print Wrap Dress   
83       83  Elegant Silk Wrap

  print(items[interactions['user_id'] == user])


Great, we recovered one of the original items the user saved! (Uncomment the line in the function above if recommending unseen items).

In [233]:
import random
user = random.choice(users['user_id'].values)  # Randomly select a user ID
# Print user profile for reference
print("User Profile for User ID:", user)
print(users.loc[user])
# print the users saved items
liked_ids = interactions[interactions['user_id'] == user]["item_id"].values
print("Saved Items for User ID:", user)
print(items.loc[items['item_id'].isin(liked_ids)])
# And print the recommendations
recommended_items = recommend_items(user)
print("Recommended Items for User ID:", user)
print(recommended_items)

User Profile for User ID: 4
user_id                                    4
interests    ['formal', 'classic', 'blazer']
Name: 4, dtype: object
Saved Items for User ID: 4
    item_id                         title  \
6         6          Classic Black Blazer   
10       10          Elegant Evening Gown   
19       19        Classic White Sneakers   
42       42          Tailored Suit Jacket   
53       53          Tailored Dress Pants   
66       66  Tailored Blazer with Pockets   

                                     tags   category  
6         ['classic', 'blazer', 'formal']  outerwear  
10          ['elegant', 'gown', 'formal']    dresses  
19      ['classic', 'sneakers', 'casual']   footwear  
42  ['tailored', 'suit jacket', 'formal']  outerwear  
53  ['tailored', 'dress pants', 'formal']    bottoms  
66      ['tailored', 'blazer', 'pockets']  outerwear  
(383,)
(101, 383)
Recommended Items for User ID: 4
    item_id                         title  \
53       53          Tailored Dress

While we didn't recover any of the original pins in this example, I would argue that the recommendations actually fit the user's profile better than the saved items. It might be interesting in future to explore creating recommendations with the profile and current pins in case tastes changed after creating a profile. Let's try one more!

In [161]:
user = random.choice(users['user_id'].values)  # Randomly select a user ID
# Print user profile for reference
print("User Profile for User ID:", user)
print(users.loc[user])
# print the users saved items
print("Saved Items for User ID:", user)
print(items[interactions['user_id'] == user])
# And print the recommendations
recommended_items = recommend_items(user)
print("Recommended Items for User ID:", user)
print(recommended_items)

User Profile for User ID: 15
user_id                               15
interests    ['woven', 'tote', 'summer']
Name: 15, dtype: object
Saved Items for User ID: 15
    item_id                                           title  \
76       76              Casual Denim Overalls with Pockets   
77       77                Tailored Wool Blazer with Lining   
78       78      Sporty Athletic Skirt with Built-in Shorts   
79       79  Vintage-Inspired Leather Backpack with Buckles   
80       80              Casual Graphic T-Shirt with Slogan   
81       81           Retro Glasses with Pink Tinted Lenses   

                                               tags     category  
76          ['casual', 'denim overalls', 'pockets']      bottoms  
77            ['tailored', 'wool blazer', 'lining']    outerwear  
78  ['sporty', 'athletic skirt', 'built-in shorts']      bottoms  
79       ['vintage', 'leather backpack', 'buckles']  accessories  
80          ['casual', 'graphic t-shirt', 'slogan']         

  print(items[interactions['user_id'] == user])


In this case, the recommendations really leaned into the summer aspect of the user's interests. This might make sense in context of the tag breakdown though, because neither woven nor tote were in the top 20 tags from our EDA.

## Collaborative Filtering

While content filtering is useful for the cold-start problem, it doesn't take into account adaptive preferences as users pin more images. Collaborative filtering looks at the users interactions and compares them with other users to get recommendations.

In [163]:
interactions.head()

Unnamed: 0,user_id,item_id,liked
0,0,0,1
1,0,8,1
2,0,15,1
3,0,18,1
4,0,36,1


In [164]:
# We have interactions dataframe with columns: user_id, item_id, and liked (1 if liked, 0 otherwise)
# We want to create a user-item interaction matrix for our recommendation model

# I think we can use the interactions dataframe to indicate 1's in the user-item matrix,
# but we need to use the items and users dataframes to ensure we have all users and items represented
# giving 0's to items that a user hasn't interacted with

user_item_matrix = [[0]*len(items) for _ in range(len(users))] # Initialize a 2D list with zeros
for _, row in interactions.iterrows():
    user_id = row['user_id']
    item_id = row['item_id']
    liked = row['liked']
    user_item_matrix[user_id][item_id] = liked # Set to 1 if liked, 0 otherwise

user_item_matrix = csr_matrix(user_item_matrix) # Convert to sparse matrix
user_item_matrix.shape # Should be (num_users, num_items)

(51, 101)

In [165]:
user_item_matrix

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 248 stored elements and shape (51, 101)>

In [166]:
user_item_matrix[3]

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 6 stored elements and shape (1, 101)>

In [167]:
import numpy as np
from scipy.sparse import csr_matrix
import implicit

# Initialize and train the model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01, alpha=40, iterations=20)
# factors: Number of latent factors (embedding dimension)
# regularization: L2 regularization to prevent overfitting
# alpha: A weight for observed interactions (higher alpha means observed interactions have more confidence)
# iterations: Number of ALS iterations

print("Training ALS model...")
model.fit(user_item_matrix) # Fit on the user-item matrix

# 3. Make recommendations
user_id_to_recommend_for = 9
n_recommendations = 5

# Get recommendations for a specific user
# filter_already_liked_items=True ensures you don't recommend items the user already interacted with
recommendations = model.recommend(
    user_id_to_recommend_for,
    user_item_matrix[user_id_to_recommend_for], # Pass the user's interaction vector
    N=n_recommendations,
    filter_already_liked_items=True
)

print(f"\nRecommendations for user {user_id_to_recommend_for}:")
for item_id, score in zip(*recommendations):
    print(f"  Item ID: {item_id}, Score: {score:.4f}")

Training ALS model...


100%|██████████| 20/20 [00:00<00:00, 2573.98it/s]


Recommendations for user 9:
  Item ID: 8, Score: 0.1975
  Item ID: 11, Score: 0.0440
  Item ID: 27, Score: 0.0195
  Item ID: 20, Score: 0.0190
  Item ID: 17, Score: 0.0179





In [168]:
# Compare to original pins
interactions[interactions['user_id'] == user_id_to_recommend_for]

Unnamed: 0,user_id,item_id,liked
45,9,3,1
46,9,12,1
47,9,23,1
48,9,62,1
49,9,84,1


In [169]:
recommendations[0]

array([ 8, 11, 27, 20, 17], dtype=int32)

In [170]:
items.loc[items['item_id'].isin([item_id for item_id in recommendations[0]])]

Unnamed: 0,item_id,title,tags,category
8,8,Ruffle Mini Skirt,"['feminine', 'skirt', 'summer']",bottoms
11,11,Casual Chambray Shirt,"['casual', 'shirt', 'denim']",tops
17,17,Chic Leather Handbag,"['chic', 'handbag', 'leather']",accessories
20,20,Graphic Tee with Vintage Print,"['graphic', 'tee', 'vintage']",tops
27,27,Denim Overalls,"['denim', 'overalls', 'casual']",bottoms


In [171]:
liked_ids = interactions[interactions['user_id'] == user_id_to_recommend_for]["item_id"].values
items.loc[items['item_id'].isin(liked_ids)]

Unnamed: 0,item_id,title,tags,category
3,3,Cozy Knit Sweater,"['cozy', 'sweater', 'fall']",tops
12,12,Plaid Flannel Shirt,"['plaid', 'flannel', 'cozy']",tops
23,23,Plaid A-Line Skirt,"['plaid', 'A-line', 'skirt']",bottoms
62,62,Casual Plaid Shirt Dress,"['casual', 'plaid', 'shirt dress']",dresses
84,84,Casual Plaid Button-Up Shirt,"['casual', 'plaid', 'button-up']",mens tops


In [172]:
users[users['user_id'] == user_id_to_recommend_for]

Unnamed: 0,user_id,interests
9,9,"['plaid', 'flannel', 'cozy']"


Since our current version of Pinterest doesn't contain many users, and most users are in a cold-start stage, collaborative filtering isn't working as well as content based filtering. This is because it's not taking into account user preferences, and there's likely not very much overlap in user pins when each user only has 2 or 3. This makes it so even the top recommendations have a low score. However, content-based filtering didn't really take into account the users current pins, which could be useful to gain more insights into their preferences beyond what was indicated on their profile. So next I think it would be worth trying a hybrid approach. 

## Hybrid Approach

There are several approaches to combining collaborative and content-based filtering, but in this case I think it would be good to try a weighted average of the scores from each method: final_score = alpha * content_score + (1 - alpha) * collaborative_score. 

In [173]:
def hybrid_filter(user_id, top_n=5, alpha=0.5):
    """
    Hybrid recommendation combining content-based and collaborative filtering.
    
    Parameters:
    - user_id: ID of the user for whom to recommend items.
    - top_n: Number of top recommendations to return.
    - alpha: Weighting factor between content-based and collaborative scores (0 <= alpha <= 1).
    
    Returns:
    - DataFrame of recommended items.
    """
    # Get content-based recommendations
    content_recs = recommend_items(user_id, top_n=top_n*2)  # Get more to allow for filtering later
    content_scores = np.ones(len(content_recs))  # Placeholder scores for content-based (could be improved)
    
    # Get collaborative filtering recommendations
    collab_recs = model.recommend(
        user_id,
        user_item_matrix[user_id],
        N=top_n*2,
        filter_already_liked_items=True
    )
    #print(collab_recs)
    collab_item_ids = [item_id for item_id, _ in zip(*collab_recs)]
    collab_scores = [score for _, score in zip(*collab_recs)]
#    collab_item_ids, collab_scores = zip(*collab_recs)
    
    # Combine scores
    combined_scores = {}
    
    for idx, item in enumerate(content_recs['item_id']):
        combined_scores[item] = alpha * content_scores[idx]
    
    for idx, item in enumerate(collab_item_ids):
        if item in combined_scores:
            combined_scores[item] += (1 - alpha) * collab_scores[idx]
        else:
            combined_scores[item] = (1 - alpha) * collab_scores[idx]
    
    # Sort by combined score and get top N
    sorted_items = sorted(combined_scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    recommended_item_ids = [item[0] for item in sorted_items]
    
    return items.loc[items['item_id'].isin(recommended_item_ids)]

Let's test our system with a few examples. 

In [174]:
user_id = 23
recommendations = hybrid_filter(user_id, top_n=5, alpha=0.5)
print(f"User Profile for User ID: {user_id}")
print(users[users['user_id'] == user_id])
print("Saved Items for User ID:", user_id)
liked_ids = interactions[interactions['user_id'] == user_id]["item_id"].values
print(items.loc[items['item_id'].isin(liked_ids)])
print(f"Hybrid Recommendations for user {user_id}:")
print(recommendations)

User Profile for User ID: 23
    user_id                      interests
23       23  ['elegant', 'silk', 'summer']
Saved Items for User ID: 23
    item_id                    title                             tags  \
0         0   Boho Summer Maxi Dress      ['boho', 'dress', 'summer']   
8         8        Ruffle Mini Skirt  ['feminine', 'skirt', 'summer']   
10       10     Elegant Evening Gown    ['elegant', 'gown', 'formal']   
15       15  Floral Print Maxi Skirt     ['floral', 'maxi', 'summer']   
18       18     Woven Straw Tote Bag      ['woven', 'tote', 'summer']   
26       26       Elegant Silk Scarf     ['elegant', 'silk', 'scarf']   

       category  
0       dresses  
8       bottoms  
10      dresses  
15      bottoms  
18  accessories  
26  accessories  
Hybrid Recommendations for user 23:
    item_id                                 title  \
36       36               Floral Print Wrap Dress   
57       57                   Elegant Silk Blouse   
68       68  Elegant Sil

That seems to work way better, at least for this user! Let's try one or two more examples. 

In [175]:
user_id = 16
recommendations = hybrid_filter(user_id, top_n=5, alpha=0.5)
print(f"User Profile for User ID: {user_id}")
print(users[users['user_id'] == user_id])
print("Saved Items for User ID:", user_id)
liked_ids = interactions[interactions['user_id'] == user_id]["item_id"].values
print(items.loc[items['item_id'].isin(liked_ids)])
print(f"Hybrid Recommendations for user {user_id}:")
print(recommendations)

User Profile for User ID: 16
    user_id                          interests
16       16  ['classic', 'casual', 'sneakers']
Saved Items for User ID: 16
    item_id                    title                                    tags  \
6         6     Classic Black Blazer         ['classic', 'blazer', 'formal']   
7         7      Colorblock Sneakers  ['sneakers', 'colorful', 'streetwear']   
11       11    Casual Chambray Shirt            ['casual', 'shirt', 'denim']   
14       14  Retro High-Top Sneakers         ['retro', 'sneakers', 'casual']   
19       19   Classic White Sneakers       ['classic', 'sneakers', 'casual']   
25       25     Casual Slip-On Shoes       ['casual', 'slip-on', 'footwear']   

     category  
6   outerwear  
7    footwear  
11       tops  
14   footwear  
19   footwear  
25   footwear  
Hybrid Recommendations for user 16:
    item_id                                  title  \
44       44               Chunky Platform Sneakers   
49       49                Casua

In [176]:
user_id = 47
recommendations = hybrid_filter(user_id, top_n=5, alpha=0.5)
print(f"User Profile for User ID: {user_id}")
print(users[users['user_id'] == user_id])
print("Saved Items for User ID:", user_id)
liked_ids = interactions[interactions['user_id'] == user_id]["item_id"].values
print(items.loc[items['item_id'].isin(liked_ids)])
print(f"Hybrid Recommendations for user {user_id}:")
print(recommendations)

User Profile for User ID: 47
    user_id                                  interests
47       47  ['elegant pearl earrings', 'accessories']
Saved Items for User ID: 47
    item_id                        title  \
40       40  Vintage-Inspired Sunglasses   
63       63   Elegant Statement Necklace   

                                                tags     category  
40          ['vintage', 'sunglasses', 'accessories']  accessories  
63  ['elegant', 'statement necklace', 'accessories']  accessories  
Hybrid Recommendations for user 47:
    item_id                                 title  \
26       26                    Elegant Silk Scarf   
30       30              Bohemian Beaded Necklace   
50       50                Elegant Pearl Earrings   
54       54      Vintage-Inspired Leather Satchel   
68       68  Elegant Silk Scarf with Floral Print   

                                   tags     category  
26         ['elegant', 'silk', 'scarf']  accessories  
30   ['bohemian', 'necklace', '

In [177]:
user_id = 17
recommendations = hybrid_filter(user_id, top_n=5, alpha=0.25)
print(f"User Profile for User ID: {user_id}")
print(users[users['user_id'] == user_id])
print("Saved Items for User ID:", user_id)
liked_ids = interactions[interactions['user_id'] == user_id]["item_id"].values
print(items.loc[items['item_id'].isin(liked_ids)])
print(f"Hybrid Recommendations for user {user_id}:")
print(recommendations)

User Profile for User ID: 17
    user_id                      interests
17       17  ['graphic', 'tee', 'vintage']
Saved Items for User ID: 17
    item_id                             title  \
2         2              Vintage Denim Jacket   
20       20    Graphic Tee with Vintage Print   
29       29  Vintage-Inspired Mary Jane Flats   
35       35    Graphic Hoodie with Bold Print   
40       40       Vintage-Inspired Sunglasses   
41       41           Casual Graphic Tank Top   

                                        tags     category  
2             ['vintage', 'jacket', 'denim']    outerwear  
20             ['graphic', 'tee', 'vintage']         tops  
29            ['vintage', 'flats', 'casual']     footwear  
35             ['graphic', 'hoodie', 'bold']    outerwear  
40  ['vintage', 'sunglasses', 'accessories']  accessories  
41         ['casual', 'graphic', 'tank top']         tops  
Hybrid Recommendations for user 17:
    item_id                            title  \
4        

I would say both of those look great! Even with only two saved items, the recommendations did seem to really match the user's profile and future directions (e.g. vintage, which wasn't an indicated interest). This shows that even a simple 50/50 hybrid can dramatically improve recommendation quality by balancing personalization and discovery.