# Creating a User Profile Vector

For each user, we want to create an average of the embeddings of the items they pinned. 

## Imports

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Getting the Embeddings & User Data

In [3]:
# Load in the item embeddings
embeddings_df = pd.read_csv('../data/item_embeddings.csv')
# Display the first few rows of the DataFrame
embeddings_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.074143,0.06405,-0.009264,0.046263,-0.027863,-0.048036,0.06423,-0.072758,-0.083279,-0.023699,...,-0.032186,-0.009203,-0.015632,0.013018,0.038265,0.057547,0.033841,-0.040459,-0.005354,-0.016443
1,-0.053667,0.086153,0.008184,-0.027359,0.064028,-0.062513,0.064731,0.053492,-0.086663,0.04204,...,-0.039931,-0.055881,0.070485,0.079482,-0.065805,0.01045,0.083477,-0.020643,0.026802,0.023278
2,-0.103944,0.160174,0.006674,0.032914,0.023432,0.015533,0.124695,-0.012912,-0.071057,0.007715,...,-0.04839,-0.047802,-0.056081,0.034009,0.005964,0.015365,0.026713,-0.099851,-0.039444,0.01975
3,-0.068707,0.033529,0.007046,0.08301,0.072068,0.020052,0.061295,0.038851,0.006501,0.060401,...,-0.073263,-0.021329,-0.063976,0.042687,0.028866,0.015143,-0.02514,-0.103199,-0.039904,0.047723
4,-0.069275,0.071025,0.019378,0.039093,0.075728,-0.033218,0.122828,-0.019662,-0.06492,-0.04029,...,-0.048714,-0.067688,-0.049945,-0.00556,-0.028457,0.019639,-0.037464,-0.115767,-0.017989,0.000391


In [4]:
# Let's load in the user interactions data
user_data = pd.read_csv('../data/fashion_interactions.csv')
# Display the first few rows of the user interactions DataFrame
user_data.head()

Unnamed: 0,user_id,item_id,liked
0,0,0,1
1,0,8,1
2,0,15,1
3,0,18,1
4,0,36,1


This data will tell us the items each user pinned. So, we can use the item id column to grab the item embeddings and average them. 

## Average Embeddings Function

In [5]:
def average_embeddings(user_id, embeddings_df):
    """
    Given a user ID, return the average of the embeddings for that user.
    """
    # Obtain the user interactions for the specified user ID
    user_interactions = user_data[user_data['user_id'] == user_id]

    # Now get the item IDs that the user has interacted with
    item_ids = user_interactions['item_id'].unique()

    # Filter the DataFrame for the user's items
    user_items = embeddings_df.loc[item_ids]
    
    # If the user has no items, return None
    if user_items.empty:
        # Maybe we should return a zero vector instead?
        # return np.zeros(embeddings_df.shape[1] - 1) 
        return None
    
    # Calculate the average embedding vector
    avg_embedding = user_items.iloc[:, 1:].mean(axis=0)
    
    return avg_embedding

In [7]:
# Let's test the function with a specific user ID
user_id = 1 
avg_embedding = average_embeddings(user_id, embeddings_df)
if avg_embedding is not None:
    print(f"Average embedding for user {user_id}: {avg_embedding.values}")
    print(f"Embedding shape: {avg_embedding.shape}")
else:
    print(f"No items found for user {user_id}.")

Average embedding for user 1: [ 3.83241430e-02  2.13052689e-02 -9.66815612e-03  6.27171981e-02
 -2.78365425e-02  8.26389417e-02  1.34489542e-02 -5.27623308e-02
 -3.10910400e-03  2.89081629e-02  4.25635036e-02  1.42713361e-03
 -4.31805681e-02  7.78079695e-03  3.11547335e-02  8.61427234e-03
  1.59476974e-02  1.08184306e-02 -1.69125103e-02 -5.98434278e-02
 -3.79425588e-02 -1.39859168e-02  5.94864438e-02 -8.58081443e-02
  4.27553932e-02  4.72716715e-02  4.07341697e-02 -1.00810421e-02
 -5.70671776e-02 -1.50938503e-02  3.63902506e-02  4.43624814e-02
  5.06508518e-02 -3.39160850e-02 -7.55275612e-02  8.26384226e-02
  4.28899688e-03 -8.38524358e-02  8.42564714e-02 -6.02585617e-02
 -1.00116519e-01 -5.28329511e-02  3.79733105e-03  2.27441730e-02
 -1.26497123e-02  3.27156178e-02  5.64249838e-02 -2.06584939e-02
  7.89601592e-02  9.65337704e-03 -2.36584752e-02 -1.41362662e-02
  1.80402635e-02  3.75414249e-02  3.18143256e-02 -5.90253465e-02
 -1.43962062e-02  3.35102625e-02 -9.61964149e-02  8.97835670

Now that we have a function to convert a user id to an averaged vector of item embeddings, we can create a new dictionary mapping each user id to this output. 

In [10]:
user_profiles = {}
# Iterate through each unique user ID in the user interactions DataFrame
for user_id in user_data['user_id'].unique():
    avg_embedding = average_embeddings(user_id, embeddings_df)
    if avg_embedding is not None:
        user_profiles[int(user_id)] = avg_embedding.values
    else:
        user_profiles[int(user_id)] = np.zeros(embeddings_df.shape[1] - 1)
print(f"Created user profiles for {len(user_profiles)} users.")

Created user profiles for 48 users.


In [11]:
user_profiles

{0: array([ 6.01434913e-02,  3.60614913e-02,  4.81082359e-02,  8.84425702e-03,
        -7.57456773e-03,  6.54499740e-02, -4.86542202e-02, -6.77731537e-02,
         1.42736598e-02,  2.43370316e-02, -3.64300117e-02,  9.64989653e-03,
        -2.77692017e-03,  4.31468234e-02,  7.35541700e-02,  6.43837458e-02,
         2.15122293e-02, -1.52593323e-02, -1.37876258e-02, -5.50717016e-02,
        -6.01857506e-02,  5.63650916e-03,  9.56774199e-02, -3.55797288e-02,
        -2.87640290e-02,  9.87691277e-03,  4.92873893e-02, -2.91729523e-02,
        -6.44770314e-02, -2.99926214e-02,  5.44672025e-02,  1.30546434e-02,
         2.67935496e-02, -1.38713541e-02,  2.56034556e-02,  4.47188510e-02,
        -5.87365906e-02,  1.60252623e-03,  5.56093936e-02, -4.15061861e-02,
        -1.50164389e-02, -7.90766378e-02,  2.70722132e-02,  3.79064288e-02,
         2.86880586e-02,  5.87822764e-02,  4.84806534e-02, -3.47699408e-02,
         4.29671016e-02,  4.46065667e-02,  3.26858795e-02, -5.49285744e-02,
         

## Saving Dataframe of Averaged Vectors

In [16]:
user_profiles_df = pd.DataFrame.from_dict(user_profiles, orient='index')
#user_profiles_df.columns = [f'embedding_{i}' for i in range(user_profiles_df.shape[1])]
#user_profiles_df.index.name = 'user_id'
#user_profiles_df.reset_index(inplace=True)
user_profiles_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,373,374,375,376,377,378,379,380,381,382
0,0.060143,0.036061,0.048108,0.008844,-0.007575,0.06545,-0.048654,-0.067773,0.014274,0.024337,...,-0.05487,-0.012844,-0.018181,0.041335,0.007615,0.011697,-0.027718,-0.063241,0.007207,-0.003882
1,0.038324,0.021305,-0.009668,0.062717,-0.027837,0.082639,0.013449,-0.052762,-0.003109,0.028908,...,0.002024,-0.060947,-0.035074,0.033858,0.003707,-0.009584,0.002526,-0.081802,-0.057961,-0.013604
2,0.063626,-0.022125,0.018907,0.037874,-0.007774,0.043303,0.007892,-0.044934,0.031136,0.045607,...,-0.054719,-0.027572,0.020879,0.054398,-0.066077,0.013572,0.070344,-0.072806,-0.012058,0.03805
3,0.078631,0.007365,0.026449,0.023867,0.000867,0.078875,0.040846,-0.059256,0.005375,-0.013123,...,-0.02719,-0.045285,-0.011149,0.047736,0.016887,0.021995,0.036113,-0.101998,-0.036493,0.005175
4,0.116301,0.017553,0.029933,0.044769,-0.029575,0.086199,-0.01863,-0.05432,0.015293,-0.012321,...,-0.020133,-0.030688,-0.024696,0.026634,0.004515,0.004056,-0.016762,-0.056608,-0.036605,-0.030182


In [18]:
list(user_profiles_df.loc[1])  # Check the profile for user ID 1

[0.03832414303906258,
 0.0213052689408262,
 -0.00966815611657995,
 0.06271719808379804,
 -0.027836542547447583,
 0.08263894170522687,
 0.013448954249421748,
 -0.052762330820163045,
 -0.0031091040000319516,
 0.028908162882241087,
 0.04256350360810754,
 0.0014271336064363505,
 -0.04318056814372534,
 0.007780796945250283,
 0.031154733461638263,
 0.008614272344857466,
 0.015947697412533032,
 0.01081843064942708,
 -0.016912510269321484,
 -0.05984342781205968,
 -0.03794255883743363,
 -0.0139859168169399,
 0.059486443797747236,
 -0.08580814426143961,
 0.042755393156160865,
 0.0472716715497275,
 0.04073416969428455,
 -0.010081042069941731,
 -0.05706717756887272,
 -0.015093850282331282,
 0.0363902506457331,
 0.04436248137305176,
 0.05065085180103777,
 -0.033916085027158246,
 -0.07552756120761232,
 0.08263842264811194,
 0.004288996880253133,
 -0.08385243577261764,
 0.08425647144516306,
 -0.06025856174528593,
 -0.10011651863654449,
 -0.052832951148350994,
 0.003797331048796534,
 0.022744173037305

In [19]:
# Save the user profiles DataFrame to a CSV file
user_profiles_df.to_csv('../data/user_profiles.csv', index=False)