In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Latent matrix from metadata

In [2]:
!ls ../data

2019-Dec.csv_10%.csv 2019-Oct.csv_10%.csv [1m[36mlatent_dfs[m[m


In [3]:
filename = '../data/2019-Dec.csv_10%.csv'

In [4]:
!pwd

/Users/julioq/code/lewagon-data-apps/1050_batch/eCommerce/notebooks


In [5]:
import random
p = 0.5  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df.shape

(3375212, 9)

In [6]:
df.shape

(3375212, 9)

In [7]:
df.category_code.nunique()

134

In [8]:
df.columns

Index(['event_time', 'event_type', 'product_id', 'category_id',
       'category_code', 'brand', 'price', 'user_id', 'user_session'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-12-01 00:00:07 UTC,view,11500445,2053013552259662037,computers.components.power_supply,xiaomi,27.77,526844203,5e62045f-58f7-4421-9e0f-977a5d070302
1,2019-12-01 00:00:10 UTC,view,37900170,2152167773222993940,,vega,34.05,539618099,2d82cd98-f9df-4498-b81d-0a678ea2eae2
2,2019-12-01 00:00:14 UTC,view,1004249,2232732093077520756,construction.tools.light,apple,794.1,522786545,0fa3ffd9-7417-4d71-9f80-d37621b840c8
3,2019-12-01 00:00:14 UTC,view,17301106,2232732098446229999,apparel.shoes.sandals,hermes,185.33,513133286,ae3fc5fd-dcbe-492b-b420-4e01d85c5ef2
4,2019-12-01 00:00:15 UTC,view,1005253,2232732093077520756,construction.tools.light,xiaomi,261.27,579969926,c23dcd23-c6d2-42d3-8a90-7ea87f3f672f


In [10]:
def preprocessing_feat(X, drop_event_time=False):
    if drop_event_time:
        X = X.drop("event_time", axis=1)
    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

In [11]:
X_preprocessed = preprocessing_feat(df,  drop_event_time=True)

  X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')


In [12]:
X_preprocessed.shape

(2604755, 8)

## Pricing criterion

In [13]:
def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged


In [14]:
X_merged = pricing_criterion(X_preprocessed)

In [15]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X


In [16]:
X_meta = metadata(X_merged)
X_meta.nunique()

event_type              3
product_id          89266
category_id           840
category_code         134
brand                3800
price               41117
user_id           1225705
user_session      2127836
25%                   121
75%                   125
price_category          3
metadata            15160
dtype: int64

In [17]:
X_meta.set_index(X_meta['product_id'], inplace=True)

In [18]:
print(X_meta.shape)
X_meta.head()

(2604755, 12)


Unnamed: 0_level_0,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
16200339,view,16200339,2232732108453839552,accessories bag,huggies,17.5,579851217,898366d8-3605-4dff-93c7-6d233ae5bb04,29.59,73.88,low,accessories bag huggies low
28401034,view,28401034,2053013566209917945,accessories bag,picard,142.86,514567336,41960d57-ea88-4c83-873c-cdc2e8e27e85,29.59,73.88,high,accessories bag picard high
28400999,view,28400999,2232732082935693457,accessories bag,picard,45.05,514567336,41960d57-ea88-4c83-873c-cdc2e8e27e85,29.59,73.88,medium,accessories bag picard medium
28400062,view,28400062,2232732082935693457,accessories bag,roncato,50.97,578575518,ee45a262-2389-4ce4-9076-7986ab68cf28,29.59,73.88,medium,accessories bag roncato medium
16200316,view,16200316,2232732108453839552,accessories bag,huggies,17.5,577054321,6f60f981-907c-4dd7-80a5-8f46fa2711b3,29.59,73.88,low,accessories bag huggies low


### Count Vectorizer

In [19]:
count = CountVectorizer()
count_matrix = count.fit_transform(X_meta['metadata'])
count_df_1 = pd.DataFrame(count_matrix.toarray(), index=X_meta.product_id.tolist())
print(count_df_1.shape)
count_df_1.head()

(2604755, 3990)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3980,3981,3982,3983,3984,3985,3986,3987,3988,3989
16200339,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
28401034,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
28400999,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
28400062,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
16200316,0,0,0,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [20]:
count_df_1 = count_df_1.reset_index()

In [21]:
# len(list(set(count_df_1.index)))#.nunique()

In [None]:
count_df_1.drop_duplicates(subset='index', keep='first', inplace=True) 
count_df_1["product_id"] = count_df_1["index"]
count_df_1.drop("index", axis=1, inplace=True)
count_df_1.head()

### Dimensionality reduction

In [None]:
n = 50
svd = TruncatedSVD(n_components=n)
latent_df_1 = svd.fit_transform(count_df_1.set_index("product_id"))

In [None]:
plt.plot(svd.explained_variance_ratio_.cumsum())
plt.xlabel('number of singular value components')
plt.ylabel('Cumulative percent of variance')   
plt.grid()
plt.show()

In [None]:
#count_df_1=count_df_1.set_index("product_id")
#count_df_1.head()
df_for_latent1 = count_df_1.product_id.tolist()

In [None]:
#n defined above
#n = 50
latent_df_1 = pd.DataFrame(latent_df_1[:,0:n], index=df_for_latent1)
print(latent_df_1.shape)
latent_df_1

In [None]:
latent_df_1.reset_index().drop_duplicates().shape

In [None]:
latent_df_1

In [None]:
! ls ../data/latent_dfs/

In [None]:
filename = f'../data/latent_dfs/latent_df_1_with_{p}%_data_{n}_svd_components.csv'
latent_df_1.to_csv(filename)

In [None]:
#X_meta['product_id'].nunique()

# Latent matrix from event types

In [None]:
def preprocessing_event(X):
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    return X_preprocessed

In [None]:
df_event = preprocessing_event(df)
df_event.user_id.nunique()

In [None]:
dct = {'view': 1, 'cart': 3, 'purchase': 5}

In [None]:
df_event['rating'] = df_event['event_type'].map(dct)

In [None]:
df_event["rating"].sum()

In [None]:
# df_rating.shape

In [None]:
# df_rating = df_event.pivot_table(values='rating',
#                                  index='product_id',
#                                  columns='user_id',
#                                  aggfunc="sum").fillna(0)
# df_rating

In [None]:
df_event.nunique()

In [None]:
df_event.drop_duplicates(subset='product_id',inplace=True)
df_event.nunique()

In [None]:
%%time

df_rating = df_event.pivot(values='rating',
               index='product_id',
               columns='user_id').fillna(0)

df_rating

In [None]:
df_rating.sum().reset_index()[0].sum()

In [None]:
df_event.groupby("user_id").agg({"rating":sum}).sort_values(by="rating").sum()

In [None]:
df_event.groupby("user_id").agg({"rating":sum}).sort_values(by="rating").sum()

In [None]:
df_rating.reset_index()

In [None]:
df_rating

In [None]:
# df_rating.index

In [None]:
# df_rating.columns

In [None]:
# df_rating_reset_index_no_productid_col = df_rating.reset_index().iloc[:,1:]
# df_rating_reset_index_no_productid_col

In [None]:
# n_s = 75
# svd = TruncatedSVD(n_components=n_s)
# latent_df_3 = svd.fit_transform(df_rating_reset_index_no_productid_col)
# latent_df_3

In [None]:
n_s = 200
svd = TruncatedSVD(n_components=n_s)
latent_df_2 = svd.fit_transform(df_rating)

In [None]:
latent_df_2

In [None]:
# latent_df_2 = pd.DataFrame(latent_df_2, index=df_rating.index)
# latent_df_2

In [None]:
plt.plot(svd.explained_variance_ratio_.cumsum())
plt.xlabel('number of singular value components')
plt.ylabel('Cumulative percent of variance')   
plt.grid()
plt.show()

In [None]:
latent_df_2 = pd.DataFrame(latent_df_2, index=df_rating.reset_index().product_id.tolist())
latent_df_2.head()

In [None]:
latent_df_2.shape

In [None]:
filename_2 = f'../data/latent_dfs/latent_df_2_with_{p}%_data_{n_s}_svd_components.csv'
latent_df_2.to_csv(filename_2)

# Apply Cosine Similarity in Content and Collaborative Matrices


In [None]:
product_id=5100337

In [None]:
# Get the latent vectors for "Toy Story" from content and collaborative matrices
v1 = np.array(latent_df_1.loc[product_id]).reshape(1, -1)
v2 = np.array(latent_df_2.loc[product_id]).reshape(1, -1)

# Compute the cosine similarity of this movie with the others in the list
sim1 = cosine_similarity(latent_df_1, v1).reshape(-1)
sim2 = cosine_similarity(latent_df_2, v2).reshape(-1)

In [None]:
dictDf_1 = {'features': sim1} 
recommendation_df_1 = pd.DataFrame(dictDf_1, index = latent_df_1.index)
# recommendation_df_1.sort_values('features', ascending=False, inplace=True)

In [None]:
dictDf_2 = {'ratings': sim2} 
recommendation_df_2 = pd.DataFrame(dictDf_2, index = latent_df_2.index)
# recommendation_df_2.sort_values('ratings', ascending=False, inplace=True)

In [None]:
weight_features = 0.8

In [None]:
recommendation_combined = pd.merge(recommendation_df_1, recommendation_df_2, left_index=True, right_index=True)
recommendation_combined['hybrid'] = ((weight_features*recommendation_combined['features'] + (1-weight_features)*recommendation_combined['ratings']))

In [None]:
recommendation_combined.sort_values('ratings', ascending=False, inplace=True)
recommendation_combined

In [None]:
recommendation_combined.head()

In [None]:
def top_n_products(rec_df, meta_df, n=10, ranking='hybrid'):
    
    """Valid inouts for ranking: 'features', 'ratings', 'hybrid'"""
    
    feat_idx = rec_df.sort_values(ranking, ascending=False).index
    
    
    return feat_idx
    
#     for i in feat_idx:
#         meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
#         lst = []
#         lst.append(meta_text)
#         return lst

In [None]:
indexes = top_n_products(recommendation_combined, X_meta, ranking='features')

In [None]:
len(indexes)

In [None]:
X_meta[X_meta.index.isin(indexes)]['metadata']

In [None]:
# X_meta.head()

In [None]:
def top_n_products(rec_df, meta_df, n=10, ranking='hybrid'):
    
    """Valid inputs for ranking: 'features', 'ratings', 'hybrid'"""

    feat_idx = rec_df.sort_values(ranking, ascending=False).index
    
    counter = 0
    
    product_ids=[]
    metas=[]
    prices=[]
    
    for i in feat_idx:
        meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
        price = meta_df[meta_df['product_id'] == i][['price']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} {price} \n")
#             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text} {price}")
        
        product_ids.append(i)
        metas.append(meta_text)
        prices.append(price)
        
        counter += 1
    
    return pd.DataFrame({'product_id':product_ids, 'meta_text':metas, 'price':prices})

In [None]:
# top_n_products(recommendation_combined, X_meta, ranking='features')

In [None]:
# new_df = top_n_products(recommendation_combined, X_meta, ranking='features')

In [None]:
# new_df.drop_duplicates('meta_text').iloc[:10,:]

In [None]:
def top_n_products(rec_df, meta_df, n=10, ranking='hybrid'):
    
    """Valid inputs for ranking: 'features', 'ratings', 'hybrid'"""

    feat_idx = rec_df.sort_values(ranking, ascending=False).index
    
    counter = 0
    
    product_ids=[]
    metas=[]
    prices=[]
    
    for i in feat_idx:
        meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
        price = meta_df[meta_df['product_id'] == i][['price']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} {price} \n")
#             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text} {price}")
        
        product_ids.append(i)
        metas.append(meta_text)
        prices.append(price)
        
        counter += 1
    
    new_df = pd.DataFrame({'product_id':product_ids, 'meta_text':metas, 'price':prices}).drop_duplicates('meta_text').iloc[:n,:]
    return new_df.reset_index().drop(columns='index')

In [None]:
# top_n_products(recommendation_combined, X_meta, ranking='features', n=10)