In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

# Latent matrix from metadata

In [None]:
filename = '../data/2019_oct_sort_1000.csv'

In [None]:
!pwd

In [None]:
import random
p = 1  # 1% of the lines
# keep the header, then take only 1% of lines
# if random from [0,1] interval is greater than 0.01 the row will be skipped
df = pd.read_csv(
         filename,
         header=0, 
         skiprows=lambda i: i>0 and random.random() > p
)
df.shape

In [None]:
df.shape

In [None]:
df.category_code.nunique()

In [None]:
df.columns

In [None]:
df.head()

In [None]:
def preprocessing_feat(X, drop_event_time=False):
    if drop_event_time:
        X = X.drop("event_time", axis=1)
    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

In [None]:
X_preprocessed = preprocessing_feat(df,  drop_event_time=True)

In [None]:
X_preprocessed.shape

## Pricing criterion

In [None]:
def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged


In [None]:
X_merged = pricing_criterion(X_preprocessed)

In [None]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X


In [None]:
X_meta = metadata(X_merged)
X_meta.nunique()

In [None]:
X_meta.set_index(X_meta['product_id'], inplace=True)

In [None]:
print(X_meta.shape)
X_meta.head()

### Count Vectorizer

In [None]:
count = CountVectorizer()
count_matrix = count.fit_transform(X_meta['metadata'])
count_df_1 = pd.DataFrame(count_matrix.toarray(), index=X_meta.product_id.tolist())
print(count_df_1.shape)
count_df_1.head()

In [None]:
count_df_1 = count_df_1.reset_index()

In [None]:
len(list(set(count_df_1.index)))#.nunique()

In [None]:
count_df_1.drop_duplicates(subset='index', keep='first', inplace=True) 
count_df_1["product_id"] = count_df_1["index"]
count_df_1.drop("index", axis=1, inplace=True)
count_df_1.head()

### Dimensionality reduction

In [None]:
n = 50
svd = TruncatedSVD(n_components=n)
latent_df_1 = svd.fit_transform(count_df_1.set_index("product_id"))

In [None]:
plt.plot(svd.explained_variance_ratio_.cumsum())
plt.xlabel('number of singular value components')
plt.ylabel('Cumulative percent of variance')   
plt.grid()
plt.show()

In [None]:
#count_df_1=count_df_1.set_index("product_id")
#count_df_1.head()
df_for_latent1 = count_df_1.product_id.tolist()

In [None]:
#n defined above
#n = 50
latent_df_1 = pd.DataFrame(latent_df_1[:,0:n], index=df_for_latent1)
print(latent_df_1.shape)
latent_df_1

In [None]:
latent_df_1.reset_index().drop_duplicates().shape

In [None]:
#X_meta['product_id'].nunique()

# Latent matrix from event types

In [None]:
def preprocessing_event(X):
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    return X_preprocessed

In [None]:
df_event = preprocessing_event(df)
df_event.user_id.nunique()

In [None]:
dct = {'view': 1, 'cart': 3, 'purchase': 5}

In [None]:
df_event['rating'] = df_event['event_type'].map(dct)

In [None]:
df_event["rating"].sum()

In [None]:
# df_rating.shape

In [None]:
# df_rating = df_event.pivot_table(values='rating',
#                                  index='product_id',
#                                  columns='user_id',
#                                  aggfunc="sum").fillna(0)
# df_rating

In [None]:
df_event.nunique()

In [None]:
df_event.drop_duplicates(subset='product_id',inplace=True)
df_event.nunique()

In [None]:
%%time

df_rating = df_event.pivot(values='rating',
               index='product_id',
               columns='user_id').fillna(0)

df_rating

In [None]:
df_rating.sum().reset_index()[0].sum()

In [None]:
df_event.groupby("user_id").agg({"rating":sum}).sort_values(by="rating").sum()

In [None]:
df_event.groupby("user_id").agg({"rating":sum}).sort_values(by="rating").sum()

In [None]:
df_rating.reset_index()

In [None]:
svd = TruncatedSVD(n_components=75)
latent_df_2 = svd.fit_transform(df_rating)

In [None]:
plt.plot(svd.explained_variance_ratio_.cumsum())
plt.xlabel('number of singular value components')
plt.ylabel('Cumulative percent of variance')   
plt.grid()
plt.show()

In [None]:
latent_df_2 = pd.DataFrame(latent_df_2, index=df_rating.reset_index().product_id.tolist())
latent_df_2.head()

In [None]:
latent_df_2.shape

In [8]:
filename_latent_df_1 = '../data/latent_df_1_with_0.75_data_50_svd_components.csv'

In [9]:
filename_latent_df_2 = '../data/latent_df_2_with_0.75_data_100_svd_components.csv'

In [10]:
filename_meta = '../data/X_meta_with_0.75_data.csv'

In [11]:
latent_df_1 = pd.read_csv(
         filename_latent_df_1, index_col=[0]
)
latent_df_1.shape

(97678, 50)

In [12]:
latent_df_2 = pd.read_csv(
         filename_latent_df_2, index_col=[0]
)
latent_df_2.shape

(97678, 100)

In [13]:
latent_df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
1000894,-1.835656e-14,4.52079e-13,-7.985596e-13,-5.905201e-13,1.07343e-12,1.614425e-13,-1.716163e-12,-6.534994e-13,1.819376e-13,-8.999795e-13,...,-2.592643e-11,-2.201411e-10,-1.872037e-10,-2.487008e-10,-8.921736e-11,2.005085e-11,-2.275451e-10,4.343912e-10,-6.525066e-10,-3.039636e-10
1000978,-2.941788e-14,-1.021807e-13,-1.173152e-14,-2.816329e-13,1.834223e-12,3.979463e-13,1.633979e-12,-7.425283e-13,3.71158e-12,2.338082e-12,...,-4.022742e-11,-3.638857e-10,3.895402e-12,2.302426e-10,-1.132427e-10,5.862918e-11,-1.224797e-10,5.106937e-11,-1.402225e-10,2.274363e-11
1001588,3.516446e-14,6.643427e-13,5.443896e-13,5.595949e-13,8.009927e-13,-2.208433e-13,-8.459085e-14,3.144015e-13,9.922731e-13,-1.052064e-11,...,3.107847e-10,3.567679e-11,-4.318511e-10,-3.367824e-10,5.466734e-11,-1.559944e-10,1.840067e-10,2.248758e-10,-3.799162e-11,2.254247e-10
1001605,4.855821e-14,-3.268435e-13,-4.457225e-13,2.409434e-13,7.628716e-13,3.947373e-13,6.325431e-13,-1.888941e-12,3.595318e-12,-3.991449e-12,...,-9.184417e-11,2.393336e-10,6.375184e-12,2.03061e-10,-4.624885e-10,-1.592591e-10,-2.892722e-10,-3.349816e-10,3.38815e-10,-3.307759e-10
1001606,1.424618e-14,-8.109685e-14,3.45739e-13,-8.807927e-13,9.649459e-14,2.09901e-12,8.082827e-13,-2.844927e-13,2.039062e-12,5.939513e-12,...,-3.097525e-10,4.939785e-11,-4.056259e-10,-3.369652e-10,-9.648606e-12,-3.077545e-11,2.516165e-10,5.57032e-10,-3.096536e-10,2.338247e-10


In [57]:
X_meta = pd.read_csv(
         filename_meta, index_col=[0]
)

In [64]:
X_meta = X_meta[['product_id.1', 'price', 'metadata']]
X_meta.rename(columns={'product_id.1': 'product_id'}, inplace=True)
# X_meta.columns['product_id', 'price', 'metadata']

In [65]:
X_meta.head()

Unnamed: 0_level_0,product_id,price,metadata
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28400014,28400014,32.95,accessories bag roncato medium
52100003,52100003,86.21,accessories bag nike high
16000911,16000911,24.51,accessories bag metrot low
16200339,16200339,17.5,accessories bag huggies low
16200278,16200278,4.07,accessories bag huggies low


In [66]:
# X_meta.drop(columns='product_id.1', inplace=True)

In [67]:
X_meta.head()

Unnamed: 0_level_0,product_id,price,metadata
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
28400014,28400014,32.95,accessories bag roncato medium
52100003,52100003,86.21,accessories bag nike high
16000911,16000911,24.51,accessories bag metrot low
16200339,16200339,17.5,accessories bag huggies low
16200278,16200278,4.07,accessories bag huggies low


In [68]:
latent_df_1.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
1000894,0.479679,0.703705,-0.347103,-0.527718,0.653986,-0.48669,-0.036662,-0.083995,-0.047555,-0.044862,...,0.05426,0.190516,0.025682,-0.011648,-0.058354,0.053954,0.362089,-0.295604,-0.150441,0.200665
1000978,1.177724,-0.324887,0.81388,-0.550515,-0.759678,-0.284363,-0.160556,-0.054464,-0.050674,-0.09083,...,-0.002556,0.341302,0.000729,0.066601,-0.008678,0.103298,0.208929,-0.285964,-0.170319,0.338
1001588,0.469582,0.767302,-0.390873,-0.540211,0.255217,0.894777,-0.774265,-0.731184,-0.12205,-0.120928,...,0.002785,-0.029138,0.009288,-0.003077,-0.001505,-0.019884,-0.050599,-0.020738,0.003819,0.016766
1001605,0.410104,0.302438,0.591097,0.386202,0.975642,-0.309749,-0.028792,-0.085603,-0.038355,-0.016856,...,0.045989,0.347607,0.073013,0.013252,-0.056804,0.067532,0.344986,-0.303446,-0.117945,0.161882
1001606,0.410104,0.302438,0.591097,0.386202,0.975642,-0.309749,-0.028792,-0.085603,-0.038355,-0.016856,...,0.045989,0.347607,0.073013,0.013252,-0.056804,0.067532,0.344986,-0.303446,-0.117945,0.161882


In [69]:
latent_df_2.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,90,91,92,93,94,95,96,97,98,99
1000894,-1.835656e-14,4.52079e-13,-7.985596e-13,-5.905201e-13,1.07343e-12,1.614425e-13,-1.716163e-12,-6.534994e-13,1.819376e-13,-8.999795e-13,...,-2.592643e-11,-2.201411e-10,-1.872037e-10,-2.487008e-10,-8.921736e-11,2.005085e-11,-2.275451e-10,4.343912e-10,-6.525066e-10,-3.039636e-10
1000978,-2.941788e-14,-1.021807e-13,-1.173152e-14,-2.816329e-13,1.834223e-12,3.979463e-13,1.633979e-12,-7.425283e-13,3.71158e-12,2.338082e-12,...,-4.022742e-11,-3.638857e-10,3.895402e-12,2.302426e-10,-1.132427e-10,5.862918e-11,-1.224797e-10,5.106937e-11,-1.402225e-10,2.274363e-11
1001588,3.516446e-14,6.643427e-13,5.443896e-13,5.595949e-13,8.009927e-13,-2.208433e-13,-8.459085e-14,3.144015e-13,9.922731e-13,-1.052064e-11,...,3.107847e-10,3.567679e-11,-4.318511e-10,-3.367824e-10,5.466734e-11,-1.559944e-10,1.840067e-10,2.248758e-10,-3.799162e-11,2.254247e-10
1001605,4.855821e-14,-3.268435e-13,-4.457225e-13,2.409434e-13,7.628716e-13,3.947373e-13,6.325431e-13,-1.888941e-12,3.595318e-12,-3.991449e-12,...,-9.184417e-11,2.393336e-10,6.375184e-12,2.03061e-10,-4.624885e-10,-1.592591e-10,-2.892722e-10,-3.349816e-10,3.38815e-10,-3.307759e-10
1001606,1.424618e-14,-8.109685e-14,3.45739e-13,-8.807927e-13,9.649459e-14,2.09901e-12,8.082827e-13,-2.844927e-13,2.039062e-12,5.939513e-12,...,-3.097525e-10,4.939785e-11,-4.056259e-10,-3.369652e-10,-9.648606e-12,-3.077545e-11,2.516165e-10,5.57032e-10,-3.096536e-10,2.338247e-10


# Apply Cosine Similarity in Content and Collaborative Matrices


In [20]:
product_id=52100003

In [21]:
# Get the latent vectors for "Toy Story" from content and collaborative matrices
v1 = np.array(latent_df_1.loc[product_id]).reshape(1, -1)
v2 = np.array(latent_df_2.loc[product_id]).reshape(1, -1)

# Compute the cosine similarity of this movie with the others in the list
sim1 = cosine_similarity(latent_df_1, v1).reshape(-1)
sim2 = cosine_similarity(latent_df_2, v2).reshape(-1)

In [22]:
dictDf_1 = {'features': sim1} 
recommendation_df_1 = pd.DataFrame(dictDf_1, index = latent_df_1.index)
# recommendation_df_1.sort_values('features', ascending=False, inplace=True)

In [23]:
dictDf_2 = {'ratings': sim2} 
recommendation_df_2 = pd.DataFrame(dictDf_2, index = latent_df_2.index)
# recommendation_df_2.sort_values('ratings', ascending=False, inplace=True)

In [24]:
weight_features = 0.8

In [25]:
recommendation_combined = pd.merge(recommendation_df_1, recommendation_df_2, left_index=True, right_index=True)
recommendation_combined['hybrid'] = ((weight_features*recommendation_combined['features'] + (1-weight_features)*recommendation_combined['ratings']))

In [26]:
recommendation_combined.sort_values('ratings', ascending=False, inplace=True)
recommendation_combined

Unnamed: 0,features,ratings,hybrid
52100003,1.000000,1.000000,1.000000
26205459,0.284285,0.504078,0.328244
100045198,0.284285,0.504078,0.328244
100013032,0.028050,0.495124,0.121464
21411667,0.000459,0.492685,0.098905
...,...,...,...
21412040,0.333792,-0.492154,0.168603
21402544,-0.000405,-0.492154,-0.098755
45601227,0.003253,-0.496981,-0.096794
25100316,0.001559,-0.536462,-0.106045


In [27]:
recommendation_combined.head()

Unnamed: 0,features,ratings,hybrid
52100003,1.0,1.0,1.0
26205459,0.284285,0.504078,0.328244
100045198,0.284285,0.504078,0.328244
100013032,0.02805,0.495124,0.121464
21411667,0.000459,0.492685,0.098905


In [28]:
def top_n_products(rec_df, meta_df, n=10, ranking='hybrid'):
    
    """Valid inouts for ranking: 'features', 'ratings', 'hybrid'"""
    
    feat_idx = rec_df.sort_values(ranking, ascending=False).index
    
    
    return feat_idx
    
#     for i in feat_idx:
#         meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
#         lst = []
#         lst.append(meta_text)
#         return lst

In [29]:
indexes = top_n_products(recommendation_combined, X_meta, ranking='features')

In [30]:
len(indexes)

97678

In [31]:
X_meta[X_meta.index.isin(indexes)]['metadata']

product_id
28400014        accessories bag roncato medium
52100003             accessories bag nike high
16000911            accessories bag metrot low
16200339           accessories bag huggies low
16200278           accessories bag huggies low
                           ...                
25510241         stationery cartrige gerat low
25510206      stationery cartrige gerat medium
12900414     stationery cartrige europrint low
25510333         stationery cartrige gerat low
4100339     stationery cartrige microsoft high
Name: metadata, Length: 3857156, dtype: object

In [32]:
X_meta.head()

Unnamed: 0_level_0,event_type,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,metadata
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
28400014,view,2232732082935693457,accessories bag,roncato,32.95,523916611,d5197bd8-b358-4af0-bc8f-7026ef6a5a5c,28.83,73.88,medium,accessories bag roncato medium
52100003,view,2232732097255047630,accessories bag,nike,86.21,553813867,282e31fc-78b0-417c-bb5b-bcedc09e014f,28.83,73.88,high,accessories bag nike high
16000911,view,2053013556856619499,accessories bag,metrot,24.51,575038588,4f70cdf9-9881-4112-a955-4cfafb9f6f4f,28.83,73.88,low,accessories bag metrot low
16200339,view,2232732108453839552,accessories bag,huggies,17.5,579851217,898366d8-3605-4dff-93c7-6d233ae5bb04,28.83,73.88,low,accessories bag huggies low
16200278,view,2232732108453839552,accessories bag,huggies,4.07,512669930,ea20f617-ff8f-425f-a19a-e1489dc2fc35,28.83,73.88,low,accessories bag huggies low


In [40]:
def top_n_products(rec_df, meta_df, n=10, ranking='hybrid'):
    
    """Valid inputs for ranking: 'features', 'ratings', 'hybrid'"""

    feat_idx = rec_df.sort_values(ranking, ascending=False).index
    
    counter = 0
    
    product_ids=[]
    metas=[]
    prices=[]
    
    for i in feat_idx:
        meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
        price = meta_df[meta_df['product_id'] == i][['price']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} {price} \n")
#             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text} {price}")
        
        product_ids.append(i)
        metas.append(meta_text)
        prices.append(price)
        
        counter += 1
    
    return pd.DataFrame({'product_id':product_ids, 'meta_text':metas, 'price':prices})

In [41]:
top_n_products(recommendation_combined, X_meta, ranking='features')

Unnamed: 0,product_id.1,meta_text,price
0,52100003,accessories bag nike high,86.21
1,100046296,accessories bag nike high,107.85
2,100046300,accessories bag nike high,120.72
3,52100005,accessories bag nike medium,73.36
4,100046299,accessories bag nike high,107.85
...,...,...,...
97673,28715330,computers peripherals mouse respect medium,59.20
97674,28717441,computers peripherals mouse respect medium,44.53
97675,28717361,computers peripherals mouse respect medium,59.20
97676,28716657,computers peripherals mouse respect medium,53.80


In [None]:
new_df = top_n_products(recommendation_combined, X_meta, ranking='features')

In [None]:
new_df.drop_duplicates('meta_text').iloc[:10,:]

In [70]:
def top_n_products(rec_df, meta_df, n=10, ranking='hybrid'):
    
    """Valid inputs for ranking: 'features', 'ratings', 'hybrid'"""

    feat_idx = rec_df.sort_values(ranking, ascending=False).index
    
    counter = 0
    
    product_ids=[]
    metas=[]
    prices=[]
    
    for i in feat_idx:
        meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
        price = meta_df[meta_df['product_id'] == i][['price']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} {price} \n")
#             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text} {price}")
        
        product_ids.append(i)
        metas.append(meta_text)
        prices.append(price)
        
        counter += 1
    
    new_df = pd.DataFrame({'product_id':product_ids, 'meta_text':metas, 'price':prices}).drop_duplicates('meta_text').iloc[:n,:]
    return new_df.reset_index().drop(columns='index')

In [None]:
top_n_products(recommendation_combined, X_meta, ranking='features', n=10)