# Top 10 selector

In [99]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [100]:
df = pd.read_csv('data/2019_oct_sort_1000.csv')

In [101]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-18 10:54:45 UTC,view,54900011,2146660887203676486,apparel.costume,,64.35,515483062,00000042-3e3f-42f9-810d-f3d264139c50
1,2019-10-18 10:55:20 UTC,view,54900011,2146660887203676486,apparel.costume,,64.35,515483062,00000042-3e3f-42f9-810d-f3d264139c50
2,2019-10-31 06:25:30 UTC,view,1005105,2053013555631882655,electronics.smartphone,apple,1349.46,513782162,00000056-a206-40dd-b174-a072550fa38c
3,2019-10-31 06:25:52 UTC,view,5100816,2053013553375346967,,xiaomi,29.6,513782162,00000056-a206-40dd-b174-a072550fa38c
4,2019-10-31 06:23:12 UTC,view,1005115,2053013555631882655,electronics.smartphone,apple,955.84,513782162,00000056-a206-40dd-b174-a072550fa38c


In [102]:
def preprocessing_feat(X):    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged

def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X

def rating(X):
    dct = {'view': 1, 'cart': 3, 'purchase': 5}
    X['rating'] = X['event_type'].map(dct)
    return X

In [103]:
X_prep = preprocessing_feat(df)

  X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')


In [104]:
X_pric = pricing_criterion(X_prep)

In [105]:
X_pric.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category
0,2019-10-09 21:41:23 UTC,view,49800017,2126679654801604876,accessories bag,ritmix,12.32,537956308,00003599-a772-4c8a-9c22-0dfa4f6ecc83,15.43,66.67,low
1,2019-10-09 21:41:39 UTC,view,49800020,2126679654801604876,accessories bag,ritmix,15.43,537956308,00003599-a772-4c8a-9c22-0dfa4f6ecc83,15.43,66.67,medium
2,2019-10-09 16:37:32 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high
3,2019-10-09 16:39:20 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high
4,2019-10-09 16:42:31 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high


In [106]:
X_rating = rating(X_pric)

In [107]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X

In [108]:
X_meta = metadata(X_pric)
X_meta.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,rating,metadata
0,2019-10-09 21:41:23 UTC,view,49800017,2126679654801604876,accessories bag,ritmix,12.32,537956308,00003599-a772-4c8a-9c22-0dfa4f6ecc83,15.43,66.67,low,1,accessories bag ritmix low
1,2019-10-09 21:41:39 UTC,view,49800020,2126679654801604876,accessories bag,ritmix,15.43,537956308,00003599-a772-4c8a-9c22-0dfa4f6ecc83,15.43,66.67,medium,1,accessories bag ritmix medium
2,2019-10-09 16:37:32 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high,1,accessories bag respect high
3,2019-10-09 16:39:20 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high,1,accessories bag respect high
4,2019-10-09 16:42:31 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high,1,accessories bag respect high


In [109]:
X_rating.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,rating,metadata
0,2019-10-09 21:41:23 UTC,view,49800017,2126679654801604876,accessories bag,ritmix,12.32,537956308,00003599-a772-4c8a-9c22-0dfa4f6ecc83,15.43,66.67,low,1,accessories bag ritmix low
1,2019-10-09 21:41:39 UTC,view,49800020,2126679654801604876,accessories bag,ritmix,15.43,537956308,00003599-a772-4c8a-9c22-0dfa4f6ecc83,15.43,66.67,medium,1,accessories bag ritmix medium
2,2019-10-09 16:37:32 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high,1,accessories bag respect high
3,2019-10-09 16:39:20 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high,1,accessories bag respect high
4,2019-10-09 16:42:31 UTC,view,28401080,2053013566209917945,accessories bag,respect,66.67,516007189,000037a4-c043-4113-9b53-7bbb5d7c2bfe,15.43,66.67,high,1,accessories bag respect high


In [126]:
def ranking(X, n=5):
    X = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).reset_index().index[0:n]
    return X
    

In [136]:
def top_n(product_id, X_rating, n=10):
    
    """select top n products by ranking'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).reset_index().index[0:n]
    return rating_idx

    counter = 0
    
    for i in rating_idx:
        meta_text = X_rating[X_rating['product_id'] == i][['metadata']].iloc[0,:][0]
        
        if counter == 0:
            print(f"Top {n} recommendations for product_id {product_id}:")
            print(f"{i} - {meta_text} \n")
#             print("-----------------------")

        else:
            print(f"Rec {counter}) {i} - {meta_text}")
        
        counter += 1

In [198]:
X_rating.shape

(609, 14)

In [208]:
pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
product_id,brand,Unnamed: 2_level_1
1004767,samsung,48
1004856,samsung,25
1004834,samsung,22
1004870,samsung,21
1801881,samsung,15
...,...,...
2701646,indesit,1
2701673,indesit,1
2701773,indesit,1
2800477,beko,1


In [209]:
pd.DataFrame(X_rating.groupby(['brand']).rating.sum()).sort_values('rating', ascending=False)

Unnamed: 0_level_0,rating
brand,Unnamed: 1_level_1
samsung,262
apple,112
xiaomi,75
oppo,20
huawei,16
...,...
alteco,1
karcher,1
kenwood,1
llorens,1


In [228]:
def top_n_overall(product_id, X_rating, n=5):
    
    """select top n products by ranking'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).index[0:n]
#     rec_df.sort_values(ranking, ascending=False).index[0:n]
    counter = 0
    
    for i in rating_idx:
        meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]
        
        if counter == 0:
            print(f"Top {n} recommendations for product_id {product_id}:")
            print(f"{i} - {meta_text} \n")
#             print("-----------------------")

        else:
            print(f"Rec {counter}) {i} - {meta_text}")
        
        counter += 1

In [229]:
top_n_overall(49800017, X_rating)

Top 5 recommendations for product_id 49800017:
1004767 - electronics smartphone samsung medium 

Rec 1) 1004856 - electronics smartphone samsung low
Rec 2) 1004834 - electronics smartphone samsung low
Rec 3) 1004870 - electronics smartphone samsung medium
Rec 4) 1801881 - electronics video tv samsung high


In [212]:
pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
product_id,brand,Unnamed: 2_level_1
1004767,samsung,48
1004856,samsung,25
1004834,samsung,22
1004870,samsung,21
1801881,samsung,15
...,...,...
2701646,indesit,1
2701673,indesit,1
2701773,indesit,1
2800477,beko,1


In [238]:
def top_n_brands(X_rating, n=10):
    
    """select top n products by ranking'"""
    
    rating_idx =     pd.DataFrame(X_rating.groupby(['brand']).rating.sum()).sort_values('rating', ascending=False).index[0:n]

    #     rec_df.sort_values(ranking, ascending=False).index[0:n]
    return list(rating_idx)

In [239]:
top_n_brands(X_rating)

['samsung',
 'apple',
 'xiaomi',
 'oppo',
 'huawei',
 'tefal',
 'indesit',
 'starline',
 'bts',
 'respect']

In [270]:
def top_n_by_brand(X_rating, n=10):
    
    """select top n products by ranking'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False)
    
    #     rec_df.sort_values(ranking, ascending=False).index[0:n]
    abc = rating_idx[rating_idx['brand']=='samsung']
    return abc

In [271]:
top_n_by_brand(X_rating)

KeyError: 'brand'

In [258]:
top_n_by_brand(X_rating, brand='samsung')

KeyError: 'samsung'

In [246]:
pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
product_id,brand,Unnamed: 2_level_1
1004767,samsung,48
1004856,samsung,25
1004834,samsung,22
1004870,samsung,21
1801881,samsung,15
...,...,...
2701646,indesit,1
2701673,indesit,1
2701773,indesit,1
2800477,beko,1


In [240]:
pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,rating
product_id,brand,Unnamed: 2_level_1
1004767,samsung,48
1004856,samsung,25
1004834,samsung,22
1004870,samsung,21
1801881,samsung,15
...,...,...
2701646,indesit,1
2701673,indesit,1
2701773,indesit,1
2800477,beko,1
