# Top 10 selector

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
! pwd

/home/cjergen/code/sailormoonvicky/eCommerce/notebooks


In [50]:
df = pd.read_csv('../data/2019-Oct.csv_10%.csv')

In [51]:
df.head()

Unnamed: 0,event_time,event_type,product_id,category_id,category_code,brand,price,user_id,user_session
0,2019-10-01 00:00:17 UTC,view,23100006,2053013561638126333,,,357.79,513642368,17566c27-0a8f-4506-9f30-c6a2ccbf583b
1,2019-10-01 00:00:20 UTC,view,4803399,2053013554658804075,electronics.audio.headphone,jbl,33.21,555428858,8a6afed4-77f8-40c9-8e76-e062b28216ce
2,2019-10-01 00:00:23 UTC,view,6200260,2053013552293216471,appliances.environment.air_heater,midea,47.62,538645907,7d9a8784-7b6c-426e-9924-9f688812fd71
3,2019-10-01 00:00:58 UTC,view,4802639,2053013554658804075,electronics.audio.headphone,sony,218.77,514808401,1877639d-46a4-44f8-bae9-a14456952240
4,2019-10-01 00:01:11 UTC,view,1004836,2053013555631882655,electronics.smartphone,samsung,241.19,546259103,6e2984c8-502e-4fe7-bbba-34087f760175


In [52]:
def preprocessing_feat(X):    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged

def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X

def rating(X):
    dct = {'view': 1, 'cart': 3, 'purchase': 5}
    X['rating'] = X['event_type'].map(dct)
    return X

In [53]:
X_prep = preprocessing_feat(df)

  X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')


In [54]:
X_pric = pricing_criterion(X_prep)

In [55]:
X_rating = rating(X_pric)

In [56]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X

In [57]:
X_meta = metadata(X_pric)

In [58]:
def ranking(X, n=5):
    X = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).reset_index().index[0:n]
    return X
    

In [59]:
# def top_n(product_id, X_rating, n=10):
    
#     """select top n products by ranking'"""
    
#     rating_idx = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).reset_index().index[0:n]
#     return rating_idx

#     counter = 0
    
#     for i in rating_idx:
#         meta_text = X_rating[X_rating['product_id'] == i][['metadata']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} \n")
# #             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text}")
        
#         counter += 1

In [60]:
def top_n_overall(product_id, X_rating, n=5):
    
    """select top n products by ranking'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).index[0:n]
    #     rec_df.sort_values(ranking, ascending=False).index[0:n]
    counter = 0
    
    for i in rating_idx:
        meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]
        
        if counter == 0:
            print(f"Top {n} recommendations for product_id {product_id}:")
            print(f"{i} - {meta_text} \n")
#             print("-----------------------")

        else:
            print(f"Rec {counter}) {i} - {meta_text}")
        
        counter += 1

In [61]:
top_n_overall(49800017, X_rating)

Top 5 recommendations for product_id 49800017:
1004856 - electronics smartphone samsung low 

Rec 1) 1004767 - electronics smartphone samsung medium
Rec 2) 1005115 - electronics smartphone apple high
Rec 3) 1004833 - electronics smartphone samsung low
Rec 4) 4804056 - electronics audio headphone apple high


In [62]:
def top_n_brands(X_rating, n=10):
    
    """select top n products by ranking'"""
    
    rating_idx =     pd.DataFrame(X_rating.groupby(['brand']).rating.sum()).sort_values('rating', ascending=False).index[0:n]

    #     rec_df.sort_values(ranking, ascending=False).index[0:n]
    return list(rating_idx)

In [63]:
top_n_brands(X_rating)

['samsung',
 'apple',
 'xiaomi',
 'huawei',
 'oppo',
 'lg',
 'acer',
 'lenovo',
 'bosch',
 'indesit']

In [64]:
def top_n_by_brand(X_rating, brand, n=10):
    
    """select top n products by brand'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False).reset_index('brand')
    rating_idx = rating_idx[rating_idx['brand']==brand].index[0:n]
#     return rating_idx

    counter = 0

    for i in rating_idx:
            meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]

            if counter == 0:
                print(f"Top {n} recommendations for product_id {brand}:")
                print(f"{i} - {meta_text} \n")
    #             print("-----------------------")

            else:
                print(f"Rec {counter}) {i} - {meta_text}")

            counter += 1

In [72]:
top_n_by_brand(X_rating, brand='samsung', n=5)

Top 5 recommendations for product_id samsung:
1004856 - electronics smartphone samsung low 

Rec 1) 1004767 - electronics smartphone samsung medium
Rec 2) 1004833 - electronics smartphone samsung low
Rec 3) 1004870 - electronics smartphone samsung medium
Rec 4) 1004836 - electronics smartphone samsung medium


In [67]:
top_n_by_brand(X_rating, brand='apple', n=5)

Top 5 recommendations for product_id apple:
1005115 - electronics smartphone apple high 

Rec 1) 4804056 - electronics audio headphone apple high
Rec 2) 1004249 - electronics smartphone apple high
Rec 3) 1002544 - electronics smartphone apple medium
Rec 4) 1005105 - electronics smartphone apple high


In [68]:
top_n_by_brand(X_rating, brand='huawei', n=5)

Top 5 recommendations for product_id huawei:
1004785 - electronics smartphone huawei medium 

Rec 1) 1004565 - electronics smartphone huawei low
Rec 2) 1004781 - electronics smartphone huawei medium
Rec 3) 1004903 - electronics smartphone huawei low
Rec 4) 1004708 - electronics smartphone huawei low


In [69]:
top_n_by_brand(X_rating, brand='lg', n=5)

Top 5 recommendations for product_id lg:
2702277 - appliances kitchen refrigerators lg medium 

Rec 1) 3601485 - appliances kitchen washer lg medium
Rec 2) 3601244 - appliances kitchen washer lg medium
Rec 3) 3601437 - appliances kitchen washer lg medium
Rec 4) 1802037 - electronics video tv lg medium


In [70]:
top_n_by_brand(X_rating, brand='lenovo', n=5)

Top 5 recommendations for product_id lenovo:
1307366 - computers notebook lenovo low 

Rec 1) 1307067 - computers notebook lenovo low
Rec 2) 1307004 - computers notebook lenovo low
Rec 3) 1307237 - computers notebook lenovo low
Rec 4) 1307377 - computers notebook lenovo medium


In [None]:
 
    rating_idx = pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False).reset_index('brand')
    rating_idx = rating_idx[rating_idx['brand']==brand].index[0:n]
#     return rating_idx

    counter = 0

    for i in rating_idx:
            meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]

            if counter == 0:
                print(f"Top {n} recommendations for product_id {brand}:")
                print(f"{i} - {meta_text} \n")
    #             print("-----------------------")

            else:
                print(f"Rec {counter}) {i} - {meta_text}")

            counter += 1

In [None]:
def top_n_by_brand(X_rating, brand='apple', n=5):
    
    rating_idx = rec_df.sort_values(ranking, ascending=False).index
    
    counter = 0
    
    product_ids=[]
    metas=[]
    prices=[]
    
    for i in feat_idx:
        meta_text = meta_df[meta_df['product_id'] == i][['metadata']].iloc[0,:][0]
        price = meta_df[meta_df['product_id'] == i][['price']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} {price} \n")
#             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text} {price}")
        
        product_ids.append(i)
        metas.append(meta_text)
        prices.append(price)
        
        counter += 1
    
    return pd.DataFrame({'product_id':product_ids, 'meta_text':metas, 'price':prices})

In [85]:
def top_n_by_brand(X_rating, brand, n=10):
    
    """select top n products by brand'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False).reset_index('brand')
    rating_idx = rating_idx[rating_idx['brand']==brand].index[0:n]
    return rating_idx
#     return rating_idx

#     counter = 0

#     for i in rating_idx:
#             meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]

#             if counter == 0:
#                 print(f"Top {n} recommendations for product_id {brand}:")
#                 print(f"{i} - {meta_text} \n")
#     #             print("-----------------------")

#             else:
#                 print(f"Rec {counter}) {i} - {meta_text}")

#             counter += 1

In [86]:
top_n_by_brand(X_rating, brand='lenovo', n=5)

Int64Index([1307366, 1307067, 1307004, 1307237, 1307377], dtype='int64', name='product_id')

In [87]:
top_n_by_brand(X_rating, brand='apple', n=5)

Int64Index([1005115, 4804056, 1004249, 1002544, 1005105], dtype='int64', name='product_id')

In [88]:
top_n_by_brand(X_rating, brand='samsung', n=5)

Int64Index([1004856, 1004767, 1004833, 1004870, 1004836], dtype='int64', name='product_id')

In [89]:
top_n_by_brand(X_rating, brand='huawei', n=5)

Int64Index([1004785, 1004565, 1004781, 1004903, 1004708], dtype='int64', name='product_id')

In [90]:
top_n_by_brand(X_rating, brand='lg', n=5)

Int64Index([2702277, 3601485, 3601244, 3601437, 1802037], dtype='int64', name='product_id')

In [91]:
lst = [1307366, 1307067, 1307004, 1307237, 1307377, 1005115, 4804056, 1004249, 1002544, 1005105, 1004856, 1004767, 1004833, 1004870, 1004836,1004785, 1004565, 1004781, 1004903, 1004708, 2702277, 3601485, 3601244, 3601437, 1802037]

In [92]:
len(lst)

25

In [98]:
top25=pd.DataFrame(lst)

In [107]:
top25.rename(columns={0:'product_id'}, inplace=True)

In [109]:
top25

Unnamed: 0,product_id
0,1307366
1,1307067
2,1307004
3,1307237
4,1307377
5,1005115
6,4804056
7,1004249
8,1002544
9,1005105


In [114]:
top25 = top25.merge(X_rating, how='left', left_on='product_id', right_on='product_id').drop_duplicates(['product_id'])

In [118]:
top25 = top25.reset_index()

In [119]:
top25.head()

Unnamed: 0,index,product_id,event_time,event_type,category_id,category_code,brand,price,user_id,user_session,25%,75%,price_category,rating,metadata
0,0,1307366,2019-10-01 02:38:03 UTC,view,2053013558920217191,computers notebook,lenovo,248.62,532340721,e7613958-fa9a-4dba-a199-7e611fac38c6,360.34,873.61,low,1,computers notebook lenovo low
1,2740,1307067,2019-10-01 02:20:03 UTC,view,2053013558920217191,computers notebook,lenovo,251.74,541316253,d15e32ab-efb2-4232-a646-0492a50d2fe1,360.34,873.61,low,1,computers notebook lenovo low
2,4618,1307004,2019-10-01 00:30:16 UTC,view,2053013558920217191,computers notebook,lenovo,290.61,551508458,eb9a8d4b-da50-43c5-b2c2-21fe9487a175,360.34,873.61,low,1,computers notebook lenovo low
3,6228,1307237,2019-10-01 07:18:29 UTC,view,2053013558920217191,computers notebook,lenovo,257.38,513225144,b01b12bb-3bee-412e-abe4-d40ef9e570f7,360.34,873.61,low,1,computers notebook lenovo low
4,7101,1307377,2019-10-01 02:51:52 UTC,view,2053013558920217191,computers notebook,lenovo,432.42,519309066,d60c7cf6-dc82-4776-a920-6cdbd1f22c2d,360.34,873.61,medium,1,computers notebook lenovo medium


In [120]:
top25[['product_id', 'category_code', 'brand', 'price', 'metadata']]

Unnamed: 0,product_id,category_code,brand,price,metadata
0,1307366,computers notebook,lenovo,248.62,computers notebook lenovo low
1,1307067,computers notebook,lenovo,251.74,computers notebook lenovo low
2,1307004,computers notebook,lenovo,290.61,computers notebook lenovo low
3,1307237,computers notebook,lenovo,257.38,computers notebook lenovo low
4,1307377,computers notebook,lenovo,432.42,computers notebook lenovo medium
5,1005115,electronics smartphone,apple,975.57,electronics smartphone apple high
6,4804056,electronics audio headphone,apple,161.98,electronics audio headphone apple high
7,1004249,electronics smartphone,apple,739.81,electronics smartphone apple high
8,1002544,electronics smartphone,apple,464.13,electronics smartphone apple medium
9,1005105,electronics smartphone,apple,1415.48,electronics smartphone apple high


In [121]:
top25.to_csv('top25.csv')

In [122]:
!pwd

/home/cjergen/code/sailormoonvicky/eCommerce/notebooks
