# Top 10 selector

In [101]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

In [102]:
df = pd.read_csv('data/2019_oct_sort_1000.csv')

In [103]:
def preprocessing_feat(X):    
    X_preprocessed = X.dropna(subset = ['category_code', 'brand']) #tbd!!
    X_preprocessed = X_preprocessed.drop_duplicates()
    X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')
    return X_preprocessed

def pricing_criterion(X):
    pricing_guide = X.groupby('category_code')['price'].describe()[["25%", "75%"]].reset_index()
    X_merged = X.merge(pricing_guide, on="category_code", how="right")
    X_merged["price_category"] = X_merged.apply(lambda row: make_column(row), axis=1)
    return X_merged

def make_column(row):
    if row["price"] < row["25%"]:
        return "low"
    elif row["price"] < row["75%"]:
        return "medium"
    else:
        return "high"

def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X

def rating(X):
    dct = {'view': 1, 'cart': 3, 'purchase': 5}
    X['rating'] = X['event_type'].map(dct)
    return X

In [104]:
X_prep = preprocessing_feat(df)

  X_preprocessed['category_code'] = X_preprocessed['category_code'].str.replace('.',' ')


In [105]:
X_pric = pricing_criterion(X_prep)

In [106]:
X_rating = rating(X_pric)

In [107]:
def metadata(X):
    X['metadata'] = X[['category_code', 'brand', 'price_category']].apply(lambda x: ' '.join(x), axis = 1)
    return X

In [108]:
X_meta = metadata(X_pric)

In [109]:
def ranking(X, n=5):
    X = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).reset_index().index[0:n]
    return X
    

In [117]:
# def top_n(product_id, X_rating, n=10):
    
#     """select top n products by ranking'"""
    
#     rating_idx = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).reset_index().index[0:n]
#     return rating_idx

#     counter = 0
    
#     for i in rating_idx:
#         meta_text = X_rating[X_rating['product_id'] == i][['metadata']].iloc[0,:][0]
        
#         if counter == 0:
#             print(f"Top {n} recommendations for product_id {product_id}:")
#             print(f"{i} - {meta_text} \n")
# #             print("-----------------------")

#         else:
#             print(f"Rec {counter}) {i} - {meta_text}")
        
#         counter += 1

In [118]:
def top_n_overall(product_id, X_rating, n=5):
    
    """select top n products by ranking'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(by='product_id').rating.sum()).sort_values('rating', ascending=False).index[0:n]
    #     rec_df.sort_values(ranking, ascending=False).index[0:n]
    counter = 0
    
    for i in rating_idx:
        meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]
        
        if counter == 0:
            print(f"Top {n} recommendations for product_id {product_id}:")
            print(f"{i} - {meta_text} \n")
#             print("-----------------------")

        else:
            print(f"Rec {counter}) {i} - {meta_text}")
        
        counter += 1

In [119]:
top_n_overall(49800017, X_rating)

Top 5 recommendations for product_id 49800017:
1004767 - electronics smartphone samsung medium 

Rec 1) 1004856 - electronics smartphone samsung low
Rec 2) 1004834 - electronics smartphone samsung low
Rec 3) 1004870 - electronics smartphone samsung medium
Rec 4) 1801881 - electronics video tv samsung high


In [120]:
def top_n_brands(X_rating, n=10):
    
    """select top n products by ranking'"""
    
    rating_idx =     pd.DataFrame(X_rating.groupby(['brand']).rating.sum()).sort_values('rating', ascending=False).index[0:n]

    #     rec_df.sort_values(ranking, ascending=False).index[0:n]
    return list(rating_idx)

In [121]:
top_n_brands(X_rating)

['samsung',
 'apple',
 'xiaomi',
 'oppo',
 'huawei',
 'tefal',
 'indesit',
 'starline',
 'bts',
 'respect']

In [122]:
def top_n_by_brand(X_rating, brand, n=10):
    
    """select top n products by brand'"""
    
    rating_idx = pd.DataFrame(X_rating.groupby(['product_id','brand']).rating.sum()).sort_values('rating', ascending=False).reset_index('brand')
    rating_idx = rating_idx[rating_idx['brand']==brand].index[0:n]
#     return rating_idx

    counter = 0

    for i in rating_idx:
            meta_text = X_meta[X_meta['product_id'] == i][['metadata']].iloc[0,:][0]

            if counter == 0:
                print(f"Top {n} recommendations for product_id {brand}:")
                print(f"{i} - {meta_text} \n")
    #             print("-----------------------")

            else:
                print(f"Rec {counter}) {i} - {meta_text}")

            counter += 1

In [123]:
top_n_by_brand(X_rating, brand='samsung')

Top 10 recommendations for product_id samsung:
1004767 - electronics smartphone samsung medium 

Rec 1) 1004856 - electronics smartphone samsung low
Rec 2) 1004834 - electronics smartphone samsung low
Rec 3) 1004870 - electronics smartphone samsung medium
Rec 4) 1801881 - electronics video tv samsung high
Rec 5) 1004833 - electronics smartphone samsung low
Rec 6) 1004836 - electronics smartphone samsung medium
Rec 7) 1004857 - electronics smartphone samsung low
Rec 8) 3701056 - appliances environment vacuum samsung medium
Rec 9) 1004768 - electronics smartphone samsung medium
