In [6]:
import pandas as pd
import numpy as np
import pickle
from sklearn import preprocessing

In [7]:
# load recommendations from level 1
with open('recommendations_all_level2_features.pkl', 'rb') as f:
    rec = pickle.load(f)

In [10]:
rec.columns

Index(['Unnamed: 0', 'itemId', 'rec_id', 'weight', 'source', 'title_x',
       'author_x', 'publisher_x', 'main_topic_item', 'subtopics_item',
       'title_y', 'author_y', 'publisher_y', 'main_topic_rec', 'subtopics_rec',
       'confItemVsRec', 'titleLength', 'identical', 'pages_mean_item',
       'pages_mean_rec', 'max_pageCount', 'Euc_distance', 'rel_distance',
       'score', 'g_categories_item', 'g_categories_rec', 'main_topic_item',
       'main_topic_rec', 'subtopics_item', 'subtopics_rec',
       'main_topic_cluster_item', 'main_topic_cluster_rec', 'category_match',
       'suptopics_matches_count', 'suptopics_matches_weight',
       'main_topic_cluster_match', 'weighted_average_rating',
       'total_ratings_count', 'total_ratings_count_log_scaled', 'itemID_merge',
       'prices_mean_rec', 'prices_mean_item', 'eucl_distance', 'score',
       'cover_similarity_score_squared'],
      dtype='object')

In [11]:
### drop columns that are not needed
rec = rec.drop(['Unnamed: 0',
                 'pages_mean_item', 'pages_mean_rec', 'max_pageCount', 'Euc_distance', 'rel_distance',
               'g_categories_item', 'g_categories_rec', 'main_topic_item', 'main_topic_rec',
               'subtopics_item', 'subtopics_rec', 'main_topic_cluster_item', 'main_topic_cluster_rec',
                'suptopics_matches_count',
               'itemID_merge', 'prices_mean_rec', 'prices_mean_item',
                'eucl_distance'], axis = 1)

In [12]:
rec.columns

Index(['itemId', 'rec_id', 'weight', 'source', 'title_x', 'author_x',
       'publisher_x', 'title_y', 'author_y', 'publisher_y', 'confItemVsRec',
       'titleLength', 'identical', 'score', 'category_match',
       'suptopics_matches_weight', 'main_topic_cluster_match',
       'weighted_average_rating', 'total_ratings_count',
       'total_ratings_count_log_scaled', 'score',
       'cover_similarity_score_squared'],
      dtype='object')

In [13]:
# rename to differentiate "scores"
rec.columns = ['itemId', 'rec_id', 'weight', 'source', 'title_x', 'author_x',
       'publisher_x', 'title_y', 'author_y', 'publisher_y', 'confItemVsRec',
       'titleLength', 'identical', 'score_pages', 'category_match',
       'suptopics_matches_weight', 'main_topic_cluster_match',
       'weighted_average_rating', 'total_ratings_count',
       'total_ratings_count_log_scaled', 'score_price',
       'cover_similarity_score_squared']
#rec = rec.drop(['score'], axis = 1)

# Create Final Level 2 Features

In [14]:
# CATEGORY FEATURE
def create_l2_category_feature(df):
    rec = df
    rec['l2_category_feature'] = np.NaN
    for i in range(0, len(rec)):
        category_match = rec.iloc[i]['category_match']
        #main_topic_seqmatch = rec.iloc[i]['main_topic_seqmatch']
        suptopics_matches_weight = rec.iloc[i]['suptopics_matches_weight']
        main_topic_cluster_match = rec.iloc[i]['main_topic_cluster_match']
        
        if pd.isnull(category_match) == True:
            # no category_match exists
            # exclude from calculation    
            # calculate final category feature
            cat_feature = 0.7 * main_topic_cluster_match + 0.3 * suptopics_matches_weight
            rec.at[i, 'l2_category_feature'] = cat_feature
        else: 
            # category_match exists:
            cat_feature = 0.5 * main_topic_cluster_match + 0.3 * category_match + 0.2 * suptopics_matches_weight
            rec.at[i, 'l2_category_feature'] = cat_feature
    return rec['l2_category_feature']    


In [15]:
def create_l2_cover_feature_hao(df):
    rec = df
    rec['l2_cover_feature'] = np.NaN
    for i in range(0, len(rec)):
        similarity = rec.iloc[i]['cover_similarity_score_squared']
        
        if similarity == 0:
            cover_feature = np.NaN
            rec.at[i, 'l2_cover_feature'] = cover_feature
        else:
            cover_feature = similarity
            rec.at[i, 'l2_cover_feature'] = cover_feature
    return rec['l2_cover_feature']        

In [16]:
# COVER FEATURE
def create_l2_cover_feature(df):
    rec = df
    rec['l2_cover_feature'] = np.NaN
    for i in range(0, len(rec)):
        structural_similarity = rec.iloc[i]['structural_similarity']
        hist_correlation = rec.iloc[i]['hist_correlation']
        #main_color_euc_distance_norm = rec.iloc[i]['main_color_euc_distance_norm']
        
        if pd.isnull(hist_correlation) == True:
            # no rating exists
            # cannot calculate a feature
            cover_feature = np.NaN
            rec.at[i, 'l2_cover_feature'] = cover_feature
        
        else:
            # hist_correlation and struct_sim are not always between -1 and 1
            # control for that
            if hist_correlation < -1:
                hist_correlation = -1
            if structural_similarity < -1:
                structural_similarity = -1
            if hist_correlation > 1:
                hist_correlation = 1
            if structural_similarity > 1:
                structural_similarity = 1

            # calculate final feature
            cover_feature = 0.5 * (structural_similarity+1)/2 +  0.5 * (hist_correlation+1)/2 #+ 0.2 * main_color_euc_distance_norm
            rec.at[i, 'l2_cover_feature'] = cover_feature

    return rec['l2_cover_feature']    


In [17]:
# CATEGORY FEATURE
def create_l2_rating_feature(df):
    rec = df
    rec['l2_rating_feature'] = np.NaN
    
    # scale rating 
    values = rec['weighted_average_rating']
    min_max_scaler = preprocessing.MinMaxScaler()
    null_index = rec['weighted_average_rating'].isnull()
    rec.loc[~null_index, ['weighted_average_rating']] = min_max_scaler.fit_transform(rec.loc[~null_index, ['weighted_average_rating']])
    
    for i in range(0, len(rec)):
        weighted_average_rating = rec.iloc[i]['weighted_average_rating']
        total_ratings_count_log_scaled = rec.iloc[i]['total_ratings_count_log_scaled']
        
        if pd.isnull(weighted_average_rating) == True:
            # no rating exists
            # cannot calculate a feature
            feature = np.NaN
            rec.at[i, 'l2_rating_feature'] = feature
        else: 
            # rating exists
            feature = 0.8 * weighted_average_rating +  0.2 * total_ratings_count_log_scaled
            rec.at[i, 'l2_rating_feature'] = feature
    return rec['l2_rating_feature']   


In [18]:
#set empty value to 0 otherwise to 1
def return_0_or_1(cat):
    if pd.isna(cat):
        return 0
    else:
        return 1
#set empty value to 0 otherwise return original
def set_nan_to_0(cat):
    if pd.isna(cat):
        return 0
    else:
        return cat
def calc_weight(category,cover,rating,price,pages):
    #calculate nenner - set weight of missing values to 0
    nenner = return_0_or_1(category)*0.25 + return_0_or_1(cover)*0.25 + return_0_or_1(rating)*0.225 + return_0_or_1(price)*0.05 + return_0_or_1(pages)*0.225
    if nenner == 0:
        weight = 0
    else:
        #calculate weight, replace missing values with 0 - otherwise nan will be returned whenever there is a nan value in the sum
        weight = 0.25/nenner * set_nan_to_0(category) + 0.25/nenner * set_nan_to_0(cover) + 0.225/nenner * set_nan_to_0(rating) + 0.05/nenner * set_nan_to_0(price) + 0.225/nenner * set_nan_to_0(pages)
    return weight

In [19]:
def create_l2_overall_feature(df):
    rec = df
    rec['l2_overall_feature'] = np.NaN
    
    for i in range(0, len(rec)):
        category = rec.iloc[i]['l2_category_feature']
        cover = rec.iloc[i]['l2_cover_feature']
        rating = rec.iloc[i]['l2_rating_feature']
        price = rec.iloc[i]['score_price']
        pages = rec.iloc[i]['score_pages']
        
        #### PROBLEM: HOW do I deal with NaN values of final scores??? 
        # if not controlled: huge bias @Maurice
        # solution: dynamically updating the "Nenner"
        
        feature = calc_weight(category, cover, rating, price, pages)
        rec.at[i, 'l2_overall_feature'] = feature
    return rec['l2_overall_feature']   

In [20]:
rec['l2_category_feature'] = create_l2_category_feature(rec)

In [21]:
rec['l2_cover_feature'] = create_l2_cover_feature_hao(rec)

In [22]:
rec['l2_rating_feature'] = create_l2_rating_feature(rec)

In [23]:
rec['l2_overall_feature'] = create_l2_overall_feature(rec)

### save data

In [78]:
### save the so far recommendations 
with open('recommendations_final_20210628.pkl', 'wb') as f:
    pickle.dump(rec, f)

In [79]:
# load recommendations from level 1
with open('recommendations_final_20210628.pkl', 'rb') as f:
    rec = pickle.load(f)

# Keep only 5 best recommendations

In [84]:
rec_final = rec

In [86]:
# sort recommendations by itemID, then by weight so best recommendation comes first
rec_final = rec_final.sort_values(by=['itemId', 'final_weight'], ascending = (True, False))

In [87]:
rec_final = rec_final.groupby('itemId').head(5)

In [89]:
rec_final.columns

Index(['itemId', 'rec_id', 'weight', 'source', 'title_x', 'author_x',
       'publisher_x', 'title_y', 'author_y', 'publisher_y', 'confItemVsRec',
       'titleLength', 'identical', 'score_pages', 'score_price',
       'l2_category_feature', 'l2_cover_feature', 'l2_rating_feature',
       'l2_overall_feature', 'final_weight'],
      dtype='object')

In [90]:
rec_final = rec_final.drop(['weight', 'source', 'title_x', 'author_x',
       'publisher_x', 'title_y', 'author_y', 'publisher_y', 'confItemVsRec',
       'titleLength', 'identical', 'score_pages', 'score_price',
       'l2_category_feature', 'l2_cover_feature', 'l2_rating_feature',
       'l2_overall_feature'],axis = 1)

In [93]:
rec_final = rec_final.reset_index()

In [95]:
rec_final = rec_final.drop(['index'],axis = 1)

In [98]:
rec_final

Unnamed: 0,itemId,rec_id,final_weight
0,12,65244,0.355841
1,12,17952,0.330349
2,12,30847,0.285754
3,12,50607,0.278370
4,12,72230,0.255124
...,...,...,...
4995,79016,53240,0.846500
4996,79016,22802,0.559320
4997,79016,77768,0.452185
4998,79016,6624,0.435231


In [100]:
### save the so far recommendations 
with open('recommendations_final_20210628_top5.pkl', 'wb') as f:
    pickle.dump(rec_final, f)