#### This is a model evaluation code for evaluating and comparing all models performances
#### major metrics involves:  hit rate and recall

In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join('__file__','../../')))
import import_ipynb
import data_processing
import pandas as pd
import random
from config import configs #import 
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = configs.EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS
#Top-N accuracy metrics consts

importing Jupyter notebook from data_processing.ipynb


  """


no. of users: 1895
no. of users with atleast 5 consumptions: 1140
no. of interactions: 72312
no. of interactions from users with at least 5 engagement: 69868
no. of unique user/item interactions: 39106
engagement in train set: 33240
engagement in on test set: 5866
               consumer_id              item_id  type_weightage
2103  -8550167523008133722  -454649054276160610        1.000000
31618  4670267857749552625  6587635730509289343        1.000000
18745  -444330148331768170 -1297580205670251233        2.000000
34490  6686431125336194142 -4994468824009200256        1.000000
27398  3576137684812235192 -5570129644089964821        1.000000
...                    ...                  ...             ...
20971   692689608292948411  2435024834845042614        2.321928
23030  1623838599684589103 -5781461435447152359        1.000000
24932  2416280733544962613  7400903238402587728        1.584963
31680  4778050608932092852  1642787330067525131        2.000000
29484  3829784524040647339  727

In [3]:
class ModelEvaluator:
    
    def __init__(self):
        
        self.plat_articles = data_processing.reading_content_data()[0]           
        self.user_behaviour_full_indexed_df, \
        self.user_behaviour_train_indexed_df, \
        self.user_behaviour_test_indexed_df = data_processing.getting_user_behaviour_indexed()


    def get_not_interacted_items_sample(self, person_id, sample_size, seed=42):
        interacted_items = data_processing.get_users_content_data(person_id, self.user_behaviour_full_indexed_df)
        all_items = set(self.plat_articles['item_id'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, person_id):
        #Getting the items in test set
        interacted_values_testset = self.user_behaviour_test_indexed_df.loc[person_id]
        if type(interacted_values_testset['item_id']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['item_id'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['item_id'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.get_item_recommendations(person_id, 
                                               items_to_ignore=data_processing.get_users_content_data(person_id, 
                                                                                    self.user_behaviour_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(person_id, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['item_id'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['item_id'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hitrate@5_count':hits_at_5_count, 
                          'hitrate@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recallscore@5': recall_at_5,
                          'recallscore@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model, recommendation_model_type):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, person_id in enumerate(list(self.user_behaviour_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, person_id)  
            person_metrics['person_id'] = person_id
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        print(detailed_results_df)
        
        global_recall_at_5 = detailed_results_df['hitrate@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hitrate@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'model_type': recommendation_model_type,
                          'recallscore@5': global_recall_at_5,
                          'recallscore@10': global_recall_at_10}    
        return global_metrics, detailed_results_df

    
model_evaluator = ModelEvaluator()    