<a href="https://colab.research.google.com/github/sparsh-ai/chef-session/blob/main/notebooks/chef_session_01_booking_itempop.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup

In [1]:
import os
project_name = "chef-session"; branch = "main"; account = "sparsh-ai"
project_path = os.path.join('/content', project_name)

In [None]:
if not os.path.exists(project_path):
    !pip install -U -q dvc dvc[gdrive]
    !cp -r /content/drive/MyDrive/git_credentials/. ~
    path = "/content/" + project_name; 
    !mkdir "{path}"
    %cd "{path}"
    !git init
    !git remote add origin https://github.com/"{account}"/"{project_name}".git
    !git pull origin "{branch}"
    !git checkout "{branch}"
else:
    %cd "{project_path}"

In [None]:
!git status

In [None]:
!git add . && git commit -m 'commit' && git push origin "{branch}"

In [None]:
!dvc pull ./data/bronze/booking/*

## Context

- Booking.com dataset
    - Popularity recommender, hit rate evaluation


## Prototype

In [None]:
import pandas as pd

In [None]:
train = pd.read_parquet('./data/bronze/booking/train.parquet.snappy')
train = train.sort_values(by=['utrip_id','checkin'])
train

Unnamed: 0,user_id,checkin,checkout,city_id,device_class,affiliate_id,booker_country,hotel_country,utrip_id
0,1000027,2016-08-13,2016-08-14,8183,desktop,7168,Elbonia,Gondal,1000027_1
1,1000027,2016-08-14,2016-08-16,15626,desktop,7168,Elbonia,Gondal,1000027_1
2,1000027,2016-08-16,2016-08-18,60902,desktop,7168,Elbonia,Gondal,1000027_1
3,1000027,2016-08-18,2016-08-21,30628,desktop,253,Elbonia,Gondal,1000027_1
4,1000033,2016-04-09,2016-04-11,38677,mobile,359,Gondal,Cobra Island,1000033_1
...,...,...,...,...,...,...,...,...,...
1166830,999855,2016-05-01,2016-05-02,20345,mobile,359,Gondal,Fook Island,999855_1
1166831,999944,2016-06-23,2016-06-24,17944,desktop,4541,Gondal,Glubbdubdrib,999944_1
1166832,999944,2016-06-24,2016-06-27,47075,desktop,2322,Gondal,Glubbdubdrib,999944_1
1166833,999944,2016-06-27,2016-06-29,228,desktop,384,Gondal,Glubbdubdrib,999944_1


In [None]:
test = pd.read_parquet('./data/bronze/booking/test.parquet.snappy')
test = test.sort_values(by=['utrip_id','checkin'])
test

Unnamed: 0,user_id,checkin,checkout,device_class,affiliate_id,booker_country,utrip_id,city_id,hotel_country
0,1000066,2016-07-21,2016-07-23,desktop,9924,Gondal,1000066_2,56430,Urkesh
1,1000066,2016-07-23,2016-07-25,desktop,9924,Gondal,1000066_2,41971,Urkesh
2,1000066,2016-07-25,2016-07-28,desktop,9924,Gondal,1000066_2,5797,Urkesh
3,1000066,2016-07-28,2016-07-31,mobile,2436,Gondal,1000066_2,0,
4,1000270,2016-02-08,2016-02-09,mobile,9452,The Devilfire Empire,1000270_1,50075,The Devilfire Empire
...,...,...,...,...,...,...,...,...,...
378662,999911,2016-10-07,2016-10-08,desktop,9598,Gondal,999911_1,0,
378663,999991,2016-08-15,2016-08-17,desktop,8065,Elbonia,999991_3,29770,Elbonia
378664,999991,2016-08-18,2016-08-19,desktop,8065,Elbonia,999991_3,36170,Carpathia
378665,999991,2016-08-19,2016-08-20,tablet,3631,Elbonia,999991_3,52155,Elbonia


In [None]:
# Generate Dummy Predictions - use top 4 cities in the trainset as benchmark recommendation
topcities = train.city_id.value_counts().index[:4]

test_trips = (test[['utrip_id']].drop_duplicates()).reset_index().drop('index', axis=1)

cities_prediction = pd.DataFrame([topcities]*test_trips.shape[0], columns=['city_id_1','city_id_2','city_id_3','city_id_4'])

cities_prediction = pd.concat([test_trips, cities_prediction], axis =1)
cities_prediction

Unnamed: 0,utrip_id,city_id_1,city_id_2,city_id_3,city_id_4
0,1000066_2,47499,23921,36063,17013
1,1000270_1,47499,23921,36063,17013
2,1000441_1,47499,23921,36063,17013
3,100048_1,47499,23921,36063,17013
4,1000543_1,47499,23921,36063,17013
...,...,...,...,...,...
70657,999674_1,47499,23921,36063,17013
70658,999797_1,47499,23921,36063,17013
70659,999862_1,47499,23921,36063,17013
70660,999911_1,47499,23921,36063,17013


In [None]:
ground_truth = pd.read_parquet('./data/bronze/booking/ground_truth.parquet.snappy')
ground_truth.set_index('utrip_id', inplace=True)
ground_truth

Unnamed: 0_level_0,city_id,hotel_country
utrip_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1038944_1,54085,Sokovia
1068715_1,29319,Cobra Island
1075528_1,55763,Bozatta
1110462_4,11930,Alvonia
1132565_1,58659,Axphain
...,...,...
881470_1,28422,Cobra Island
886479_1,51291,Glubbdubdrib
90072_1,22175,Cobra Island
96245_1,58135,Nevoruss


In [None]:
def evaluate_accuracy_at_4(predicted, actual):
    '''checks if the true city is within the four recommended cities'''
    data = predicted.join(actual, on='utrip_id')

    hits = ((data['city_id']==data['city_id_1'])|(data['city_id']==data['city_id_2'])|
        (data['city_id']==data['city_id_3'])|(data['city_id']==data['city_id_4']))*1
    return hits.mean()

In [None]:
evaluate_accuracy_at_4(cities_prediction, ground_truth)

## Tests

In [None]:
!pip install -q ipytest
import ipytest
ipytest.autoconfig()

In [None]:
# %%ipytest


In [None]:
train = Dataset(path: str)
test = Dataset(path: str)
model = Model()
model.fit(train: pd.DataFrame)
model.recommend(test: pd.DataFrame, topk=4)

metrics = Metrics()
hr = metrics.HitRate(k=4)

eval = Evaluator(model,
                 data = test,
                 metrics=[hr])
eval.evaluate()
eval.save_results(path: str)

## Dev

In [65]:
import numpy as np
import pandas as pd
from typing import List

In [54]:
class Dataset:
    def __init__(self, data=None):
        self.data = data

    def load(self, path, type='parquet'):
        if type=='parquet':
            self.data = pd.read_parquet(path)
        return self

    def sort(self, by: List):
        self.data.sort_values(by=by)
        return self

    def filter(self, by='cols', keep=[]):
        if by=='cols':
            self.data = self.data[keep]
        return self
    
    def rename(self, rename_map):
        self.data = self.data.rename(columns=rename_map)
        return self
    
    def cast(self, schema_map):
        self.data = self.data.astype(schema_map)
        return self

    def __repr__(self):
        return '{}\n{}\n{}\n{}'\
        .format(
            self.data.info(),
            '='*100,
            self.data.head(),
            '='*100
            )

In [60]:
class Model:
    def __init__(self):
        self.items_by_popularity = []

    def fit(self, train):
        self.items_by_popularity = train.data['ITEM_ID'].value_counts().index.tolist()

    def recommend(self, uid=None, topk=4):
        return self.items_by_popularity[:topk]

In [67]:
class HitRate:
    def __init__(self, k=4):
        self.k = k

    def calculate(self, recommended_list, actual_list):
        actual_list = np.array(actual_list) 
        recommended_list = np.array(recommended_list)[:self.k]
        flags = np.isin(actual_list, recommended_list) 
        return (flags.sum() > 0) * 1

    def __repr__(self):
        return 'HR@{}'.format(self.k)

In [107]:
class Evaluate:
    def __init__(self, model, test_ids, ground_truth, metrics):
        self.model = model
        self.test_ids = test_ids
        self.ground_truth = ground_truth
        self.metrics = metrics
        self.results = {}
        self.recommendations = {}
        self._calculate_recommendations()
    
    def _calculate_recommendations(self):
        for test_id in self.test_ids:
            self.recommendations[test_id] = self.model.recommend(test_id)

    def evaluate(self):
        for metric in self.metrics:
            self.results[metric] = 0
            scores = []
            for test_id in self.test_ids:
                actual_list = self.ground_truth[test_id]
                recommended_list = self.recommendations[test_id]
                score = metric.calculate(recommended_list=recommended_list,
                                         actual_list=actual_list)
                scores.append(score)
            self.results[metric] = np.mean(scores)
        return self

    def save_results(self, path):
        with open(path, 'wt') as handle:
            self.results.write(str(handle))
    
    def __repr__(self):
        return str(self.results)

---

In [56]:
train = Dataset()

train_info = train.load('./data/bronze/booking/train.parquet.snappy')\
                        .sort(by=['utrip_id','checkin'])\
                        .filter(by='cols', keep=['utrip_id','city_id'])\
                        .rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
                        .cast({'USER_ID':'str', 'ITEM_ID':'str'})
train_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1166835 entries, 0 to 1166834
Data columns (total 2 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   USER_ID  1166835 non-null  object
 1   ITEM_ID  1166835 non-null  object
dtypes: object(2)
memory usage: 17.8+ MB


None
     USER_ID ITEM_ID
0  1000027_1    8183
1  1000027_1   15626
2  1000027_1   60902
3  1000027_1   30628
4  1000033_1   38677

In [57]:
test = Dataset()

test_info = test.load('./data/bronze/booking/test.parquet.snappy')\
                        .sort(by=['utrip_id','checkin'])\
                        .filter(by='cols', keep=['utrip_id','city_id'])\
                        .rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
                        .cast({'USER_ID':'str', 'ITEM_ID':'str'})
test_info

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 378667 entries, 0 to 378666
Data columns (total 2 columns):
 #   Column   Non-Null Count   Dtype 
---  ------   --------------   ----- 
 0   USER_ID  378667 non-null  object
 1   ITEM_ID  378667 non-null  object
dtypes: object(2)
memory usage: 5.8+ MB


None
     USER_ID ITEM_ID
0  1000066_2   56430
1  1000066_2   41971
2  1000066_2    5797
3  1000066_2       0
4  1000270_1   50075

In [63]:
model = Model()
model.fit(train)
model.recommend('1000066_2')

['47499', '23921', '36063', '17013']

In [74]:
hitrate = HitRate(k=4)
hitrate
print(hitrate.calculate(recommended_list=['1','2','3','4','5'], actual_list = ['4']))
print(hitrate.calculate(recommended_list=['1','2','3','4','5'], actual_list = ['5']))

1
0


In [89]:
ground_truth = Dataset()

gt_info = ground_truth.load('./data/bronze/booking/ground_truth.parquet.snappy')\
                            .filter(by='cols', keep=['utrip_id','city_id'])\
                            .rename({'utrip_id':'USER_ID','city_id':'ITEM_ID'})\
                            .cast({'USER_ID':'str', 'ITEM_ID':'str'})

ground_truth = ground_truth.data\
                    .drop_duplicates(subset='USER_ID', keep='last')\
                    .set_index('USER_ID')\
                    .to_dict()['ITEM_ID']

print(type(ground_truth), len(ground_truth.keys()))

<class 'dict'> 70662


In [108]:
eval = Evaluate(model=model,
                test_ids=test.data.USER_ID.unique(),
                ground_truth=ground_truth,
                metrics=[hitrate])

In [109]:
eval.evaluate()

{HR@4: 0.05271574537941185}