# Метрики

## Imports

In [1]:
import pandas as pd
import numpy as np

from tqdm.auto import tqdm
from rectools import Columns

### Загрузим данные МТС Кион

Датасет необходимо скачать: https://ods.ai/competitions/competition-recsys-21/data

Оттуда нужны файлы interactions.csv, items.csv и users.csv

### Зафиксируем random seed

In [2]:
np.random.seed(23)

## Read data

In [3]:
interactions = pd.read_csv('interactions.csv', parse_dates=['last_watch_dt'])

interactions.rename(
    columns={
        'last_watch_dt': Columns.Datetime,
        'total_dur': Columns.Weight
    }, 
    inplace=True) 


In [4]:
users = pd.read_csv('users.csv')
items = pd.read_csv('items.csv')

In [5]:
def headtail(df):
    return pd.concat([df.head(), df.tail()])

headtail(interactions)


Unnamed: 0,user_id,item_id,datetime,weight,watched_pct
0,176549,9506,2021-05-11,4250,72.0
1,699317,1659,2021-05-29,8317,100.0
2,656683,7107,2021-05-09,10,0.0
3,864613,7638,2021-07-05,14483,100.0
4,964868,9506,2021-04-30,6725,100.0
5476246,648596,12225,2021-08-13,76,0.0
5476247,546862,9673,2021-04-13,2308,49.0
5476248,697262,15297,2021-08-20,18307,63.0
5476249,384202,16197,2021-04-19,6203,100.0
5476250,319709,4436,2021-08-15,3921,45.0


In [6]:
interactions.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5476251 entries, 0 to 5476250
Data columns (total 5 columns):
 #   Column       Dtype         
---  ------       -----         
 0   user_id      int64         
 1   item_id      int64         
 2   datetime     datetime64[ns]
 3   weight       int64         
 4   watched_pct  float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 208.9 MB


Выделим небольшой кусок из данных, чтобы не слишком страдать

In [7]:
sample_users = [57607, 403227, 70720]
df = interactions[interactions[Columns.User].isin(sample_users)].sort_values("user_id").reset_index(drop=True)
del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']
df


Unnamed: 0,user_id,item_id
0,57607,4151
1,57607,10440
2,57607,13865
3,70720,4880
4,70720,4881
5,70720,6327
6,403227,6353
7,403227,1736
8,403227,5336
9,403227,181


In [8]:
print('Users', df[Columns.User].unique())
sample_items = df[Columns.Item].unique()
print('Items', sample_items)


Users [ 57607  70720 403227]
Items [ 4151 10440 13865  4880  4881  6327  6353  1736  5336   181]


## Regression

С регрессией все относительно просто. По (user, item) мы знаем таргет (рейтинг чаще всего) и по такой же паре предсказываем его

In [9]:
df['target'] = np.random.choice([3, 4, 5], df.shape[0])
df['predict'] = np.random.rand(df.shape[0]) * 3 + 2
df


Unnamed: 0,user_id,item_id,target,predict
0,57607,4151,5,2.658958
1,57607,10440,3,4.500017
2,57607,13865,4,3.280296
3,70720,4880,5,4.263719
4,70720,4881,3,4.990597
5,70720,6327,4,2.000249
6,403227,6353,3,4.927865
7,403227,1736,5,3.257364
8,403227,5336,4,4.044672
9,403227,181,5,4.257889


Общая оценка

In [10]:
mae = (df['target'] - df['predict']).abs().mean()
print(mae)

1.374467645324227


Оценка по пользователю с последюущим усреднением

In [11]:
df['diff'] = (df['target'] - df['predict']).abs()
average_mae = df.groupby(Columns.User)['diff'].mean()
print(average_mae.mean())
average_mae

1.4033728201405007


user_id
57607     1.520254
70720     1.575543
403227    1.114321
Name: diff, dtype: float64

Видно, что в данном случае метрики близки к друг другу, но это не всегда так

In [12]:
del df['target'], df['predict'], df['diff']

## Classification

Сгенерируем случайные рекомендации.

In [13]:
top_k = 5
recs = np.array([
    np.random.choice(sample_items, top_k, replace=False),
    np.random.choice(sample_items, top_k, replace=False),
    np.random.choice(sample_items, top_k, replace=False),
])
recs

array([[ 4151,  6353, 13865,  6327,  4880],
       [  181,  6327,  4151,  1736,  4881],
       [10440,  4880,  4881, 13865,  6327]])

Преобразуем в длинный датафрейм

In [14]:
df_rec = pd.DataFrame({
    Columns.User: np.repeat(sample_users, top_k),
    Columns.Item: recs.ravel()
})
df_rec


Unnamed: 0,user_id,item_id
0,57607,4151
1,57607,6353
2,57607,13865
3,57607,6327
4,57607,4880
5,403227,181
6,403227,6327
7,403227,4151
8,403227,1736
9,403227,4881


In [15]:
df_rec[Columns.Rank] = df_rec.groupby(Columns.User).cumcount() + 1
df_rec

Unnamed: 0,user_id,item_id,rank
0,57607,4151,1
1,57607,6353,2
2,57607,13865,3
3,57607,6327,4
4,57607,4880,5
5,403227,181,1
6,403227,6327,2
7,403227,4151,3
8,403227,1736,4
9,403227,4881,5


Ключевой момент. Именно ради него преобразовывали данные и именно это позволяет считать метрики быстрее.

In [16]:
df_rec = df.merge(df_rec, how='left', left_on=Columns.UserItem, right_on=Columns.UserItem)
df_rec = df_rec.sort_values(by=[Columns.User, Columns.Rank])
df_rec

Unnamed: 0,user_id,item_id,rank
0,57607,4151,1.0
2,57607,13865,3.0
1,57607,10440,
3,70720,4880,2.0
4,70720,4881,3.0
5,70720,6327,5.0
9,403227,181,1.0
7,403227,1736,4.0
6,403227,6353,
8,403227,5336,


### Precision@K

In [17]:
df_rec[f'TP@5'] = df_rec['rank'] <= 5
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5
0,57607,4151,1.0,True
2,57607,13865,3.0,True
1,57607,10440,,False
3,70720,4880,2.0,True
4,70720,4881,3.0,True
5,70720,6327,5.0,True
9,403227,181,1.0,True
7,403227,1736,4.0,True
6,403227,6353,,False
8,403227,5336,,False


In [18]:
df_rec[df_rec[Columns.Rank].notnull()]

Unnamed: 0,user_id,item_id,rank,TP@5
0,57607,4151,1.0,True
2,57607,13865,3.0,True
3,70720,4880,2.0,True
4,70720,4881,3.0,True
5,70720,6327,5.0,True
9,403227,181,1.0,True
7,403227,1736,4.0,True


Посчитаем вручную Precision@5 (усредняем по юзерам): (2/5 + 3/5 + 2/5) / 3

In [19]:
(2/5 + 3/5 + 2/5) / 3

0.4666666666666666

Посчитаем через groupby

In [20]:
df_rec['TP@5/5'] = df_rec['TP@5'] / top_k 

p5 = df_rec.groupby(Columns.User)['TP@5/5'].sum().mean()

print(f'Precision@5 = {p5}')

Precision@5 = 0.4666666666666666


In [21]:
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5
0,57607,4151,1.0,True,0.2
2,57607,13865,3.0,True,0.2
1,57607,10440,,False,0.0
3,70720,4880,2.0,True,0.2
4,70720,4881,3.0,True,0.2
5,70720,6327,5.0,True,0.2
9,403227,181,1.0,True,0.2
7,403227,1736,4.0,True,0.2
6,403227,6353,,False,0.0
8,403227,5336,,False,0.0


Используем тот факт, что мы знаем количество пользователей, а значит groupby не нужен

In [22]:
p5 = df_rec['TP@5/5'].sum() / len(sample_users)
print(f'Precision@5 = {p5}')

Precision@5 = 0.46666666666666673


### Recall@K

In [23]:
df_rec['actual'] = df_rec.groupby(Columns.User)[Columns.Item].transform('count')
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual
0,57607,4151,1.0,True,0.2,3
2,57607,13865,3.0,True,0.2,3
1,57607,10440,,False,0.0,3
3,70720,4880,2.0,True,0.2,3
4,70720,4881,3.0,True,0.2,3
5,70720,6327,5.0,True,0.2,3
9,403227,181,1.0,True,0.2,4
7,403227,1736,4.0,True,0.2,4
6,403227,6353,,False,0.0,4
8,403227,5336,,False,0.0,4


In [24]:
df_rec['TP@5/actual'] = df_rec['TP@5'] / df_rec['actual']
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual
0,57607,4151,1.0,True,0.2,3,0.333333
2,57607,13865,3.0,True,0.2,3,0.333333
1,57607,10440,,False,0.0,3,0.0
3,70720,4880,2.0,True,0.2,3,0.333333
4,70720,4881,3.0,True,0.2,3,0.333333
5,70720,6327,5.0,True,0.2,3,0.333333
9,403227,181,1.0,True,0.2,4,0.25
7,403227,1736,4.0,True,0.2,4,0.25
6,403227,6353,,False,0.0,4,0.0
8,403227,5336,,False,0.0,4,0.0


In [25]:
(2/3 + 3/3 + 2/4) / 3

0.7222222222222222

In [26]:
r5 = df_rec.groupby(Columns.User)['TP@5/actual'].sum().mean()
print(f'Recall@5 = {r5}')

Recall@5 = 0.7222222222222222


In [27]:
r5 = df_rec['TP@5/actual'].sum() / len(sample_users)
print(f'Recall@5 = {r5}')

Recall@5 = 0.7222222222222222


## Ranking

### MAP@K

In [28]:
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual
0,57607,4151,1.0,True,0.2,3,0.333333
2,57607,13865,3.0,True,0.2,3,0.333333
1,57607,10440,,False,0.0,3,0.0
3,70720,4880,2.0,True,0.2,3,0.333333
4,70720,4881,3.0,True,0.2,3,0.333333
5,70720,6327,5.0,True,0.2,3,0.333333
9,403227,181,1.0,True,0.2,4,0.25
7,403227,1736,4.0,True,0.2,4,0.25
6,403227,6353,,False,0.0,4,0.0
8,403227,5336,,False,0.0,4,0.0


In [29]:
df_rec['cumTP@5'] = df_rec.groupby(Columns.User)['TP@5'].cumsum()
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual,cumTP@5
0,57607,4151,1.0,True,0.2,3,0.333333,1
2,57607,13865,3.0,True,0.2,3,0.333333,2
1,57607,10440,,False,0.0,3,0.0,2
3,70720,4880,2.0,True,0.2,3,0.333333,1
4,70720,4881,3.0,True,0.2,3,0.333333,2
5,70720,6327,5.0,True,0.2,3,0.333333,3
9,403227,181,1.0,True,0.2,4,0.25,1
7,403227,1736,4.0,True,0.2,4,0.25,2
6,403227,6353,,False,0.0,4,0.0,2
8,403227,5336,,False,0.0,4,0.0,2


In [30]:
df_rec['Prec@5'] = df_rec['cumTP@5'] / df_rec[Columns.Rank]
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual,cumTP@5,Prec@5
0,57607,4151,1.0,True,0.2,3,0.333333,1,1.0
2,57607,13865,3.0,True,0.2,3,0.333333,2,0.666667
1,57607,10440,,False,0.0,3,0.0,2,
3,70720,4880,2.0,True,0.2,3,0.333333,1,0.5
4,70720,4881,3.0,True,0.2,3,0.333333,2,0.666667
5,70720,6327,5.0,True,0.2,3,0.333333,3,0.6
9,403227,181,1.0,True,0.2,4,0.25,1,1.0
7,403227,1736,4.0,True,0.2,4,0.25,2,0.5
6,403227,6353,,False,0.0,4,0.0,2,
8,403227,5336,,False,0.0,4,0.0,2,


In [31]:
df_rec['Prec@5/actual'] = df_rec['Prec@5'] / df_rec['actual']
df_rec

Unnamed: 0,user_id,item_id,rank,TP@5,TP@5/5,actual,TP@5/actual,cumTP@5,Prec@5,Prec@5/actual
0,57607,4151,1.0,True,0.2,3,0.333333,1,1.0,0.333333
2,57607,13865,3.0,True,0.2,3,0.333333,2,0.666667,0.222222
1,57607,10440,,False,0.0,3,0.0,2,,
3,70720,4880,2.0,True,0.2,3,0.333333,1,0.5,0.166667
4,70720,4881,3.0,True,0.2,3,0.333333,2,0.666667,0.222222
5,70720,6327,5.0,True,0.2,3,0.333333,3,0.6,0.2
9,403227,181,1.0,True,0.2,4,0.25,1,1.0,0.25
7,403227,1736,4.0,True,0.2,4,0.25,2,0.5,0.125
6,403227,6353,,False,0.0,4,0.0,2,,
8,403227,5336,,False,0.0,4,0.0,2,,


In [32]:
map = df_rec.groupby(Columns.User)['Prec@5/actual'].sum()
print(map.mean())
map

0.5064814814814814


user_id
57607     0.555556
70720     0.588889
403227    0.375000
Name: Prec@5/actual, dtype: float64

In [33]:
def precision_naive(target, users, recs, k):
    precision = []
    for i, user in enumerate(users):
        p = 0
        user_target = target[target[:, 0] == user][:, 1]
        for rec in recs[i]:
            if rec in user_target:
                p += 1
        precision.append(p)
    return sum(precision) / len(users) / k

## Naive vs Pandas

In [34]:
df = interactions[Columns.UserItem]
df

Unnamed: 0,user_id,item_id
0,176549,9506
1,699317,1659
2,656683,7107
3,864613,7638
4,964868,9506
...,...,...
5476246,648596,12225
5476247,546862,9673
5476248,697262,15297
5476249,384202,16197


In [35]:
target = df.values
target


array([[176549,   9506],
       [699317,   1659],
       [656683,   7107],
       ...,
       [697262,  15297],
       [384202,  16197],
       [319709,   4436]])

In [36]:
target[target[:, 0] == 176549][:, 1]  # таргет для 1 случайного юзера


array([ 9506, 15469,  9164, 12250,  5695, 11345,  7582, 10881,  5051,
        7453,  3258,  7626, 13243, 10761,  5600,  5533, 16197, 12302,
        6626,  9352, 10605,  1343,  8972,  6649, 11919,  1132,   899,
        5087, 14684,  4387,  4756, 15096,  1659,  1641, 10954,  2358,
       13018,  1105, 13424, 10732,  4360, 14689,  8211, 12324,   349,
       11654, 14006,  2956,  8437, 12770,  2722,   149, 10688, 14217,
        8283,  7000,  3182, 12104,  5171,  5411, 15915,  5956,  3834,
       11494,  6870, 15719,  2505, 15464, 14317, 11689, 10544, 15531,
       12448,  9728,   101,  7102, 11539, 16166,  4718,  4273,  7401,
       14470])

In [37]:
recs


array([[ 4151,  6353, 13865,  6327,  4880],
       [  181,  6327,  4151,  1736,  4881],
       [10440,  4880,  4881, 13865,  6327]])

In [38]:
sample_users

[57607, 403227, 70720]

#### Напишем функцию расчета precision@k "в лоб"

In [39]:
def precision_naive(target, users, recs, k):
    precision = []
    for i, user in enumerate(users):
        p = 0
        user_target = target[target[:, 0] == user][:, 1]
        for rec in recs[i]:
            if rec in user_target:
                p += 1
        precision.append(p)
    return sum(precision) / len(users) / k

In [40]:
precision_naive(target, sample_users, recs, 5)

0.4666666666666667

#### Напишем реализацию функции с использованием pandas

In [41]:
def precision_pandas(df, users, recs, k):
    df_recs = pd.DataFrame({
        Columns.User: np.repeat(users, k),
        Columns.Item: recs.ravel()
    })
    df_recs[Columns.Rank] = df_recs.groupby(Columns.User).cumcount() + 1
    df_recs = df.merge(df_recs, how='left', left_on=Columns.UserItem, right_on=Columns.UserItem)
    tp_k = f'TP@{k}'
    df_recs[tp_k] = df_recs[Columns.Rank] < (k + 1)
    p = df_recs[tp_k].sum() / k / len(users)
    return p

In [42]:
precision_pandas(df, sample_users, recs, 5)

0.4666666666666666

#### Посмотрим через `timeit`

In [43]:
%timeit precision_naive(target, sample_users, recs, 5)

37.4 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [44]:
%timeit precision_pandas(df, sample_users, recs, 5)

1.18 s ± 25.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Почему так произошло?

#### Попробуем взять больше данных

In [45]:
def generate_subsample(users_count, top_k):
    users = np.random.choice(interactions[Columns.User].unique(), users_count, replace=False)
    df = interactions[interactions[Columns.User].isin(users)].reset_index(drop=True)
    del df[Columns.Datetime], df[Columns.Weight], df['watched_pct']
    
    recs = np.random.choice(df[Columns.Item], size=(users_count, top_k))
    return df, users, recs

In [46]:
top_k = 10
df, users, recs = generate_subsample(10000, top_k)
target = df.values

In [47]:
users

array([628072,  15993, 470329, ..., 721843, 987818,  28484])

In [48]:
precision_naive(target, users, recs, top_k)

0.03048

In [49]:
%timeit precision_naive(target, users, recs, top_k)

1.32 s ± 29.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
precision_pandas(df, users, recs, top_k)

0.03048

In [51]:
%timeit precision_pandas(df, users, recs, top_k)

17.2 ms ± 132 μs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Homework Метрики и бейзлайны**

Добавьте реализацию метрик (по аналогии с Precision) MAP, Recall и применить к данным выше (из семинара), полученным с помощью generate_subsample

k взять разным: 1, 5, 10

На выходе - сравнение 

In [58]:
def recall_naive(df, users, recs, top_k):
    target = df.values
    recall = []
    for i, user in enumerate(users):
        p = 0
        n = 0
        user_target = target[target[:, 0] == user][:, 1]
        for rec in recs[i]:
            if rec in user_target:
                p += 1
        for item in user_target:
            if item not in recs[i]:
                n += 1
        recall.append(p / (p + n))
    return sum(recall) / len(users)


def recall_pandas(df, users, recs, top_k):
    df_rec = pd.DataFrame(
        {Columns.User: np.repeat(users, top_k), Columns.Item: recs.ravel()}
    )

    df_rec[Columns.Rank] = df_rec.groupby(Columns.User).cumcount() + 1
    df_rec = df.merge(
        df_rec,
        how="left",
        left_on=Columns.UserItem,
        right_on=Columns.UserItem,
    )
    df_rec = df_rec.sort_values(by=[Columns.User, Columns.Rank])

    tp_k = f"TP@{top_k}"
    df_rec[tp_k] = df_rec[Columns.Rank] < (top_k + 1)
    df_rec["actual"] = df_rec.groupby(Columns.User)[Columns.Item].transform("count")

    df_rec = df_rec[df_rec[Columns.User].isin(users)]
    df_rec[f"{tp_k}/actual"] = df_rec[f"{tp_k}"] / df_rec["actual"]

    return df_rec.groupby(Columns.User)[f"{tp_k}/actual"].sum().mean()


# AP@K metric (sub function for map_naive)
def apk(actual, predicted, k):

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)


def map_naive(df, users, recs, top_k):
    target = df.values
    user_target = []
    for user in users:
        user_target.append(list(target[target[:, 0] == user][:, 1]))
    return np.mean([apk(a, p, top_k) for a, p in zip(user_target, recs)])


def map_pandas(df, users, recs, top_k):
    df_rec = pd.DataFrame(
        {
            Columns.User: np.repeat(users, top_k),
            Columns.Item: recs.ravel(),
        }
    )

    df_rec[Columns.Rank] = df_rec.groupby(Columns.User).cumcount() + 1
    df_rec = df.merge(
        df_rec,
        how="left",
        left_on=Columns.UserItem,
        right_on=Columns.UserItem,
    )
    df_rec = df_rec.sort_values(by=[Columns.User, Columns.Rank])

    df_rec["TP@k"] = df_rec[Columns.Rank] < (top_k + 1)
    df_rec["actual"] = df_rec.groupby(Columns.User)[Columns.Item].transform("count")

    df_rec["cumTP@k"] = df_rec.groupby(Columns.User)["TP@k"].cumsum()

    df_rec["Prec@k"] = df_rec["cumTP@k"] / df_rec[Columns.Rank]
    df_rec["Prec@k/actual"] = df_rec["Prec@k"] / df_rec["actual"]

    map = df_rec.groupby(Columns.User)["Prec@k/actual"].sum().mean()

    return map

In [59]:
%timeit recall_naive(df, users, recs, top_k)

1.47 s ± 23 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [60]:
%timeit recall_pandas(df, users, recs, top_k)

25.3 ms ± 1.52 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [61]:
%timeit map_naive(df, users, recs, top_k)

1.11 s ± 13.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [62]:
%timeit map_pandas(df, users, recs, top_k)

29.1 ms ± 2.45 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


### Критерии успеха

- написана реализация хотя бы одной метрики (реализация "в лоб") - 1 балл
- написаны функции для двух метрик в наивном виде - +2 балла
- если одна метрика реализована двумя способами (наивный и с помощью pandas) +2 балла
- обе метрики сделаны двумя способами +1 балла

Bonus:
- если самостоятельно изучите одну дополнительную метрику (HR/MNAP/NDCG/etc) и добавите реализацию двумя способами, то еще +2 балла

In [174]:
# AP@K metric (sub function for map_naive)
def apk(actual, predicted, k):

    if len(predicted) > k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mnap_naive(df, users, recs, top_k):
    target = df.values
    user_target = []
    for user in users:
        user_target.append(list(target[target[:, 0] == user][:, 1]))
    return np.mean([apk(a, p, top_k) / np.min([len(a), top_k]) for a, p in zip(user_target, recs)])


def mnap_pandas(df, users, recs, top_k):
    df_rec = pd.DataFrame(
        {
            Columns.User: np.repeat(users, top_k),
            Columns.Item: recs.ravel(),
        }
    )

    df_rec[Columns.Rank] = df_rec.groupby(Columns.User).cumcount() + 1
    df_rec = df.merge(
        df_rec,
        how="left",
        left_on=Columns.UserItem,
        right_on=Columns.UserItem,
    )
    df_rec = df_rec.sort_values(by=[Columns.User, Columns.Rank])
    
    df_rec["TP@k"] = df_rec[Columns.Rank] < (top_k + 1)
    df_rec["actual"] = df_rec.groupby(Columns.User)[Columns.Item].transform("count")

    df_rec["cumTP@k"] = df_rec.groupby(Columns.User)["TP@k"].cumsum()

    df_rec["Prec@k"] = df_rec["cumTP@k"] / df_rec[Columns.Rank]
    df_rec["Prec@k/actual"] = df_rec["Prec@k"] / df_rec["actual"]

    map = df_rec.groupby(Columns.User, as_index=False)["Prec@k/actual"].sum()


    print(map)
    def sub_func(x):

        # print(x[Columns.User])
        
        # res = x["user_id"]
        res = 4
        # np.min([
        #     top_k,
        #     df_rec[df_rec["user_id"] == x["user_id"]][Columns.UserItem].unique(),
        # ])
        print(x)
        x /= res 

        
        
        return x

    # print(map)

    # print(df_rec)
    map.apply(lambda x: sub_func(x))

    # map["mew"] = df_rec[df_rec["user_id"] == map["user_id"]]["item_id"].unique().shape[0]

    return map.mean()

In [166]:
mnap_naive(df, users, recs, top_k)

0.016290435718003766

In [153]:
df_rec[df_rec["user_id"] == 57607]["item_id"].unique().shape[0]

3

In [175]:
mnap_pandas(df, users, recs, top_k)

      user_id  Prec@k/actual
0          24       0.000000
1         263       0.000000
2         477       0.000000
3         509       0.007692
4         522       0.000000
...       ...            ...
9995  1097062       0.000000
9996  1097151       0.047619
9997  1097227       0.000000
9998  1097343       0.000000
9999  1097398       0.000000

[10000 rows x 2 columns]
0            24
1           263
2           477
3           509
4           522
         ...   
9995    1097062
9996    1097151
9997    1097227
9998    1097343
9999    1097398
Name: user_id, Length: 10000, dtype: int64
0       0.000000
1       0.000000
2       0.000000
3       0.007692
4       0.000000
          ...   
9995    0.000000
9996    0.047619
9997    0.000000
9998    0.000000
9999    0.000000
Name: Prec@k/actual, Length: 10000, dtype: float64


user_id          547018.642700
Prec@k/actual         0.006555
dtype: float64

(10000,)