In [1]:
import zipfile

with zipfile.ZipFile('./../data/earth_day_missions.csv.zip', 'r') as myzip:
    myzip.extract('earth_day_missions.csv', './../data/')
    myzip.close()

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('./../data/earth_day_missions.csv', index_col=0, low_memory=False)
df = df[~df['ignored']]
df = df[['start_date', 'user', 'missionId', 'missionType', 'missionSubType', 'period', 'target', 'title', 'satisfied']]

df['start_date'] = pd.to_datetime(df['start_date'])
df['user'] = df['user'].astype('category').cat.codes
df['missionId'] = df['missionId'].astype('category').cat.codes
df['missionType'] = df['missionType'].astype('category')
df['missionSubType'] = df['missionSubType'].astype('category')
df['period'] = df['period'].astype('category')

df = df.sort_values(by=['start_date'], ignore_index=True)

df

Unnamed: 0,start_date,user,missionId,missionType,missionSubType,period,target,title,satisfied
0,2024-03-25,21190,30,TICKET,TICKET,CUSTOM,1,ticket,True
1,2024-03-25,18882,30,TICKET,TICKET,CUSTOM,1,ticket,True
2,2024-03-25,17304,14,MEASURE,TYPEFORM_BASE,CUSTOM,1,calculate_carbon_footprint,True
3,2024-03-25,17304,16,LEARN,JOURNEY,CUSTOM,1,complete_journey,True
4,2024-03-25,17304,30,TICKET,TICKET,CUSTOM,1,ticket,True
...,...,...,...,...,...,...,...,...,...
45115,2024-04-20,8951,28,LEARN,DAILYEPISODE,DAILY,1,watch_the_daily_episode,True
45116,2024-04-20,8951,32,CHECKIN,CHECKIN,DAILY,1,do_checkin,True
45117,2024-04-20,22776,3,CHECKIN,CHECKIN,DAILY,1,do_checkin,True
45118,2024-04-20,19837,28,LEARN,DAILYEPISODE,DAILY,1,watch_the_daily_episode,True


In [3]:
summary = df.groupby(['missionId', 'title', 'missionType', 'missionSubType', 'period', 'target', 'satisfied'], observed=True).size().unstack(fill_value=0)
summary.sort_index(level=[2, 3, 4, 5], inplace=True)

summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,satisfied,False,True
missionId,title,missionType,missionSubType,period,target,Unnamed: 6_level_1,Unnamed: 7_level_1
8,log_1_action,ACT,ALL,DAILY,1,0,636
24,log_2_action,ACT,ALL,DAILY,2,113,1087
18,log_3_action,ACT,ALL,DAILY,3,115,517
33,log_2_action2_consecutive,ACT,ALL,WEEKLY,2,1425,190
25,log_1_action_energy,ACT,ENERGY,DAILY,1,0,101
21,log_2_action_energy,ACT,ENERGY,DAILY,2,48,814
26,log_1_action_environment,ACT,ENVIRONMENT,DAILY,1,0,554
11,log_1_action_vegetarian,ACT,FIXED,DAILY,1,0,98
31,log_action_take5minutesshower,ACT,FIXED,DAILY,1,0,101
29,log_2_action_vegetarian,ACT,FIXED,WEEKLY,2,134,225


In [4]:
dataset = df[['user', 'missionId']].copy()

train_dataset = dataset.groupby('user').sample(frac=0.75, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

display(train_dataset.nunique())
display(test_dataset.nunique())

train_dataset.shape, test_dataset.shape

user         23233
missionId       36
dtype: int64

user         3298
missionId      36
dtype: int64

((38704, 2), (6416, 2))

In [None]:
user_missions_matrix = train_dataset.assign(tried=1) \
    .pivot(index='user', columns='missionId', values='tried') \
    .fillna(0).astype('int8') \

user_missions_matrix

In [None]:
import random
from tqdm.auto import tqdm

random.seed(42)

data = []
for i, row in tqdm(user_missions_matrix.iterrows(), total=user_missions_matrix.shape[0]):
    couples = set()

    for pos_idx in row.index:
        positive = row[pos_idx]

        neg_idx = row[row < positive].index
        if len(neg_idx) == 0:
            continue
        
        neg_idx = random.choice(neg_idx)

        couples.add((i, pos_idx, neg_idx))

    data.extend(couples)

data = pd.DataFrame(data, columns=['user', 'positive', 'negative'])

data

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

torch.manual_seed(42)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

class NeuMF(nn.Module):
    def __init__(self, num_factors, num_users, num_items, hiddens, **kwargs) -> None:
        super().__init__(**kwargs)

        self.P = nn.Embedding(num_users, num_factors)
        self.Q = nn.Embedding(num_items, num_factors)

        self.U = nn.Embedding(num_users, num_factors)
        self.V = nn.Embedding(num_items, num_factors)

        self.mlp = nn.Sequential()
        for h in hiddens:
            self.mlp.add_module('linear', nn.LazyLinear(h))
            self.mlp.add_module('relu', nn.ReLU())
        
        self.prediction = nn.LazyLinear(1, bias=False)

    def forward(self, user_id, item_id):
        p_mf = self.P(user_id)
        q_mf = self.Q(item_id)
        gmf = p_mf * q_mf

        p_mlp = self.U(user_id)
        q_mlp = self.V(item_id)
        mlp = self.mlp(torch.cat([p_mlp, q_mlp], dim=1))

        con_res = torch.cat([gmf, mlp], dim=1)
        con_res = self.prediction(con_res)

        return con_res.view(-1)


class BPRLoss(nn.Module):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)

    def forward(self, positive, negative):
        return - torch.sum(torch.log(torch.sigmoid(positive - negative)), dim=0, keepdim=True)
    

class PairwiseDataset(Dataset):
    def __init__(self, data: pd.DataFrame) -> None:
        super().__init__()

        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row: pd.Series = self.data.iloc[idx]
        row = torch.from_numpy(row.to_numpy()).long()
        return row


def train(model: nn.Module, loss, optimizer, data_loader, num_epochs):
    for _ in (t := tqdm(range(num_epochs))):
        model.train()
        total_loss = 0

        for d in (b := tqdm(data_loader, leave=False)):
            d = d.to(DEVICE)
            user = d[:, 0]
            positive = d[:, 1]
            negative = d[:, 2]

            optimizer.zero_grad()
            positive = model(user, positive)
            negative = model(user, negative)
            l = loss(positive, negative)
            l.backward()
            optimizer.step()
            total_loss += l.item()
            b.set_postfix(loss=l.item())

        t.set_postfix(loss=total_loss / len(data_loader))

In [9]:
class RandomRanker:
    def __call__(self, user_id: torch.Tensor, item_id: torch.Tensor):
        return torch.rand(user_id.size(0)).view(-1)

In [10]:
class Recommender:
    def __init__(self, model, user_missions_matrix: pd.DataFrame) -> None:
        self.model = model
        self.user_missions_matrix = user_missions_matrix
    
    def recommend(self, user_id, top_k=10):
        items = user_missions_matrix.loc[user_id] \
            .where(lambda x: x == 0) \
            .dropna().index
        
        user = torch.tensor([user_id] * len(items), device=DEVICE)
        scores: np.ndarray = self.model(
            user, 
            torch.tensor(items, device=DEVICE).long()
        ).detach().cpu().numpy()
        scores = pd.Series(scores, index=items)
        return scores.nlargest(top_k).index.to_numpy()

In [11]:
def evaluate(recommender: Recommender, test_dataset: pd.DataFrame, top_k=10):
    y_true = []
    y_pred = []

    for user_id, group in test_dataset.groupby('user'):
        y_true.append(group['missionId'].values)
        y_pred.append(recommender.recommend(user_id, top_k))

    y_true = [set(y) for y in y_true]
    y_pred = [set(y) for y in y_pred]
    relevant = [len(y & yp) for y, yp in zip(y_true, y_pred)]
    
    hit_rate = [r > 0 for r in relevant]
    recall = [r / len(y) for r, y in zip(relevant, y_true)]

    return np.mean(hit_rate), np.mean(recall)

In [None]:
num_users = dataset['user'].max() + 1
num_items = dataset['missionId'].max() + 1

model = NeuMF(num_factors=16, num_users=num_users, num_items=num_items, hiddens=[16, 16, 16]).to(DEVICE)

loss = BPRLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
data_loader = DataLoader(PairwiseDataset(data), batch_size=64, shuffle=True)

train(model, loss, optimizer, data_loader, num_epochs=10)

recommender = Recommender(model, user_missions_matrix)

pd.DataFrame([[k, *evaluate(recommender, test_dataset, k)] for k in [3, 5, 10]], columns=['k', 'hit_rate', 'recall']).set_index('k').round(4)

In [None]:
random_recommender = Recommender(RandomRanker(), user_missions_matrix)

pd.DataFrame([[k, *evaluate(random_recommender, test_dataset, k)] for k in [3, 5, 10]], columns=['k', 'hit_rate', 'recall']).set_index('k').round(4)