In [1]:
import zipfile

with zipfile.ZipFile('./../data/earth_day_missions.csv.zip', 'r') as myzip:
    myzip.extract('earth_day_missions.csv', './../data/')
    myzip.close()

In [2]:
import pandas as pd
import numpy as np

df = pd.read_csv('./../data/earth_day_missions.csv', index_col=0, low_memory=False)
df = df[]
df = df[['start_date', 'user', 'missionId', 'missionType', 'missionSubType', 'period', 'target', 'title', 'performance', 'satisfied']]

df['start_date'] = pd.to_datetime(df['start_date'])
df['user'] = df['user'].astype('category').cat.codes
df['missionId'] = df['missionId'].astype('category').cat.codes
df['missionType'] = df['missionType'].astype('category')
df['missionSubType'] = df['missionSubType'].astype('category')
df['period'] = df['period'].astype('category')

df['performance'] = df['performance'].apply(lambda x: -1 if x == 0 else 1-x if x >= 1 else x)

df = df.sort_values(by=['start_date'], ignore_index=True)

df

Unnamed: 0,start_date,user,missionId,missionType,missionSubType,period,target,title,performance,satisfied
0,2024-03-25,8789,16,LEARN,JOURNEY,CUSTOM,1,complete_journey,-1.0,False
1,2024-03-25,1288,14,MEASURE,TYPEFORM_BASE,CUSTOM,1,calculate_carbon_footprint,-1.0,False
2,2024-03-25,4280,4,LEARN,DAILYEPISODE,WEEKLY,2,watch_the_daily_episode_2,-1.0,False
3,2024-03-25,1288,7,ACT,MOBILITY,WEEKLY,2,log_2_action_mobility,-1.0,False
4,2024-03-25,1288,4,LEARN,DAILYEPISODE,WEEKLY,2,watch_the_daily_episode_2,-1.0,False
...,...,...,...,...,...,...,...,...,...,...
267891,2024-04-20,18489,20,CHECKIN,CHECKIN,DAILY,1,do_checkin,0.0,True
267892,2024-04-20,5065,23,CHECKIN,CHECKIN,DAILY,1,do_checkin,0.0,True
267893,2024-04-20,9216,22,CHECKIN,CHECKIN,DAILY,1,do_checkin,0.0,True
267894,2024-04-20,9216,15,CHECKIN,CHECKIN,DAILY,1,do_checkin,0.0,True


In [3]:
summary = df.groupby(['missionId', 'title', 'missionType', 'missionSubType', 'period', 'target', 'satisfied'], observed=True).size().unstack(fill_value=0)
summary.sort_index(level=[2, 3, 4, 5], inplace=True)

summary

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,satisfied,False,True
missionId,title,missionType,missionSubType,period,target,Unnamed: 6_level_1,Unnamed: 7_level_1
8,log_1_action,ACT,ALL,DAILY,1,0,636
24,log_2_action,ACT,ALL,DAILY,2,113,1087
18,log_3_action,ACT,ALL,DAILY,3,115,517
33,log_2_action2_consecutive,ACT,ALL,WEEKLY,2,23043,190
25,log_1_action_energy,ACT,ENERGY,DAILY,1,0,101
21,log_2_action_energy,ACT,ENERGY,DAILY,2,48,814
26,log_1_action_environment,ACT,ENVIRONMENT,DAILY,1,0,554
11,log_1_action_vegetarian,ACT,FIXED,DAILY,1,0,98
31,log_action_take5minutesshower,ACT,FIXED,DAILY,1,0,101
29,log_2_action_vegetarian,ACT,FIXED,WEEKLY,2,23008,225


In [4]:
dataset = df[['user', 'missionId', 'performance']] \
    .drop_duplicates(subset=['user', 'missionId'], keep='last')

train_dataset = dataset.groupby('user').sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

display(train_dataset.nunique())
display(test_dataset.nunique())

print(train_dataset.shape, test_dataset.shape)

user           23233
missionId         36
performance       34
dtype: int64

user           23233
missionId         36
performance       27
dtype: int64

(215743, 3) (48809, 3)


In [5]:
user_missions_matrix = train_dataset \
    .pivot(index='user', columns='missionId', values='performance').fillna(-1)

user_missions_matrix

missionId,0,1,2,3,4,5,6,7,8,9,...,26,27,28,29,30,31,32,33,34,35
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,...,-1.0,0.333333,0.0,-1.0,0.0,-1.0,-1.0,0.5,-1.0,-1.0
1,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
3,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
4,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23228,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
23229,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
23230,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0
23231,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,0.0,-1.0,...,-1.0,-1.000000,-1.0,-1.0,0.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [6]:
import random
from tqdm import tqdm

random.seed(42)

data = []
for i, row in tqdm(user_missions_matrix.iterrows(), total=user_missions_matrix.shape[0]):
    couples = set()

    for pos_idx in row.index:
        positive = row[pos_idx]

        neg_idx = row[row < positive].index
        if len(neg_idx) == 0:
            continue
        
        neg_idx = random.choice(neg_idx)

        couples.add((i, pos_idx, neg_idx))

    data.extend(couples)

data = pd.DataFrame(data, columns=['user', 'positive', 'negative'])

data

100%|██████████| 23233/23233 [00:53<00:00, 431.52it/s]


Unnamed: 0,user,positive,negative
0,0,28,8
1,0,14,0
2,0,18,29
3,0,27,17
4,0,33,8
...,...,...,...
59268,23228,30,22
59269,23230,30,28
59270,23231,30,15
59271,23231,8,18


In [7]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from tqdm.auto import tqdm

torch.manual_seed(42)
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(DEVICE)

class NeuMF(nn.Module):
    def __init__(self, num_factors, num_users, num_items, hiddens, **kwargs) -> None:
        super().__init__(**kwargs)

        self.P = nn.Embedding(num_users, num_factors)
        self.Q = nn.Embedding(num_items, num_factors)

        self.U = nn.Embedding(num_users, num_factors)
        self.V = nn.Embedding(num_items, num_factors)

        self.mlp = nn.Sequential()
        for h in hiddens:
            self.mlp.add_module('linear', nn.LazyLinear(h))
            self.mlp.add_module('relu', nn.ReLU())
        
        self.prediction = nn.Sequential(
            nn.LazyLinear(1, bias=False),
            nn.Sigmoid()
        )

    def forward(self, user_id, item_id):
        p_mf = self.P(user_id)
        q_mf = self.Q(item_id)
        gmf = p_mf * q_mf

        p_mlp = self.U(user_id)
        q_mlp = self.V(item_id)

        mlp = self.mlp(torch.cat([p_mlp, q_mlp], dim=1))
        con_res = torch.cat([gmf, mlp], dim=1)
        return self.prediction(con_res)


class BPRLoss(nn.Module):
    def __init__(self, **kwargs) -> None:
        super().__init__(**kwargs)

    def forward(self, positive, negative):
        return - torch.sum(torch.log(torch.sigmoid(positive - negative)), dim=0, keepdim=True)
    

class PairwiseDataset(Dataset):
    def __init__(self, data, num_users, num_items) -> None:
        super().__init__()

        self.data = data
        self.num_users = num_users
        self.num_items = num_items

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        row = torch.from_numpy(row.to_numpy()).long()
        return row


def train(model, loss, optimizer, data_loader, num_epochs):
    for _ in (t := tqdm(range(num_epochs))):
        model.train()
        total_loss = 0

        for d in (b := tqdm(data_loader, leave=False)):
            d = d.to(DEVICE)
            user = d[:, 0]
            positive = d[:, 1]
            negative = d[:, 2]

            optimizer.zero_grad()
            positive = model(user, positive)
            negative = model(user, negative)
            l = loss(positive, negative)
            l.backward()
            optimizer.step()
            total_loss += l.item()
            b.set_postfix(loss=l.item())

        t.set_postfix(loss=total_loss / len(data_loader))

cuda


In [8]:
num_users = user_missions_matrix.shape[0]
num_items = user_missions_matrix.shape[1]

model = NeuMF(num_factors=10, num_users=num_users, num_items=num_items, hiddens=[16, 16, 16]).to(DEVICE)

loss = BPRLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-5)
data_loader = DataLoader(PairwiseDataset(data, num_users, num_items), batch_size=64, shuffle=True)

train(model, loss, optimizer, data_loader, num_epochs=10)



  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

  0%|          | 0/927 [00:00<?, ?it/s]

In [9]:
class Recommender:
    def __init__(self, model, user_missions_matrix) -> None:
        self.model = model
        self.user_missions_matrix = user_missions_matrix
    
    def recommend(self, user_id, top_k=10):
        user = torch.tensor([user_id] * self.user_missions_matrix.shape[1], device=DEVICE)
        items = torch.arange(self.user_missions_matrix.shape[1], device=DEVICE)
        scores: np.ndarray = self.model(user, items).squeeze().detach().cpu().numpy()
        scores = scores.argsort()[::-1][:top_k]

        return scores
    
recommender = Recommender(model, user_missions_matrix)

In [10]:
def evaluate(recommender: Recommender, test_dataset: pd.DataFrame, top_k=10):
    y_true = []
    y_pred = []

    for user_id, group in test_dataset.groupby('user'):
        y_true.append(group['missionId'].values)
        y_pred.append(recommender.recommend(user_id, top_k))

    y_true = [set(y) for y in y_true]
    y_pred = [set(y) for y in y_pred]
    relevant = [len(y & yp) for y, yp in zip(y_true, y_pred)]
    
    hit_rate = [r > 0 for r in relevant]
    recall = [r / len(y) for r, y in zip(relevant, y_true)]

    return np.mean(hit_rate), np.mean(recall)

In [11]:
hit_rate, recall = evaluate(recommender, test_dataset, top_k=3)

print('Hit Rate:', hit_rate.round(2))
print('Recall:', recall.round(2))

Hit Rate: 0.34
Recall: 0.17


In [12]:
hit_rate, recall = evaluate(recommender, test_dataset, top_k=5)

print('Hit Rate:', hit_rate.round(2))
print('Recall:', recall.round(2))

Hit Rate: 0.57
Recall: 0.32


In [13]:
hit_rate, recall = evaluate(recommender, test_dataset, top_k=10)

print('Hit Rate:', hit_rate.round(2))
print('Recall:', recall.round(2))

Hit Rate: 0.86
Recall: 0.59
