In [23]:
import os
import math
import random
import json
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from preprocess import make_multiple_hard_labels
from metrics import mapk

In [2]:
DATA_ROOT = os.path.join(
    Path(os.getcwd()).parent.absolute(), 'hahow', 'data'
)

user_csv_path = os.path.join(DATA_ROOT, 'users.csv')
course_chapter_item_csv_path = os.path.join(DATA_ROOT, 'course_chapter_items.csv')
courses_csv_path = os.path.join(DATA_ROOT, 'courses.csv')
subgroup_csv_path = os.path.join(DATA_ROOT, 'subgroups.csv')
train_group_csv_path = os.path.join(DATA_ROOT, 'train', 'train_group.csv')
train_csv_path = os.path.join(DATA_ROOT, 'train', 'train.csv')

user_df = pd.read_csv(user_csv_path)
course_chapter_item_df = pd.read_csv(course_chapter_item_csv_path)
courses_df = pd.read_csv(courses_csv_path)
subgroup_df = pd.read_csv(subgroup_csv_path)
train_group_df = pd.read_csv(train_group_csv_path)
train_df = pd.read_csv(train_csv_path)

In [3]:
val_seen_course_path = os.path.join(DATA_ROOT, 'val', 'val_seen.csv')
val_unseen_course_path = os.path.join(DATA_ROOT, 'val', 'val_unseen.csv')
test_seen_course_path = os.path.join(DATA_ROOT, 'test', 'test_seen.csv')
test_unseen_course_path = os.path.join(DATA_ROOT, 'test', 'test_unseen.csv')

val_seen_course_df = pd.read_csv(val_seen_course_path)
val_unseen_course_df = pd.read_csv(val_unseen_course_path)
test_seen_course_df = pd.read_csv(test_seen_course_path)
test_unseen_course_df = pd.read_csv(test_unseen_course_path)

In [21]:
gt_val_seen = val_seen_course_df['course_id'].tolist()
gt_val_unseen = val_unseen_course_df['course_id'].tolist()

In [4]:
OUT_ROOT = './leaderboard_playing'

# Predict only unbought courses

In [9]:
_, course_ids = make_multiple_hard_labels(train_df, 'course_id')
course_ids = pd.Series(course_ids)

In [8]:
all_course = courses_df['course_id'].tolist()
unbought_courses = set(all_course) - set(course_ids)
print(f"There are {len(unbought_courses)} unbought courses")
print(unbought_courses)

There are 64 unbought courses
{'56e0e9fb4e3ef90900b7cff1', '60dd6d80638ed00aa99b5556', '61888f9bb259500007e2cb20', '5ff6aa28c5cbbcb694532eaa', '6168db780c0dd90006161f32', '60efd3a347b7d70006894784', '55ae66017b4d991000119959', '61a5d89b3b954c0007022c77', '61381a3f33a3960006df1eb3', '610814069871100007ab7b4f', '61b6c2bea7dfb10006498876', '6184ace6b2319400078a8dad', '607ceae6fa76bb60e8a556c8', '60c84de9eb75ca46e0c25e85', '60c1d33a8dd31844b56bfd54', '6083734d17b1e70d08fd3ca6', '6130753a26d20b0006d48dfe', '60e7f0598036260006b71683', '6107cd6c3cc7a0000689c1c6', '60ed88d6b89d2300069ee963', '6184efc3b2319400078aefe7', '61681f49fea517000686d9dc', '61056b63d46a000007a46af7', '568537d0e8ff9b20003cf504', '6136ed9fafdea00006bdd8ee', '6056f8ceab70de413d98e723', '6130495cd5d02a00071f2c3b', '611f7d91bd122100071f2926', '613c4d77323c7f000694dd08', '6135374d94b8350007f7fe43', '61666a458fc5c300073e6f60', '6125b83cdf147200070db995', '61b941f2e8990300069e4e9e', '61237ed5df69e30006dbfff7', '617a0a057bb91d00

In [17]:
def generate_random_courses(len_list, len_sample=50):
    out = [
            random.sample(list(unbought_courses), len_sample) \
            for _ in range(len_list)
        ]
    return out

def post_process_label_to_df(df, pred_list):
    preds = [' '.join(
            map(str, pred)
            ) for pred in pred_list]
    preds_df = pd.DataFrame(
        {
            'user_id': df['user_id'],
            'course_id': preds
        }
    )
    return preds_df

In [32]:
val_seen_courses = generate_random_courses(val_seen_course_df.shape[0])
val_unseen_courses = generate_random_courses(val_unseen_course_df.shape[0])
mapk_seen = mapk(gt_val_seen, val_seen_courses, k=50)
mapk_unseen = mapk(gt_val_unseen, val_unseen_courses, k=50)
print(f"Valid set mapk, seen: {mapk_seen}, unseen {mapk_unseen}")

Valid set mapk, seen: 0.001536917395655182, unseen 0.0011067415513861029
