## Riiidコンペの実際のデータから(ユーザ×問題)の特徴量のcsv生成

In [None]:
%%time
import cudf

types = {
        'row_id': 'int64',
        'timestamp': 'int64',
        'user_id': 'int32',
        'content_id': 'int16',
        'content_type_id': 'int8',
        'task_container_id': 'int16',
        'user_answer': 'int8',
        'answered_correctly': 'int8',
        'prior_question_elapsed_time': 'float32',
        'prior_question_had_explanation': 'int8'
}
datapath = '/kaggle/input/riiid-test-answer-prediction/train.csv'

train_X = cudf.read_csv(datapath, dtype=types)
train_X = train_X[train_X['content_type_id'] == 0]
feat_float = train_X.groupby(['user_id', 'content_id'])['answered_correctly'].mean().astype('float32')
feat_int = train_X.groupby(['user_id', 'content_id'])['answered_correctly'].count().astype('int16')
del train_X

print('number of keys : ', len(feat_int))
feat_float.reset_index().to_csv('user_content_wise_float.csv', index=False)
feat_int.reset_index().to_csv('user_content_wise_int.csv', index=False)

## C++のコードをriiid_module.cppに保存

In [None]:
%%file riiid_module.cpp

#include <parallel_hashmap/phmap.h>
#include <string>
#include <fstream>
#include <iostream>

using phmap::flat_hash_map;

struct riiid_user_x_content_int {
  riiid_user_x_content_int(const std::string& datapath) {
    std::ifstream mycsvfile (datapath);
    if (mycsvfile.is_open()) {
      std::string line; getline(mycsvfile, line);
      while (mycsvfile.good()) {
          getline(mycsvfile, line);
          if (line.size() == 0) break;
 
          int comma_pos = line.find(',', 0);
          int comma_pos2 = line.find(',', comma_pos+1);
 
          uint32_t user_id = stoi(line.substr(0, comma_pos));
          uint16_t content_id = stoi(line.substr(comma_pos+1,
            comma_pos2-comma_pos-1));
          uint8_t value = stoi(line.substr(comma_pos2+1));
          m_dict[user_id][content_id] = value;
      }
      std::cout << "csv file loaded" << std::endl;
    } else {
      std::cout << "Error in opening file" << std::endl;
    }
  }

  int getval(int user_id, int content_id) {
    return m_dict[user_id][content_id];
  }

  void setval(int user_id, int content_id, int value) {
    m_dict[user_id][content_id] = value;
  }

  flat_hash_map<uint32_t, flat_hash_map<uint16_t, uint8_t>> m_dict;
};

#include <pybind11/pybind11.h>
namespace py = pybind11;

PYBIND11_MODULE(riiid_module, m) {
  py::class_<riiid_user_x_content_int>(m, "user_x_content_int")
    .def(py::init<const std::string &>())
    .def("get", &riiid_user_x_content_int::getval, 
      py::arg("user_id"), py::arg("content_id"))
    .def("set", &riiid_user_x_content_int::setval,
      py::arg("user_id"), py::arg("content_id"), py::arg("value"));
}

In [None]:
path_to_parallelmap_folder = "/kaggle/input/parallel-hashmap"
path_to_cppfile = "./riiid_module.cpp"
module_name = "riiid_module"

## 共有ライブラリにコンパイル

In [None]:
!c++ -O3 -Wall -shared -std=c++11 -fPIC `python3 -m pybind11 --includes` $path_to_cppfile -I$path_to_parallelmap_folder -o $module_name`python3-config --extension-suffix`

### メモリプロファイラインストール

In [None]:
!pip install memory_profiler
%load_ext memory_profiler

## Pythonから拡張モジュールをimport

In [None]:
%%time
from riiid_module import user_x_content_int

path_to_csv = "./user_content_wise_int.csv"
%memit cumulative_count = user_x_content_int(path_to_csv)

cumulative_count.set(user_id=42, content_id=4, value=3) # セット
assert cumulative_count.get(user_id=42, content_id=4) == 3 # キーワード引数
assert cumulative_count.get(42, 4) == 3 # ポジション引数