In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from datetime import datetime
from pathlib import Path
from collections import Counter

from joblib import Parallel, delayed
from tqdm.notebook import tqdm
import dill
import pandas as pd
import polars as pl
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.model_selection import GroupKFold

import lightgbm as lgb

import pytorch_lightning as L
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import matplotlib.pyplot as plt
import seaborn as sns

from tasks.data.dataset.mappers import EntityEncoder
from tasks.jobs import Splitter

In [3]:
from otto_utils import *
from otto_features import *
from otto_candidates_covisit import *
from otto_lgbm_utils import *
from otto_implicit import *
from otto_reranker import *
from otto_jobs_candidates import *
from otto_jobs_datasets import *

In [4]:
%env PYTHONHASHSEED=1
from otto_word2vec import *

# w2vec = Word2Vec.load("__valid__word2vec_window=10_negative=20.w2v")

env: PYTHONHASHSEED=1


# Base features

In [7]:
build_user_features(_dataset_type="valid")

Reading dataset...:  valid
2022-08-22 00:00:00.192000
2022-08-28 21:59:58.621000


In [9]:
build_user_features(_dataset_type="subm")

Reading dataset...:  subm
2022-08-28 22:00:00.278000
2022-09-04 21:59:51.563000


In [12]:
build_item_features(_dataset_type="valid")

Reading dataset...:  valid
2022-07-31 22:00:00.025000
2022-08-28 21:59:58.621000


In [14]:
build_item_features(_dataset_type="subm")

Reading dataset...:  subm
2022-08-08 00:00:00.265000
2022-09-04 21:59:51.563000


In [6]:
candidates_valid = {
    "tdidf_new_top_100": "__valid__candidates_tfidf_new_k=200_topk=100.parquet",
    "tdidf_old_top_100": "__valid__candidates_tfidf_old_k=200_topk=100.parquet",
    "i2i_new_top_100": "__valid__candidates_i2i_new_k=100_topk=100.parquet",
    "i2i_old_top_100": "__valid__candidates_i2i_old_k=100_topk=100.parquet",
    "covisit_all": "__valid__candidates_covisit_all_topk=200.parquet",
}

candidates_subm = {
    "tdidf_new_top_100": "__subm__candidates_tfidf_new_k=200_topk=100.parquet",
    "tdidf_old_top_100": "__subm__candidates_tfidf_old_k=200_topk=100.parquet",
    "i2i_new_top_100": "__subm__candidates_i2i_new_k=100_topk=100.parquet",
    "i2i_old_top_100": "__subm__candidates_i2i_old_k=100_topk=100.parquet",
    "covisit_all": "__subm__candidates_covisit_all_topk=200.parquet",
}

# Candidate datasets

In [15]:
generate_dataset_from_candidates(
    candidate_files = [candidates_valid[v] for v in ["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]],
    mode = "train",
    return_users_with_positives = True,
    action_type = "clicks",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

generating mode:  train
only return users with positives (for train mode only):  True
action type:  clicks
Reading target file
Reading candidate files...: 
Current file:  reranker_finale/__valid__candidates_covisit_all_topk=200.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_new_k=200_topk=100.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_old_k=200_topk=100.parquet
Drop duplicate candidates, keeping last...: 
Adding target
sessions in test dataframe:  1303355
sessions with positives in candidates:  831551
sessions with positives in test dataframe:  1265979
hit rate:  0.6568442288537172
shape: (2, 2)
┌────────┬───────────┐
│ target ┆ counts    │
│ ---    ┆ ---       │
│ i32    ┆ u32       │
╞════════╪═══════════╡
│ 0      ┆ 159947727 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 831551    │
└────────┴───────────┘
Saving to file...:  reranker_finale/__stage2__train__clicks__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


In [7]:
cands = ["tdidf_new_top_100", "tdidf_old_top_100"]
generate_dataset_from_candidates(
    candidate_files = [candidates_valid[v] for v in cands],
    mode = "train",
    return_users_with_positives = True,
    action_type = "clicks",
    version = "+".join(cands),
)

generating mode:  train
only return users with positives (for train mode only):  True
action type:  clicks
Reading target file
Reading candidate files...: 
Current file:  reranker_finale/__valid__candidates_tfidf_new_k=200_topk=100.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_old_k=200_topk=100.parquet
Drop duplicate candidates, keeping last...: 
Adding target
sessions in test dataframe:  1303355
sessions with positives in candidates:  788846
sessions with positives in test dataframe:  1265979
hit rate:  0.6231114418169653
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 98844851 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 788846   │
└────────┴──────────┘
Saving to file...:  reranker_finale/__stage2__train__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet


In [12]:
cands = ["tdidf_new_top_100", "tdidf_old_top_100"]
generate_dataset_from_candidates(
    candidate_files = [candidates_subm[v] for v in cands],
    mode = "subm",
    return_users_with_positives = True,
    action_type = "clicks",
    version = "+".join(cands),
)

generating mode:  subm
only return users with positives (for train mode only):  True
action type:  clicks
Reading candidate files...: 
Current file:  reranker_finale/__subm__candidates_tfidf_new_k=200_topk=100.parquet
Current file:  reranker_finale/__subm__candidates_tfidf_old_k=200_topk=100.parquet
Drop duplicate candidates, keeping last...: 
Saving to file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet


In [11]:
generate_dataset_from_candidates(
    candidate_files = [candidates_valid[v] for v in ["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]],
    mode = "train",
    return_users_with_positives = True,
    action_type = "carts",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

generating mode:  train
only return users with positives (for train mode only):  True
action type:  carts
Reading target file
Reading candidate files...: 
Current file:  reranker_finale/__valid__candidates_covisit_all_topk=200.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_new_k=200_topk=100.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_old_k=200_topk=100.parquet
Drop duplicate candidates, keeping last...: 
Adding target
sessions with positives in candidates:  202061
sessions with positives in test dataframe:  1303355
hit rate:  0.15503143809629763
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 42828456 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 272271   │
└────────┴──────────┘
Saving to file...:  reranker_finale/__stage2__train__carts__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


In [None]:
generate_dataset_from_candidates(
    candidate_files = [candidates_valid[v] for v in ["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]],
    mode = "subm",
    return_users_with_positives = True,
    action_type = "carts",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

In [15]:
generate_dataset_from_candidates(
    candidate_files = [candidates_valid[v] for v in ["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]],
    mode = "train",
    return_users_with_positives = True,
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

generating mode:  train
only return users with positives (for train mode only):  True
action type:  orders
Reading target file
Reading candidate files...: 
Current file:  reranker_finale/__valid__candidates_covisit_all_topk=200.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_new_k=200_topk=100.parquet
Current file:  reranker_finale/__valid__candidates_tfidf_old_k=200_topk=100.parquet
Drop duplicate candidates, keeping last...: 
Adding target
sessions in test dataframe:  1303355
sessions with positives in candidates:  120359
sessions with positives in test dataframe:  134666
hit rate:  0.09234552366776512
shape: (2, 2)
┌────────┬──────────┐
│ target ┆ counts   │
│ ---    ┆ ---      │
│ i32    ┆ u32      │
╞════════╪══════════╡
│ 0      ┆ 27191735 │
├╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌╌┤
│ 1      ┆ 207981   │
└────────┴──────────┘
Saving to file...:  reranker_finale/__stage2__train__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


In [58]:
generate_dataset_from_candidates(
    candidate_files = [candidates_subm[v] for v in ["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]],
    mode = "subm",
    return_users_with_positives = False,
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

generating mode:  subm
only return users with positives (for train mode only):  False
action type:  orders
Reading candidate files...: 
Current file:  reranker_finale/__subm__candidates_covisit_all_topk=200.parquet
Current file:  reranker_finale/__subm__candidates_tfidf_new_k=200_topk=100.parquet
Current file:  reranker_finale/__subm__candidates_tfidf_old_k=200_topk=100.parquet
Drop duplicate candidates, keeping last...: 
Saving to file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


# Dataset feature groups

## orders

In [5]:
generate_dataset_feature_group_cand_user_item(
    mode = "train",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


  0%|          | 0/5 [00:00<?, ?it/s]

Processing feature file...:  __valid__candidates_covisit_all_topk=200.parquet


 20%|██        | 1/5 [00:05<00:23,  5.84s/it]

Processing feature file...:  __valid__candidates_tfidf_new_k=200_topk=100.parquet


 40%|████      | 2/5 [00:12<00:18,  6.21s/it]

Processing feature file...:  __valid__candidates_tfidf_old_k=200_topk=100.parquet


 60%|██████    | 3/5 [00:18<00:12,  6.38s/it]

Processing feature file...:  __valid__candidates_i2i_new_k=100_topk=100.parquet


 80%|████████  | 4/5 [00:25<00:06,  6.44s/it]

Processing feature file...:  __valid__candidates_i2i_old_k=100_topk=100.parquet


100%|██████████| 5/5 [00:31<00:00,  6.40s/it]


Saving to file...:  reranker_finale/__dataset__features__train__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_cand_user_item.parquet


In [59]:
generate_dataset_feature_group_cand_user_item(
    mode = "subm",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


  0%|          | 0/5 [00:00<?, ?it/s]

Processing feature file...:  __subm__candidates_covisit_all_topk=200.parquet


 20%|██        | 1/5 [00:16<01:06, 16.62s/it]

Processing feature file...:  __subm__candidates_tfidf_new_k=200_topk=100.parquet


 40%|████      | 2/5 [00:31<00:47, 15.75s/it]

Processing feature file...:  __subm__candidates_tfidf_old_k=200_topk=100.parquet


 60%|██████    | 3/5 [00:51<00:35, 17.57s/it]

Processing feature file...:  __subm__candidates_i2i_new_k=100_topk=100.parquet


 80%|████████  | 4/5 [01:09<00:17, 17.75s/it]

Processing feature file...:  __subm__candidates_i2i_old_k=100_topk=100.parquet


100%|██████████| 5/5 [01:31<00:00, 18.22s/it]


Saving to file...:  reranker_finale/__dataset__features__subm__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_cand_user_item.parquet


In [6]:
generate_dataset_feature_group_other(
    mode = "train",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Saving to file...:  reranker_finale/__dataset__features__train__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_other.parquet


In [60]:
generate_dataset_feature_group_other(
    mode = "subm",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Saving to file...:  reranker_finale/__dataset__features__subm__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_other.parquet


In [4]:
generate_dataset_feature_group_covisit_item2item(
    mode = "train",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...
Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__train__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_covisit_item2item.parquet


In [12]:
generate_dataset_feature_group_covisit_item2item(
    mode = "subm",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...
Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__subm__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_covisit_item2item.parquet


In [10]:
%env PYTHONHASHSEED=1
generate_dataset_feature_group_w2v_item2item(
    mode = "train",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

env: PYTHONHASHSEED=1
mode:  train
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...


100%|██████████| 5/5 [01:26<00:00, 17.23s/it]


Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__train__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_w2v_item2item.parquet


In [5]:
%env PYTHONHASHSEED=1
generate_dataset_feature_group_w2v_item2item(
    mode = "subm",
    action_type = "orders",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

env: PYTHONHASHSEED=1
mode:  subm
action type:  orders
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__orders__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...


100%|██████████| 5/5 [12:00<00:00, 144.01s/it]


Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__subm__orders__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_w2v_item2item.parquet


## carts

In [7]:
generate_dataset_feature_group_cand_user_item(
    mode = "train",
    action_type = "carts",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  carts
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__carts__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


  0%|          | 0/5 [00:00<?, ?it/s]

Processing feature file...:  __valid__candidates_covisit_all_topk=200.parquet


 20%|██        | 1/5 [00:06<00:24,  6.24s/it]

Processing feature file...:  __valid__candidates_tfidf_new_k=200_topk=100.parquet


 40%|████      | 2/5 [00:13<00:19,  6.66s/it]

Processing feature file...:  __valid__candidates_tfidf_old_k=200_topk=100.parquet


 60%|██████    | 3/5 [00:20<00:13,  6.88s/it]

Processing feature file...:  __valid__candidates_i2i_new_k=100_topk=100.parquet


 80%|████████  | 4/5 [00:27<00:06,  6.99s/it]

Processing feature file...:  __valid__candidates_i2i_old_k=100_topk=100.parquet


100%|██████████| 5/5 [00:34<00:00,  6.93s/it]


Saving to file...:  reranker_finale/__dataset__features__train__carts__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_cand_user_item.parquet


In [8]:
generate_dataset_feature_group_other(
    mode = "train",
    action_type = "carts",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  carts
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__carts__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Saving to file...:  reranker_finale/__dataset__features__train__carts__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_other.parquet


In [9]:
generate_dataset_feature_group_covisit_item2item(
    mode = "train",
    action_type = "carts",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  carts
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__carts__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...
Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__train__carts__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_covisit_item2item.parquet


In [10]:
%env PYTHONHASHSEED=1
generate_dataset_feature_group_w2v_item2item(
    mode = "train",
    action_type = "carts",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

env: PYTHONHASHSEED=1
mode:  train
action type:  carts
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__carts__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...


100%|██████████| 5/5 [02:02<00:00, 24.56s/it]


Processing last type items features...


100%|██████████| 3/3 [00:52<00:00, 17.47s/it]


Saving to file...:  reranker_finale/__dataset__features__train__carts__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_w2v_item2item.parquet


## clicks

In [16]:
generate_dataset_feature_group_cand_user_item(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  clicks
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet


  0%|          | 0/5 [00:00<?, ?it/s]

Processing feature file...:  __valid__candidates_covisit_all_topk=200.parquet


 20%|██        | 1/5 [00:09<00:38,  9.66s/it]

Processing feature file...:  __valid__candidates_tfidf_new_k=200_topk=100.parquet


 40%|████      | 2/5 [00:20<00:30, 10.14s/it]

Processing feature file...:  __valid__candidates_tfidf_old_k=200_topk=100.parquet


 60%|██████    | 3/5 [00:30<00:20, 10.32s/it]

Processing feature file...:  __valid__candidates_i2i_new_k=100_topk=100.parquet


 80%|████████  | 4/5 [00:41<00:10, 10.42s/it]

Processing feature file...:  __valid__candidates_i2i_old_k=100_topk=100.parquet


100%|██████████| 5/5 [00:51<00:00, 10.37s/it]


Saving to file...:  reranker_finale/__dataset__features__train__clicks__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_cand_user_item.parquet


In [17]:
generate_dataset_feature_group_other(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  clicks
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Saving to file...:  reranker_finale/__dataset__features__train__clicks__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_other.parquet


In [18]:
generate_dataset_feature_group_covisit_item2item(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  clicks
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...
Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__train__clicks__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_covisit_item2item.parquet


In [19]:
%env PYTHONHASHSEED=1
generate_dataset_feature_group_w2v_item2item(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["covisit_all", "tdidf_new_top_100", "tdidf_old_top_100"]),
)

env: PYTHONHASHSEED=1
mode:  train
action type:  clicks
version:  covisit_all+tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_covisit_all+tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...


100%|██████████| 5/5 [06:10<00:00, 74.11s/it] 


Processing last type items features...


100%|██████████| 3/3 [02:35<00:00, 51.86s/it] 


Saving to file...:  reranker_finale/__dataset__features__train__clicks__covisit_all+tdidf_new_top_100+tdidf_old_top_100_features_w2v_item2item.parquet


In [8]:
generate_dataset_feature_group_cand_user_item(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet


  0%|          | 0/5 [00:00<?, ?it/s]

Processing feature file...:  __valid__candidates_covisit_all_topk=200.parquet


 20%|██        | 1/5 [00:12<00:50, 12.73s/it]

Processing feature file...:  __valid__candidates_tfidf_new_k=200_topk=100.parquet


 40%|████      | 2/5 [00:29<00:44, 14.89s/it]

Processing feature file...:  __valid__candidates_tfidf_old_k=200_topk=100.parquet


 60%|██████    | 3/5 [00:45<00:30, 15.43s/it]

Processing feature file...:  __valid__candidates_i2i_new_k=100_topk=100.parquet


 80%|████████  | 4/5 [01:01<00:15, 15.88s/it]

Processing feature file...:  __valid__candidates_i2i_old_k=100_topk=100.parquet


100%|██████████| 5/5 [01:17<00:00, 15.57s/it]


Saving to file...:  reranker_finale/__dataset__features__train__clicks__tdidf_new_top_100+tdidf_old_top_100_features_cand_user_item.parquet


In [9]:
generate_dataset_feature_group_other(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
Saving to file...:  reranker_finale/__dataset__features__train__clicks__tdidf_new_top_100+tdidf_old_top_100_features_other.parquet


In [10]:
generate_dataset_feature_group_covisit_item2item(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  train
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...
Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__train__clicks__tdidf_new_top_100+tdidf_old_top_100_features_covisit_item2item.parquet


In [11]:
%env PYTHONHASHSEED=1
generate_dataset_feature_group_w2v_item2item(
    mode = "train",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

env: PYTHONHASHSEED=1
mode:  train
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__train__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...


100%|██████████| 5/5 [03:51<00:00, 46.36s/it]


Processing last type items features...


100%|██████████| 3/3 [01:40<00:00, 33.55s/it]


Saving to file...:  reranker_finale/__dataset__features__train__clicks__tdidf_new_top_100+tdidf_old_top_100_features_w2v_item2item.parquet


In [13]:
generate_dataset_feature_group_cand_user_item(
    mode = "subm",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet


  0%|          | 0/5 [00:00<?, ?it/s]

Processing feature file...:  __subm__candidates_covisit_all_topk=200.parquet


 20%|██        | 1/5 [00:16<01:07, 16.77s/it]

Processing feature file...:  __subm__candidates_tfidf_new_k=200_topk=100.parquet


 40%|████      | 2/5 [00:42<01:05, 21.83s/it]

Processing feature file...:  __subm__candidates_tfidf_old_k=200_topk=100.parquet


 60%|██████    | 3/5 [01:03<00:43, 21.77s/it]

Processing feature file...:  __subm__candidates_i2i_new_k=100_topk=100.parquet


 80%|████████  | 4/5 [01:29<00:23, 23.13s/it]

Processing feature file...:  __subm__candidates_i2i_old_k=100_topk=100.parquet


100%|██████████| 5/5 [01:52<00:00, 22.52s/it]


Saving to file...:  reranker_finale/__dataset__features__subm__clicks__tdidf_new_top_100+tdidf_old_top_100_features_cand_user_item.parquet


In [14]:
generate_dataset_feature_group_other(
    mode = "subm",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
Saving to file...:  reranker_finale/__dataset__features__subm__clicks__tdidf_new_top_100+tdidf_old_top_100_features_other.parquet


In [15]:
generate_dataset_feature_group_covisit_item2item(
    mode = "subm",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...
Processing last type items features...
Saving to file...:  reranker_finale/__dataset__features__subm__clicks__tdidf_new_top_100+tdidf_old_top_100_features_covisit_item2item.parquet


In [16]:
%env PYTHONHASHSEED=1
generate_dataset_feature_group_w2v_item2item(
    mode = "subm",
    action_type = "clicks",
    version = "+".join(["tdidf_new_top_100", "tdidf_old_top_100"]),
)

env: PYTHONHASHSEED=1
mode:  subm
action type:  clicks
version:  tdidf_new_top_100+tdidf_old_top_100
Reading generated candidate file...:  reranker_finale/__stage2__subm__clicks__candidates_tdidf_new_top_100+tdidf_old_top_100.parquet
Processing recent items features...


100%|██████████| 5/5 [07:51<00:00, 94.26s/it] 


Processing last type items features...


100%|██████████| 3/3 [03:43<00:00, 74.48s/it] 


Saving to file...:  reranker_finale/__dataset__features__subm__clicks__tdidf_new_top_100+tdidf_old_top_100_features_w2v_item2item.parquet
