In [40]:
NOTEBOOK_NAME = "e028_use_markov"

In [47]:
import os


class Config:
    OUTPUT_DIR = f"../saved_data/{NOTEBOOK_NAME}"
    SEED = 33
    TARGET_COL = "reserve"


os.makedirs(Config.OUTPUT_DIR, exist_ok=True)

In [34]:
# Combined code for training the Markov Chain model and predicting the top next states
from collections import defaultdict


def train_markov_chain(data):
    transitions = defaultdict(lambda: defaultdict(int))
    for sequence in data:
        for i in range(len(sequence) - 1):
            state, next_state = sequence[i], sequence[i + 1]
            transitions[state][next_state] += 1

    for state, next_states in transitions.items():
        total = sum(next_states.values())
        for next_state, count in next_states.items():
            transitions[state][next_state] = count / total

    return transitions


def predict_top_next_states(model, current_state, num_predictions=10):
    if current_state not in model:
        return [0] * num_predictions

    next_states = model[current_state]
    sorted_states = sorted(next_states.items(), key=lambda x: x[1], reverse=True)
    top_predictions = [state for state, _ in sorted_states[:num_predictions]]

    while len(top_predictions) < num_predictions:
        top_predictions.append(0)

    return top_predictions

In [35]:
import pandas as pd

train_log = pd.read_csv("../data/train_log.csv")
train_label = pd.read_csv("../data/train_label.csv")

test_log = pd.read_csv("../data/test_log.csv")
test_session = pd.read_csv("../data/test_session.csv")

sample_submission = pd.read_csv("../data/sample_submission.csv")

In [16]:
# seq_noがNaNの場合は一番最後にくる
all_train_log = pd.concat([train_log, train_label]).sort_values(
    ["session_id", "seq_no"]
)

In [23]:
train_yad_no_list = all_train_log.groupby("session_id")["yad_no"].apply(list).tolist()
test_yad_no_list = test_log.groupby("session_id")["yad_no"].apply(list).tolist()

In [29]:
test_session

Unnamed: 0,session_id
0,00001149e9c73985425197104712478c
1,0000e02747d749a52b7736dfa751e258
2,0000f17ae2628237d78d3a38b009d3be
3,000174a6f7a569b84c5575760d2e9664
4,00017e2a527901c9c41b1acef525d016
...,...
174695,fffee3199ef94b92283239cd5e3534fa
174696,ffff62c6bb49bc9c0fbcf08494a4869c
174697,ffff9a7dcc892875c7a8b821fa436228
174698,ffffb1d30300fe17f661941fd085b04b


In [28]:
test_log.groupby("session_id")["yad_no"].apply(list)

session_id
00001149e9c73985425197104712478c             [3560, 1959]
0000e02747d749a52b7736dfa751e258                  [11984]
0000f17ae2628237d78d3a38b009d3be              [757, 8922]
000174a6f7a569b84c5575760d2e9664    [13610, 12341, 13610]
00017e2a527901c9c41b1acef525d016                   [4621]
                                            ...          
fffee3199ef94b92283239cd5e3534fa             [1997, 8336]
ffff62c6bb49bc9c0fbcf08494a4869c                  [12062]
ffff9a7dcc892875c7a8b821fa436228                   [8989]
ffffb1d30300fe17f661941fd085b04b                   [6030]
ffffe984aafd6127ce8e43e3ca40c79d                   [8250]
Name: yad_no, Length: 174700, dtype: object

In [42]:
top_predictions_for_each_sequence[-5:]

[[1997, 7888, 10997, 2278, 7062, 5744, 9743, 11123, 3440, 1885],
 [12432, 13220, 899, 3802, 1227, 3644, 13702, 2164, 4014, 4962],
 [13241, 13797, 2087, 8143, 7308, 5810, 11796, 13719, 12939, 844],
 [10287, 3100, 2692, 12281, 2373, 13752, 2305, 11496, 4976, 3002],
 [10729, 1891, 12767, 11994, 11723, 634, 867, 3781, 5623, 12620]]

In [24]:
# Train the Markov Chain model
markov_model = train_markov_chain(train_yad_no_list)

In [26]:
# Predict the top next states for each sequence in the test data
top_predictions_for_each_sequence = [
    predict_top_next_states(markov_model, sequence[-1]) for sequence in test_yad_no_list
]
top_predictions_for_each_sequence

[[11561, 4714, 2680, 4420, 5466, 9830, 6766, 6563, 4545, 2811],
 [143, 4066, 6555, 7014, 11237, 7913, 12862, 8108, 6129, 11923],
 [757, 9190, 10485, 7710, 1774, 410, 6730, 9910, 3400, 6721],
 [12341, 3359, 6991, 1542, 5080, 13521, 6489, 10746, 4180, 10861],
 [9020, 3476, 5372, 9623, 3844, 10826, 3854, 6161, 12029, 6565],
 [13292, 3811, 11214, 12785, 6178, 10857, 7202, 109, 5066, 855],
 [11776, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 [2806, 13347, 2824, 11201, 0, 0, 0, 0, 0, 0],
 [10541, 9717, 4522, 3901, 12217, 682, 0, 0, 0, 0],
 [5203, 2322, 1013, 8364, 5357, 11450, 9405, 7727, 2964, 2867],
 [8169, 12305, 5076, 1418, 6959, 12598, 4712, 271, 6425, 2068],
 [12986, 12089, 11037, 6199, 8468, 11112, 6905, 12939, 10155, 2452],
 [9877, 6407, 5907, 9292, 13229, 1510, 5937, 561, 11431, 357],
 [7964, 9153, 289, 6986, 9330, 3843, 358, 2166, 3481, 8460],
 [12645, 1935, 5336, 10827, 8953, 6165, 10710, 8190, 3118, 8985],
 [6693, 2160, 2687, 11888, 9215, 8226, 5116, 2118, 8537, 2201],
 [10515, 12907, 11407, 55

In [39]:
pred_df = pd.DataFrame(
    top_predictions_for_each_sequence, columns=[f"predict_{i}" for i in range(10)]
)

In [41]:
pred_df.to_csv(
    f"../sub/{NOTEBOOK_NAME}.csv",
    index=False,
)

In [48]:
import pickle

with open(f"{Config.OUTPUT_DIR}/train_yad_no_list.pkl", "wb") as f:
    pickle.dump(train_yad_no_list, f)

with open(f"{Config.OUTPUT_DIR}/test_yad_no_list.pkl", "wb") as f:
    pickle.dump(test_yad_no_list, f)