## import libraries

In [1]:
import gc
import os
import sys
import time
import glob
import json
import pickle
from dataclasses import dataclass, asdict
from typing import List, Tuple, Dict, Any, Union, Optional

import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
from sklearn.model_selection import GroupKFold


In [2]:

class Config:
    def __init__(self):
        self.steps_per_sec = 0.2
        self.step_for_a_day = 60 * self.steps_per_sec * 60 * 24
        self.step_for_30min = 60 * self.steps_per_sec * 30
        self.step_for_15min = 60 * self.steps_per_sec * 15
        self.step_for_1min = 60 * self.steps_per_sec

    def from_json(self, json_path):
        json_data = json.load(open(json_path))
        for k, v in json_data.items():
            print(k, v)
            setattr(self, k, v)
        return self

setting_file = "SETTINGS.json"
Cfg = Config().from_json(setting_file)


In [3]:
import time
import psutil
from contextlib import contextmanager

def show_memory_usage(name = "unknown"):
    vm = psutil.virtual_memory()
    print(f"[MEMUSE] memory usage (in {name}): {vm.used/1024/1024:.2f}MB ({vm.percent}%)")

@contextmanager
def timer(name: str):
    show_memory_usage(f"before {name}")
    s = time.time()
    yield
    elapsed = time.time() - s
    print(f"[{name}] {elapsed:.3f}sec")
    show_memory_usage(f"after {name}")

def reduce_dtype(df):
    for c in df.columns:
        if df[c].dtype.name == "float64":
            df[c] = df[c].astype(np.float32)
    return df

def load_dataset(path_feather):
    df = pd.read_feather(path_feather)
    return df

In [4]:
import numpy as np
import pandas as pd
import pandas.api.types
from typing import Dict, List, Tuple

# tolerances in steps
tolerances = {
    "onset":  [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
    "wakeup": [12, 36, 60, 90, 120, 150, 180, 240, 300, 360],
}
# 12は(12の倍数)からだと1minしか許容されない。少しずらすだけで2min可能になる
# 90と150は(12の倍数+6)からだとよくないか。ということでピッタリと30secは避ける。(6の倍数秒は避ける)

series_id_column_name = "series_id"
time_column_name = "step"
event_column_name = "event"
score_column_name = "score"
use_scoring_intervals = None

def score(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        tolerances: Dict[str, List[float]],
        series_id_column_name: str,
        time_column_name: str,
        event_column_name: str,
        score_column_name: str,
        use_scoring_intervals: bool = False,
) -> float:
    
    # Validate metric parameters
    assert len(tolerances) > 0, "Events must have defined tolerances."
    assert set(tolerances.keys()) == set(solution[event_column_name]).difference({'start', 'end'}),\
        (f"Solution column {event_column_name} must contain the same events "
         "as defined in tolerances.")
    assert pd.api.types.is_numeric_dtype(solution[time_column_name]),\
        f"Solution column {time_column_name} must be of numeric type."

    # Validate submission format
    for column_name in [
        series_id_column_name,
        time_column_name,
        event_column_name,
        score_column_name,
    ]:
        if column_name not in submission.columns:
            raise ParticipantVisibleError(f"Submission must have column '{target_name}'.")

    if not pd.api.types.is_numeric_dtype(submission[time_column_name]):
        raise ParticipantVisibleError(
            f"Submission column '{time_column_name}' must be of numeric type."
        )
    if not pd.api.types.is_numeric_dtype(submission[score_column_name]):
        raise ParticipantVisibleError(
            f"Submission column '{score_column_name}' must be of numeric type."
        )

    # Set these globally to avoid passing around a bunch of arguments
    globals()['series_id_column_name'] = series_id_column_name
    globals()['time_column_name'] = time_column_name
    globals()['event_column_name'] = event_column_name
    globals()['score_column_name'] = score_column_name
    globals()['use_scoring_intervals'] = use_scoring_intervals

    return event_detection_ap(solution, submission, tolerances)

def event_detection_ap(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        tolerances: Dict[str, List[float]],
) -> float:
    # Ensure solution and submission are sorted properly
    solution = solution.sort_values([series_id_column_name, time_column_name])
    submission = submission.sort_values([series_id_column_name, time_column_name])

    # Extract scoring intervals.
    if use_scoring_intervals:
        intervals = (
            solution
            .query("event in ['start', 'end']")
            .assign(interval=lambda x: x.groupby([series_id_column_name, event_column_name]).cumcount())
            .pivot(
                index='interval',
                columns=[series_id_column_name, event_column_name],
                values=time_column_name,
            )
            .stack(series_id_column_name)
            .swaplevel()
            .sort_index()
            .loc[:, ['start', 'end']]
            .apply(lambda x: pd.Interval(*x, closed='both'), axis=1)
        )

    # Extract ground-truth events.
    ground_truths = (
        solution
        .query("event not in ['start', 'end']")
        .reset_index(drop=True)
    )

    # Map each event class to its prevalence (needed for recall calculation)
    class_counts = ground_truths.value_counts(event_column_name).to_dict()

    # Create table for detections with a column indicating a match to a ground-truth event
    detections = submission.assign(matched = False)

    # Remove detections outside of scoring intervals
    if use_scoring_intervals:
        detections_filtered = []
        for (det_group, dets), (int_group, ints) in zip(
            detections.groupby(series_id_column_name), intervals.groupby(series_id_column_name)
        ):
            assert det_group == int_group
            detections_filtered.append(filter_detections(dets, ints))
        detections_filtered = pd.concat(detections_filtered, ignore_index=True)
    else:
        detections_filtered = detections
    # Create table of event-class x tolerance x series_id values
    aggregation_keys = pd.DataFrame(
        [(ev, tol, vid)
         for ev in tolerances.keys()
         for tol in tolerances[ev]
         for vid in ground_truths[series_id_column_name].unique()],
        columns=[event_column_name, 'tolerance', series_id_column_name],
    )

    # Create match evaluation groups: event-class x tolerance x series_id
    detections_grouped = (
        aggregation_keys
        .merge(detections_filtered, on=[event_column_name, series_id_column_name], how='left')
        .groupby([event_column_name, 'tolerance', series_id_column_name])
    )
    ground_truths_grouped = (
        aggregation_keys
        .merge(ground_truths, on=[event_column_name, series_id_column_name], how='left')
        .groupby([event_column_name, 'tolerance', series_id_column_name])
    )
    # Match detections to ground truth events by evaluation group
    detections_matched = []
    for key in aggregation_keys.itertuples(index=False):
        dets = detections_grouped.get_group(key)
        gts = ground_truths_grouped.get_group(key)
        detections_matched.append(
            match_detections(dets['tolerance'].iloc[0], gts, dets)
        )
    detections_matched = pd.concat(detections_matched)
    

    # Compute AP per event x tolerance group
    event_classes = ground_truths[event_column_name].unique()
    ap_table = (
        detections_matched
        .query("event in @event_classes")
        .groupby([event_column_name, 'tolerance']).apply(
            lambda group: average_precision_score(
                group['matched'].to_numpy(),
                group[score_column_name].to_numpy(),
                class_counts[group[event_column_name].iat[0]],
            )
        )
    )
    # Average over tolerances, then over event classes
    mean_ap = ap_table.groupby(event_column_name).mean().sum() / len(event_classes)

    return mean_ap

def _match_detections(
        tolerance: float, ground_truths: pd.DataFrame, detections: pd.DataFrame
) -> pd.DataFrame:
    """Match detections to ground truth events. Arguments are taken from a common event x tolerance x series_id evaluation group."""
    detections_sorted = detections.sort_values(score_column_name, ascending=False).dropna()
    is_matched = np.full_like(detections_sorted[event_column_name], False, dtype=bool)
    gts_matched = set()
    for i, det in enumerate(detections_sorted.itertuples(index=False)):
        best_error = tolerance
        best_gt = None

        for gt in ground_truths.itertuples(index=False):
            error = abs(getattr(det, time_column_name) - getattr(gt, time_column_name))
            if error < best_error and gt not in gts_matched:
                best_gt = gt
                best_error = error

        if best_gt is not None:
            is_matched[i] = True
            gts_matched.add(best_gt)

    detections_sorted['matched'] = is_matched

    return detections_sorted

def match_detections(
        tolerance: float, ground_truths: pd.DataFrame, detections: pd.DataFrame
) -> pd.DataFrame:
    """Match detections to ground truth events. Arguments are taken from a common event x tolerance x series_id evaluation group."""
    detections_sorted = detections.sort_values(score_column_name, ascending=False).dropna()
    is_matched = np.full_like(detections_sorted[event_column_name], False, dtype=bool)
    gts_matched = set()

    det_times = detections_sorted[time_column_name].values.reshape(-1)
    gt_times = ground_truths[time_column_name].values.reshape(-1)
    errors_matrix = np.abs(gt_times[np.newaxis, :] - det_times[:, np.newaxis])
    """
    pred_indices_matrix = np.tile(np.arange(len(det_times))[:,np.newaxis], (1,len(gt_times)))
    gt_values_matrix = np.tile(gt_times[np.newaxis, :], (len(det_times),1))

    cond = errors_matrix < tolerance
    errors = errors_matrix[cond]
    pred_indices = pred_indices_matrix[cond]
    gt_values = gt_values_matrix[cond]
    last_i = -1
    best_gt = None
    best_error = tolerance
    for i, gt, error in zip(pred_indices, gt_values, errors):
        pred_reset = True if i != last_i else False
        if pred_reset and best_gt is not None:
            is_matched[i] = True
            gts_matched.add(best_gt)
        if pred_reset:
            best_gt = None
            best_error = tolerance
        if error < best_error and gt not in gts_matched:
            best_gt = gt
            best_error = error

        last_i = i
    if pred_reset and best_gt is not None:
        is_matched[i] = True
        gts_matched.add(best_gt)
    """

    
    # for i, det_time in enumerate(det_times): # detections_sorted.itertuples(index=False)):
    for i in range(len(det_times)): # detections_sorted.itertuples(index=False)):

        errors = errors_matrix[i] # np.abs(gt_times - det_time)
        # print(errors.shape)
        # print(np.abs(gt_times - det_time).shape)
        # mask = (errors < tolerance) & (~np.isin(gt_indices, list(gts_matched)))
        

        best_error = tolerance
        best_gt = None
        mask = errors < best_error
        errors_masked = errors[mask]
        gt_times_masked = gt_times[mask]

        for error, gt in zip(errors_masked, gt_times_masked):#ground_truths.itertuples(index=False):
            # error = abs(getattr(det, time_column_name) - getattr(gt, time_column_name))
            if  gt not in gts_matched:
                best_gt = gt
                best_error = error

        if best_gt is not None:
            is_matched[i] = True
            gts_matched.add(best_gt)
    

    detections_sorted['matched'] = is_matched

    return detections_sorted


def precision_recall_curve(
        matches: np.ndarray, scores: np.ndarray, p: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    if len(matches) == 0:
        return [1], [0], []

    # Sort matches by decreasing confidence
    idxs = np.argsort(scores, kind='stable')[::-1]
    scores = scores[idxs]
    matches = matches[idxs]

    distinct_value_indices = np.where(np.diff(scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, matches.size - 1]
    thresholds = scores[threshold_idxs]

    # Matches become TPs and non-matches FPs as confidence threshold decreases
    tps = np.cumsum(matches)[threshold_idxs]
    fps = np.cumsum(~matches)[threshold_idxs]

    precision = tps / (tps + fps)
    precision[np.isnan(precision)] = 0
    recall = tps / p  # total number of ground truths might be different than total number of matches

    # Stop when full recall attained and reverse the outputs so recall is non-increasing.
    last_ind = tps.searchsorted(tps[-1])
    sl = slice(last_ind, None, -1)

    # Final precision is 1 and final recall is 0
    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]

def average_precision_score(matches: np.ndarray, scores: np.ndarray, p: int) -> float:
    precision, recall, _ = precision_recall_curve(matches, scores, p)
    # Compute step integral
    return -np.sum(np.diff(recall) * np.array(precision)[:-1])



### feature engineering for 2nd stage


In [7]:
def make_target_2nd_satge(df):
    def current_best_score_in_a_day(df):
        """
        スコアでソートし、自身よりも上位にあるbest_apの最大値を列に追加する
        """
        df = df.sort_values("score", ascending=False).reset_index(drop=True)
        df["best_ap_until"] = df.groupby(["series_id", "night", "event"])["best_ap"].transform(lambda x: np.maximum.accumulate(x))
        return df

    df["best_ap_in_a_day"] = df.groupby(["series_id", "night", "event"])["best_ap"].transform("max")
    df["is_best_of_best"] = ((df["best_ap"] == df["best_ap_in_a_day"]) * (df["best_ap"]>0.)).astype(int)
    df = current_best_score_in_a_day(df)
    df["best_ap_until_binary"] = (df["best_ap_until"] > 0.5).astype(int) # temporary
    return df


In [8]:
def make_features(df, Cfg, phase="train"):
    """
    一部の処理はスコア上位のみのdfに対して実行してもいいかも？
    """


    drop_cols = []
    added_cols = []
    score_keys = ['score', 'score_10p', 'score_10', 'score_8', 'score_6', 'score_4', 'score_2']
    
    df["event"] = (df["event"]=="wakeup").astype(int)
    # daily_step for night
    df["daily_step_sleep"] = (df["daily_step"] + 0.5) % 1


    # change of state between before and after
    lengthlist = [12, 24, 60, 120, 240, 360, 720]
    for length in lengthlist:
        df[f"state_diff_{length}"] = df[f"before_states_feat_{length}"] - df[f"after_states_feat_{length}"]
        df[f"nan_diff_{length}"] = df[f"before_nan_feat_{length}"] - df[f"after_nan_feat_{length}"]


    # largest score
    for key in score_keys:
        df[f"max_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("max")
        df[f"max_{key}_sne_diff"] = df[f"max_{key}_sne"] - df[key]
        df[f"max_{key}_sne_is_peak"] = (df[f"max_{key}_sne_diff"] == 0).astype(int)
        df[f"sum_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("sum")
        df[f"mean_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("mean")
        drop_cols.append(f"max_{key}_sne_is_peak")
        added_cols += [f"max_{key}_sne", f"max_{key}_sne_diff", f"max_{key}_sne_is_peak"]


    # largest score
    for key in score_keys:
        df[f"max_{key}_sn"] = df.groupby(["series_id", "night"])[key].transform("max")
        df[f"max_{key}_sn_diff"] = df[f"max_{key}_sn"] - df[key]
        # df[f"max_{key}_sn_rel"] = df[f"max_{key}_sn_diff"] / df[f"max_{key}_sn"]
        added_cols += [f"max_{key}_sn", f"max_{key}_sn_diff"]

    
    
    for key in score_keys:
        df_peak = df[df[f"max_{key}_sne_is_peak"] == 1]
        
        df_peak = df_peak.groupby(["series_id", "event"])[f"max_{key}_sne"].agg(["mean", "std"]).reset_index()
        df_peak.columns = ["series_id", "event", f"max_{key}_sne_mean", f"max_{key}_sne_std"]
        df = df.merge(df_peak, on=["series_id", "event"], how="left")
        # normalize
        df[f"{key}_relative_to_peak"] = (df[key] - df[f"max_{key}_sne_mean"]) / df[f"max_{key}_sne_std"]

        added_cols += [f"max_{key}_sne_mean", f"max_{key}_sne_std", f"{key}_relative_to_peak"]

   
    
    # daily_step at peak
    for key in score_keys:
        df_peak = df[df[f"max_{key}_sne_is_peak"] == 1].copy()
        df_peak = df_peak.rename(columns={"daily_step": f"peak_daily_step_{key}", "daily_step_sleep": f"peak_daily_step_sleep_{key}"})
        df_peak[f"peak_daily_step_{key}_mean"] = df_peak.groupby(["series_id", "event"])[f"peak_daily_step_{key}"].transform("mean")
        df_peak[f"peak_daily_step_sleep_{key}_mean"] = df_peak.groupby(["series_id", "event"])[f"peak_daily_step_sleep_{key}"].transform("mean") # scoreが高いものだけに限定してもいいのかもな…。
        # 以下はフリップしてから実施。　df_peak[f"peak_daily_step_{key}_mean_sleep"] = df_peak[f"peak_daily_step_{key}_mean"] + 0.5 - df_peak[f"peak_daily_step_sleep_{key}_mean"]

        df = df.merge(df_peak[["series_id", "night", "event", f"peak_daily_step_{key}", f"peak_daily_step_sleep_{key}", f"peak_daily_step_{key}_mean", f"peak_daily_step_sleep_{key}_mean"]], on=["series_id", "night", "event"], how="left")
        df[f"step_dist_from_peak_{key}"] = df["daily_step"] - df[f"peak_daily_step_{key}"]
        df[f"step_dist_from_peak_sleep_{key}"] = df["daily_step_sleep"] - df[f"peak_daily_step_sleep_{key}"]
        df[f"step_dist_from_peak_{key}_mean"] = df["daily_step"] - df[f"peak_daily_step_{key}_mean"]
        df[f"step_dist_from_peak_sleep_{key}_mean"] = df["daily_step_sleep"] - df[f"peak_daily_step_sleep_{key}_mean"]

        added_cols += [f"peak_daily_step_{key}", f"peak_daily_step_sleep_{key}", f"peak_daily_step_{key}_mean", f"peak_daily_step_sleep_{key}_mean", f"step_dist_from_peak_{key}", f"step_dist_from_peak_sleep_{key}", f"step_dist_from_peak_{key}_mean", f"step_dist_from_peak_sleep_{key}_mean"]
        

    # opposite event at same night
    df_flip = df.copy()
    flip_columns = [f"max_{key}_sne" for key in score_keys]+ [f"sum_{key}_sne" for key in score_keys] + [f"peak_daily_step_{key}" for key in score_keys] + [f"peak_daily_step_sleep_{key}" for key in score_keys]
    df_flip["event"] = 1 - df_flip["event"] #.apply(lambda x: "onset" if x == "wakeup" else "wakeup")
    df_flip = df_flip.groupby(["series_id", "event", "night"])[flip_columns].max().reset_index()
    df_flip.columns = ["series_id", "event", "night"] + [f"{c}_flip" for c in flip_columns]
    df = df.merge(df_flip, on=["series_id", "night", "event"], how="left")

    for key in score_keys:
        df[f"peak_daily_step_{key}_sleep_duration_01"] = df["daily_step"] + 0.5 - df[f"peak_daily_step_sleep_{key}_flip"]
        df[f"peak_daily_step_{key}_sleep_duration_10"] = df[f"peak_daily_step_{key}_flip"] + 0.5 - df["daily_step_sleep"]
        added_cols += [f"peak_daily_step_{key}_sleep_duration_01", f"peak_daily_step_{key}_sleep_duration_10"]


    for key in ['score', 'score_10p', 'score_10', 'score_8', 'score_6', 'score_4', 'score_2']:
        df[f"rank_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("rank")
        

    
    # sort by scoreでsort -> accumulated score
    df = df.sort_values(["series_id", "night", "event", "score"], ascending=False).reset_index(drop=True)
    for key in ['score', 'score_10p', 'score_10', 'score_8', 'score_6', 'score_4', 'score_2']:
        df[f"cumsum_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].transform("cumsum")
        # score diff
        # df[f"diff_{key}_sne"] = df.groupby(["series_id", "night", "event"])[key].diff()

        # 累積の積ももとめる。非透過率のような感じに        
        # max_val = df[key].max()
        # max_val = df.groupby(["series_id", "night", "event"])[key].transform("max")
    #     df["notpass"] = np.clip(df[key]/df[f"max_{key}_se"], 0, 1)
    #     df["pass"] = 1- df["notpass"]
    #     df[f"cumprod_{key}_sne_max"] = df.groupby(["series_id", "night", "event"])["pass"].transform("cumprod")
    #     df[f"cumprod_{key}_sne_max"] = - df.groupby(["series_id", "night", "event"])[f"cumprod_{key}_sne_max"].diff()
    #     df[f"cumprod_{key}_sne_max"] = df[f"cumprod_{key}_sne_max"].fillna(df["notpass"])
        
    # df = df.drop(columns=["notpass", "pass"])
    
    return df


def make_features_dual(df, Cfg, phase="train"):
    df_high_score = df[df["score"] > 0.25].copy()
    df_normal, _ = make_features(df, Cfg, phase)
    df_high_score, added_cols = make_features(df_high_score, Cfg, phase)
    df = df_normal.merge(df_high_score[["series_id", "step"] + added_cols], on=["series_id", "step"], how="left", suffixes=("", "_high_score"))
    return df

    
    

In [9]:
# ensemble after 2nd stage
def weighted_fusion_ensemble(df_0, df_1, distance_threshold=12, model_weights=[0.5, 0.5]):
    model_weights = [model_weights[0] / sum(model_weights), model_weights[1] / sum(model_weights)]
    # weight_wo_fusion = 0.5
    large_val = 1e8
    series_ids = df_0['series_id'].unique()
    out_df = []
    for series_id in series_ids:
        df_0_id = df_0[df_0['series_id']==series_id].copy()
        df_1_id = df_1[df_1['series_id']==series_id].copy()
        df_0_id = df_0_id.sort_values("score", ascending=False).reset_index(drop=True)
        df_1_id = df_1_id.sort_values("score", ascending=False).reset_index(drop=True)
        
        steps_0 = df_0_id['step'].values.copy() # base
        steps_1 = df_1_id['step'].values.copy()
        scores_0 = df_0_id['score'].values.copy() # base
        scores_1 = df_1_id['score'].values.copy()
        not_assigned_df = []
        for step, score in zip(steps_1, scores_1):
            dists = np.abs(steps_0 - step)
            argmin = np.argmin(dists)
            min_dist = dists[argmin]
            if min_dist < distance_threshold:
                f_step = steps_0[argmin]
                f_score = scores_0[argmin]
                add_step = step
                add_score = score
                
                # new_score = (f_score + add_score) / 2
                w_0 = model_weights[0] * f_score
                w_1 = model_weights[1] * add_score
                new_score = (f_score * w_0 + add_score * w_1) / (w_0 + w_1)
                new_step = (f_step * w_0 + add_step * w_1) / (w_0 + w_1)
                df_0_id.loc[argmin, "score"] = new_score
                df_0_id.loc[argmin, "step"] = new_step
                steps_0[argmin] = large_val # large val to avoid assign again
            else:
                not_assigned = df_1_id[df_1_id['step']==step].copy()
                not_assigned['score'] = score * model_weights[1] # not assigned
                not_assigned_df.append(not_assigned)
        df_0_id.loc[steps_0!=large_val, "score"] *= model_weights[0] # not assigned
        out_df.append(df_0_id)
        if len(not_assigned_df) >0:
            not_assigned_df = pd.concat(not_assigned_df)
            out_df.append(not_assigned_df)
    out_df = pd.concat(out_df).reset_index(drop=True) # .reset_index() # .rename(columns={"index": "row_id"})
    return out_df

def round_step(df):
    df["step"] = df["step"].astype(int) + (df["step"] % 6 < 1).astype(int)
    return df

### model functions

In [10]:
class LabelEncoders:
    def __init__(self):
        self.encoders = {}
    
    def fit(self, df):
        for c in df:
            if df[c].dtype.name == "object":
                enc = self.encoders.get(c, LabelEncoder())
                enc.fit(df[c])
                self.encoders[c] = enc
                
    def transform(self, df):
        for c in df:
            if c in self.encoders:
                df[c] = self.encoders[c].transform(df[c])
        return df
    
    def fit_one(self, s):
        enc = self.encoders.get(s.name, LabelEncoder())
        enc.fit(s)
        self.encoders[s.name] = enc
        
    def transform_one(self, s):
        if s.name in self.encoders:
            return self.encoders[s.name].transform(s)
        else:
            return s

    def fit_transform(self, df):
        self.fit(df)
        return self.transform(df)
                
    def fit_transform_one(self, s):
        self.fit_one(s)
        return self.transform_one(s)

In [11]:
def competition_metrics(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

def make_oof(cvbooster, X_train: np.ndarray, y_train: pd.Series, split):
    oof = np.zeros(len(X_train))
    
    for booster, (idx_train, idx_valid) in zip(cvbooster.boosters, split):
        y_pred = booster.predict(X_train[idx_valid])
        oof[idx_valid] = y_pred
        print("oof score", competition_metrics(y_train[idx_valid], y_pred))
        
    return oof


def plot_importance(cvbooster, figsize=(12, 20)):
    raw_importances = cvbooster.feature_importance(importance_type='gain')
    feature_name = cvbooster.boosters[0].feature_name()
    importance_df = pd.DataFrame(data=raw_importances,
                                 columns=feature_name)
    # order by average importance across folds
    sorted_indices = importance_df.mean(axis=0).sort_values(ascending=False).index
    sorted_importance_df = importance_df.loc[:, sorted_indices]
    # plot top-n
    PLOT_TOP_N = 100
    plot_cols = sorted_importance_df.columns[:PLOT_TOP_N]
    _, ax = plt.subplots(figsize=figsize)
    ax.grid()
    ax.set_xscale('log')
    ax.set_ylabel('Feature')
    ax.set_xlabel('Importance')
    sns.boxplot(data=sorted_importance_df[plot_cols],
                orient='h',
                ax=ax)
    plt.show()



def metrics(y_true, y_pred, threshold=None, threshold2=None, threshold2_mask=None):
    if threshold is None:
        threshold = search_best_threshold(y_true, y_pred)
        
    if threshold2 is not None and threshold2_mask is not None:
        return matthews_corrcoef(y_true, binarize_pred(y_pred, threshold, threshold2, threshold2_mask))
    else:
        return matthews_corrcoef(y_true, y_pred>threshold)


from sklearn.model_selection import KFold, GroupKFold

def groupkfold(groups, n_splits=5, seed=42):
    unique_ids = groups.unique()
    split = []
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=seed)
    for tr_group_idx, va_group_idx in kf.split(unique_ids):
        tr_groups, va_groups = unique_ids[tr_group_idx], unique_ids[va_group_idx]
        is_tr = groups.isin(tr_groups)
        is_va = groups.isin(va_groups)
        split.append((np.arange(len(groups))[is_tr], np.arange(len(groups))[is_va]))
    return split




def train_cv(train_df: pd.DataFrame, 
             non_feature_cols: List[str], 
             target_column: str = "best_ap",
             encoder: Optional[LabelEncoders] = None,
             lgb_params = None,
             weight = None,
             use_full_data = False,
             num_boost_round=600,
             seed=42,
             show_importance=False):

    lgb_params = lgb_params or {
        "objective": "binary", # "binary", #"regression",
        # "binary",
        "metric": "auc", # "mae"
        "max_depth": 5,
        "num_leaves": 128,
        "feature_fraction": 0.3,
        "verbose": -1,
        "learning_rate": 0.02,
        "min_child_samples": 10,
        "min_child_weight": 5,
        "subsample_for_bin": 50000,
        #"reg_lambda": 1
    }

    
    # Group KFold
    split = list(GroupKFold(n_splits=5).split(train_df, groups=train_df["series_id"]))
    # df = train_df.reset_index(drop=True)
    # split = groupkfold(train_df["series_id"], n_splits=5, seed=seed)
    
    use_existing_encoders = encoder is not None
    encoder = encoder or LabelEncoders()

    with timer("make dataset"):
        # train_df.values.astype(np.float32)とかやるとOOMで死ぬので、箱を先に用意して値を入れていく
        feature_names = [c for c in train_df.columns if c not in non_feature_cols]

        X_train = np.empty((len(train_df), len(feature_names)), dtype=np.float32)
        y_train = train_df[target_column].values.astype(np.float32)
        print(X_train.shape)

        for i, c in enumerate(feature_names):
            if train_df[c].dtype.name == "object":
                if use_existing_encoders:
                    X_train[:, i] = encoder.transform_one(train_df[c])
                else:
                    X_train[:, i] = encoder.fit_transform_one(train_df[c])
            else:
                X_train[:, i] = train_df[c]

        gc.collect()
        print(f"features: {feature_names}")
        print(f"category: {list(encoder.encoders.keys())}")

        if False:
            weight = np.ones(len(X_train))
            
        

        ds_train = lgb.Dataset(X_train, y_train,
                               feature_name=feature_names, 
                               weight=weight)
        gc.collect()

    with timer("lgb.cv"):
        if use_full_data:
            split = [
                (np.arange(len(train_df)), np.arange(10))
            ] * 5
        ret = lgb.cv(lgb_params, ds_train, 
                     num_boost_round=num_boost_round, 
                     folds=split,
                     return_cvbooster=True,
                     callbacks=[
                         # lgb.early_stopping(stopping_rounds=100, verbose=True),
                         lgb.log_evaluation(25)
                     ]) 

        for booster in ret["cvbooster"].boosters:
            booster.best_iteration = ret["cvbooster"].best_iteration

        del ds_train
        gc.collect()

    if show_importance:
        plot_importance(ret["cvbooster"])
    
    if use_full_data:
        return ret["cvbooster"], encoder
    
    oof = make_oof(ret["cvbooster"], X_train, y_train, split)

    
    return ret["cvbooster"], oof, encoder

    
    


In [12]:
class LGBMSerializer:
    def __init__(self,
                 booster: lgb.CVBooster,
                 encoders: LabelEncoders,
                 ):
        self.booster = booster
        self.encoders = encoders
        
    
    def to_file(self, filename: str):
        model = {
            "boosters": [b.model_to_string() for b in self.booster.boosters],
            "best_iteration": self.booster.best_iteration,
            
        }
        
        with open(f"{filename}_model.json", "w") as f:
            json.dump(model, f)
            
        with open(f"{filename}_encoder.bin", "wb") as f:
            pickle.dump(self.encoders, f)
            
    @classmethod
    def from_file(cls, filename: str) -> "TrainedModel":
        
        with open(f"{filename}_model.json", "r") as f:
            model = json.load(f)

        cvbooster = lgb.CVBooster()
        cvbooster.boosters = [lgb.Booster(model_str=b) for b in model["boosters"]]
        cvbooster.best_iteration = model["best_iteration"]
        for b in cvbooster.boosters:
            b.best_iteration = cvbooster.best_iteration
            
        with open(f"{filename}_encoder.bin", "rb") as f:
            encoders = pickle.load(f)
            
        return cls(cvbooster, encoders)
            

In [82]:
check_score = False

save_dir_model = Cfg.weight_dir_lgbm
os.makedirs(save_dir_model, exist_ok=True)

path_1st_preds = [os.path.join(Cfg.preprocess_dir, f) for f in ["df_second_model_0.feather", "df_second_model_1.feather"]]
path_2nd_preds = [os.path.join(Cfg.preprocess_dir, f) for f in ["df_third_model_0.feather", "df_third_model_1.feather"]]
pred_indices = [0,1]

train_events = pd.read_csv(Cfg.train_target_path)
train_events = train_events[~np.isnan(train_events['step'])]

for pred_idx, path_pred in zip(pred_indices, path_1st_preds):

    with timer("load dataset and make features"):

        df_train = load_dataset(path_pred)
        # df_train = df_train[df_train["score"] > (df_train["score"].min()+0.02)]
        display(df_train.head())
        print(df_train.shape)

        df_train = make_target_2nd_satge(df_train)
        df_train = make_features(df_train, Cfg, phase="train")
        sample_weight = None

    with timer("train_cv 2nd stage"):

        non_feature_columns = ["step","peak_indices", "best_ap_in_a_day", "is_best_of_best",
                                "best_ap_until", "best_ap_until_binary",
                                # "event", 
                                "offset_step", 
                                # 'night', 
                                'best_ap', # 'binary_best_ap', "target", 
                                'series_id', "min_diff_step",
                                # 'pred_nan_counter', 'pred_nan_span',
                                ]# + [c for c in df_train.columns if "score_4" in c] + [c for c in df_train.columns if "score_6" in c]# + [c for c in df_train.columns if "score_6" in c]

        for target_column in ["best_ap_until_binary", "is_best_of_best"]:
            num_boost_round = 500 if target_column == "best_ap_until_binary" else 600
            # num_boost_round = int(1.2 * num_boost_round)
            lgb_params = {
                        "objective": "binary", # "binary", #"regression",
                        "metric": "auc", # "mae"
                        'max_depth': 10, 
                        'num_leaves': 103, 
                        'feature_fraction': 0.18647422969488445, 
                        'min_child_samples': 10, 
                        'min_child_weight': 29, 
                        # 'subsample_for_bin': 43949,
                        "verbose": -1,
                        "learning_rate": 0.01/2 if target_column == "best_ap_until_binary" else 0.01,
                        # "reg_lambda": 10
                    }
            if target_column == "best_ap_until_binary":

                oofs_acm = []
                for th in [0.3,0.5,0.7]:
                    df_train["best_ap_until_binary"] = (df_train["best_ap_until"] > th).astype(int)
                    cvbooster, oof_acm, encoder = train_cv(df_train, non_feature_columns, target_column, weight=sample_weight, lgb_params=lgb_params, num_boost_round=num_boost_round)
                    np.save(save_dir_model + f"oof_acm_{pred_idx}.npy", oof_acm)
                    LGBMSerializer(cvbooster, encoder).to_file(save_dir_model + f"lgb_acm_{pred_idx}_th{int(th*100)}")
                    oofs_acm.append(oof_acm)


            else:
                

                cvbooster, oof_simple, encoder = train_cv(df_train, non_feature_columns, target_column, weight=sample_weight, lgb_params=lgb_params, num_boost_round=num_boost_round)
                np.save(save_dir_model + f"oof_simple_{pred_idx}.npy", oof_simple)
                LGBMSerializer(cvbooster, encoder).to_file(save_dir_model + f"lgb_simple_{pred_idx}")

    with timer("postprocess and evaluation"):
                
        def postprocess_2nd_stage(df, oof_acm, oof_simple, weights=[1, 0.4, 1]):
            print("need to be sorted by score in advance")
            df["origin_score"] = df["score"].copy()
            df["event"] = df["event"].apply(lambda x: "onset" if x == 0 else "wakeup")
            df["pred_best_ap_until"] = oof_acm
            df["pred_best_ap_until"] = df.groupby(["series_id", "night", "event"])["pred_best_ap_until"].transform(lambda x: np.maximum.accumulate(x)) # avoid minus diff
            df["pred_score_acm"] = df.groupby(["series_id", "night", "event"])["pred_best_ap_until"].diff().fillna(df["pred_best_ap_until"])
            df["pred_score_simple"] = oof_simple
            df["score"] = (df["pred_score_acm"]*weights[0] + df["pred_score_simple"]*weights[1]  + df["origin_score"]*weights[2]) / sum(weights)
            return df
        
        # daily auc prediction to confidence score, then save feather for third stage
        oof_acm = np.mean(oofs_acm, axis=0)
        df_train = postprocess_2nd_stage(df_train, oof_acm, oof_simple, weights=[1, 0.4, 1]) # CHANGE TO 04
        df_train.to_feather(path_2nd_preds[pred_idx])


        # check score
        eval_df = df_train[["series_id", "step", "night", "event", "best_ap", "best_ap_until", "score"]].copy()
        eval_df = eval_df.sort_values(["series_id", "step"])
        train_events = train_events.loc[train_events['series_id'].isin(eval_df['series_id'].unique())]
        print(event_detection_ap(train_events, eval_df.copy(), tolerances))


        if check_score: # scores of many patterns
            eval_df = df_train[["series_id", "step", "night", "event", "best_ap", "best_ap_until", "origin_score", "pred_score_acm", "pred_score_simple"]].copy()

            # eval_df["score"] = eval_df["origin_score"]
            # print("origin score")
            # print(event_detection_ap(train_events, eval_df.copy(), tolerances))

            # eval_df["score"] = eval_df["pred_score_acm"]
            # print("cumsum lgbm score")
            # print(event_detection_ap(train_events, eval_df.copy(), tolerances))

            # eval_df["score"] = eval_df["pred_score_simple"]
            # print("simple lgbm score")
            # print(event_detection_ap(train_events, eval_df.copy(), tolerances))

            # eval_df["score"] = (eval_df["pred_score_acm"]*1 + eval_df["origin_score"]*0.9) / 2
            # print("ensemble score 2cs")
            # print(event_detection_ap(train_events, eval_df.copy(), tolerances))

            # eval_df["score"] = (eval_df["pred_score_simple"]*1 + eval_df["origin_score"]*0.9) / 2
            # print("ensemble score 2ss")
            # print(event_detection_ap(train_events, eval_df.copy(), tolerances))
            w = [1,0.4,1.1]
            eval_df["score"] = (eval_df["pred_score_acm"]*w[0] + eval_df["pred_score_simple"]*w[1] + eval_df["origin_score"]*w[2]) / sum(w)
            print("ensemble score 3css", w)
            print(event_detection_ap(train_events, eval_df.copy(), tolerances))

            w = [1,0.4,1]
            eval_df["score"] = (eval_df["pred_score_acm"]*w[0] + eval_df["pred_score_simple"]*w[1] + eval_df["origin_score"]*w[2]) / sum(w)
            print("ensemble score 3css", w)
            print(event_detection_ap(train_events, eval_df.copy(), tolerances))

            w = [1,0.4,0.9]
            eval_df["score"] = (eval_df["pred_score_acm"]*w[0] + eval_df["pred_score_simple"]*w[1] + eval_df["origin_score"]*w[2]) / sum(w)
            print("ensemble score 3css", w)
            print(event_detection_ap(train_events, eval_df.copy(), tolerances))



[MEMUSE] memory usage (in before load dataset and make features): 16434.22MB (25.1%)
(30084, 117)


Unnamed: 0,step,peak_indices,daily_step,event,score,score_10p,score_10,score_8,score_6,score_4,...,pred_switch_min,pred_switch_max,pred_awake_mean,pred_awake_min,pred_awake_max,offset_step,night,min_diff_step,best_ap,series_id
0,488,488,0.674074,wakeup,0.097351,0.029362,0.114621,0.151416,0.185094,0.233708,...,0.0,0.426974,0.632516,0.000394,0.999994,11648.0,2.0,4504.0,0.0,038441c925bb
1,617,618,0.681597,wakeup,0.089041,0.027944,0.117709,0.130648,0.144085,0.164008,...,0.0,0.426974,0.632516,0.000394,0.999994,11778.0,2.0,4375.0,0.0,038441c925bb
2,4993,4993,0.93478,onset,0.413439,0.220643,0.478017,0.639067,0.653115,0.690274,...,0.0,0.426974,0.632516,0.000394,0.999994,16153.0,2.0,1.0,1.0,038441c925bb
3,9350,9350,0.186921,wakeup,0.053009,0.024013,0.063197,0.07506,0.081973,0.088259,...,0.0,0.426974,0.632516,0.000394,0.999994,20510.0,2.0,1582.0,0.0,038441c925bb
4,9749,9749,0.210012,wakeup,0.064986,0.014936,0.079251,0.098673,0.091745,0.095175,...,0.0,0.426974,0.632516,0.000394,0.999994,20909.0,2.0,1183.0,0.0,038441c925bb


  f"evaluating in Python space because the {repr(op_str)} "


[load dataset and make features] 5.653sec
[MEMUSE] memory usage (in after load dataset and make features): 16405.70MB (25.1%)
[MEMUSE] memory usage (in before train_cv 2nd stage): 16405.70MB (25.1%)
[MEMUSE] memory usage (in before make dataset): 16405.70MB (25.1%)
(30084, 308)
features: ['daily_step', 'event', 'score', 'score_10p', 'score_10', 'score_8', 'score_6', 'score_4', 'score_2', 'pred_nan', 'pred_nan_counter', 'pred_nan_span', 'pred_awake', 'before_states', 'after_states', 'before_states_feat_12', 'after_states_feat_12', 'before_nan_feat_12', 'after_nan_feat_12', 'befaf_pred_switch_6', 'befaf_pred_switch10p_6', 'befaf_pred_switch10_6', 'befaf_pred_switch8_6', 'befaf_pred_switch6_6', 'befaf_pred_switch4_6', 'befaf_pred_switch2_6', 'before_states_feat_24', 'after_states_feat_24', 'before_nan_feat_24', 'after_nan_feat_24', 'befaf_pred_switch_12', 'befaf_pred_switch10p_12', 'befaf_pred_switch10_12', 'befaf_pred_switch8_12', 'befaf_pred_switch6_12', 'befaf_pred_switch4_12', 'befaf_

Unnamed: 0,step,peak_indices,daily_step,event,score,score_10p,score_10,score_8,score_6,score_4,...,pred_switch_min,pred_switch_max,pred_awake_mean,pred_awake_min,pred_awake_max,offset_step,night,min_diff_step,best_ap,series_id
0,489,489,0.674132,wakeup,0.069161,0.015779,0.079967,0.096567,0.133182,0.15149,...,0.0,0.391876,0.632209,0.000268,0.999994,11649.0,2.0,4503.0,0.0,038441c925bb
1,617,617,0.681539,wakeup,0.074454,0.018117,0.100562,0.106553,0.112734,0.120838,...,0.0,0.391876,0.632209,0.000268,0.999994,11777.0,2.0,4375.0,0.0,038441c925bb
2,4993,4993,0.93478,onset,0.382331,0.192437,0.417983,0.636628,0.69687,0.643506,...,0.0,0.391876,0.632209,0.000268,0.999994,16153.0,2.0,1.0,1.0,038441c925bb
3,9349,9349,0.186863,wakeup,0.092179,0.053659,0.116856,0.135798,0.163534,0.177741,...,0.0,0.391876,0.632209,0.000268,0.999994,20509.0,2.0,1583.0,0.0,038441c925bb
4,9445,9445,0.192419,wakeup,0.052276,0.023591,0.047596,0.095143,0.124461,0.151226,...,0.0,0.391876,0.632209,0.000268,0.999994,20605.0,2.0,1487.0,0.0,038441c925bb


  f"evaluating in Python space because the {repr(op_str)} "


[load dataset and make features] 5.550sec
[MEMUSE] memory usage (in after load dataset and make features): 16438.59MB (25.1%)
[MEMUSE] memory usage (in before train_cv 2nd stage): 16438.59MB (25.1%)
[MEMUSE] memory usage (in before make dataset): 16440.06MB (25.1%)
(29161, 308)
features: ['daily_step', 'event', 'score', 'score_10p', 'score_10', 'score_8', 'score_6', 'score_4', 'score_2', 'pred_nan', 'pred_nan_counter', 'pred_nan_span', 'pred_awake', 'before_states', 'after_states', 'before_states_feat_12', 'after_states_feat_12', 'before_nan_feat_12', 'after_nan_feat_12', 'befaf_pred_switch_6', 'befaf_pred_switch10p_6', 'befaf_pred_switch10_6', 'befaf_pred_switch8_6', 'befaf_pred_switch6_6', 'befaf_pred_switch4_6', 'befaf_pred_switch2_6', 'before_states_feat_24', 'after_states_feat_24', 'before_nan_feat_24', 'after_nan_feat_24', 'befaf_pred_switch_12', 'befaf_pred_switch10p_12', 'befaf_pred_switch10_12', 'befaf_pred_switch8_12', 'befaf_pred_switch6_12', 'befaf_pred_switch4_12', 'befaf_

In [89]:
# WBF Ensemble

check_score = True
path_2nd_pred_wbf = os.path.join(Cfg.preprocess_dir, "df_third_model_wbf.feather")

with timer("weighted fusion ensemble"):
    df_train_2nd_0 = pd.read_feather(path_2nd_preds[0])
    df_train_2nd_1 = pd.read_feather(path_2nd_preds[1])

    ensemble_dist_th = 100
    df_train_2nd_wbf = weighted_fusion_ensemble(df_train_2nd_0, df_train_2nd_1, distance_threshold=ensemble_dist_th, model_weights=[0.5,0.5]) # 0.8331296875571643 0.83456221631326 # 0.8346057313899438 # 0.8346431798496226param tuned 0.8346228992883475
    print("shape", df_train_2nd_0.shape, df_train_2nd_1.shape, df_train_2nd_wbf.shape)
    
    df_train_2nd_wbf = round_step(df_train_2nd_wbf)
    df_train_2nd_wbf.to_feather(path_2nd_pred_wbf)

    if check_score:
        eval_df = df_train_2nd_wbf[["series_id", "step", "event", "score"]].copy()
        eval_df = eval_df.sort_values(["series_id", "step"])
        train_events = pd.read_csv(Cfg.train_target_path)
        train_events = train_events[~np.isnan(train_events['step'])]
        train_events = train_events.loc[train_events['series_id'].isin(eval_df['series_id'].unique())]
 
        print(event_detection_ap(train_events, eval_df.copy(), tolerances))


[MEMUSE] memory usage (in before weighted fusion ensemble): 16827.23MB (25.7%)
shape (30208, 322) (29539, 322) (33325, 322)
0.8346228992883475
[weighted fusion ensemble] 38.847sec
[MEMUSE] memory usage (in after weighted fusion ensemble): 16875.76MB (25.8%)


In [90]:
def add_many_events(pred_df, score_th=0.0):
    """
    Use as many prediction as possible for Average Precision.
    """
    
    mins_offset = [2, 4, 8, 16]
    
    pred_df["offset_from_original"] = 0

    pred_concat = [pred_df.copy()]
    for offset in mins_offset:
        pred_df_c = pred_df.copy()
        pred_df_c = pred_df_c[pred_df_c["score"] > score_th]
        pred_df_c["step"] = pred_df_c["step"] - Cfg.step_for_1min * offset
        pred_df_c["offset_from_original"] = - offset
        pred_concat.append(pred_df_c)
        pred_df_c = pred_df.copy()
        pred_df_c["step"] = pred_df_c["step"] + Cfg.step_for_1min * offset
        pred_df_c["offset_from_original"] = offset
        pred_concat.append(pred_df_c)
    pred_concat = pd.concat(pred_concat, ignore_index=True)

    # If some predictions are in the same minutes, keep the higher.
    pred_concat["step_1min"] = pred_concat["step"] // Cfg.step_for_1min
    pred_concat = pred_concat.drop_duplicates(["series_id", "event", "step_1min"])
    pred_concat = pred_concat.drop(columns=["step_1min"])
    
    return pred_concat

def add_10dayoffset_events(pred_df, score_th=0.0):
    mins_offset = [1440*10]
    pred_df["offset_from_original"] = 0
    pred_concat = [pred_df.copy()]
    for offset in mins_offset:
        pred_df_c = pred_df.copy()
        pred_df_c["step"] = pred_df_c["step"] + Cfg.step_for_1min * offset
        pred_df_c["score"] *= 0.001
        pred_df_c["offset_from_original"] = offset
        pred_concat.append(pred_df_c)
    pred_concat = pd.concat(pred_concat, ignore_index=True)
    # If some predictions are in the same minutes, keep the higher.
    pred_concat["step_1min"] = pred_concat["step"] // Cfg.step_for_1min
    pred_concat = pred_concat.drop_duplicates(["series_id", "event", "step_1min"])
    pred_concat = pred_concat.drop(columns=["step_1min"])
    return pred_concat


def diff_step_to_AP(diff_step):
    return np.sum(np.array([diff_step < threshold for threshold in [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]]), axis=0) / 10.

    

def add_feat_third_stage(df, pred_files):
    
    out_df = []
    for file in pred_files:
        df_preds = pd.read_parquet(file)
        cols = df_preds.columns
        new_cols = [col + "_third" for col in cols]
        series_id = file.split("id_")[-1].split("_")[0]
        df_series = df[df["series_id"] == series_id].copy()
        steps = df_series["step"].values.astype(int)
        steps = np.clip(steps, 0, len(df_preds)-1)
        df_series[new_cols] = df_preds.values[steps]
        out_df.append(df_series)
    out_df = pd.concat(out_df, ignore_index=True)
    return out_df


def add_feat_third_stage_ensemble(df, list_pred_files, suffix="_third"):
   
    
    num_ensemble = len(list_pred_files)

    out_df = []
    for files in zip(*list_pred_files):
        df_pred_id = pd.read_parquet(files[0])
        for pf in files[1:]:
            df_pred_id_1 = pd.read_parquet(pf)
            cols = [c for c in df_pred_id.columns if "pred_switch" in c] + ["pred_awake"]
            for c in cols:
                df_pred_id[c] += df_pred_id_1[c]
        for c in cols:
            df_pred_id[c] /= num_ensemble
        cols = df_pred_id.columns
        new_cols = [col + suffix for col in cols]
        series_id = files[0].split("id_")[-1].split("_")[0]
        df_series = df[df["series_id"] == series_id].copy()
        steps = df_series["step"].values.astype(int)
        steps = np.clip(steps, 0, len(df_pred_id)-1)
        df_series[new_cols] = df_pred_id.values[steps]
        out_df.append(df_series)
    out_df = pd.concat(out_df, ignore_index=True)
    return out_df        


# train_events = pd.read_csv(Cfg.train_target_path)

df_train_2nd = pd.read_feather(path_2nd_pred_wbf)

df_train_third = []
for (series_id, event), _df_train in df_train_2nd.groupby(["series_id", "event"]):
    if len(_df_train) == 0:
        continue
    train_event_series = train_events.loc[np.logical_and((train_events["series_id"] == series_id), (train_events["event"] == event))]
    train_event_series = train_event_series[~np.isnan(train_event_series['step'])]
    df_third_series = _df_train.copy()
    df_third_series = add_many_events(df_third_series)
    steps = df_third_series["step"].values
    if len(train_event_series) == 0:
        df_third_series["next_best_ap"] = 0
    else:
        target_step = train_event_series["step"].values
        df_third_series["next_best_ap"] = diff_step_to_AP(np.abs(steps.reshape(-1, 1) - target_step.reshape(1, -1)).min(axis=1))
    df_train_third.append(df_third_series)
df_train_third = pd.concat(df_train_third, ignore_index=True)
print((df_train_third["next_best_ap"] > df_train_third["best_ap_in_a_day"]).sum(), " / ", len(df_train_third), " is better than original. target of third stage")

df_train_third["next_best_ap_in_a_day"] = df_train_third.groupby(["series_id", "night", "event"])["next_best_ap"].transform("max")
df_train_third["third_target"] = np.logical_and(df_train_third["next_best_ap"] > df_train_third["best_ap_in_a_day"], df_train_third["next_best_ap_in_a_day"]==df_train_third["next_best_ap"]).astype(int) # 手前のオフセットから順番に見ていきつつ、ベストAP更新していくのもある。
# df_train_third["third_target"] = (df_train_third["next_best_ap"] > df_train_third["best_ap_in_a_day"]).astype(int) # 手前のオフセットから順番に見ていきつつ、ベストAP更新していくのもある。
print("pos_samples", df_train_third["third_target"].sum(), " / ", len(df_train_third))
sample_weight_third = ((df_train_third["next_best_ap"] - df_train_third["best_ap_in_a_day"]) >= 0.5) + 1
df_train_third = df_train_third[df_train_third["offset_from_original"] != 0]
df_train_third.drop(columns=["next_best_ap_in_a_day"], inplace=True)


print("adding feature from the output of 1dcnn")
num_folds = 2 if Cfg.IS_DEBUG=="True" else 5
pred_files_A0 = []
for i in range(5):
    pred_files_A0 += glob.glob(os.path.join(Cfg.weight_dir_1dcnn, f"exp00_run_00_SplitStem{num_folds}foldSEED42controledStride_fold{i}/pred090/*.parquet"))

pred_files_A1 = []
for i in range(5):
    pred_files_A1 += glob.glob(os.path.join(Cfg.weight_dir_1dcnn, f"exp00_run_00_SplitStem{num_folds}foldSEED111controledStride_fold{i}/pred090/*.parquet")) # 

pred_files_B0 = []
for i in range(5):
    pred_files_B0 += glob.glob(os.path.join(Cfg.weight_dir_1dcnn, f"exp00_run_01_SplitStem{num_folds}foldSEED42normal_fold{i}/pred090/*.parquet"))

pred_files_B1 = []
for i in range(5):
    pred_files_B1 += glob.glob(os.path.join(Cfg.weight_dir_1dcnn, f"exp00_run_01_SplitStem{num_folds}foldSEED111normal_fold{i}/pred090/*.parquet"))

# sort
pred_A0_base = [os.path.basename(f) for f in pred_files_A0]
pred_A1_base = [os.path.basename(f) for f in pred_files_A1]
pred_B0_base = [os.path.basename(f) for f in pred_files_B0]
pred_B1_base = [os.path.basename(f) for f in pred_files_B1]

pred_files_A0 = [pred_files_A0[i] for i in np.argsort(pred_A0_base)]
pred_files_A1 = [pred_files_A1[i] for i in np.argsort(pred_A1_base)]
pred_files_B0 = [pred_files_B0[i] for i in np.argsort(pred_B0_base)]
pred_files_B1 = [pred_files_B1[i] for i in np.argsort(pred_B1_base)]

df_train_third_f = add_feat_third_stage_ensemble(df_train_third, [pred_files_A0, pred_files_A1, pred_files_B0, pred_files_B1], suffix="_third")

8436  /  281351  is better than original.
pos_samples 2973  /  281351


In [99]:
display(df_train_third[[ "next_best_ap", "score", "best_ap", "min_diff_step", "best_ap_in_a_day"]].head())

non_feature_columns = ["step","peak_indices", "best_ap_in_a_day", "is_best_of_best",
                        "best_ap_until", "best_ap_until_binary",

                        # "event", 
                        "offset_step", 
                        # 'night', 
                        'best_ap', # 'binary_best_ap', "target", 
                        'series_id', "min_diff_step",
                        "third_target", "next_best_ap",]# + [col for col in df_train_third_f.columns if "_third" in col]

target_column = "third_target"
num_boost_round = 300
lgb_params = {
            "objective": "binary", # "binary", #"regression",
            "metric": "auc", # "mae"
            'max_depth': 10, 
            'num_leaves': 103, 
            'feature_fraction': 0.38647422969488445, 
            'min_child_samples': 10, 
            'min_child_weight': 29, 
            'subsample_for_bin': 43949,
            "verbose": -1,
            "learning_rate": 0.015,
            # "reg_lambda": 1
        }
sample_weight_third = None # ((df_train_third["next_best_ap"] - df_train_third["best_ap_in_a_day"]) >= 0.1) * 0.1 + ((df_train_third["next_best_ap"] - df_train_third["best_ap_in_a_day"]) >= 0.2) * 0.1 +  + 1
print("---train third stage---")
with timer("train_cv"):
    # cvbooster, oof_third, encoder = train_cv(df_train_third, non_feature_columns, target_column, weight=sample_weight, lgb_params=lgb_params, num_boost_round=num_boost_round)
    cvbooster, oof_third, encoder = train_cv(df_train_third_f, non_feature_columns, target_column, weight=None, lgb_params=lgb_params, num_boost_round=num_boost_round)
    np.save(save_dir_model + "oof_third.npy", oof_third)
    LGBMSerializer(cvbooster, encoder).to_file(save_dir_model + "lgb_third")
    
# [1000]	cv_agg's auc: 0.83465 + 0.0158637


Unnamed: 0,next_best_ap,score,best_ap,min_diff_step,best_ap_in_a_day
69,0.9,0.701376,1.0,1.0,1.0
70,0.9,0.699377,1.0,1.0,1.0
71,0.9,0.69215,1.0,2.0,1.0
72,0.9,0.693019,1.0,1.0,1.0
73,0.9,0.683404,1.0,1.0,1.0


[MEMUSE] memory usage (in before train_cv): 16786.48MB (25.7%)
[MEMUSE] memory usage (in before make dataset): 16791.98MB (25.7%)
(248026, 325)
features: ['daily_step', 'event', 'score', 'score_10p', 'score_10', 'score_8', 'score_6', 'score_4', 'score_2', 'pred_nan', 'pred_nan_counter', 'pred_nan_span', 'pred_awake', 'before_states', 'after_states', 'before_states_feat_12', 'after_states_feat_12', 'before_nan_feat_12', 'after_nan_feat_12', 'befaf_pred_switch_6', 'befaf_pred_switch10p_6', 'befaf_pred_switch10_6', 'befaf_pred_switch8_6', 'befaf_pred_switch6_6', 'befaf_pred_switch4_6', 'befaf_pred_switch2_6', 'before_states_feat_24', 'after_states_feat_24', 'before_nan_feat_24', 'after_nan_feat_24', 'befaf_pred_switch_12', 'befaf_pred_switch10p_12', 'befaf_pred_switch10_12', 'befaf_pred_switch8_12', 'befaf_pred_switch6_12', 'befaf_pred_switch4_12', 'befaf_pred_switch2_12', 'before_states_feat_60', 'after_states_feat_60', 'before_nan_feat_60', 'after_nan_feat_60', 'befaf_pred_switch_30', '

In [77]:

def add_large_offset_events(pred_df): # 0.843046614024095 1 day 
    mins_offset = [-1440,-60,60,1440]
    pred_df["offset_from_original"] = 0
    pred_concat = [pred_df.copy()]
    for offset in mins_offset:
        pred_df_c = pred_df.copy()
        pred_df_c["step"] = pred_df_c["step"] + Cfg.step_for_1min * offset
        pred_df_c["score"] *= 0.0001
        pred_df_c["offset_from_original"] = offset
        pred_concat.append(pred_df_c)
    pred_concat = pd.concat(pred_concat, ignore_index=True)
    # If some predictions are in the same minutes, keep the higher.
    pred_concat["step_1min"] = pred_concat["step"] // Cfg.step_for_1min
    pred_concat = pred_concat.drop_duplicates(["series_id", "event", "step_1min"])
    pred_concat = pred_concat.drop(columns=["step_1min", "offset_from_original"])
    return pred_concat

print("---after 3rd stage, add large offset events---")

eval_df_third = df_train_third_f[["series_id", "step", "event", "score"]].copy()
eval_df_third["score"] = oof_third * 0.35 # 0.8433218836176455

eval_df_second = df_train_2nd[["series_id", "step", "event", "score"]].copy()
eval_df_second_plus = add_large_offset_events(eval_df_second) # 0.8434865485966365
eval_df = pd.concat([eval_df_second_plus, eval_df_third], ignore_index=True)
# stepが負のものは除く
eval_df = eval_df[eval_df["step"] > 0]
eval_df = eval_df.sort_values(["series_id", "step"])
print(eval_df.shape)
print(event_detection_ap(train_events, eval_df, tolerances))



(343820, 4)
0.8438249831768274
