In [1]:
!pip show torchvision

Name: torchvision
Version: 0.15.1+cpu
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /opt/conda/lib/python3.10/site-packages
Requires: numpy, pillow, requests, torch
Required-by: easyocr, fastai, timm


In [2]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
from datetime import datetime, timezone, timedelta
import matplotlib.pyplot as plt
import matplotlib
matplotlib.use("Agg")
import torch
import torchvision
from torchvision.io import read_image
from torchvision.transforms import v2 as T
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor



In [3]:
import numpy as np
import pandas as pd
import pandas.api.types
from typing import Dict, List, Tuple

tolerances = {
    'onset': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360], 
    'wakeup': [12, 36, 60, 90, 120, 150, 180, 240, 300, 360]
}

column_names = {
    'series_id_column_name': 'series_id',
    'time_column_name': 'step',
    'event_column_name': 'event',
    'score_column_name': 'score',
}

In [4]:
def score(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        tolerances: Dict[str, List[float]],
        series_id_column_name: str,
        time_column_name: str,
        event_column_name: str,
        score_column_name: str,
        use_scoring_intervals: bool = False,
) -> float:
    """Event Detection Average Precision, an AUCPR metric for event detection in
    time series and video.

    This metric is similar to IOU-threshold average precision metrics commonly
    used in object detection. For events occuring in time series, we replace the
    IOU threshold with a time tolerance.

    Submissions are evaluated on the average precision of detected events,
    averaged over timestamp error tolerance thresholds, averaged over event
    classes.

    Detections are matched to ground-truth events within error tolerances, with
    ambiguities resolved in order of decreasing confidence.

    Detailed Description
    --------------------
    Evaluation proceeds in four steps:

    1. Selection - (optional) Predictions not within a series' scoring
    intervals are dropped.
    2. Assignment - Predicted events are matched with ground-truth events.
    3. Scoring - Each group of predictions is scored against its corresponding
    group of ground-truth events via Average Precision.
    4. Reduction - The multiple AP scores are averaged to produce a single
    overall score.

    Selection

    With each series there may be a defined set of scoring intervals giving the
    intervals of time over which zero or more ground-truth events might be
    annotated in that series. A prediction will be evaluated only if it falls
    within a scoring interval. These scoring intervals can be chosen to improve
    the fairness of evaluation by, for instance, ignoring edge-cases or
    ambiguous events.

    It is recommended that, if used, scoring intervals be provided for training
    data but not test data.

    Assignment

    For each set of predictions and ground-truths within the same `event x
    tolerance x series_id` group, we match each ground-truth to the
    highest-confidence unmatched prediction occurring within the allowed
    tolerance.

    Some ground-truths may not be matched to a prediction and some predictions
    may not be matched to a ground-truth. They will still be accounted for in
    the scoring, however.

    Scoring

    Collecting the events within each `series_id`, we compute an Average
    Precision score for each `event x tolerance` group. The average precision
    score is the area under the (step-wise) precision-recall curve generated by
    decreasing confidence score thresholds over the predictions. In this
    calculation, matched predictions over the threshold are scored as TP and
    unmatched predictions as FP. Unmatched ground-truths are scored as FN.

    Reduction

    The final score is the average of the above AP scores, first averaged over
    tolerance, then over event.

    Parameters
    ----------
    solution : pd.DataFrame, with columns:

        `series_id_column_name` identifier for each time series

        `time_column_name` the time of occurence for each event as a numeric type

        `event_column_name` class label for each event

        The solution contains the time of occurence of one or more types of
        event within one or more time series. The metric expects the solution to
        contain the same event types as those given in `tolerances`.

        When `use_scoring_intervals == True`, you may include `start` and `end`
        events to delimit intervals within which detections will be scored.
        Detected events (from the user submission) outside of these events will
        be ignored.

    submission : pd.DataFrame, with columns as above and in addition:

        `score_column_name` the predicted confidence score for the detected event

    tolerances : Dict[str, List[float]]

        Maps each event class to a list of timestamp tolerances used
        for matching detections to ground-truth events.

    use_scoring_intervals: bool, default False

        Whether to ignore predicted events outside intervals delimited
        by `'start'` and `'end'` events in the solution. When `False`,
        the solution should not include `'start'` and `'end'` events.
        See the examples for illustration.

    Returns
    -------
    event_detection_ap : float
        The mean average precision of the detected events.

    Examples
    --------
    Detecting `'pass'` events in football:
    >>> column_names = {
    ...     'series_id_column_name': 'video_id',
    ...     'time_column_name': 'time',
    ...     'event_column_name': 'event',
    ...     'score_column_name': 'score',
    ... }
    >>> tolerances = {'pass': [1.0]}
    >>> solution = pd.DataFrame({
    ...     'video_id': ['a', 'a'],
    ...     'event': ['pass', 'pass'],
    ...     'time': [0, 15],
    ... })
    >>> submission = pd.DataFrame({
    ...     'video_id': ['a', 'a', 'a'],
    ...     'event': ['pass', 'pass', 'pass'],
    ...     'score': [1.0, 0.5, 1.0],
    ...     'time': [0, 10, 14.5],
    ... })
    >>> score(solution, submission, tolerances, **column_names)
    1.0

    Increasing the confidence score of the false detection above the true
    detections decreases the AP.
    >>> submission.loc[1, 'score'] = 1.5
    >>> score(solution, submission, tolerances, **column_names)
    0.6666666666666666...

    Likewise, decreasing the confidence score of a true detection below the
    false detection also decreases the AP.
    >>> submission.loc[1, 'score'] = 0.5  # reset
    >>> submission.loc[0, 'score'] = 0.0
    >>> score(solution, submission, tolerances, **column_names)
    0.8333333333333333...

    We average AP scores over tolerances. Previously, the detection at 14.5
    would match, but adding smaller tolerances gives AP scores where it does
    not match. This results in both a FN, since the ground-truth wasn't
    detected, and a FP, since the detected event matches no ground-truth.
    >>> tolerances = {'pass': [0.1, 0.2, 1.0]}
    >>> score(solution, submission, tolerances, **column_names)
    0.3888888888888888...

    We also average over time series and over event classes.
    >>> tolerances = {'pass': [0.5, 1.0], 'challenge': [0.25, 0.50]}
    >>> solution = pd.DataFrame({
    ...     'video_id': ['a', 'a', 'b'],
    ...     'event': ['pass', 'challenge', 'pass'],
    ...     'time': [0, 15, 0],  # restart time for new time series b
    ... })
    >>> submission = pd.DataFrame({
    ...     'video_id': ['a', 'a', 'b'],
    ...     'event': ['pass', 'challenge', 'pass'],
    ...     'score': [1.0, 0.5, 1.0],
    ...     'time': [0, 15, 0],
    ... })
    >>> score(solution, submission, tolerances, **column_names)
    1.0

    By adding scoring intervals to the solution, we may choose to ignore
    detections outside of those intervals.
    >>> tolerances = {'pass': [1.0]}
    >>> solution = pd.DataFrame({
    ...     'video_id': ['a', 'a', 'a', 'a'],
    ...     'event': ['start', 'pass', 'pass', 'end'],
    ...     'time': [0, 10, 20, 30],
    ... })
    >>> submission = pd.DataFrame({
    ...     'video_id': ['a', 'a', 'a'],
    ...     'event': ['pass', 'pass', 'pass'],
    ...     'score': [1.0, 1.0, 1.0],
    ...     'time': [10, 20, 40],
    ... })
    >>> score(solution, submission, tolerances, **column_names, use_scoring_intervals=True)
    1.0

    """
    # Validate metric parameters
    assert len(tolerances) > 0, "Events must have defined tolerances."
    assert set(tolerances.keys()) == set(solution[event_column_name]).difference({'start', 'end'}),        (f"Solution column {event_column_name} must contain the same events "
         "as defined in tolerances.")
    assert pd.api.types.is_numeric_dtype(solution[time_column_name]),        f"Solution column {time_column_name} must be of numeric type."

    # Validate submission format
    for column_name in [
        series_id_column_name,
        time_column_name,
        event_column_name,
        score_column_name,
    ]:
        if column_name not in submission.columns:
            raise ParticipantVisibleError(f"Submission must have column '{target_name}'.")

    if not pd.api.types.is_numeric_dtype(submission[time_column_name]):
        raise ParticipantVisibleError(
            f"Submission column '{time_column_name}' must be of numeric type."
        )
    if not pd.api.types.is_numeric_dtype(submission[score_column_name]):
        raise ParticipantVisibleError(
            f"Submission column '{score_column_name}' must be of numeric type."
        )

    # Set these globally to avoid passing around a bunch of arguments
    globals()['series_id_column_name'] = series_id_column_name
    globals()['time_column_name'] = time_column_name
    globals()['event_column_name'] = event_column_name
    globals()['score_column_name'] = score_column_name
    globals()['use_scoring_intervals'] = use_scoring_intervals

    return event_detection_ap(solution, submission, tolerances)


def filter_detections(
        detections: pd.DataFrame, intervals: pd.DataFrame
) -> pd.DataFrame:
    """Drop detections not inside a scoring interval."""
    detection_time = detections.loc[:, time_column_name].sort_values().to_numpy()
    intervals = intervals.to_numpy()
    is_scored = np.full_like(detection_time, False, dtype=bool)

    i, j = 0, 0
    while i < len(detection_time) and j < len(intervals):
        time = detection_time[i]
        int_ = intervals[j]

        # If the detection is prior in time to the interval, go to the next detection.
        if time < int_.left:
            i += 1
        # If the detection is inside the interval, keep it and go to the next detection.
        elif time in int_:
            is_scored[i] = True
            i += 1
        # If the detection is later in time, go to the next interval.
        else:
            j += 1

    return detections.loc[is_scored].reset_index(drop=True)


def match_detections(
        tolerance: float, ground_truths: pd.DataFrame, detections: pd.DataFrame
) -> pd.DataFrame:
    """Match detections to ground truth events. Arguments are taken from a common event x tolerance x series_id evaluation group."""
    detections_sorted = detections.sort_values(score_column_name, ascending=False).dropna()
    is_matched = np.full_like(detections_sorted[event_column_name], False, dtype=bool)
    gts_matched = set()
    for i, det in enumerate(detections_sorted.itertuples(index=False)):
        best_error = tolerance
        best_gt = None

        for gt in ground_truths.itertuples(index=False):
            error = abs(getattr(det, time_column_name) - getattr(gt, time_column_name))
            if error < best_error and gt not in gts_matched:
                best_gt = gt
                best_error = error

        if best_gt is not None:
            is_matched[i] = True
            gts_matched.add(best_gt)

    detections_sorted['matched'] = is_matched

    return detections_sorted


def precision_recall_curve(
        matches: np.ndarray, scores: np.ndarray, p: int
) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    if len(matches) == 0:
        return [1], [0], []

    # Sort matches by decreasing confidence
    idxs = np.argsort(scores, kind='stable')[::-1]
    scores = scores[idxs]
    matches = matches[idxs]

    distinct_value_indices = np.where(np.diff(scores))[0]
    threshold_idxs = np.r_[distinct_value_indices, matches.size - 1]
    thresholds = scores[threshold_idxs]

    # Matches become TPs and non-matches FPs as confidence threshold decreases
    tps = np.cumsum(matches)[threshold_idxs]
    fps = np.cumsum(~matches)[threshold_idxs]

    precision = tps / (tps + fps)
    precision[np.isnan(precision)] = 0
    recall = tps / p  # total number of ground truths might be different than total number of matches

    # Stop when full recall attained and reverse the outputs so recall is non-increasing.
    last_ind = tps.searchsorted(tps[-1])
    sl = slice(last_ind, None, -1)

    # Final precision is 1 and final recall is 0
    return np.r_[precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl]


def average_precision_score(matches: np.ndarray, scores: np.ndarray, p: int) -> float:
    precision, recall, _ = precision_recall_curve(matches, scores, p)
    # Compute step integral
    return -np.sum(np.diff(recall) * np.array(precision)[:-1])


def event_detection_ap(
        solution: pd.DataFrame,
        submission: pd.DataFrame,
        tolerances: Dict[str, List[float]],
) -> float:

    # Ensure solution and submission are sorted properly
    solution = solution.sort_values([series_id_column_name, time_column_name])
    submission = submission.sort_values([series_id_column_name, time_column_name])

    # Extract scoring intervals.
    if use_scoring_intervals:
        intervals = (
            solution
            .query("event in ['start', 'end']")
            .assign(interval=lambda x: x.groupby([series_id_column_name, event_column_name]).cumcount())
            .pivot(
                index='interval',
                columns=[series_id_column_name, event_column_name],
                values=time_column_name,
            )
            .stack(series_id_column_name)
            .swaplevel()
            .sort_index()
            .loc[:, ['start', 'end']]
            .apply(lambda x: pd.Interval(*x, closed='both'), axis=1)
        )

    # Extract ground-truth events.
    ground_truths = (
        solution
        .query("event not in ['start', 'end']")
        .reset_index(drop=True)
    )

    # Map each event class to its prevalence (needed for recall calculation)
    class_counts = ground_truths.value_counts(event_column_name).to_dict()

    # Create table for detections with a column indicating a match to a ground-truth event
    detections = submission.assign(matched = False)

    # Remove detections outside of scoring intervals
    if use_scoring_intervals:
        detections_filtered = []
        for (det_group, dets), (int_group, ints) in zip(
            detections.groupby(series_id_column_name), intervals.groupby(series_id_column_name)
        ):
            assert det_group == int_group
            detections_filtered.append(filter_detections(dets, ints))
        detections_filtered = pd.concat(detections_filtered, ignore_index=True)
    else:
        detections_filtered = detections

    # Create table of event-class x tolerance x series_id values
    aggregation_keys = pd.DataFrame(
        [(ev, tol, vid)
         for ev in tolerances.keys()
         for tol in tolerances[ev]
         for vid in ground_truths[series_id_column_name].unique()],
        columns=[event_column_name, 'tolerance', series_id_column_name],
    )

    # Create match evaluation groups: event-class x tolerance x series_id
    detections_grouped = (
        aggregation_keys
        .merge(detections_filtered, on=[event_column_name, series_id_column_name], how='left')
        .groupby([event_column_name, 'tolerance', series_id_column_name])
    )
    ground_truths_grouped = (
        aggregation_keys
        .merge(ground_truths, on=[event_column_name, series_id_column_name], how='left')
        .groupby([event_column_name, 'tolerance', series_id_column_name])
    )
    # Match detections to ground truth events by evaluation group
    detections_matched = []
    for key in aggregation_keys.itertuples(index=False):
        dets = detections_grouped.get_group(key)
        gts = ground_truths_grouped.get_group(key)
        detections_matched.append(
            match_detections(dets['tolerance'].iloc[0], gts, dets)
        )
    detections_matched = pd.concat(detections_matched)

    # Compute AP per event x tolerance group
    event_classes = ground_truths[event_column_name].unique()
    ap_table = (
        detections_matched
        .query("event in @event_classes")
        .groupby([event_column_name, 'tolerance']).apply(
            lambda group: average_precision_score(
                group['matched'].to_numpy(),
                group[score_column_name].to_numpy(),
                class_counts[group[event_column_name].iat[0]],
            )
        )
    )
    # Average over tolerances, then over event classes
    mean_ap = ap_table.groupby(event_column_name).mean().sum() / len(event_classes)

    return mean_ap

# Object detection data prep

In [5]:
file = '/kaggle/input/gamma-train-series-updated-11-11-2023/train_series_10112023.parquet'

DEV = False

series_id  = pd.read_parquet(file, columns=['series_id'])
series_id = series_id.series_id.unique()

print(len(series_id))

if DEV:
    series_id = series_id[::10]

269


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
series_id_train, series_id_test = train_test_split(series_id, 
                                   random_state=1234,  
                                   test_size=0.25,  
                                   shuffle=True)

series_id_val, series_id_test = train_test_split(series_id_test, 
                                   random_state=1234,  
                                   test_size=0.50,  
                                   shuffle=True)

In [8]:
working_folder = Path("/kaggle/working/")
images_folder = working_folder/"images"
images_folder.mkdir()

In [9]:
data_folder = Path("/kaggle/input/child-mind-institute-detect-sleep-states")
!ls {data_folder}

sample_submission.csv  train_events.csv
test_series.parquet    train_series.parquet


In [10]:
#test_series = pd.read_parquet(data_folder/"test_series.parquet")
#test_series

In [11]:
#test_series['large_enmo'] = test_series['enmo'] > 0.1506

In [12]:
#series_ids = test_series['series_id'].unique()
series_ids = series_id_val#[0:5]
len(series_ids)

34

In [13]:
def feat_eng(df):
    
    df['series_id'] = df['series_id'].astype('category')
    df['timestamp_2'] = pd.to_datetime(df['timestamp']).apply(lambda t: t.tz_localize(None))
    df['hour'] = df["timestamp_2"].dt.hour
    
    df.sort_values(['timestamp_2'], inplace=True)
    df.set_index('timestamp_2', inplace=True)

    df["anglez"] = df["anglez"].astype(np.float32)
    df["anglezdiffabs"] = df["anglez"].diff().abs().astype(np.float32)
    
    for col in ['anglezdiffabs']:
        
        # periods in seconds        
        periods = [60] 
        
        for n in periods:
            
            rol_args = {'window':f'{n+5}s', 'min_periods':10, 'center':True}
            
            for agg in ['median']:
                df[f'{col}_{agg}_{n}'] = df[col].rolling(**rol_args).agg(agg).astype(np.float32).values
                gc.collect()
            
            gc.collect()
    
    df.reset_index(inplace=True)
    df.dropna(inplace=True)

    return df

In [14]:
def feat_eng_by_id(idx):
    
    from warnings import simplefilter 
    simplefilter(action="ignore", category=pd.errors.PerformanceWarning)
    
    df  = pd.read_parquet('/kaggle/input/child-mind-institute-detect-sleep-states/train_series.parquet',
                          filters=[('series_id','=',idx)])
    df = feat_eng(df)
    
    return df

In [15]:
from tqdm.auto import tqdm 
from joblib import Parallel, delayed
from time import sleep, time
from multiprocessing import cpu_count
import gc

In [16]:
%%time

import joblib

print("Number of jobs: ",int(cpu_count()))

#train_df = []

window_properties = []

for idx in tqdm(series_ids): 

    test_series = feat_eng_by_id(idx)
    test_series['large_enmo'] = test_series['enmo'] > 0.1509000062942505
    test_series['anglezdiffabs_median_60_norm'] = (test_series['anglezdiffabs_median_60']-np.min(test_series['anglezdiffabs_median_60']))/ (max(test_series['anglezdiffabs_median_60'])-min(test_series['anglezdiffabs_median_60']))
    
    series = test_series.reset_index(drop=True)
    series['color'] = ["blue" if large_enmo else "green" for large_enmo in series['large_enmo']]
    series['timestamp'] = pd.to_datetime(series['timestamp'])
    series['timestamp_utc'] = series['timestamp'].map(lambda timestamp: timestamp.astimezone(timezone.utc))
    series['anglez_radians'] = (np.pi / 180) * series['anglez']
    series['cos_anglez'] = np.cos(series['anglez_radians'])
    series['enmo'] = np.clip(series['enmo'], 0, 1)
    min_date_utc = series['timestamp_utc'].dt.date.min()
    max_date_utc = series['timestamp_utc'].dt.date.max()
    series_24_hour_windows = {}
    upper_bound = datetime(year=min_date_utc.year, month=min_date_utc.month, day=min_date_utc.day, hour=20, minute=30, tzinfo=timezone.utc)
    lower_bound = upper_bound + timedelta(hours=-24) # 8:30pm UTC on the previous day.
    while lower_bound < series['timestamp_utc'].max():
        window_df = series.loc[(series['timestamp_utc'] >= lower_bound) & (series['timestamp_utc'] < upper_bound)].reset_index(drop=True)
        if len(window_df) > 0:
            series_24_hour_windows[upper_bound.isoformat()[:-6]] = window_df
        upper_bound += timedelta(hours=24)
        lower_bound += timedelta(hours=24)
    
    windows = list(series_24_hour_windows.keys())
    num_steps_cumulative = 0
    for window_idx, window in enumerate(windows):        
        fig = plt.figure(figsize=(14.4, 4))  # (width, height) in inches
        #plt.plot(series_24_hour_windows[window]['timestamp_utc'], series_24_hour_windows[window]['cos_anglez'], color="red")
        plt.plot(series_24_hour_windows[window]['timestamp_utc'],
                 series_24_hour_windows[window]['anglezdiffabs_median_60_norm'],
                 color="red")
        plt.scatter(
            series_24_hour_windows[window]['timestamp_utc'], 
            series_24_hour_windows[window]['enmo'], 
            color=series_24_hour_windows[window]['color'], 
            s=1
        )
        ax = plt.gca()
        ax.spines['top'].set_visible(False)
        ax.spines['right'].set_visible(False)
        ax.spines['bottom'].set_visible(False)
        ax.spines['left'].set_visible(False)
        ax.set_xticks([])
        ax.set_yticks([])
        plt.margins(0, 0)
        plt.subplots_adjust(left=0, right=1, top=1, bottom=0)
        plt.savefig(images_folder/f"{idx}_{window}.jpg", bbox_inches="tight", pad_inches=0)
        plt.clf()
        plt.cla()
        plt.close()

        min_ts_window = series_24_hour_windows[window]['timestamp_utc'].min()
        max_ts_window = series_24_hour_windows[window]['timestamp_utc'].max()
        num_steps_window = (max_ts_window - min_ts_window).total_seconds() / 5 + 1
        num_steps_cumulative += num_steps_window
        window_properties.append({
            'series_id': idx, 
            'image_name': f"{idx}_{window}.jpg", 
            'idx_in_series': window_idx, 
            'num_steps_window': num_steps_window, 
            'num_steps_cumulative': num_steps_cumulative
        })

    #test = feat_eng_by_id(idx)

    #train_df.append(test)
    
#train = pd.concat(train_df, ignore_index=True).reset_index(names='new_row_id')

#len(train)

Number of jobs:  4


  0%|          | 0/34 [00:00<?, ?it/s]

CPU times: user 55min 8s, sys: 3min 27s, total: 58min 36s
Wall time: 43min 48s


In [17]:
window_properties_df = pd.DataFrame(window_properties)
window_properties_df

Unnamed: 0,series_id,image_name,idx_in_series,num_steps_window,num_steps_cumulative
0,b737f8c78ec5,b737f8c78ec5_2018-03-03T20:30:00.jpg,0,15836.0,15836.0
1,b737f8c78ec5,b737f8c78ec5_2018-03-04T20:30:00.jpg,1,17280.0,33116.0
2,b737f8c78ec5,b737f8c78ec5_2018-03-05T20:30:00.jpg,2,17280.0,50396.0
3,b737f8c78ec5,b737f8c78ec5_2018-03-06T20:30:00.jpg,3,17280.0,67676.0
4,b737f8c78ec5,b737f8c78ec5_2018-03-07T20:30:00.jpg,4,17280.0,84956.0
...,...,...,...,...,...
982,eef041dd50aa,eef041dd50aa_2019-01-03T20:30:00.jpg,23,17280.0,399956.0
983,eef041dd50aa,eef041dd50aa_2019-01-04T20:30:00.jpg,24,17280.0,417236.0
984,eef041dd50aa,eef041dd50aa_2019-01-05T20:30:00.jpg,25,17280.0,434516.0
985,eef041dd50aa,eef041dd50aa_2019-01-06T20:30:00.jpg,26,17280.0,451796.0


# Transforms

In [18]:
def get_transforms():
    transforms = []
    transforms.append(T.ConvertDtype(torch.float))
    transforms.append(T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]))
    return T.Compose(transforms)

In [19]:
eval_transforms = get_transforms()

# Instantiating the model

In [20]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device

device(type='cpu')

In [21]:
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(weights=None, weights_backbone=None, max_size=1440)
# Replace the classifier with a new one, that has num_classes which is user-defined:
num_classes = 3 # 2 classes ('onset' & 'wakeup') + the 'background' class
# Get the number of input features for the box classifier:
in_features = model.roi_heads.box_predictor.cls_score.in_features
# Replace the pre-trained box predictor head with a new one:
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

In [22]:
weights = torch.load("/kaggle/input/seb-train-cnn-models-image-detection/seb_model_v1__4.pth", map_location=device)
model.load_state_dict(weights)

<All keys matched successfully>

In [23]:
model.to(device);

# Inference

In [24]:
model.eval()
thresh = 0.0 # Tune this.
reverse_label_mapping = {1: "onset", 2: "wakeup"}

In [25]:
def get_submission_df():
    all_test_preds = []
    for i, series_id in enumerate(series_ids): # i is the index of the series.
        series_rows = window_properties_df.loc[window_properties_df['series_id'] == series_id].reset_index(drop=True)
        for j in range(len(series_rows)): # j is the index of the image (within the series).
            image_name = series_rows['image_name'][j]
            image_path = os.path.join(images_folder, image_name)
            image = read_image(image_path)
            with torch.no_grad():
                x = eval_transforms(image)
                x = x.to(device)
                predictions = model([x])
            pred = predictions[0]
            
            # Postprocessing: remove 'narrow' boxes as these are false positives.
            pred_widths = pred['boxes'][:, 2] - pred['boxes'][:, 0]
            pred['boxes'] = pred['boxes'][pred_widths > 10]
            pred['labels'] = pred['labels'][pred_widths > 10]
            pred['scores'] = pred['scores'][pred_widths > 10]
            
            pred['boxes'] = pred['boxes'][pred['scores'] > thresh]
            pred['labels'] = pred['labels'][pred['scores'] > thresh]
            pred['scores'] = pred['scores'][pred['scores'] > thresh]
            
            if len(pred['labels']) == 0:
                continue
            else:
                pred_x = (pred['boxes'][:, 0] + pred['boxes'][:, 2]) / 2
                pred_labels = [reverse_label_mapping[l.item()] for l in pred['labels']]
                num_steps_window = series_rows['num_steps_window'][j]
                if j == 0:
                    prev_num_steps_cumulative = 0
                else:
                    prev_num_steps_cumulative = series_rows['num_steps_cumulative'][j - 1]            
                for k in range(len(pred_labels)): # k is the index of the bounding box (within the image).
                    event = {}
                    step_in_window = (pred_x[k] / 1440) * num_steps_window # All images have width 1,440px.
                    step_in_series = int(prev_num_steps_cumulative + step_in_window)
                    event['series_id'] = series_id
                    event['step'] = step_in_series
                    event['event'] = pred_labels[k]
                    event['score'] = pred['scores'][k].item()
                    all_test_preds.append(event)          
    if len(all_test_preds) > 0:
        submission_df = pd.DataFrame(all_test_preds)
        submission_df = submission_df.sort_values(by=['series_id', 'step']).reset_index(drop=True)
        submission_df['row_id'] = np.arange(len(submission_df))
        submission_df = submission_df[['row_id', 'series_id', 'step', 'event', 'score']]
    else:
        submission_df = pd.DataFrame({'row_id': [], 'series_id': [], 'step': [], 'event': [], 'score': []})
    return submission_df

In [26]:
submission_df = get_submission_df()
submission_df

Unnamed: 0,row_id,series_id,step,event,score
0,0,062dbd4c95e6,7840,onset,0.941166
1,1,062dbd4c95e6,14283,wakeup,0.080451
2,2,062dbd4c95e6,14452,wakeup,0.965852
3,3,062dbd4c95e6,14550,wakeup,0.095775
4,4,062dbd4c95e6,14646,wakeup,0.488864
...,...,...,...,...,...
4366,4366,fcca183903b7,601412,wakeup,0.053166
4367,4367,fcca183903b7,602138,wakeup,0.983011
4368,4368,fcca183903b7,602418,wakeup,0.060061
4369,4369,fcca183903b7,619397,onset,0.929265


In [27]:
events_submission=submission_df
events_submission=events_submission[(events_submission['score']>0.7)]

In [28]:
%%time
import polars as pl
train_events = (pl.scan_csv('/kaggle/input/child-mind-institute-detect-sleep-states/train_events.csv')
                .with_columns(
                    (
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.year().alias("year")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.month().alias("month")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.day().alias("day")),
                        (pl.col("timestamp").str.strptime(pl.Datetime, "%Y-%m-%dT%H:%M:%S%Z").dt.hour().alias("hour")),
                    )
                )
                .collect()
                .to_pandas()
               )

CPU times: user 120 ms, sys: 47.1 ms, total: 168 ms
Wall time: 403 ms


In [29]:
val_solution = train_events[train_events['series_id'].isin(series_ids)][['series_id', 'event', 'step']]
#val_solution = train_events[train_events['series_id'].isin(series_id_test2)][['series_id', 'event', 'step']]
val_solution = val_solution[val_solution['step'].notna()]
val_solution = val_solution.reset_index(drop=True)
val_solution = val_solution.reset_index().rename(columns={'index': 'row_id'})

#xgb_submission=xgb_submission[(xgb_submission['score']>0.1)]

#for the weird case
#print(f"Model score: {score(val_solution, events_submission[(events_submission['row_id']<26)],tolerances, **column_names)}")
print(f"Model score: {score(val_solution, events_submission,tolerances, **column_names)}")

Model score: 0.6701466609652249


In [30]:
val_solution

Unnamed: 0,row_id,series_id,event,step
0,0,062dbd4c95e6,onset,7872.0
1,1,062dbd4c95e6,wakeup,14484.0
2,2,062dbd4c95e6,onset,60720.0
3,3,062dbd4c95e6,wakeup,68400.0
4,4,062dbd4c95e6,onset,77304.0
...,...,...,...,...
1354,1354,fcca183903b7,wakeup,565824.0
1355,1355,fcca183903b7,onset,577344.0
1356,1356,fcca183903b7,wakeup,584052.0
1357,1357,fcca183903b7,onset,595344.0


In [31]:
events_submission

Unnamed: 0,row_id,series_id,step,event,score
0,0,062dbd4c95e6,7840,onset,0.941166
2,2,062dbd4c95e6,14452,wakeup,0.965852
7,7,062dbd4c95e6,60569,onset,0.976574
11,11,062dbd4c95e6,66822,wakeup,0.966833
13,13,062dbd4c95e6,68472,wakeup,0.789991
...,...,...,...,...,...
4354,4354,fcca183903b7,577343,onset,0.987767
4360,4360,fcca183903b7,584029,wakeup,0.984445
4362,4362,fcca183903b7,595403,onset,0.972804
4367,4367,fcca183903b7,602138,wakeup,0.983011


In [32]:
series_ids

array(['b737f8c78ec5', 'bdfce9ce62b9', '5f40907ec171', '5acc9d63b5fd',
       'b7188813d58a', '1762ab70ec76', 'fcca183903b7', '5f94bb3e1bed',
       'c75b4b207bea', '99b829cbad2d', '4a31811f3558', '3d53bfea61d6',
       'f0482490923c', '7df249527c63', 'ece2561f07e9', 'a9a2f7fac455',
       '8a22387617c3', '062dbd4c95e6', '612aa8ba44e2', 'b84960841a75',
       '60d31b0bec3b', '25e2b3dd9c3b', '08db4255286f', '3aceb17ef7bd',
       '8a306e0890c0', '35826366dfc7', '2f7504d0f426', '44d8c02b369e',
       'bfa54bd26187', 'd9e887091a5c', 'c289c8a823e0', '44a41bba1ee7',
       'ea0770830757', 'eef041dd50aa'], dtype=object)

In [33]:
scores_series = []
for i in series_ids:
    #print(f"Model score - {i} : {score(val_solution[(val_solution['series_id']==i)],events_submission[(events_submission['series_id']==i)],tolerances, **column_names)}")
    scores_series.append(score(val_solution[(val_solution['series_id']==i)],
                               events_submission[(events_submission['series_id']==i)],tolerances, **column_names))
df1=pd.DataFrame(scores_series)
df1.rename(columns={ df1.columns[0]: "Score"}, inplace = True)
df2=pd.DataFrame(series_ids)
df2.rename(columns={ df2.columns[0]: "Series_Id" }, inplace = True)
df_scores = pd.concat([df2, df1], axis=1)
df_scores

Unnamed: 0,Series_Id,Score
0,b737f8c78ec5,0.765363
1,bdfce9ce62b9,0.738664
2,5f40907ec171,0.696493
3,5acc9d63b5fd,0.795093
4,b7188813d58a,0.730164
5,1762ab70ec76,0.591001
6,fcca183903b7,0.748236
7,5f94bb3e1bed,0.753612
8,c75b4b207bea,0.7782
9,99b829cbad2d,0.632857


In [34]:
df_scores.to_csv('valid_scores_cnn_scm_v4.csv', index=False)

In [35]:
!rm -rf /kaggle/working/images

In [36]:
submission_df.to_csv("submission.csv", index=False)