In [5]:
import numpy as np 
import pandas as pd
from sklearn import *
import lightgbm as lgb
from catboost import Pool,CatBoostRegressor

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

pd.set_option("display.precision", 8)

from functools import partial
import scipy as sp

import os
from sklearn.metrics import f1_score

import pywt 
from statsmodels.robust import mad

import scipy
from scipy import signal
from scipy.signal import butter, deconvolve, find_peaks, peak_widths, peak_prominences

from numpy.fft import *

import time
import math
from numba import jit
from math import log, floor
from sklearn.neighbors import KDTree
 

train = pd.read_csv('/Users/siero5335/channel/train.csv')
test = pd.read_csv('/Users/siero5335/channel/test.csv')

In [6]:
def remove_false_peak(signal, p1, p2, maxDistance=10):
    peak_diff = np.diff(p2)
    if len(peak_diff) == 0:
        return p1
    ticks = []
    for i, d in enumerate(peak_diff):
        ratio = signal[p2[i+1]]/signal[p2[i]]
        if d < maxDistance and -0.25 > ratio and ratio > -4:
            ticks.append((p2[i], p2[i+1]))
    mask = np.array([True]*len(p1))
    for i, j in ticks:
        mask = mask &amp; ((p1 < i) | (p1 > 500+j))
    return p1[mask]


def get_peaks(signal):
    p1_1, _ = find_peaks(signal, height=[5, 100])
    p1_2, _ = find_peaks(-signal, height=[5, 100])
    p1 = np.union1d(p1_1, p1_2)
    n_peaks, _ = find_peaks(-signal, height=[10, 100])
    p_peaks, _ = find_peaks(signal, height=[10, 100])
    p2 = np.union1d(n_peaks, p_peaks)
    p = remove_false_peak(signal, p1, p2, maxDistance=10)
    return np.intersect1d(p1_1, p), np.intersect1d(p1_2, p)


def extract_peak_feature(signal):
    p_peaks, n_peaks = get_peaks(signal)

    num_p, num_n = len(p_peaks), len(n_peaks)

    sig_peak_width = np.concatenate(
        [peak_widths(signal, p_peaks)[0], peak_widths(-signal, n_peaks)[0]])
    sig_peak_height = abs(signal[np.concatenate([p_peaks, n_peaks])])

    if num_n or num_p:
        height_mean = sig_peak_height.mean()
        height_max = sig_peak_height.max()
        height_min = sig_peak_height.min()
        height_median = np.median(sig_peak_height)

        width_mean = sig_peak_width.mean()
        width_max = sig_peak_width.max()
        width_min = sig_peak_width.min()
        width_median = np.median(sig_peak_width)

        return np.array([height_mean, height_max, height_min, height_median,
                         width_mean, width_max, width_min, width_median, num_p, num_n])
    else:
        return np.zeros(10)

In [7]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        if col!='open_channels':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [8]:
window_sizes = [100, 500, 1000, 2500, 5000, 25000, 50000]

In [9]:
for window in window_sizes:
    train["rolling_height_max_" + str(window)] = train['signal'].rolling(window=window).apply(lambda x: extract_peak_feature(x)[0])
    train["rolling_width_mean_" + str(window)] = train['signal'].rolling(window=window).apply(lambda x: extract_peak_feature(x)[1])
    train["rolling_width_max_" + str(window)] = train['signal'].rolling(window=window).apply(lambda x: extract_peak_feature(x)[2])

for window in window_sizes:
    test["rolling_height_max_" + str(window)] = test['signal'].rolling(window=window).apply(lambda x: extract_peak_feature(x)[0])
    test["rolling_width_mean_" + str(window)] = test['signal'].rolling(window=window).apply(lambda x: extract_peak_feature(x)[1])
    test["rolling_width_max_" + str(window)] = test['signal'].rolling(window=window).apply(lambda x: extract_peak_feature(x)[2])















In [10]:
train.to_csv('/Users/siero5335/channel/train_rolling_signal.csv')
test.to_csv('/Users/siero5335/channel/test_rolling_signal.csv')

In [11]:
train.head()

Unnamed: 0,time,signal,open_channels,rolling_height_max_100,rolling_width_mean_100,rolling_width_max_100,rolling_height_max_500,rolling_width_mean_500,rolling_width_max_500,rolling_height_max_1000,...,rolling_width_max_2500,rolling_height_max_5000,rolling_width_mean_5000,rolling_width_max_5000,rolling_height_max_25000,rolling_width_mean_25000,rolling_width_max_25000,rolling_height_max_50000,rolling_width_mean_50000,rolling_width_max_50000
0,0.0001,-2.76,0,,,,,,,,...,,,,,,,,,,
1,0.0002,-2.8557,0,,,,,,,,...,,,,,,,,,,
2,0.0003,-2.4074,0,,,,,,,,...,,,,,,,,,,
3,0.0004,-3.1404,0,,,,,,,,...,,,,,,,,,,
4,0.0005,-3.1525,0,,,,,,,,...,,,,,,,,,,


In [12]:
train

Unnamed: 0,time,signal,open_channels,rolling_height_max_100,rolling_width_mean_100,rolling_width_max_100,rolling_height_max_500,rolling_width_mean_500,rolling_width_max_500,rolling_height_max_1000,...,rolling_width_max_2500,rolling_height_max_5000,rolling_width_mean_5000,rolling_width_max_5000,rolling_height_max_25000,rolling_width_mean_25000,rolling_width_max_25000,rolling_height_max_50000,rolling_width_mean_50000,rolling_width_max_50000
0,0.0001,-2.7600,0,,,,,,,,...,,,,,,,,,,
1,0.0002,-2.8557,0,,,,,,,,...,,,,,,,,,,
2,0.0003,-2.4074,0,,,,,,,,...,,,,,,,,,,
3,0.0004,-3.1404,0,,,,,,,,...,,,,,,,,,,
4,0.0005,-3.1525,0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4999995,499.9996,2.9194,7,6.07681000,7.168,5.1525,5.92979796,7.8028,5.0084,5.92318814,...,5.0084,5.99873054,8.2702,5.0084,6.12901078,8.8303,5.001,6.30090519,9.9871,5.0002
4999996,499.9997,2.6980,7,6.07681000,7.168,5.1525,5.93115833,7.8028,5.0084,5.92318814,...,5.0084,5.99919091,8.2702,5.0084,6.12901078,8.8303,5.001,6.30090519,9.9871,5.0002
4999997,499.9998,4.5164,8,6.07681000,7.168,5.1525,5.93115833,7.8028,5.0084,5.92318814,...,5.0084,5.99919091,8.2702,5.0084,6.12901078,8.8303,5.001,6.30090519,9.9871,5.0002
4999998,499.9999,5.6397,9,6.10098889,7.168,5.1525,5.93115833,7.8028,5.0084,5.92318814,...,5.0084,5.99919091,8.2702,5.0084,6.12901078,8.8303,5.001,6.30090519,9.9871,5.0002
