In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
# imports
%matplotlib inline

import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, f1_score, plot_confusion_matrix
from keras.models import Model
import keras.layers as L
import lightgbm as lgb

!pip install uc==2.2.0

# Load data
Thanks to https://www.kaggle.com/cdeotte/data-without-drift.

In [None]:
# read data
data = pd.read_csv('../input/data-without-drift/train_clean.csv')
data.head()

# Feature engineering
Add to signal several other signals: gradients, rolling mean, std, low/high pass filters...

FE is the same as this notebook https://www.kaggle.com/martxelo/fe-and-simple-mlp with corrections in filters.

In [None]:
def calc_gradients(s, n_grads=4):
    '''
    Calculate gradients for a pandas series. Returns the same number of samples
    '''
    grads = pd.DataFrame()
    
    g = s.values
    for i in range(n_grads):
        g = np.gradient(g)
        grads['grad_' + str(i+1)] = g
        
    return grads

In [None]:
def calc_low_pass(s, n_filts=10):
    '''
    Applies low pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.3, n_filts)
    
    low_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='low')
        zi = signal.lfilter_zi(b, a)
        low_pass['lowpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        low_pass['lowpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return low_pass

In [None]:
def calc_high_pass(s, n_filts=10):
    '''
    Applies high pass filters to the signal. Left delayed and no delayed
    '''
    wns = np.logspace(-2, -0.1, n_filts)
    
    high_pass = pd.DataFrame()
    x = s.values
    for wn in wns:
        b, a = signal.butter(1, Wn=wn, btype='high')
        zi = signal.lfilter_zi(b, a)
        high_pass['highpass_lf_' + str('%.4f' %wn)] = signal.lfilter(b, a, x, zi=zi*x[0])[0]
        high_pass['highpass_ff_' + str('%.4f' %wn)] = signal.filtfilt(b, a, x)
        
    return high_pass

In [None]:
def calc_roll_stats(s, windows=[10, 50, 100, 500, 1000]):
    '''
    Calculates rolling stats like mean, std, min, max...
    '''
    roll_stats = pd.DataFrame()
    for w in windows:
        roll_stats['roll_mean_' + str(w)] = s.rolling(window=w, min_periods=1).mean()
        roll_stats['roll_std_' + str(w)] = s.rolling(window=w, min_periods=1).std()
        roll_stats['roll_min_' + str(w)] = s.rolling(window=w, min_periods=1).min()
        roll_stats['roll_max_' + str(w)] = s.rolling(window=w, min_periods=1).max()
        roll_stats['roll_range_' + str(w)] = roll_stats['roll_max_' + str(w)] - roll_stats['roll_min_' + str(w)]
        roll_stats['roll_q10_' + str(w)] = s.rolling(window=w, min_periods=1).quantile(0.10)
        roll_stats['roll_q25_' + str(w)] = s.rolling(window=w, min_periods=1).quantile(0.25)
        roll_stats['roll_q50_' + str(w)] = s.rolling(window=w, min_periods=1).quantile(0.50)
        roll_stats['roll_q75_' + str(w)] = s.rolling(window=w, min_periods=1).quantile(0.75)
        roll_stats['roll_q90_' + str(w)] = s.rolling(window=w, min_periods=1).quantile(0.90)
    
    # add zeros when na values (std)
    roll_stats = roll_stats.fillna(value=0)
             
    return roll_stats

In [None]:
def calc_ewm(s, windows=[10, 50, 100, 500, 1000]):
    '''
    Calculates exponential weighted functions
    '''
    ewm = pd.DataFrame()
    for w in windows:
        ewm['ewm_mean_' + str(w)] = s.ewm(span=w, min_periods=1).mean()
        ewm['ewm_std_' + str(w)] = s.ewm(span=w, min_periods=1).std()
        
    # add zeros when na values (std)
    ewm = ewm.fillna(value=0)
        
    return ewm

In [None]:
def add_features(s):
    '''
    All calculations together
    '''
    
    gradients = calc_gradients(s)
    low_pass = calc_low_pass(s)
    high_pass = calc_high_pass(s)
    roll_stats = calc_roll_stats(s)
    ewm = calc_ewm(s)
    
    return pd.concat([s, gradients, low_pass, high_pass, roll_stats, ewm], axis=1)


def divide_and_add_features(s, signal_size=500000):
    '''
    Divide the signal in bags of "signal_size".
    Normalize the data dividing it by 15.0
    '''
    # normalize
    s = s/15.0
    
    ls = []
    for i in tqdm(range(int(s.shape[0]/signal_size))):
        sig = s[i*signal_size:(i+1)*signal_size].copy().reset_index(drop=True)
        sig_featured = add_features(sig)
        ls.append(sig_featured)
    
    return pd.concat(ls, axis=0)

In [None]:
# apply every feature to data
df = divide_and_add_features(data['signal'])
df.head()

Let's plot the signals to see how they look like.

In [None]:
# The low pass lfilter captures the trend of the signal for different cutoff frequencies
df[['signal',
    'lowpass_lf_0.0100',
    'lowpass_lf_0.0154',
    'lowpass_lf_0.0239',
    'lowpass_lf_0.0369',
    'lowpass_lf_0.5012']].iloc[:200].plot()

In [None]:
# The low pass filtfilt captures the trend of the signal for different cutoff frequencies
# but without delay
df[['signal',
    'lowpass_ff_0.0100',
    'lowpass_ff_0.0154',
    'lowpass_ff_0.0239',
    'lowpass_ff_0.0369',
    'lowpass_ff_0.5012']].iloc[:200].plot()

In [None]:
# The high pass lfilter captures fast variation of the signal for different cutoff frequencies
df[['signal',
    'highpass_lf_0.0100',
    'highpass_lf_0.0163',
    'highpass_lf_0.0264',
    'highpass_lf_0.3005',
    'highpass_lf_0.7943']].iloc[:100].plot()

In [None]:
# The high pass lfilter captures fast variation of the signal for different cutoff frequencies
# but without delay
df[['signal',
    'highpass_ff_0.0100',
    'highpass_ff_0.0163',
    'highpass_ff_0.0264',
    'highpass_ff_0.3005',
    'highpass_ff_0.7943']].iloc[:200].plot()

In [None]:
# rolling mean, quantiles and ewm also capture the trend
df[['signal',
    'roll_mean_10',
    'roll_mean_50',
    'roll_mean_100',
    'roll_q50_100',
    'ewm_mean_10',
    'ewm_mean_50',
    'ewm_mean_100']].iloc[:100].plot()

In [None]:
# quantiles, min, max
df[['signal',
    'roll_min_100',
    'roll_q10_100',
    'roll_q25_100',
    'roll_q50_100',
    'roll_q75_100',
    'roll_q90_100',
    'roll_max_100']].iloc[:1000].plot()

In [None]:
# rolling std, and emw std
df[['signal',
    'roll_std_10',
    'roll_std_50',
    'ewm_std_10',
    'ewm_std_50']].iloc[:100].plot()

# Divide in train and test

In [None]:
# Get train and test data
x_train, x_test, y_train, y_test = train_test_split(df.values, data['open_channels'].values, test_size=0.2)

del data, df
print('x_train.shape=', x_train.shape)
print('x_test.shape=', x_test.shape)
print('y_train.shape=', y_train.shape)
print('y_test.shape=', y_test.shape)

# Classes weights

In [None]:
def get_class_weight(classes, exp=1):
    '''
    Weight of the class is inversely proportional to the population of the class.
    There is an exponent for adding more weight.
    '''
    hist, _ = np.histogram(classes, bins=np.arange(12)-0.5)
    class_weight = hist.sum()/np.power(hist, exp)
    
    return class_weight

class_weight = get_class_weight(y_train)
print('class_weight=', class_weight)
plt.figure()
plt.title('classes')
plt.hist(y_train, bins=np.arange(12)-0.5)
plt.figure()
plt.title('class_weight')
plt.bar(np.arange(11), class_weight)
plt.title('class_weight')

# Build a MLP model

In [None]:
from uc.mlp import MLP

mlp = MLP(
    layer_size = [x_train.shape[1], 100, 100, 100, 11],
    activation = 'a2m2l',
    op='fc',

    # rate_init = 0.08, 
    leaky = -0.2,
    rate_init = 0.04,   
    bias_rate = [], 
    regularization = 1,
    importance_mul = 0.0001, 
    output_shrink = 0.1, 
    output_range = None, 
    loss_type = "softmax",
    verbose=1, 
    importance_out=False,
    rate_decay = 0.9, 
    epoch_train = 30 / 10 / 3, 
    epoch_decay = 3 / 10 / 3,
    epoch_log = 0.01,
)




In [None]:
# fit the model
mlp.fit(x_train, y_train)

In [None]:
print('Reading data...')
data = pd.read_csv('../input/data-without-drift/test_clean.csv')

print('Feature engineering...')
df = divide_and_add_features(data['signal'])


mlp_pred = mlp.predict(df.values)
mlp_pred = np.clip(mlp_pred, 0, 10)
mlp_pred = np.int0(np.rint(mlp_pred))

print('Writing submission...')
submission = pd.DataFrame()
submission['time'] = data['time']
submission['open_channels'] = mlp_pred
submission.to_csv('submission.csv', index=False, float_format='%.4f')

print('Submission finished!')

submit_result(mlp_pred)