In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import gc
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.


# Any results you write to the current directory are saved as output.
import warnings
warnings.filterwarnings('ignore')

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import RobustScaler, MinMaxScaler

from tqdm import tqdm

import lightgbm as lgb
import xgboost as xgb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.utils import class_weight

from sklearn.metrics import accuracy_score, make_scorer
from sklearn.metrics import roc_curve, auc, accuracy_score, cohen_kappa_score
from sklearn.metrics import mean_squared_error, f1_score, confusion_matrix

## Introduction

Identify the number of channels open at each time point

Many diseases, including cancer, are believed to have a contributing factor in common. Ion channels are pore-forming proteins present in animals and plants. They encode learning and memory, help fight infections, enable pain signals, and stimulate muscle contraction. If scientists could better study ion channels, which may be possible with the aid of machine learning, it could have a far-reaching impact.

When ion channels open, they pass electric currents. Existing methods of detecting these state changes are slow and laborious. Humans must supervise the analysis, which imparts considerable bias, in addition to being tedious. These difficulties limit the volume of ion channel current analysis that can be used in research. Scientists hope that technology could enable rapid automatic detection of ion channel current events in raw data.

The University of Liverpool’s Institute of Ageing and Chronic Disease is working to advance ion channel research. Their team of scientists have asked for your help. In this competition, you’ll use ion channel data to better model automatic identification methods. If successful, you’ll be able to detect individual ion channel events in noisy raw signals. The data is simulated and injected with real world noise to emulate what scientists observe in laboratory experiments.

## Acknowledgements

### This kernel used ideas and some code from excellent notebook:
### https://www.kaggle.com/vbmokin/ion-switching-advanced-fe-lgb-xgb-confmatrix
### and from this discussion: 
### https://www.kaggle.com/c/liverpool-ion-switching/discussion/143390


### data from
### https://www.kaggle.com/cdeotte/data-without-drift

## Utils

In [None]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        if col != 'time':
            col_type = df[col].dtypes
            if col_type in numerics:
                c_min = df[col].min()
                c_max = df[col].max()
                if str(col_type)[:3] == 'int':
                    if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        df[col] = df[col].astype(np.int8)
                    elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                        df[col] = df[col].astype(np.int16)
                    elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                        df[col] = df[col].astype(np.int32)
                    elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                        df[col] = df[col].astype(np.int64)  
                else:
                    if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                        df[col] = df[col].astype(np.float16)
                    elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                        df[col] = df[col].astype(np.float32)
                    else:
                        df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

def get_stats(df):
    stats = pd.DataFrame(index=df.columns, columns=['na_count', 'n_unique', 'type', 'memory_usage'])
    for col in df.columns:
        stats.loc[col] = [df[col].isna().sum(), df[col].nunique(dropna=False), df[col].dtypes, df[col].memory_usage(deep=True, index=False) / 1024**2]
    stats.loc['Overall'] = [stats['na_count'].sum(), stats['n_unique'].sum(), None, df.memory_usage(deep=True).sum() / 1024**2]
    return stats

def print_header():
    print('col         conversion        dtype    na    uniq  size')
    print()
    
def print_values(name, conversion, col):
    template = '{:10}  {:16}  {:>7}  {:2}  {:6}  {:1.2f}MB'
    print(template.format(name, conversion, str(col.dtypes), col.isna().sum(), col.nunique(dropna=False), col.memory_usage(deep=True, index=False) / 1024 ** 2))

In [None]:
def display_set(df, column, n_sample, figsize ):
    f, ax1 = plt.subplots(nrows = 1, ncols = 1, figsize = figsize )
    sns.lineplot(x= df.index[::n_sample], y = df[column][::n_sample], ax=ax1)


In [None]:
# Showing Confusion Matrix
# Thanks to https://www.kaggle.com/marcovasquez/basic-nlp-with-tensorflow-and-wordcloud
def plot_cm(y_true, y_pred, title):
    figsize=(14,14)
    y_pred = y_pred.astype(int)
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    cm_sum = np.sum(cm, axis=1, keepdims=True)
    cm_perc = cm / cm_sum.astype(float) * 100
    annot = np.empty_like(cm).astype(str)
    nrows, ncols = cm.shape
    for i in range(nrows):
        for j in range(ncols):
            c = cm[i, j]
            p = cm_perc[i, j]
            if i == j:
                s = cm_sum[i]
                annot[i, j] = '%.1f%%\n%d/%d' % (p, c, s)
            elif c == 0:
                annot[i, j] = ''
            else:
                annot[i, j] = '%.1f%%\n%d' % (p, c)
    cm = pd.DataFrame(cm, index=np.unique(y_true), columns=np.unique(y_true))
    cm.index.name = 'Actual'
    cm.columns.name = 'Predicted'
    fig, ax = plt.subplots(figsize=figsize)
    plt.title(title)
    sns.heatmap(cm, cmap= "YlGnBu", annot=annot, fmt='', ax=ax)

In [None]:
def get_class_weight(classes, exp=1):
    '''
    Weight of the class is inversely proportional to the population of the class.
    There is an exponent for adding more weight.
    '''
    hist, _ = np.histogram(classes, bins=np.arange(12)-0.5)
    class_weight = hist.sum()/np.power(hist, exp)
    
    return class_weight

## Load train and test datasets

**IMPORTANT: While the time series appears continuous, the data is from discrete batches of 50 seconds long 10 kHz samples (500,000 rows per batch). In other words, the data from 0.0001 - 50.0000 is a different batch than 50.0001 - 100.0000, and thus discontinuous between 50.0000 and 50.0001.**

In [None]:
PATH = '/kaggle/input/data-without-drift/'
#PATH = '/kaggle/input/liverpool-ion-switching/'

train = pd.read_csv(PATH + 'train_clean.csv')
test = pd.read_csv(PATH + 'test_clean.csv')

train.head()

In [None]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

In [None]:
## Most of the FE have been taken from: 
## - https://www.kaggle.com/vbmokin/ion-switching-advanced-fe-lgb-xgb-confmatrix

In [None]:
WINDOW_SIZES = [3, 5, 10, 50, 100, 500]

In [None]:
%%time

def gen_roll_features(full, win_sizes = WINDOW_SIZES):
    for window in tqdm(win_sizes):
        full["rolling_mean_" + str(window)] = full['signal'].rolling(window=window).mean()
        full["rolling_std_" + str(window)] = full['signal'].rolling(window=window).std()
        full["rolling_var_" + str(window)] = full['signal'].rolling(window=window).var()
        full["rolling_min_" + str(window)] = full['signal'].rolling(window=window).min()
        full["rolling_max_" + str(window)] = full['signal'].rolling(window=window).max()

        a = (full['signal'] - full['rolling_min_' + str(window)]) / (full['rolling_max_' + str(window)] - full['rolling_min_' + str(window)])
        full["norm_" + str(window)] = a * (np.floor(full['rolling_max_' + str(window)]) - np.ceil(full['rolling_min_' + str(window)]))
    
    full = full.replace([np.inf, -np.inf], np.nan)    
    full.fillna(0, inplace=True)
    return full

train = gen_roll_features(train)
test = gen_roll_features(test)

In [None]:
%%time

GROUP_BATCH_SIZE = 8000

# create batches of GROUP_BATCH_SIZE observations
def batching(df, batch_size, gr_name='group'):
    df[gr_name] = df.groupby(df.index//batch_size, sort=False)['signal'].agg(['ngroup']).values
    df[gr_name] = df[gr_name].astype(np.uint16)
    return df

def run_feat_engineering(df, batch_size, gr_name='group'):
    df = batching(df, batch_size = batch_size, gr_name=gr_name)
    df['signal_2'] = df['signal'] ** 2
    df['signal_2-7500-mean'] = df['signal_2'] - df['signal_2'].rolling(window=7500).mean()    
    return df

train = run_feat_engineering(train, batch_size = GROUP_BATCH_SIZE, gr_name='group')
test = run_feat_engineering(test, batch_size = GROUP_BATCH_SIZE, gr_name='group')

In [None]:
%%time

## add some noise

STD = 0.01

old_data = train['signal']
new_data = old_data + np.random.normal(0,STD,size=len(train)) 
train['signal'] = new_data

old_data = test['signal']
new_data = old_data + np.random.normal(0,STD,size=len(test)) 
test['signal'] = new_data

del old_data, new_data

In [None]:
%%time

def gen_roll_features(full, win_sizes = WINDOW_SIZES):
    for window in tqdm(win_sizes):
        full["rolling_mean_" + str(window)] = full['signal'].rolling(window=window).mean()
        full["rolling_std_" + str(window)] = full['signal'].rolling(window=window).std()
        full["rolling_var_" + str(window)] = full['signal'].rolling(window=window).var()
        full["rolling_min_" + str(window)] = full['signal'].rolling(window=window).min()
        full["rolling_max_" + str(window)] = full['signal'].rolling(window=window).max()

        a = (full['signal'] - full['rolling_min_' + str(window)]) / (full['rolling_max_' + str(window)] - full['rolling_min_' + str(window)])
        full["norm_" + str(window)] = a * (np.floor(full['rolling_max_' + str(window)]) - np.ceil(full['rolling_min_' + str(window)]))
    return full

train = gen_roll_features(train)
test = gen_roll_features(test)

In [None]:
%%time

def gen_sig_features(df):
    df = df.sort_values(by=['time']).reset_index(drop=True)
    df.index = ((df.time * 10_000) - 1).values
    df['batch'] = df.index // 25_000
    df['batch_index'] = df.index  - (df.batch * 25_000)
    df['batch_slices'] = df['batch_index']  // 2500
    df['batch_slices2'] = df.apply(lambda r: '_'.join([str(r['batch']).zfill(3), str(r['batch_slices']).zfill(3)]), axis=1)
    
    for c in tqdm(['batch','batch_slices2']):
        d = {}
        d['mean'+c] = df.groupby([c])['signal'].mean()
        d['median'+c] = df.groupby([c])['signal'].median()
        d['max'+c] = df.groupby([c])['signal'].max()
        d['min'+c] = df.groupby([c])['signal'].min()
        d['std'+c] = df.groupby([c])['signal'].std()
        d['mean_abs_chg'+c] = df.groupby([c])['signal'].apply(lambda x: np.mean(np.abs(np.diff(x))))
        d['abs_max'+c] = df.groupby([c])['signal'].apply(lambda x: np.max(np.abs(x)))
        d['abs_min'+c] = df.groupby([c])['signal'].apply(lambda x: np.min(np.abs(x)))
        d['range'+c] = d['max'+c] - d['min'+c]
        d['maxtomin'+c] = d['max'+c] / d['min'+c]
        d['abs_avg'+c] = (d['abs_min'+c] + d['abs_max'+c]) / 2
        for v in d:
            df[v] = df[c].map(d[v].to_dict())
    df = reduce_mem_usage(df)
    gc.collect()
    return df

train = gen_sig_features(train)
test = gen_sig_features(test)

In [None]:
%%time

def gen_shift_features(df):
    # add shifts
    df['signal_shift_+1'] = [0,] + list(df['signal'].values[:-1])
    df['signal_shift_-1'] = list(df['signal'].values[1:]) + [0]
    for i in df[df['batch_index']==0].index:
        df['signal_shift_+1'][i] = np.nan
    for i in df[df['batch_index']==49999].index:
        df['signal_shift_-1'][i] = np.nan
    
    df['signal_shift_+2'] = [0,] + [1,] + list(df['signal'].values[:-2])
    df['signal_shift_-2'] = list(df['signal'].values[2:]) + [0] + [1]
    for i in df[df['batch_index']==0].index:
        df['signal_shift_+2'][i] = np.nan
    for i in df[df['batch_index']==1].index:
        df['signal_shift_+2'][i] = np.nan
    for i in df[df['batch_index']==49999].index:
        df['signal_shift_-2'][i] = np.nan
    for i in df[df['batch_index']==49998].index:
        df['signal_shift_-2'][i] = np.nan
    
    df.drop(columns=['batch', 'batch_index', 'batch_slices', 'batch_slices2'], inplace=True)
    gc.collect()

    for c in [c1 for c1 in df.columns if c1 not in ['time', 'signal', 'open_channels', 'group' 'category', 'index']]:
        df[c+'_msignal'] = df[c] - df['signal']
        
    df = df.replace([np.inf, -np.inf], np.nan)    
    df.fillna(0, inplace=True)
    df = reduce_mem_usage(df)
    gc.collect()
    return df

train = gen_shift_features(train)
test = gen_shift_features(test)

In [None]:
ALL_FEATURES = [c for c in train.columns if c not in ['time', 'signal', 'open_channels', 'group' 'category', 'index']]
train.info()

## Display train and test signals

In [None]:
DATA_BATCH_SIZE = 500000

TRAIN_SAMPLE_RATE = 100 ## for display
TRAIN_BATCH_SIZE = int(len(train)/TRAIN_SAMPLE_RATE)

f, ax1 = plt.subplots(nrows = 1, ncols = 1, figsize = (20,4))
sns.lineplot(data=train.signal[::TRAIN_SAMPLE_RATE], ax=ax1, hue="size", size="size")
sns.lineplot(data=train.open_channels[::TRAIN_SAMPLE_RATE], ax=ax1, hue="size", size="size")
ax1.set_title(f'Full train signal')

f, ax1 = plt.subplots(nrows = 1, ncols = 1, figsize = (10,4))
sns.lineplot(data=test.signal[::TRAIN_SAMPLE_RATE], ax=ax1, hue="size", size="size")
ax1.set_title(f'Full test signal')

## By batch

In [None]:
f, axes = plt.subplots(nrows = 2, ncols = 5, figsize = (26,12))
for i in range(10):
    XX = train.signal[i*DATA_BATCH_SIZE:(i+1)*DATA_BATCH_SIZE + 1]
    yy = train.open_channels[i*DATA_BATCH_SIZE:(i+1)*DATA_BATCH_SIZE + 1]
    sns.scatterplot(data=XX[::TRAIN_SAMPLE_RATE], ax=axes[int(i/5), i%5], hue="size", size="size")
    sns.scatterplot(data=yy[::TRAIN_SAMPLE_RATE], ax=axes[int(i/5), i%5], hue="size", size="size")
    axes[int(i/5), i%5].set_title(f'Train Batch# {i+1}')
    
f, axes = plt.subplots(nrows = 1, ncols = 5, figsize = (26,6))
for i in range(4):
    XX = test.signal[i*DATA_BATCH_SIZE:(i+1)*DATA_BATCH_SIZE + 1]
    sns.scatterplot(data=XX[::TRAIN_SAMPLE_RATE], ax=axes[i], hue="size", size="size")
    axes[i].set_title(f'Test Batch# {i+1}')


## Open channels value count

In [None]:
f, ax = plt.subplots(figsize=(15, 6))
sns.countplot(x="open_channels", data=train, ax=ax)

## Distributions for open channels for each batch

In [None]:
f, axes = plt.subplots(nrows = 2, ncols = 5, figsize = (26,12))
for i in range(10):
    y = pd.DataFrame()
    sns.countplot( x = train.open_channels[i*DATA_BATCH_SIZE:(i+1)*DATA_BATCH_SIZE + 1], ax=axes[int(i/5), i%5])
    axes[int(i/5), i%5].set_title(f'Train Batch# {i+1}')

## Feature selection

In [None]:
print(f'Original sizes: train: {train.shape}, test: {test.shape}' )

### remove high corr values

In [None]:
%%time

## this may take a lot of time
corr = train[ALL_FEATURES][::20].corr('spearman')

columns = np.full((corr.shape[0],), True, dtype=bool)
for i in range(corr.shape[0]):
    for j in range(i+1, corr.shape[0]):
        if corr.iloc[i,j] >= 0.9:
            if columns[j]:
                columns[j] = False
                
selected_columns = train[ALL_FEATURES].columns[columns]
print(len(selected_columns))            
#print(selected_columns)            

### remove all low-variance features

In [None]:
%%time

from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(0.5)
y = train['open_channels'].values
X = train[selected_columns].values
vt.fit(X, y)

## let's take top 25
top_idx = np.argpartition(vt.variances_, -25)[-25:]
SELECTED_FEATURES = [selected_columns[i] for i in top_idx]
print(SELECTED_FEATURES)     

## LGB Model

In [None]:
## reduce amount of data to speed things up
X_train = train[SELECTED_FEATURES]
y_train = train['open_channels'].values

print(f'Original sizes: train: {train.shape}, test: {test.shape}' )
print(f'Reduced train sizes: X_train: {X_train.shape}, y_train: {y_train.shape}' )

In [None]:
# Thanks to https://www.kaggle.com/siavrez/simple-eda-model
def MacroF1Metric(preds, dtrain):
    labels = dtrain.get_label()
    preds = np.round(np.clip(preds, 0, 10)).astype(int)
    score = f1_score(labels, preds, average = 'macro')
    return ('MacroF1Metric', score, True)

In [None]:
## started from here:
## https://www.kaggle.com/vbmokin/ion-switching-advanced-fe-lgb-xgb-confmatrix

NUM_BOOST_ROUND = 2000 
EARLY_STOPPING_ROUNDS = 40
VERBOSE_EVAL = 100
RANDOM_SEED = 13
LEARNING_RATE = 0.02
MAX_DEPTH = -1
NUM_LEAVES = 200


X_train1, X_valid1, y_train1, y_valid1 = train_test_split(X_train, y_train, test_size=0.3, random_state=RANDOM_SEED)

params = {
    'learning_rate': LEARNING_RATE, 
    'max_depth': MAX_DEPTH, 
    'num_leaves': NUM_LEAVES,
    'metric': 'logloss', 
    'random_state': RANDOM_SEED, 
    'n_jobs':-1, 
    'sample_fraction':0.33
    }

evals_result = {}
model = lgb.train(
    params, 
    train_set=lgb.Dataset(X_train1, y_train1), 
    num_boost_round = NUM_BOOST_ROUND,
    valid_sets = lgb.Dataset(X_valid1, y_valid1), 
    verbose_eval = VERBOSE_EVAL,
    evals_result = evals_result,
    early_stopping_rounds = EARLY_STOPPING_ROUNDS, 
    feval = MacroF1Metric)

gc.collect()

In [None]:
f, ax1 = plt.subplots(nrows = 1, ncols = 1, figsize=(15, 6))
lgb.plot_metric(evals_result, metric='MacroF1Metric', ax=ax1)
plt.show()

## LGB feature importance

In [None]:
fig =  plt.figure(figsize = (15,15))
axes = fig.add_subplot(111)
lgb.plot_importance(model,ax = axes,height = 0.5)
plt.show();plt.close()

## LGB Confusion Matrix

In [None]:
y_pred_train_lgb = model.predict(X_train, num_iteration=model.best_iteration)
print('LGB score {0:.4f}'.format(np.mean(f1_score(y_train, np.round(np.clip(y_pred_train_lgb,0,10)).astype(int), average="macro"))))
gc.collect()

In [None]:
plot_cm(y_train, y_pred_train_lgb, 'LGB Confusion Matrix')

## Submission

In [None]:
y_hat = model.predict(test[SELECTED_FEATURES], num_iteration=model.best_iteration)
y_pred = np.round(np.clip(y_hat,0,10)).astype(int)

## Plot test signal with predictions

In [None]:
f, ax1 = plt.subplots(nrows = 1, ncols = 1, figsize = (15,6))
sns.scatterplot(x=test[SELECTED_FEATURES].index[::1000], y=test.signal[::1000], ax=ax1)
sns.scatterplot(x=test[SELECTED_FEATURES].index[::1000], y=y_pred[::1000], ax=ax1 )
ax1.set_title(f'Full test signal with predictions')

In [None]:
sub = pd.read_csv('../input/liverpool-ion-switching/sample_submission.csv')
sub['open_channels'] = y_pred
sub.to_csv('submission.csv', index=False, float_format='%.4f')

sub.head(20)