In [None]:
import os

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

plt.style.use('dark_background')

## Load Data

In [None]:
train = pd.read_csv('/kaggle/input/liverpool-ion-switching/train.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
test  = pd.read_csv('/kaggle/input/liverpool-ion-switching/test.csv', dtype={'time': np.float32, 'signal': np.float32})

train_clean_kalman = pd.read_csv('/kaggle/input/clean-kalman/train_clean_kalman.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
test_clean_kalman  = pd.read_csv('/kaggle/input/clean-kalman/test_clean_kalman.csv', dtype={'time': np.float32, 'signal': np.float32})

train_clean = pd.read_csv('/kaggle/input/data-without-drift/train_clean.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
test_clean  = pd.read_csv('/kaggle/input/data-without-drift/test_clean.csv', dtype={'time': np.float32, 'signal': np.float32})

train_sin = pd.read_csv('/kaggle/input/remove-drift-using-a-sine-function/train_wo_drift.csv', dtype={'time': np.float32, 'signal': np.float32, 'open_channels':np.int32})
test_sin  = pd.read_csv('/kaggle/input/remove-drift-using-a-sine-function/test_wo_drift.csv', dtype={'time': np.float32, 'signal': np.float32})

## Help Functions

In [None]:
def plot_open_channels_signal(df: pd.DataFrame, vline=[]):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    
    ax.plot(df.signal, df.open_channels, '.', color='fuchsia', alpha=0.25)
    for x in vline:
        ax.axvline(x, alpha=0.75, color='tomato')
    ax.set_xlabel('Signal')
    ax.set_ylabel('Open Channels')
    plt.show()
    
    
def plot_data(df: pd.DataFrame):
    if 'open_channels' in df.columns:
        fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(24, 16))
    
        ax2.plot(df.time, df.open_channels, color='royalblue', alpha=0.75)
        ax2.set_xlabel('time')
        ax2.set_ylabel('Open Channels')
    else:
        fig, ax1 = plt.subplots(1, 1, figsize=(24, 8))
    
    ax1.plot(df.time, df.signal, color='royalblue', alpha=0.75)
    ax1.set_xlabel('time')
    ax1.set_ylabel('Signal')
    plt.show()
    
    
def plot_data_emission(df: pd.DataFrame, timestamps: list = [], roi=None):
    fig, ax = plt.subplots(1, 1, figsize=(24, 8))
    
    ax.plot(df.time, df.signal, color='royalblue', alpha=0.75)
    for start, end in timestamps:
        ax.axvspan(start, end, color='lime', alpha=0.5)
    ax.set_xlabel('time')
    ax.set_ylabel('Signal')
    if roi is not None:
        plt.xlim(roi)
    plt.show()
    
    
def distplot(series: pd.Series, bound: float = None):
    fig, ax = plt.subplots(1, 1, figsize=(10, 10))
    
    sns.distplot(series, ax=ax, color='darkorange')
    if bound is not None:
        ax.axvline(bound)
    plt.show()
    
    
def shiftplot(df:pd.DataFrame, open_channel:int, level:float):
    fig, ax = plt.subplots(1, 1, figsize=(24, 7))

    ax.plot(df.signal, color='royalblue')

    ax.plot(df.signal[(df.open_channels==open_channel) & (df.signal < level)], '.', color='cyan', alpha=0.75, label = 'left part')
    ax.plot(df.signal[(df.open_channels==open_channel) & (df.signal >= level)], '.', color='deeppink', alpha=0.25, label = 'right part');
    ax.set_xlabel('Time')
    ax.set_ylabel('Signal')
    plt.legend()
    plt.show();

## Initial Data

In [None]:
plot_open_channels_signal(train)

In [None]:
plot_data(train)

In [None]:
plot_data(test)

## Data without drift

#### Data without drift by [Chris Deotte](https://www.kaggle.com/cdeotte)

In [None]:
plot_open_channels_signal(train_clean)

In [None]:
plot_data(train_clean)

In [None]:
plot_data(test_clean)

#### Data without drift by [Eunho Lee](https://www.kaggle.com/eunholee)

In [None]:
plot_open_channels_signal(train_sin)

In [None]:
plot_data(train_sin)

In [None]:
plot_data(test_sin)

## Data without drift with Kalman Filter postprocessing by [ragnar](https://www.kaggle.com/ragnar123)

In [None]:
plot_open_channels_signal(train_clean_kalman)

In [None]:
plot_data(train_clean_kalman)

In [None]:
plot_data(test_clean_kalman)

## Outliers 

Training data has two obvious outliers that are easiest to remove from the training set.

#### First Outliers

In [None]:
plot_data_emission(train_clean_kalman, roi=[47.5, 48])

In [None]:
FIRST_EMISSION = (47.857, 47.863)
plot_data_emission(train_clean_kalman, [FIRST_EMISSION,], roi=[47.5, 48])

#### Second Outliers

In [None]:
plot_data_emission(train_clean_kalman, roi=[360, 390])

In [None]:
SECOND_EMISSION = (364.229, 382.343)
plot_data_emission(train_clean_kalman, [SECOND_EMISSION,], roi=[360, 390])

In [None]:
plot_data_emission(train_clean_kalman, [FIRST_EMISSION, SECOND_EMISSION])

In [None]:
train_ckwe = train_clean_kalman_without_emission = train_clean_kalman.loc[(train_clean_kalman.time < FIRST_EMISSION[0]) | (train_clean_kalman.time > FIRST_EMISSION[1]), :]
train_ckwe = train_clean_kalman_without_emission = train_ckwe.loc[(train_ckwe.time < SECOND_EMISSION[0]) | (train_ckwe.time > SECOND_EMISSION[1]), :]

train_cwe = train_clean_without_emission = train_clean.loc[(train_clean.time < FIRST_EMISSION[0]) | (train_clean.time > FIRST_EMISSION[1]), :]
train_cwe = train_clean_without_emission = train_cwe.loc[(train_cwe.time < SECOND_EMISSION[0]) | (train_cwe.time > SECOND_EMISSION[1]), :]

train_swe = train_sin_without_emission = train_sin.loc[(train_sin.time < FIRST_EMISSION[0]) | (train_sin.time > FIRST_EMISSION[1]), :]
train_swe = train_sin_without_emission = train_swe.loc[(train_swe.time < SECOND_EMISSION[0]) | (train_swe.time > SECOND_EMISSION[1]), :]

In [None]:
plot_data_emission(train_ckwe, [FIRST_EMISSION, SECOND_EMISSION])

In [None]:
plot_open_channels_signal(train_swe)

## "Ghost" drift
The chart above has a very interesting relationship: a mirror image of the signal values ​​relative to the point for open channels from 0 to 5, which can indicate a shift in values ​​in some pieces of the signal.

In [None]:
levels = [-4., -2.95, -1.7, -0.45, 0.98, 2.25]

plot_open_channels_signal(train_swe, levels)

In [None]:
distplot(train_ckwe.signal[train_swe.open_channels==0], -4.)

In [None]:
distplot(train_ckwe.signal[train_swe.open_channels==1], -2.95)

In [None]:
distplot(train_ckwe.signal[train_swe.open_channels==2], -1.7)

In [None]:
distplot(train_ckwe.signal[train_swe.open_channels==3], -0.45)

In [None]:
distplot(train_ckwe.signal[train_swe.open_channels==4], 0.98)

In [None]:
distplot(train_ckwe.signal[train_swe.open_channels==5], 2.25)

In [None]:
shiftplot(train_ckwe, 0, -4.)

In [None]:
shiftplot(train_ckwe, 1, -2.95)

In [None]:
shiftplot(train_ckwe, 2, -1.7)

In [None]:
shiftplot(train_ckwe, 3, -.45)

In [None]:
shiftplot(train_ckwe, 4, .98)

In [None]:
shiftplot(train_ckwe, 5, 2.25)

In [None]:
rights_mean = []
rights_signal = []
for i, l in enumerate(levels):
    rights_mask = (train_ckwe.open_channels==i) & (train_ckwe.signal >= l)
    rights_mean.append(train_ckwe.signal[rights_mask].mean())
    rights_signal.append(train_ckwe.signal[rights_mask])

In [None]:
left_mean = []
left_signal = []
for i, l in enumerate(levels):
    left_mask = (train_ckwe.open_channels==i) & (train_ckwe.signal < l)
    left_mean.append(train_ckwe.signal[left_mask].mean())
    left_signal.append(train_ckwe.signal[left_mask])

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))

ax.plot(levels, np.arange(0, 6), color='darkorange', label='bound');
ax.plot(levels, np.arange(0, 6), '*', color='darkorange');

ax.plot(left_mean, np.arange(0, 6), color='crimson', label='left signal part');
ax.plot(left_mean, np.arange(0, 6), '*', color='crimson');
for i, s in enumerate(left_signal):
    ax.plot(s, np.ones_like(s)*i, '.', color='crimson', alpha=0.01)

ax.plot(rights_mean, np.arange(0, 6), color='indigo', label='right signal part');
ax.plot(rights_mean, np.arange(0, 6), '*', color='indigo');
for i, s in enumerate(rights_signal):
    ax.plot(s, np.ones_like(s)*i, '.', color='indigo', alpha=0.01)

ax.set_xlabel('Signal')
ax.set_ylabel('Open Channels')
plt.legend();

In [None]:
dist = [(l - r)**2 for l, r in zip(left_mean, rights_mean)]

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))

ax.plot(dist, np.arange(0, 6), color='darkgreen');
ax.plot(dist, np.arange(0, 6), '*', color='darkgreen');
ax.axvline(np.mean(dist[:]), color='lime', label=f'{np.mean(dist[:])}')
ax.axvline(np.mean(dist[1:]), color='tomato', label=f'{np.mean(dist[1:])}')
ax.axvline(np.exp(2), color='yellow', label='e**2')
ax.set_xlabel('(LeftSignalMean - RightSignalMean)**2')
ax.set_ylabel('Open Channels')
plt.legend();

In [None]:
SGNAL_SHIFT_CONSTANT = np.exp(1)
SGNAL_SHIFT_CONSTANT

## Let's remove "Ghost" drift

In [None]:
train_ckwe.loc[2000000:2500000, 'signal'] += SGNAL_SHIFT_CONSTANT
train_ckwe.loc[4500000:, 'signal'] += SGNAL_SHIFT_CONSTANT

train_cwe.loc[2000000:2500000, 'signal'] += SGNAL_SHIFT_CONSTANT
train_cwe.loc[4500000:, 'signal'] += SGNAL_SHIFT_CONSTANT

train_swe.loc[2000000:2500000, 'signal'] += SGNAL_SHIFT_CONSTANT
train_swe.loc[4500000:, 'signal'] += SGNAL_SHIFT_CONSTANT

In [None]:
plot_data(train_cwe)

In [None]:
plot_open_channels_signal(train_ckwe)

In [None]:
distplot(train_ckwe.signal[train_ckwe.open_channels==0])

In [None]:
distplot(train_ckwe.signal[train_ckwe.open_channels==1],)

In [None]:
distplot(train_ckwe.signal[train_ckwe.open_channels==2])

In [None]:
distplot(train_ckwe.signal[train_ckwe.open_channels==3])

In [None]:
distplot(train_ckwe.signal[train_ckwe.open_channels==4])

In [None]:
distplot(train_ckwe.signal[train_ckwe.open_channels==5])

In [None]:
shiftplot(train_ckwe, 0, -4.)

In [None]:
shiftplot(train_ckwe, 1, -2.95)

In [None]:
shiftplot(train_ckwe, 2, -1.7)

In [None]:
shiftplot(train_ckwe, 3, -.45)

In [None]:
shiftplot(train_ckwe, 4, .98)

In [None]:
shiftplot(train_ckwe, 5, 2.25)

In [None]:
test_clean_kalman.loc[500000:600000, 'signal'] += SGNAL_SHIFT_CONSTANT
test_clean_kalman.loc[700000:800000, 'signal'] += SGNAL_SHIFT_CONSTANT

test_clean.loc[500000:600000, 'signal'] += SGNAL_SHIFT_CONSTANT
test_clean.loc[700000:800000, 'signal'] += SGNAL_SHIFT_CONSTANT

test_sin.loc[500000:600000, 'signal'] += SGNAL_SHIFT_CONSTANT
test_sin.loc[700000:800000, 'signal'] += SGNAL_SHIFT_CONSTANT

In [None]:
plot_data(test_clean_kalman)

## Save results

In [None]:
train_ckwe.to_csv('train_clean_kalman.csv')
test_clean_kalman.to_csv('test_clean_kalman.csv')

train_cwe.to_csv('train_clean.csv')
test_clean.to_csv('test_clean.csv')

train_swe.to_csv('train_sin.csv')
test_sin.to_csv('test_sin.csv')

## Boundary Classifier

In [None]:
BOUNDS = [-2.1, -0.9, 0.3, 1.5, 2.78, 3.99, 5.25, 6.5, 7.7, 8.95, 8.95]

fig, ax = plt.subplots(1, 1, figsize=(10, 10))

for i, bound in enumerate(BOUNDS):
    sns.distplot(train_ckwe.signal[train_ckwe.open_channels == i], ax=ax, color='darkorange')
    if bound is not None:
        ax.axvline(bound)
plt.show()

In [None]:
def boundery_classifier(x):
    for i in range(10):
        if x < BOUNDS[i]:
            return i
    return 10

In [None]:
train_ckwe['pred'] = train_ckwe.signal.apply(boundery_classifier)

In [None]:
from sklearn.metrics import f1_score, accuracy_score

print("Accuracy =", accuracy_score(y_pred=train_ckwe['pred'].values, y_true=train_ckwe['open_channels'].values,))
print("F1 macro =", f1_score(y_pred=train_ckwe['pred'].values, y_true=train_ckwe['open_channels'].values, average='macro'))

In [None]:
df_subm = pd.read_csv("../input/liverpool-ion-switching/sample_submission.csv")
df_subm['open_channels'] = test_clean_kalman.signal.apply(boundery_classifier)
df_subm.to_csv("boundary.csv", float_format='%.4f', index=False)

## Conclusion
In this research I detect and remove outliers and "Ghost" drift. I hope that current research help construct more accurate model.

## Reference
* [Clean Removal of Data Drift](https://www.kaggle.com/eunholee/remove-drift-using-a-sine-function)
* [remove drift using a sine function](https://www.kaggle.com/eunholee/remove-drift-using-a-sine-function)
* [A signal processing approach - Kalman Filtering](https://www.kaggle.com/teejmahal20/a-signal-processing-approach-kalman-filtering?scriptVersionId=30696729)
* [Data Without Drift](https://www.kaggle.com/cdeotte/data-without-drift)
* [clean_kalman](https://www.kaggle.com/ragnar123/clean-kalman)