In [None]:
# import os
# for dirname, _, filenames in os.walk('/kaggle/input'):
#     for filename in filenames:
#         print(os.path.join(dirname, filename))

## Loading libraries

In [None]:
%%time

import sys
!cp ../input/rapids/rapids.0.12.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.6/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.6"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path 
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [None]:
import numpy as np
import pandas as pd
import pickle
from datetime import datetime
import pytz
import feather
import math
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline 
from pathlib import Path
from sklearn.preprocessing import PolynomialFeatures
from hmmlearn.hmm import GaussianHMM
from sklearn.naive_bayes import GaussianNB, BernoulliNB
from sklearn.metrics import f1_score, confusion_matrix, classification_report
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis, LinearDiscriminantAnalysis
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
import lightgbm as lgb
from tqdm.notebook import tqdm
from scipy.stats import mode
from sklearn.metrics import accuracy_score
# from cuml.neighbors import KNeighborsClassifier, NearestNeighbors
# import cuml; cuml.__version__

In [None]:
from cuml.neighbors import KNeighborsClassifier, NearestNeighbors
import cuml; cuml.__version__

In [None]:
%%time

NAME='KNN'

NMINUS=3
NPLUS=3

weights=[4, 1, 0.5, 0.25, 1, 0.5, 0.25] # original, negative, positive

NFOLDS=5
RS=42

PATH=Path('/kaggle/input/is-eda-sine-50hz-exp')

print(f'Loading data for model {NAME}\n')

train=feather.read_dataframe(PATH/'train.feather')
test=feather.read_dataframe(PATH/'test.feather')

with open(PATH/'folds_train.pickle', 'rb') as infile:
    folds_train = pickle.load(infile)
    
with open(PATH/'folds_val.pickle', 'rb') as infile:
    folds_val = pickle.load(infile)

In [None]:
models=np.sort(train['model'].unique())
models

In [None]:
cols=['signal_no_drift']
target=['open_channels']

for shift in range(1, NMINUS+1):
    feature='signal_shift_-' + str(shift)
    cols.append(feature)
    
for shift in range(1, NPLUS+1):
    feature='signal_shift_+' + str(shift)
    cols.append(feature)
    
print(f"The list of features included in the {NAME} model:\n")
print(cols)

In [None]:
train=train[cols+target+['model', 'batch', 'time']]
test=test[cols+['model', 'batch', 'segment', 'time']]

In [None]:
print(train.shape)
print(test.shape)

## Setting things up for training

In [None]:
classes=np.array(['class_'+str(i) for i in range(11)])
oof=pd.DataFrame(data=np.zeros((len(train), 11)), index=train.index, columns=classes)
oof_preds=np.zeros(len(train))
preds_proba=pd.DataFrame(data=np.zeros((len(test), 11)), index=test.index, columns=classes)

f1_folds=[]

Multiply the original and shifted signal columns by their weights. 

In [None]:
for c, w in zip(cols, weights):
    train[c]=w*train[c]
    test[c]=w*test[c]

## Training a KNN model

In [None]:
%%time

KNN=100
batch = 1024

for fold_num in range(1, NFOLDS+1):
    
    print('-'*50)
    print(f'Fold {fold_num}:')
    
    train_index=folds_train[fold_num]
    val_index=folds_val[fold_num]

    X_train, Y_train = train.iloc[train_index, :], train.loc[train_index, target]
    X_val, Y_val = train.iloc[val_index, :], train.loc[val_index, target]
    
    for m in models:   
        
#         if fold_num !=1:
#             continue

        mask_model_train=(X_train['model']==m)
        mask_model_val=(X_val['model']==m)
        mask_model_test=(test['model']==m)

        X_mod=X_train.loc[mask_model_train, cols].values
        Y_mod=Y_train[mask_model_train].values.reshape(-1,)
        
        X_val_mod=X_val.loc[mask_model_val, cols].values
        Y_val_mod=Y_val[mask_model_val].values.reshape(-1,)
        
        X_test=test.loc[mask_model_test, cols].copy()
        
#         clf = KNeighborsClassifier(n_neighbors=KNN)
        
#         clf.fit(X_mod, Y_mod)
            
        #Y_val_pred=clf.predict_proba(X_val_mod)#.reshape(-1, 1))     
        #Y_test_pred=clf.predict_proba(X_test.values)#.reshape(-1, 1))
        
        if m=='M4':
            shift=1 # recall that we removed zero open channel from model 4
        else:
            shift=0
        ##############################################
        #KNN = 99
        #batch = 1024
        #print('Training...')
        clf = NearestNeighbors(n_neighbors=KNN)
        clf.fit(X_mod)
        distances, indices = clf.kneighbors(X_val_mod)
        #print('Processing validation set...')
        ct = indices.shape[0]
        pred = np.zeros((ct,KNN),dtype=np.int8)
        Y_val_pred = np.zeros((ct,len(np.unique(Y_mod))),dtype=np.float32)
        it = ct//batch + int(ct%batch!=0)
        for k in range(it):
            a = batch*k; b = batch*(k+1); b = min(ct,b)
            pred[a:b,:] = Y_mod[ indices[a:b].astype(int) ]
            for j in np.unique(Y_mod):
                Y_val_pred[a:b,j-shift] = np.sum(pred[a:b,]==j,axis=1)/KNN
        
        ##############################################
        #print('Processing test set...')
        
        distances, indices = clf.kneighbors(X_test.values)

        ct = indices.shape[0]
        pred = np.zeros((ct,KNN),dtype=np.int8)
        Y_test_pred = np.zeros((ct,len(np.unique(Y_mod))),dtype=np.float32)
        it = ct//batch + int(ct%batch!=0)
        for k in range(it):
            a = batch*k; b = batch*(k+1); b = min(ct,b)
            pred[a:b,:] = Y_mod[ indices[a:b].astype(int) ]
            for j in np.unique(Y_mod):
                Y_test_pred[a:b,j-shift] = np.sum(pred[a:b,]==j,axis=1)/KNN
        
        ##############################################

        classes_mod=classes[np.unique(Y_mod)]           
        #print('oofs...')
        oof.loc[val_index[mask_model_val], classes_mod]=Y_val_pred
        #print('preds_probas...')
        preds_proba.loc[mask_model_test, classes_mod]+=Y_test_pred
        
        # Compute Macro F1 score for the model:
        
        #print('Y_val_pred...')
        Y_val_pred=np.argmax(Y_val_pred, axis=1).astype(int).reshape(-1, ) + int(shift)
        #print('f1...')
        f1_model=f1_score(Y_val_mod, Y_val_pred, average='macro')
        print(f'Model {m}: done! Macro F1 score = {f1_model:.5f}')
    
    oof_preds[val_index]=np.argmax(oof.iloc[val_index, :].values, axis=1).astype(int).reshape(-1, )
    Y_val_OC=train.loc[val_index, 'open_channels'].values.astype(np.uint8).reshape(-1, )
    
    f1_fold=f1_score(Y_val_OC, oof_preds[val_index], average='macro')
    f1_folds.append(f1_fold)
    
    print(f'\nFold {fold_num} is done! Macro F1 score = {f1_fold:.5f}')

preds_proba/=NFOLDS
preds=np.argmax(preds_proba.values, axis=1).astype(int).reshape(-1, )

print('-'*50)
print('Summary:')

for m in models:
    print(f"\nModel {m}:")
    mask_model=train['model']==m
    f1_model=f1_score(train.loc[mask_model, 'open_channels'].values.reshape(-1,), 
                      oof_preds[mask_model], average='macro')
    print(classification_report(train.loc[mask_model, 'open_channels'].values.reshape(-1,), 
                                oof_preds[mask_model], digits=5))
    print(f'Macro F1 score for model {m}    = {f1_model:.5f}')

f1_av=np.array(f1_folds).mean()
f1_std=np.std(f1_folds)
print(f'Macro F1 score = {f1_av:.5f} (average across the folds); std = {f1_std:.5f}')

f1=f1_score(train['open_channels'].values.reshape(-1,), oof_preds, average='macro')

print(f'Macro F1 score = {f1:.5f} (out-of-folds)')

Reinstate the original values of the signals:

In [None]:
for c, w in zip(cols, weights):
    train[c]=train[c]/w
    test[c]=test[c]/w

To get an idea about the accuracy of our results let's print a full classification report and also take a look at the confusion matricies for different models.

In [None]:
%%time
print(classification_report(train['open_channels'].values.reshape(-1,), oof_preds, digits=5))

In [None]:
%%time

# hidden states vs open channels
fig, ax = plt.subplots(5, 1, figsize=(10, 10*5))
ax = ax.flatten()

for i, m in enumerate(models): 
    mask=train['model']==m
    cm = confusion_matrix(train.loc[mask, 'open_channels'].values, oof_preds[mask])
    sns.heatmap(cm, annot=True, lw=1, ax=ax[i])
    ax[i].set_xlabel("Predicted open channels")
    ax[i].set_ylabel("Actual open channels")
    ax[i].set_title(f"Model {m}")
plt.tight_layout()
plt.show()

## Fixing Model 1

Model 1 data in the train set contain only two possible open_channels values: 0 and 1. Graphical analysis of the test set has shown that there might be an additional channel present in Model 1 test set data. To identify the channels that are not in train earlier we used the Gaussian Mixture Model algorithm. Now, for `open_channels` of model M1 greater than 1, we will overwrite the our predictions with those of GMM.

In [None]:
%%time
PATH=Path('/kaggle/input/is-gmm-cv5-b7-repl-seq-folds/sub_GMM_110.csv')
preds_GMM=pd.read_csv(PATH)

mask_M1_GMM=(test['model']=='M1')&(preds_GMM['open_channels']>1)
preds[mask_M1_GMM]=preds_GMM.loc[mask_M1_GMM, 'open_channels']

## Generating a submission file and saving oof's and predicted probabilities

In [None]:
PATH=Path('/kaggle/input/liverpool-ion-switching/')
sub=pd.read_csv(PATH/'sample_submission.csv')

In [None]:
sub['open_channels']=preds
sub['open_channels'].value_counts().sort_index()

In [None]:
sub.shape

In [None]:
time_zone = pytz.timezone('America/Chicago')
current_datetime = datetime.now(time_zone)
ts=current_datetime.strftime("%m%d%H%M%S")

sub_file_name='sub_'+NAME+'_'+ts+'.csv'
oof_file_name='oof_'+NAME+'.feather'#'_'+ts+'.csv'
preds_file_name='preds_'+NAME+'.feather'#'_'+ts+'.csv'

ts, sub_file_name, oof_file_name, preds_file_name

In [None]:
%%time

oof.to_feather(oof_file_name)
preds_proba.to_feather(preds_file_name)
sub.to_csv(sub_file_name, index=False, float_format='%.4f')

## Visualizing the results

In [None]:
%%time

palette = sns.color_palette()
palette=[(0, 0, 0)]+palette
sns.palplot(palette)
plt.xlabel('Open channels', fontsize=15)
ticks=np.arange(0, 11)
plt.xticks(ticks, ticks, fontsize=12)
plt.show()

In [None]:
def plot_signal_vs_shifted_one(df, mod, target='open_channels', batch=None, segment=None, 
                               col1='signal_no_drift', col2='signal_shift_-1', s=0.05, mk_scale=60,
                               low=math.floor(train['signal_no_drift'].min()),
                               high=math.ceil(train['signal_no_drift'].max())):
    
    mask_model=df['model']==mod
        
    if batch is not None:
        mask_batch=df['batch']==batch
        mask_model=np.logical_and(mask_model, mask_batch)

    if segment is not None:
        if 'segment' not in df.columns:
            print("There is no 'segment' column in the data frame! Can't continue!")
            return
        else:           
            mask_segment=df['segment']==segment
            mask_model=np.logical_and(mask_model, mask_segment)               
            
    if target in df.columns:
        mod_chans=np.unique(df.loc[mask_model, target].values)
        for ch in mod_chans:
            mask_channel=df[target]==ch
            mask=np.logical_and(mask_model, mask_channel)
            x=df.loc[mask, col1].values
            y=df.loc[mask, col2].values
            plt.plot(x, y, 'o', markersize=s, label=ch, c=palette[ch])
            plt.legend(markerscale=mk_scale)
    else:
        x=df.loc[mask_model, col1].values
        y=df.loc[mask_model, col2].values
        plt.plot(x, y, 'o', markersize=s)        
    
    plt.xlim((low, high))
    plt.ylim((low, high))
    
    plt.xlabel('Current now, pA')
    plt.ylabel('Current next, pA')
    
    plot_title=f'Model {mod}'
    if batch is not None:
        plot_title+=f', batch {batch}'
    if segment is not None:
        plot_title+=f', {segment}'
    plt.title(plot_title)

In [None]:
def plot_signal_vs_shifted_all(df, mod, target='open_channels', hsize_one=5, 
                               s=0.05, mk_scale=60, n_cols=2, style='seaborn-whitegrid',
                               col1='signal_no_drift', col2='signal_shift_-1',
                               low=math.floor(train['signal_no_drift'].min()),
                               high=math.ceil(train['signal_no_drift'].max()),):
    
    mask=df['model']==mod
    
    if 'segment' in df.columns:
        segments=np.sort(df.loc[mask, 'segment'].unique())
        batches=[None for i in range(len(segments))]
    else:
        batches=np.sort(df.loc[mask, 'batch'].unique())
        segments=[None for i in range(len(batches))]
    
    hsize=n_cols*hsize_one
    n_rows=math.ceil(len(batches) / n_cols)
    vsize= n_rows*hsize_one
    
    plt.figure(figsize=(hsize, vsize))
    plt.style.use(style)
    
    for i , (batch, segment) in enumerate(zip(batches, segments), 1):
        plt.subplot(n_rows, n_cols, i)
        plot_signal_vs_shifted_one(df, target=target, batch=batch, 
                                   segment=segment, mod=mod, s=s, 
                                   mk_scale=mk_scale,
                                   low=low, high=high)
        
    plt.tight_layout()

In [None]:
low={'M1':-4, 'M2':-4, 'M3':-5, 'M4':-5, 'M5':-5}
high={'M1':2, 'M2':0, 'M3':6, 'M4':10, 'M5':5}

In [None]:
def show_results(preds, mod='M1', lag=-1, df=train, s=0.3, mk_scale=10):
    df_new=df[['model', 'batch', 'signal_no_drift', 'signal_shift_'+str(lag)]].copy()
    df_new['open_channels']=preds.astype(np.uint8)
    if 'segment' in df.columns:
        df_new['segment']=df['segment'].copy()
    plot_signal_vs_shifted_all(df_new, mod, low=low[mod], high=high[mod], s=s, mk_scale=mk_scale)

In [None]:
for m in models:
    show_results(oof_preds, mod=m, lag=-1, df=train, s=0.7, mk_scale=5)

In [None]:
y_true=train['open_channels'].values

mask=np.equal(y_true, oof_preds)
for m in models:
    show_results(y_true[~mask], mod=m, lag=-1, df=train[~mask], s=0.7, mk_scale=5)

In [None]:
high['M1']=4

In [None]:
for m in models:
    show_results(preds, mod=m, lag=-1, df=test, s=0.7, mk_scale=5)

In [None]:
batches_order=np.array([0, 1, 2, 6, 3, 7, 4, 9, 5, 8])

In [None]:
def signal_scatter_plots(df, col='signal', order=batches_order):
    
    n_batches=df['batch'].nunique()
    
    if n_batches==4:  # if test
        vsize = 2
        hsize = 2
        fig_vsize=20
        fig_hsize=40
        name='test'
        order=np.arange(4)
    else:             # if train
        vsize = 5
        hsize = 2
        fig_vsize=60
        fig_hsize=40
        name='train'
    
    plt.figure(figsize=(fig_hsize, fig_vsize), facecolor='white')
    sns.set(font_scale=3.5)
    
    for i, b in enumerate(order):

        ax = plt.subplot(vsize, hsize, i+1)
        mask_batch=(df['batch'] == b)
        
        if np.isin(df.columns, 'open_channels').any():   
            channels=np.unique(df.loc[mask_batch, 'open_channels'].values)
            for ch in channels:
                mask_channel=(df['open_channels']==ch)
                mask=np.logical_and(mask_batch, mask_channel)
                plt.plot(df.loc[mask, 'time'].values, df.loc[mask, col].values, 
                         'o', color=palette[ch], ms=0.6, label=ch)      
            title_string='Signal vs time per batch in '
            plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0., 
                       fontsize='36', markerscale=30)
        else:
            title_string='Signal vs time per batch in '
            plt.plot(df.loc[mask_batch, 'time'].values, df.loc[mask_batch, col].values, 
                     'o', ms=0.1)

        ax.set(xlabel='Time, s', ylabel='Current, pA', title= f'Batch {b}')

    plt.suptitle(title_string + f'{name}', y=1.02)
    plt.tight_layout()
    plt.show()

In [None]:
%%time
train['open_channels']=oof_preds
train['open_channels']=train['open_channels'].astype(np.uint8)
signal_scatter_plots(train, col='signal_no_drift')

In [None]:
%%time
test['open_channels']=preds
test['open_channels']=test['open_channels'].astype(np.uint8)
signal_scatter_plots(test, col='signal_no_drift')