In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:

input_data=pd.read_csv('/kaggle/input/data-without-drift/test_clean.csv')
print(input_data.head())
train_data=pd.read_csv('/kaggle/input/data-without-drift/train_clean.csv')
print(train_data.head())

input_data_drifted=pd.read_csv('/kaggle/input/liverpool-ion-switching/test.csv')

train_data_drifted=pd.read_csv('/kaggle/input/liverpool-ion-switching/train.csv')

'''input_data=pd.DataFrame(data={'time':[0,2,10],'signal':[0,3,11],'open_channels':[1,3,12]})
input_data_drifted=pd.DataFrame(data={'time':[0,5,17],'signal':[0,4,17],'open_channels':[0,1,17]})

train_data=pd.DataFrame(data={'time':[0,1,19],'signal':[0,2,18],'open_channels':[1,7,18]})
train_data_drifted=pd.DataFrame(data={'time':[0,5,11],'signal':[0,4,15],'open_channels':[0,1,16]})'''


We add some plots in order to observe the continous batches, and check if the drift in the dataset was removed properly.

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(input_data.time,input_data.signal)
plt.show()

plt.plot(train_data.time,train_data.signal)
plt.show()

print(train_data.open_channels.unique())

# Feature Engineering
We create sub-batches of the continous batches, and add some feature engineering on them. The reason for sub-batching is having some local information rather than the global operation of 10 concatenated continuous functions.

We add a drift feature because the data-cleaning process may have affected the correctnes of data, and including this in our model may help reducing the bias induced by improper drift removal.

Shifting helps us see how the continuous function evolves, taking some small discrete steps that try to evaluate the continous inferred graph.


In [None]:
train_data['original']=train_data_drifted['signal']
input_data['original']=input_data_drifted['signal']

train_data['drift']=train_data['signal']-train_data['original']
input_data['drift']=input_data['signal']-input_data['original']

batch_size=5*(10**4)
train_data['index_in_batch']=[i%batch_size for i in range(train_data.shape[0])]
input_data['index_in_batch']=[i%batch_size for i in range(input_data.shape[0])]

train_data['batch']=[i//batch_size for i in range(train_data.shape[0])]
input_data['batch']=[i//batch_size for i in range(input_data.shape[0])]

train_data['max']=train_data.groupby('batch')['signal'].transform('max')
input_data['max']=input_data.groupby('batch')['signal'].transform('max')

train_data['min']=train_data.groupby('batch')['signal'].transform('min')
input_data['min']=input_data.groupby('batch')['signal'].transform('min')

train_data['mean']=train_data.groupby('batch')['signal'].transform('mean')
input_data['mean']=input_data.groupby('batch')['signal'].transform('mean')

train_data['signal_shifted1']=train_data.groupby('batch').shift(1)['signal']
input_data['signal_shifted1']=input_data.groupby('batch').shift(1)['signal']

train_data['signal_shifted-1']=train_data.groupby('batch').shift(-1)['signal']
input_data['signal_shifted-1']=input_data.groupby('batch').shift(-1)['signal']

train_data['signal_shifted2']=train_data.groupby('batch').shift(2)['signal']
input_data['signal_shifted2']=input_data.groupby('batch').shift(2)['signal']

train_data['signal_shifted-2']=train_data.groupby('batch').shift(-2)['signal']
input_data['signal_shifted-2']=input_data.groupby('batch').shift(-2)['signal']

train_data['diff1']=train_data['signal']-train_data['signal_shifted1']
input_data['diff1']=input_data['signal']-input_data['signal_shifted1']

train_data['diff-1']=train_data['signal']-train_data['signal_shifted-1']
input_data['diff-1']=input_data['signal']-input_data['signal_shifted-1']

train_data['diff2']=train_data['signal']-train_data['signal_shifted2']
input_data['diff2']=input_data['signal']-input_data['signal_shifted2']

train_data['diff-2']=train_data['signal']-train_data['signal_shifted-2']
input_data['diff-2']=input_data['signal']-input_data['signal_shifted-2']

train_data['median']=train_data.groupby('batch')['signal'].transform('median')
input_data['median']=input_data.groupby('batch')['signal'].transform('median')

train_data['mean_drift']=train_data.groupby('batch')['drift'].transform('mean')
input_data['mean_drift']=input_data.groupby('batch')['drift'].transform('mean')

#train_data['roll']=train_data.groupby('batch')['drift'].transform('mean')
#input_data['roll']=input_data.groupby('batch')['drift'].transform('mean')
print(train_data.head())

We also try adding some smaller batches, that may help us with some "local" information.

In [None]:
small_batch=5*(10**3)

train_data['small_batch']=[i//batch_size for i in range(train_data.shape[0])]
input_data['small_batch']=[i//batch_size for i in range(input_data.shape[0])]

train_data['small_batch_median']=train_data.groupby('batch')['signal'].transform('median')
input_data['small_batch_median']=input_data.groupby('batch')['signal'].transform('median')

train_data['small_batch_mean']=train_data.groupby('batch')['signal'].transform('mean')
input_data['small_batch_mean']=input_data.groupby('batch')['signal'].transform('mean')

# Model creation
We add to our model features which experimentally produced the accurate results and are diverse enough in terms of the insights they evaluate.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

#think also about diff1 and diff-1
columns_to_use=['signal','max','min','mean','median','diff1','diff-1','diff2','diff-2','index_in_batch','small_batch_median','small_batch_mean','small_batch','batch']

print("ok")
#TO DO:remove heads
X=train_data[columns_to_use]
y=train_data.open_channels

X_final=input_data[columns_to_use]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=1)

Some deprecated model

In [None]:
'''from sklearn.metrics import mean_absolute_error
from sklearn.ensemble import RandomForestClassifier

def get_error(y,actual_y):
    return mean_absolute_error(y,actual_y)

possible_depths=[10,15,20]

best_error=20
best_depth=20

for depth in possible_depths:
    classifier=RandomForestClassifier(max_depth=depth,random_state=0)
    classifier.fit(X_train,y_train)
    predictions=classifier.predict(X_test)
    error=get_error(predictions,y_test)
    if error<best_error:
        best_error=error
        best_depth=depth
    print("solved")
print(best_depth,best_error)'''
nothing_happens_here=1

In [None]:
#inspired by: https://www.kaggle.com/siavrez/simple-eda-model/data
import lightgbm as lgb
params={'learning_rate':0.1,'max_depth':-1,'num_leaves':2**7+1,'metric':'mae','random_state':7,'n_jobs':-1,'sample_fraction':0.33,'verbose':-1}
model1=lgb.train(params,lgb.Dataset(X_train,y_train),22222,lgb.Dataset(X_test,y_test),early_stopping_rounds=250)
pred1=model1.predict(X_final)

In [None]:
#inspired by: https://www.kaggle.com/siavrez/simple-eda-model/data
from catboost import Pool,CatBoostRegressor
    
model2=CatBoostRegressor(task_type='GPU',iterations=22222,learning_rate=0.1,random_seed=7,depth=7,eval_metric='MAE')
trainer=Pool(X_train,y_train)
validator=Pool(X_test,y_test)
model2.fit(trainer,eval_set=validator,verbose=0,early_stopping_rounds=250)
pred2=model2.predict(X_final)

In [None]:
from xgboost import XGBRegressor

model3=XGBRegressor(n_estimators=50,learning_rat=0.1,n_jobs=4)
model3.fit(X_train,y_train,eval_set=[(X_test,y_test)],verbose=False,early_stopping_rounds=250)
pred3=model3.predict(X_final)

# Model combining
After training 3 different gradient boosters, we try to evaluate the final result as the most accurate linear combination of the 3 evaluations. The function addition behaves linearly.

In [None]:
from sklearn.metrics import f1_score
best_score=0
best_coef1=0
best_coef2=0
p1=model1.predict(X_test)
p2=model2.predict(X_test)
p3=model3.predict(X_test)
for coef1 in range(10):
    for coef2 in range(10):
        test_pred=np.around(((coef1)*np.array(p1)+(coef2)*np.array(p2)+(10-coef1-coef2)*np.array(p3))/10).astype(int)
        score=f1_score(test_pred,y_test,average='macro')
        if score>best_score:
            best_score=score
            best_coef1=coef1
            best_coef2=coef2
coef1,coef2=best_coef1,best_coef2
answer=np.around(((coef1)*np.array(pred1)+(coef2)*np.array(pred2)+(10-coef1-coef2)*np.array(pred3))/10).astype(int)
print(best_score)
print(best_coef1,best_coef2,10-best_coef1-best_coef2)

In [None]:
#classifier=RandomForestClassifier(max_depth=best_depth,random_state=0)
#classifier.fit(X,y)
#answer=[0 for i in range(Input.shape[0])]
#answer=classifier.predict(X_final)
Output=pd.DataFrame({'time': input_data.time,'open_channels':answer})
Output.to_csv('submission.csv',index=False,float_format='%.4f')
print(Output.head())