# <span style='color:#A80808'>Problem description</span>

You've been provided with thousands of sixty-second sequences of biological sensor data recorded from several hundred participants who could have been in either of two possible activity states. Can you determine what state a participant was in from the sensor data?

![](https://cdn.usharama.edu.in/blog/biomedical-signal-processing/bms-usha-rama-blog.PNG)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['axes.facecolor'] = 'gray'
from scipy.stats import kurtosis

import warnings
warnings.simplefilter('ignore')

# <span style='color:#A80808'>Data</span>

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
train = train.merge(train_labels, how='left')

test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

#Source: https://www.kaggle.com/code/tyrionlannisterlzy/xgboost-dnn-ensemble-lb-0-980
pseudo_labels=pd.read_csv('../input/tpsapril22pseudolabel/pseudoLabel.csv')
test = test.merge(pseudo_labels, how='left')
test['state_rounded'] = test.state.round()

train.head(3)

There are 25968 sequences, each sequence has 60 steps (one step per second). The total number of rows in the dataset is 25968*60=1558080. No sequence has missing step.

# <span style='color:#A80808'>Target: state</span>

Target is equally distributed between two categories in the train set. The rounded pseudo target for the test set has also this caracteristic.

In [None]:
state = train_labels.state

In [None]:
plt.figure(figsize=(15,3))

plt.subplot(1,3,1)
train_labels.state.hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Train target', fontsize=16)
plt.tight_layout()

plt.subplot(1,3,2)
pseudo_labels.state.hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Pseudo target', fontsize=16)
plt.tight_layout()

plt.subplot(1,3,3)
(pseudo_labels.state.round()).hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Pseudo target rounded', fontsize=16)
plt.tight_layout()

plt.show()

# <span style='color:#A80808'>Subject</span>

A unique id for the subjects in the experiments. There are in total 991 subjects: 672 subjects in the train set and 319 subjects in the test set.

Subjects with high number of sequences tend to have more state 1. Precisely, subjects with more than 100 sequences have more than 80% of state 1.

In [None]:
plt.figure(figsize=(10,7))

plt.plot(train.groupby('subject').size()/60,train.groupby('subject').state.sum()/train.groupby('subject').size()*100, 'w.', label='train')
plt.plot(test.groupby('subject').size()/60,test.groupby('subject').state_rounded.sum()/test.groupby('subject').size()*100, 'k.', label='test rounded')
plt.xlabel('Number of sequences per subject', fontsize=16)
plt.ylabel('% state 1', fontsize=16)
plt.title('The influence of number of sequences per subject on the % of state 1', fontsize=16)
plt.legend()
plt.show()

Beside, we can observe an artificial pattern of the data for subjects with number of sequences below 50. Indeed, the whole data follow smooth curves. For example, we can fit the curves y=A/x with A=100, 200, 300 etc that correspond to 1, 2, 3 etc state 1 in the subject. We can observe also that all the subject with 100% state 0 have less than 40 sequences.

In [None]:
x = np.arange(5,40)

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.groupby('subject').size()/60,train.groupby('subject').state.sum()/train.groupby('subject').size()*100, 'w.', markersize=10, label='train')
plt.plot(test.groupby('subject').size()/60,test.groupby('subject').state_rounded.sum()/test.groupby('subject').size()*100, 'k.', markersize=10, label='test')
plt.plot(x,100/x,'r', label='y=100/x')
plt.plot(x,200/x,'b', label='y=200/x')
plt.plot(x,300/x,'g', label='y=300/x')
plt.plot(x,400/x,'y', label='y=400/x')
plt.plot(x,500/x,'k', label='y=500/x')
plt.plot(x,600/x,'orange', label='y=600/x')
plt.xlabel('Number of sequences per subject', fontsize=16)
plt.ylabel('% state 1', fontsize=16)
plt.xlim(0,50)
plt.ylim(-1,40)
plt.legend()
plt.title('Data follow artificial smooth curves', fontsize=16)
plt.show()

# <span style='color:#A80808'>Subjects with 100% state 0</span>

The subjects with 100% state 0 are listed below

In [None]:
val = train.groupby('subject').state.sum()
list_subs_0 = val[val==0].index
list_subs_0

In [None]:
subs_0 = train[train.subject.isin(list_subs_0)]
subs_0 = subs_0.set_index('step')
subs_0.head(10)

In [None]:
plt.figure(figsize=(10,7))
plt.plot(subs_0.sensor_02, 'w.')
plt.title('Subjects with 100% state 0', fontsize=16)
plt.xlabel('Steps', fontsize=16)
plt.ylabel('Sensor_02', fontsize=16)
plt.tight_layout()

We can observe that the subjects with 100% state 0 have constant values (at each sequence) for the sensor_02.

# <span style='color:#A80808'>Subjects following the curve y=100/x: only one state 1</span>

The subjects of which the % of state 1 is linked to the number of sequences by the smooth curve: y=100/x is listed below

In [None]:
val1 = train.groupby('subject').state.sum()/train.groupby('subject').size()*100
val2 = train.groupby('subject').size()/60

list_subs_1 = val1[val1==100/val2].index
list_subs_1

In [None]:
subs_1 = train[train.subject.isin(list_subs_1)]
subs_1 = subs_1.set_index('step')
subs_1.head(3)

There are many sensors with constant values in this case: sensor_00, sensor_01, etc.

In [None]:
plt.figure(figsize=(10,7))
plt.plot(subs_1.sensor_02, 'w.')
plt.title('Subjects following smooth curve y=100/x', fontsize=16)
plt.xlabel('Steps', fontsize=16)
plt.ylabel('Sensor_02', fontsize=16)
plt.tight_layout()

# <span style='color:#A80808'>Subject sequences distribution</span>

In [None]:
plt.figure(figsize=(10,5))
(train.groupby('subject').size()/60).hist(bins=200, color='yellow', label='Train')
(test.groupby('subject').size()/60).hist(bins=100, color='black', label='Test')
plt.xlabel('Number of sequences per subject', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.legend(fontsize=16)
plt.show()

The comparison below clearly shows the important of the number of sequences per subject feature on the target.

In [None]:
plt.figure(figsize=(15,5))

plt.subplot(1,2,1)
(train[train.state==0].groupby('subject').size()/60).hist(bins=100, color='white', label='train, state=0')
(test[test.state_rounded==0].groupby('subject').size()/60).hist(bins=100, color='black', label='test, state=0')
plt.xlabel('Number of sequences per subject', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xlim(0,200)
plt.ylim(0,50)
plt.legend(fontsize=16)

plt.subplot(1,2,2)
(train[train.state==1].groupby('subject').size()/60).hist(bins=200, color='white', label='train, state=1')
(test[test.state_rounded==1].groupby('subject').size()/60).hist(bins=200, color='black', label='test, state=1')
plt.xlabel('Number of sequences per subject', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.xlim(0,200)
plt.ylim(0,50)
plt.legend(fontsize=16)

plt.show()

# <span style='color:#A80808'>Subject with high number of sequences</span>

Most subject in the train set has around 30 sequences, but some subjects have more than 100 sequences. The subject 437 has highest number of sequences (199) following by the subject 1 with 175 sequences and subject 635 with 158 sequences.

Most signal sequences of the subjects with high number of sequences (subjects 437, 1 and 635) are of state 1 as shown below.

In [None]:
plt.figure(figsize=(15,6))

plt.subplot(2,3,1)
train.state[(train.subject==1) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 1', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,2)
train.state[(train.subject==87) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 87', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,3)
train.state[(train.subject==207) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 207', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,4)
train.state[(train.subject==421) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 421', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,5)
train.state[(train.subject==437) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 437', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,6)
train.state[(train.subject==635) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 635', fontsize=16)
plt.tight_layout()

plt.show()

# <span style='color:#A80808'>Subject with low number of sequences</span>



Inversely, the subjects 45, 73, 365, 472, 486, 519 have very few number of sequences. Most sequences of these subject show state 0 as below.

In [None]:
plt.figure(figsize=(15,6))

plt.subplot(2,3,1)
train.state[(train.subject==45) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 45', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,2)
train.state[(train.subject==73) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 73', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,3)
train.state[(train.subject==265) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 265', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,4)
train.state[(train.subject==472) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 472', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,5)
train.state[(train.subject==486) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 486', fontsize=16)
plt.tight_layout()

plt.subplot(2,3,6)
train.state[(train.subject==519) & (train.step==0)].hist(color='yellow')
plt.xlabel('State', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.title('Subject 519', fontsize=16)
plt.tight_layout()

plt.show()

# <span style='color:#A80808'>Add a feature for the number of sequences per subject</span>

In [None]:
def count_sequences(df):
    count_sequences = (df.groupby('subject').sequence.size()/60).astype(int).reset_index()
    count_sequences['num_sequences'] = count_sequences.sequence
    count_sequences = count_sequences.drop('sequence', axis=1)
    return count_sequences

train_count_sequences = count_sequences(train)
train = train.merge(train_count_sequences, on='subject', how='left')

test_count_sequences = count_sequences(test)
test = test.merge(test_count_sequences, on='subject', how='left')

# <span style='color:#A80808'>Sensor_00</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_00, bins=300, color='white',
             range=(train.sensor_00.quantile(0.01),
                    train.sensor_00.quantile(0.99)))
plt.title(f'Histogram of sensor_00', fontsize=16)
plt.xlabel('Sensor_00', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_00, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_00', fontsize=16)
plt.ylim(-100,100)
plt.show()

There are many sequences with constant values that are close to 0. To highlight these value, we can plot the inverse of the recorded sensor as shown below. We can observe that the constants sequences are symmetric around 0:

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_00, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_00', fontsize=16)
plt.show()

# <span style='color:#A80808'>The influence of sensor_00 on the target</span>

In [None]:
sensor_00_std = train.groupby('sequence').sensor_00.std()
sensor_00_std_test = test.groupby('sequence').sensor_00.std()

plt.figure(figsize=(15,7))

plt.subplot(1,2,1)
plt.plot(state,sensor_00_std,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Std of sensor_00', fontsize=16)
plt.title('Train: The influence of std of sensor_00 on the target', fontsize=16)

plt.subplot(1,2,2)
plt.plot(pseudo_labels.state.round(),sensor_00_std_test,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Std of sensor_00', fontsize=16)
plt.title('Test: The influence of std of sensor_00 on the target', fontsize=16)
plt.show()

The standard deviation of sensor_00 has a strong impact on the target. For example, only state 0 has high std of sensor_00 above 40.

In [None]:
df = pd.DataFrame()
df['state'] = state
df['feature'] = sensor_00_std
df = df.sort_values('feature')
df = df.reset_index()

df_test = pd.DataFrame()
df_test['state'] = pseudo_labels.state.round()
df_test['feature'] = sensor_00_std_test.tolist()
df_test = df_test.sort_values('feature')
df_test = df_test.reset_index()

plt.figure(figsize=(15,7))

plt.subplot(1,2,1)
plt.plot(df.feature,df.state.rolling(1000).mean(),'w.')
plt.ylabel('State mean', fontsize=16)
plt.xlabel('Std of sensor_00', fontsize=16)
plt.ylim(0,1)
plt.xscale('log')
plt.title('Train: The influence of std of sensor_00 on the target', fontsize=16)

plt.subplot(1,2,2)
plt.plot(df_test.feature,df_test.state.rolling(1000).mean(),'w.')
plt.ylabel('State mean', fontsize=16)
plt.xlabel('Std of sensor_00', fontsize=16)
plt.ylim(0,1)
plt.xscale('log')
plt.title('Test: The influence of std of sensor_00 on the target', fontsize=16)

plt.show()

In [None]:
train['sensor_00_inv'] = 1./train.sensor_00
sensor_00_inv_std = train.groupby('sequence').sensor_00_inv.std()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_std,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Std of 1/sensor_00', fontsize=16)
plt.title('The influence of std of 1/sensor_00 on the target', fontsize=16)
plt.show()

The standard deviation of 1/sensor_00 shows also a highimpact on the target: only state 0 has high std of 1/sensor_00 above 400.

In [None]:
sensor_00_mean = train.groupby('sequence').sensor_00.mean()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_mean,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Mean of sensor_00', fontsize=16)
plt.title('The influence of mean of sensor_00 on the target', fontsize=16)
plt.show()

The impact of the mean value of sensor_00 is not clear. However, the mean value of 1/sensor_00 is more clear as shown below.

In [None]:
df = pd.DataFrame()
df['state'] = state
df['feature'] = sensor_00_mean
df = df.sort_values('feature')
df = df.reset_index()

plt.figure(figsize=(10,7))
plt.plot(df.feature,df.state.rolling(1000).mean(),'w.')
plt.ylabel('State mean', fontsize=16)
plt.xlabel('Mean of sensor_00', fontsize=16)
plt.ylim(0,1)
plt.xscale('log')
plt.title('The influence of mean of sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_inv_mean = train.groupby('sequence').sensor_00_inv.mean()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_mean,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Mean of 1/sensor_00', fontsize=16)
plt.title('The influence of mean of 1/sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_min = train.groupby('sequence').sensor_00.min()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_min,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Min of sensor_00', fontsize=16)
plt.title('The influence of min of sensor_00 on the target', fontsize=16)
plt.show()

The min value of sensor_00 has a strong impact on the target. All sequences with min of sensor_00 below -220 have the state 0.

In [None]:
df = pd.DataFrame()
df['state'] = state
df['feature'] = sensor_00_min
df = df.sort_values('feature')
df = df.reset_index()

plt.figure(figsize=(10,7))
plt.plot(df.feature,df.state.rolling(1000).mean(),'w.')
plt.ylabel('State mean', fontsize=16)
plt.xlabel('Min of sensor_00', fontsize=16)
plt.ylim(0,1)
#plt.xscale('log')
plt.title('The influence of min of sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_inv_min = train.groupby('sequence').sensor_00_inv.min()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_min,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Min of 1/sensor_00', fontsize=16)
plt.title('The influence of min of 1/sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_max = train.groupby('sequence').sensor_00.max()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_max,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Max of sensor_00', fontsize=16)
plt.title('The influence of max of sensor_00 on the target', fontsize=16)
plt.show()

Similarly, the max value of sensor_00 has also a strong impact on the target. All sequences with max of sensor_00 above 170 have the state 0.

In [None]:
sensor_00_inv_max = train.groupby('sequence').sensor_00_inv.max()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_max,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Max of 1/sensor_00', fontsize=16)
plt.title('The influence of max of 1/sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_skew = train.groupby('sequence').sensor_00.skew()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_skew,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Skew of sensor_00', fontsize=16)
plt.title('The influence of skew of sensor_00 on the target', fontsize=16)
plt.show()

The impact of skew range is not clearly manifested.

In [None]:
sensor_00_inv_skew = train.groupby('sequence').sensor_00_inv.skew()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_skew,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Skew of 1/sensor_00', fontsize=16)
plt.title('The influence of skew of 1/sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_kurtosis = train.groupby('sequence').sensor_00.apply(lambda x: kurtosis(x))

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_kurtosis,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Kurtosis of sensor_00', fontsize=16)
plt.title('The influence of kurtosis of sensor_00 on the target', fontsize=16)
plt.show()

Also, it is hard to identify the influence of the kurtosity of sensor_00 on the target.

In [None]:
sensor_00_inv_kurtosis = train.groupby('sequence').sensor_00_inv.apply(lambda x: kurtosis(x))

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_kurtosis,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Kurtosis of 1/sensor_00', fontsize=16)
plt.title('The influence of kurtosis of 1/sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_q05 = train.groupby('sequence').sensor_00.apply(lambda x: np.quantile(x, 0.05))

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_q05,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Q05 of sensor_00', fontsize=16)
plt.title('The influence of q05 of sensor_00 on the target', fontsize=16)
plt.show()

The quantile 0.05 of sensor_00 shows a clear impact on the target. All value under -40 correspond to state 0.

In [None]:
sensor_00_inv_q05 = train.groupby('sequence').sensor_00_inv.apply(lambda x: np.quantile(x, 0.05))

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_q05,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Q05 of 1/sensor_00', fontsize=16)
plt.title('The influence of q05 of 1/sensor_00 on the target', fontsize=16)
plt.show()

In [None]:
sensor_00_q10 = train.groupby('sequence').sensor_00.apply(lambda x: np.quantile(x, 0.10))

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_q10,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Q10 of sensor_00', fontsize=16)
plt.title('The influence of q10 of sensor_00 on the target', fontsize=16)
plt.show()

Similar impact of the quantile 0.1 of sensor_00 is showen above.

In [None]:
sensor_00_inv_q10 = train.groupby('sequence').sensor_00_inv.apply(lambda x: np.quantile(x, 0.10))

plt.figure(figsize=(10,7))
plt.plot(state,sensor_00_inv_q10,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Q10 of 1/sensor_00', fontsize=16)
plt.title('The influence of q10 of 1/sensor_00 on the target', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_01</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_01, bins=300, color='white',
             range=(train.sensor_01.quantile(0.01),
                    train.sensor_01.quantile(0.99)))
plt.title(f'Histogram of sensor_01', fontsize=16)
plt.xlabel('Sensor_01', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_01, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_01', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_01, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_01', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_01 vs Sensor_00</span>

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_00,train.sensor_01, 'w.')
plt.plot([-200,200],[-300,300], 'y')
plt.xlabel('Sensor_00', fontsize=16)
plt.ylabel('Sensor_01', fontsize=16)
plt.title('Sensor_00 vs Sensor_01', fontsize=16)
plt.xlim(-200,200)
plt.ylim(-300,300)
plt.show()

The relationship between sensor_00 and sensor_01 is quite interesting as shown above.

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/train.sensor_00,1/train.sensor_01, 'w.')
plt.xlabel('1/Sensor_00', fontsize=16)
plt.ylabel('1/Sensor_01', fontsize=16)
plt.title('1/Sensor_00 vs 1/Sensor_01', fontsize=16)
plt.show()

# <span style='color:#A80808'>The influence of sensor_01 on the target</span>

In [None]:
sensor_01_std = train.groupby('sequence').sensor_01.std()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_01_std,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Std of sensor_01', fontsize=16)
plt.title('The influence of std of sensor_01 on the target', fontsize=16)
plt.show()

In [None]:
train['sensor_01_inv'] = 1./train.sensor_01
sensor_01_inv_std = train.groupby('sequence').sensor_01_inv.std()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_01_inv_std,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Std of 1/sensor_01', fontsize=16)
plt.title('The influence of std of 1/sensor_01 on the target', fontsize=16)
plt.show()

In [None]:
sensor_01_mean = train.groupby('sequence').sensor_01.mean()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_01_mean,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Mean of sensor_01', fontsize=16)
plt.title('The influence of mean of sensor_01 on the target', fontsize=16)
plt.show()

In [None]:
sensor_01_inv_mean = train.groupby('sequence').sensor_01_inv.mean()

plt.figure(figsize=(10,7))
plt.plot(state,sensor_01_inv_mean,'w.')
plt.xlabel('State', fontsize=16)
plt.ylabel('Mean of 1/sensor_01', fontsize=16)
plt.title('The influence of mean of 1/sensor_01 on the target', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_02</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_02, bins=30, color='white',
             range=(train.sensor_02.quantile(0.01),
                    train.sensor_02.quantile(1)))
plt.title(f'Histogram of sensor_02', fontsize=16)
plt.xlabel('Sensor_02', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_02, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_02', fontsize=16)
plt.ylim(-40,10)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_02, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_02', fontsize=16)
plt.ylim(-1000,1000)
plt.show()

# <span style='color:#A80808'>Sensor_02 vs Sensor_00, 01</span>

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_00,train.sensor_02, 'w.')
plt.xlabel('Sensor_00', fontsize=16)
plt.ylabel('Sensor_02', fontsize=16)
plt.title('Sensor_00 vs Sensor_02', fontsize=16)
plt.xlim(-200,200)
plt.ylim(-40,10)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/train.sensor_00,1/train.sensor_02, 'w.')
plt.xlabel('1/Sensor_00', fontsize=16)
plt.ylabel('1/Sensor_02', fontsize=16)
plt.title('1/Sensor_00 vs 1/Sensor_02', fontsize=16)
plt.ylim(-1000,1000)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_01,train.sensor_02, 'w.')
plt.xlabel('Sensor_01', fontsize=16)
plt.ylabel('Sensor_02', fontsize=16)
plt.title('Sensor_01 vs Sensor_02', fontsize=16)
plt.xlim(-300,300)
plt.ylim(-40,10)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/train.sensor_01,1/train.sensor_02, 'w.')
plt.xlabel('1/Sensor_01', fontsize=16)
plt.ylabel('1/Sensor_02', fontsize=16)
plt.title('1/Sensor_01 vs 1/Sensor_02', fontsize=16)
plt.ylim(-1000,1000)
plt.show()

# <span style='color:#A80808'>Sensor_03</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_03, bins=300, color='white',
             range=(train.sensor_03.quantile(0.01),
                    train.sensor_03.quantile(0.99)))
plt.title(f'Histogram of sensor_03', fontsize=16)
plt.xlabel('Sensor_03', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_03, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_03', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_03, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_03', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_03 vs Sensor_00, 01, 02</span>

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_00,train.sensor_03, 'w.')
plt.xlabel('Sensor_00', fontsize=16)
plt.ylabel('Sensor_03', fontsize=16)
plt.title('Sensor_00 vs Sensor_03', fontsize=16)
plt.xlim(-200,200)
plt.ylim(-200,200)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_01,train.sensor_03, 'w.')
plt.xlabel('Sensor_01', fontsize=16)
plt.ylabel('Sensor_03', fontsize=16)
plt.title('Sensor_01 vs Sensor_03', fontsize=16)
plt.xlim(-200,200)
plt.ylim(-200,200)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_02,train.sensor_03, 'w.')
plt.xlabel('Sensor_02', fontsize=16)
plt.ylabel('Sensor_03', fontsize=16)
plt.title('Sensor_02 vs Sensor_03', fontsize=16)
plt.xlim(-40,10)
plt.ylim(-200,200)
plt.show()

# <span style='color:#A80808'>Sensor_04</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_04, bins=300, color='white')
plt.title(f'Histogram of sensor_04', fontsize=16)
plt.xlabel('Sensor_04', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_04, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_04', fontsize=16)
plt.ylim(-30,30)
plt.show()

Comparing to sensor_01 to 03, the sensor_04 is very well bounded in the range from -30 to 30.

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_04, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_04', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_03,train.sensor_04, 'w.')
plt.xlabel('Sensor_03', fontsize=16)
plt.ylabel('Sensor_04', fontsize=16)
plt.title('Sensor_03 vs Sensor_04', fontsize=16)
plt.xlim(-200,200)
plt.ylim(-30,30)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_03),1/(train.sensor_04), 'w.')
plt.xlabel('1/Sensor_03', fontsize=16)
plt.ylabel('1/Sensor_04', fontsize=16)
plt.title('1/Sensor_03 vs 1/Sensor_04', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_05</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_05, bins=300, color='white',
         range=(train.sensor_05.quantile(0.01),
                    train.sensor_05.quantile(0.99)))
plt.title(f'Histogram of sensor_05', fontsize=16)
plt.xlabel('Sensor_05', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_05, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_05', fontsize=16)
plt.ylim(-30,30)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_05, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_05', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_04,train.sensor_05, 'w.')
plt.xlabel('Sensor_04', fontsize=16)
plt.ylabel('Sensor_05', fontsize=16)
plt.title('Sensor_04 vs Sensor_05', fontsize=16)
plt.xlim(-30,30)
plt.ylim(-30,30)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_04),1/(train.sensor_05), 'w.')
plt.xlabel('1/Sensor_04', fontsize=16)
plt.ylabel('1/Sensor_05', fontsize=16)
plt.title('1/Sensor_04 vs 1/Sensor_05', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_06</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_06, bins=300, color='white',
         range=(train.sensor_06.quantile(0.01),
                    train.sensor_06.quantile(0.99)))
plt.title(f'Histogram of sensor_06', fontsize=16)
plt.xlabel('Sensor_06', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_06, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_06', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_06, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_06', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_05,train.sensor_06, 'w.')
plt.xlabel('Sensor_05', fontsize=16)
plt.ylabel('Sensor_06', fontsize=16)
plt.title('Sensor_05 vs Sensor_06', fontsize=16)
plt.xlim(-30,30)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_05),1/(train.sensor_06), 'w.')
plt.xlabel('1/Sensor_05', fontsize=16)
plt.ylabel('1/Sensor_06', fontsize=16)
plt.title('1/Sensor_05 vs 1/Sensor_06', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_07</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_07, bins=300, color='white',
         range=(train.sensor_07.quantile(0.01),
                    train.sensor_07.quantile(0.99)))
plt.title(f'Histogram of sensor_07', fontsize=16)
plt.xlabel('Sensor_07', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_07, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_07', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_07, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_07', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_06,train.sensor_07, 'w.')
plt.xlabel('Sensor_06', fontsize=16)
plt.ylabel('Sensor_07', fontsize=16)
plt.title('Sensor_06 vs Sensor_07', fontsize=16)
plt.xlim(-100,100)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_06),1/(train.sensor_07), 'w.')
plt.xlabel('1/Sensor_06', fontsize=16)
plt.ylabel('1/Sensor_07', fontsize=16)
plt.title('1/Sensor_06 vs 1/Sensor_07', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_08</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_08, bins=300, color='white',
         range=(train.sensor_08.quantile(0.01),
                    train.sensor_08.quantile(0.99)))
plt.title(f'Histogram of sensor_08', fontsize=16)
plt.xlabel('Sensor_08', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_08, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_08', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_08, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_08', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_07,train.sensor_08, 'w.')
plt.xlabel('Sensor_07', fontsize=16)
plt.ylabel('Sensor_08', fontsize=16)
plt.title('Sensor_07 vs Sensor_08', fontsize=16)
#plt.xlim(-100,100)
#plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_07),1/(train.sensor_08), 'w.')
plt.xlabel('1/Sensor_07', fontsize=16)
plt.ylabel('1/Sensor_08', fontsize=16)
plt.title('1/Sensor_07 vs 1/Sensor_08', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_09</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_09, bins=300, color='white',
         range=(train.sensor_09.quantile(0.01),
                    train.sensor_09.quantile(0.99)))
plt.title(f'Histogram of sensor_09', fontsize=16)
plt.xlabel('Sensor_09', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_09, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_09', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_09, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_09', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_08,train.sensor_09, 'w.')
plt.xlabel('Sensor_08', fontsize=16)
plt.ylabel('Sensor_09', fontsize=16)
plt.title('Sensor_08 vs Sensor_09', fontsize=16)
#plt.xlim(-100,100)
#plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_08),1/(train.sensor_09), 'w.')
plt.xlabel('1/Sensor_08', fontsize=16)
plt.ylabel('1/Sensor_09', fontsize=16)
plt.title('1/Sensor_08 vs 1/Sensor_09', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_10</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_10, bins=300, color='white',
         range=(train.sensor_10.quantile(0.01),
                    train.sensor_10.quantile(0.99)))
plt.title(f'Histogram of sensor_10', fontsize=16)
plt.xlabel('Sensor_10', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_10, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_10', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_10, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_10', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_09,train.sensor_10, 'w.')
plt.xlabel('Sensor_09', fontsize=16)
plt.ylabel('Sensor_10', fontsize=16)
plt.title('Sensor_09 vs Sensor_10', fontsize=16)
plt.xlim(-100,100)
#plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_09),1/(train.sensor_10), 'w.')
plt.xlabel('1/Sensor_09', fontsize=16)
plt.ylabel('1/Sensor_10', fontsize=16)
plt.title('1/Sensor_09 vs 1/Sensor_10', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_11</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_11, bins=300, color='white',
         range=(train.sensor_11.quantile(0.01),
                    train.sensor_11.quantile(0.99)))
plt.title(f'Histogram of sensor_11', fontsize=16)
plt.xlabel('Sensor_11', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_11, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_11', fontsize=16)
plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_11, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_11', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_10,train.sensor_11, 'w.')
plt.xlabel('Sensor_10', fontsize=16)
plt.ylabel('Sensor_11', fontsize=16)
plt.title('Sensor_10 vs Sensor_11', fontsize=16)
plt.xlim(-50,50)
#plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_10),1/(train.sensor_11), 'w.')
plt.xlabel('1/Sensor_10', fontsize=16)
plt.ylabel('1/Sensor_11', fontsize=16)
plt.title('1/Sensor_10 vs 1/Sensor_11', fontsize=16)
plt.show()

# <span style='color:#A80808'>Sensor_12</span>

In [None]:
plt.figure(figsize=(10,5))
plt.hist(train.sensor_12, bins=300, color='white',
         range=(train.sensor_12.quantile(0.2),
                    train.sensor_12.quantile(0.8)))
plt.title(f'Histogram of sensor_12', fontsize=16)
plt.xlabel('Sensor_12', fontsize=16)
plt.ylabel('Count', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,train.sensor_12, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('Sensor_12', fontsize=16)
#plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.step,1/train.sensor_12, 'w.')
plt.xlabel('Step', fontsize=16)
plt.ylabel('1/Sensor_12', fontsize=16)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.sensor_11,train.sensor_12, 'w.')
plt.xlabel('Sensor_11', fontsize=16)
plt.ylabel('Sensor_12', fontsize=16)
plt.title('Sensor_11 vs Sensor_12', fontsize=16)
#plt.xlim(-50,50)
#plt.ylim(-100,100)
plt.show()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(1/(train.sensor_11),1/(train.sensor_12), 'w.')
plt.xlabel('1/Sensor_11', fontsize=16)
plt.ylabel('1/Sensor_12', fontsize=16)
plt.title('1/Sensor_11 vs 1/Sensor_12', fontsize=16)
plt.show()

# <span style='color:#A80808'>Submission</span>

In [None]:
submission = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')
submission.state = pseudo_labels.state.round()
submission.to_csv('submission.csv', index=False)