In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')
labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')

In [None]:
train.head()

In [None]:
train.info()

In [None]:
labels.info()

In [None]:
train['sequence'].value_counts()

In [None]:
test['sequence'].value_counts()

In [None]:
train['subject'].value_counts()

In [None]:
test['subject'].value_counts()

In [None]:
subs = train['subject'].unique()
subs_test = test['subject'].unique()

In [None]:
same_subs = []
for i in range(len(subs_test)):
    if subs_test[i] in subs:
        same_subs.append(subs_test[i])
    
same_subs

Test Subjects are different than train subjects

In [None]:
train['step'].value_counts()

In [None]:
labels['state'].value_counts()

In [None]:
sub_dict = defaultdict(list)
sub_dict_test = defaultdict(list)
seq = np.array(train['sequence'])
subject = np.array(train['subject'])

seq_test = np.array(test['sequence'])
subject_test = np.array(test['sequence'])

for i in range(len(seq)):
    sub_dict[seq[i]].append(subject[i])
    
for key,value in sub_dict.items():
    sub_dict[key] = set(sub_dict[key])
    
for j in range(len(seq_test)):
    sub_dict_test[seq_test[j]].append(subject_test[j])
    
for key,value in sub_dict_test.items():
    sub_dict_test[key] = set(sub_dict_test[key])
    

In [None]:
for key,value in sub_dict.items():
    sub_dict[key] = list(sub_dict[key])
    if len(sub_dict[key]) > 1:
        print('yes')

In [None]:
for key,value in sub_dict_test.items():
    if len(sub_dict_test[key]) > 1:
        print('yes')

In [None]:
train = train.merge(labels, left_on=['sequence'], right_index=True)
train = train.drop(['sequence_x','sequence_y'],axis=1)

In [None]:
train.head()

Each sequence consists of 60 steps taken by exactly one subject. There are no sequences in either the train or the test data in which multiple subjects take steps in the same sequence. However subjects can be in multiple sequences. The graph below is from [TPS April 2022: EDA & Model](https://www.kaggle.com/code/sytuannguyen/tps-april-2022-eda-model?scriptVersionId=92019269)

In [None]:
plt.figure(figsize=(10,7))
plt.plot(train.groupby('subject').size()/60,train.groupby('subject').state.sum()/train.groupby('subject').size()*100,'.r')
plt.xlabel('Number of sequences per subject', fontsize=16)
plt.ylabel('% state 1', fontsize=16)
plt.title('The influence of number of sequences per subject on the % of state 1', fontsize=16)
plt.show()

In [None]:
df=train.copy()
df = train.merge(labels, left_on=['sequence'], right_index=True)
df = df.drop(['sequence_x','sequence_y','state_y'],axis=1)
df['state'] = df['state_x']
df = df.drop('state_x',axis=1)

In [None]:
df.head()

# Sensor Visuals

In [None]:
state_1 = df[df.state == 1]
state_0 = df[df.state == 0]

In [None]:
sensors = ['sensor_00','sensor_01','sensor_02','sensor_03','sensor_04','sensor_05','sensor_06','sensor_07','sensor_08',
          'sensor_09','sensor_10','sensor_11','sensor_12']
i=0
while i < len(sensors):
    fig, axs = plt.subplots(1,2,figsize=(10, 3))
    sns.histplot(data=state_1,x=state_1[sensors[i]],color="skyblue",ax=axs[0])
    axs[0].set_title(f'{sensors[i]} State 1')
    axs[0].set_xlim(xmin=-3.5,xmax=3.5)
    sns.histplot(data=state_0,x=state_0[sensors[i]],color="skyblue",ax=axs[1])
    axs[1].set_title(f'{sensors[i]} State 0')
    axs[1].set_xlim(xmin=-3.5,xmax=3.5)
    
    i+=1
    plt.tight_layout()
    plt.show()

# Feature Selection

In [None]:
y_train = df['state']
df = df.drop('state',axis=1)

In [None]:
from sklearn.feature_selection import mutual_info_regression

mi_scores = mutual_info_regression(df, y_train)
mi_scores = pd.Series(mi_scores, name="MI_score", index=df.columns)
mi_scores = mi_scores.sort_values(ascending=False)
df_mi_scores1 = pd.DataFrame(mi_scores).reset_index().rename(columns={'index':'feature'})
df_mi_scores1

In [None]:
import eli5
from catboost import CatBoostClassifier
from eli5.sklearn import PermutationImportance

model1 = CatBoostClassifier(logging_level='Silent', random_state=42, eval_metric='Logloss',loss_function='Logloss').fit(df,y_train)

perm1 = PermutationImportance(model1, random_state=1).fit(df, y_train)
eli5.show_weights(perm1, feature_names = df.columns.tolist(), top=None)

Although there is a different subject for each sequence it still is shown as a very informative feature.
The step feature is shown to be very informative and can most likely be dropped.