Problem statement

Welcome to the April edition of the 2022 Tabular Playground Series! This month's challenge is a time series classification problem.

You've been provided with thousands of sixty-second sequences of biological sensor data recorded from several hundred participants who could have been in either of two possible activity states. Can you determine what state a participant was in from the sensor data?

Import libraries

In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Retrieve files

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))


Read files

In [None]:
train = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train.csv")
labels = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/train_labels.csv")
test = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/test.csv")
submission = pd.read_csv("/kaggle/input/tabular-playground-series-apr-2022/sample_submission.csv")

In [None]:
train

In [None]:
labels

In [None]:
test

In [None]:
submission

Prepare submission

Map labels to train

In [None]:
train['state'] = train['sequence'].map(labels.set_index('sequence')['state'])
train

Analyse label

In [None]:
sns.histplot(train['state'])

Scatter plot

In [None]:
plt.figure(figsize=(10, 7))
ax = plt.subplot()
ax.scatter(train[train['state'] == 1]['sensor_00'], train[train['state'] == 1]['sensor_01'], c='green', s=train[train['state'] == 1]['sensor_02'])
ax.scatter(train[train['state'] == 0]['sensor_00'], train[train['state'] == 0]['sensor_01'], c='red', s=train[train['state'] == 0]['sensor_02']);

Combine train and test

Drop state from train

In [None]:
target = train['state']

train = train.drop(['state'], axis=1)
train

In [None]:
combi = train.append(test)
combi

Drop sequence

In [None]:
combi = combi.drop(['sequence', 'subject', 'step'],axis=1)
combi

Remove columns that have high correlation

In [None]:
# Create correlation matrix
corr_matrix = combi.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))

# Find features with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.90)]

# Drop features 
combi.drop(to_drop, axis=1, inplace=True)
combi


Heatmap

In [None]:
corr = combi.corr()
plt.subplots(figsize=(8,8))
sns.heatmap(corr)

Define X and y

In [None]:
y = target
X = combi[: len(train)]
X_test = combi[len(train) :]

Split into train and validation

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.05, random_state=1, stratify=y, shuffle=True)
X_train.shape, X_val.shape, y_train.shape, y_val.shape, X_test.shape

Select model

In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(C=100, random_state=42).fit(X_train, y_train)
print(model.score(X_train, y_train))

Predict on Validation set

In [None]:
y_pred = model.predict(X_val)
print(model.score(X_val, y_val))

Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

print(confusion_matrix(y_val, y_pred))

Predict on X_test

In [None]:
predictions = model.predict(X_test)
predictions

Prepare and submit

In [None]:
test['state'] = predictions

seq = []
sub_state = 0
submission_state = []
temp_state = []
i = 0

for i in range(len(test)-1):
    if test['sequence'][i] == test['sequence'][i+1]:
        temp_state.append(test['state'][i])
        if 1 in temp_state:
            sub_state = 1
        else:
            sub_state = 0
    else:
        temp_state = []
    if test['sequence'][i] not in seq:
        seq.append(test['sequence'][i])
        submission_state.append(sub_state)
    i = i + 1
        
submission['sequence'] = seq
submission['state'] = submission_state

submission

In [None]:
sns.displot(submission['state'])

Prepare submission

In [None]:
submission.to_csv('submission.csv', index=False)
submission = pd.read_csv("submission.csv")
submission