In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

## Load and summarize the dataset

In [None]:
train_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_df.head()

In [None]:
train_df.describe().T

In [None]:
train_labels_df = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
train_labels_df.tail(10)

## Summarize subject data for each sequence

We can look at the number of unique values to see that there are a lot more sequences than there are subjects in this dataset.

In [None]:
train_df[['sequence', 'subject']].nunique()

Since each sequence has only one subject, we can use `drop_duplicates` on those two columns to get a dataframe with just the unique sequence/subject pairs. We'll merge in the labels dataframe so we can look at subject and state to see if there's any information in the subject column that we need to keep in our model.

In [None]:
subjects_df = train_df[['sequence', 'subject']].drop_duplicates()
subjects_df = subjects_df.merge(train_labels_df, on='sequence')
subjects_df.head()

In [None]:
grouped_sub = subjects_df.groupby('subject')
grouped_sub_states = grouped_sub['state'].agg([np.mean, len])
grouped_sub_states.plot.scatter(x='len', y='mean', figsize=(12, 6),
                                title="Mean state vs. subject appearances",
                                xlabel='Number of sequences', ylabel='Mean state');

From this chart we can see there is a correlation between the number of times a subject appears in the dataset and the state of the sequence for that subject. The more times a subject appears, the more likely their sequence is in state 1. We should keep the subject id when we train our model. We'll also include the number of sequences for each subject, and the proportion of all sequences.

In [None]:
grouped_sub_states['prop'] = grouped_sub_states['len'] / grouped_sub_states['len'].sum()

In [None]:
seq_sub_dict = dict(zip(subjects_df['sequence'], subjects_df['subject']))
sub_len_dict = dict(zip(grouped_sub_states.index, grouped_sub_states['len']))
sub_prop_dict = dict(zip(grouped_sub_states.index, grouped_sub_states['prop']))

## Summarize sensor data for each sequence

Let's try flattening all of the sensor data for each sequence into just a few summary statistics so we can have one row per sequence.

In [None]:
from scipy.stats import iqr, kurtosis, skew

sensor_columns = ["sensor_{:02d}".format(item) for item in range(0, 13)]

grouped = train_df.groupby('sequence')
train_g = grouped[sensor_columns].agg([min, max, np.mean, np.std, np.median, iqr, kurtosis, skew])

# flatten the multi-index that resulted from grouping
train_g.columns = ["_".join(a) for a in train_g.columns.to_flat_index()]

# add the subject column back
train_g['subject'] = train_g.index.map(seq_sub_dict)

# add the sequence count and proportion for each subject
train_g['subject_seq_count'] = train_g['subject'].map(sub_len_dict)
train_g['subjects_seq_prop'] = train_g['subject'].map(sub_prop_dict)
train_g

In [None]:
from sklearn.model_selection import train_test_split

X = train_g
y = train_labels_df['state']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

## Build a model

In [None]:
from xgboost import XGBClassifier

# Define model. Specify a number for random_state to ensure same results each run
model = XGBClassifier(random_state=1)

# Fit model and make predictions
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

### Determine the score

In [None]:
from sklearn.metrics import roc_auc_score

auc = roc_auc_score(y_test, y_pred)
print("AUC: %.5f" % auc)

That's not a great score given some of the other submissions already on the leaderboard, but it's not a bad baseline for a very simple model.

## Create a submission

In [None]:
test_df = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

# create a sequence->subjects dictionary
subjects_test_df = test_df[['sequence', 'subject']].drop_duplicates()

grouped_test_sub = subjects_test_df.groupby('subject')
grouped_test_sub_counts = grouped_test_sub['sequence'].agg([len])
grouped_test_sub_counts['prop'] = grouped_test_sub_counts['len'] / grouped_test_sub_counts['len'].sum()

seq_sub_test_dict = dict(zip(subjects_test_df['sequence'], subjects_test_df['subject']))
sub_len_test_dict = dict(zip(grouped_test_sub_counts.index, grouped_test_sub_counts['len']))
sub_prop_test_dict = dict(zip(grouped_test_sub_counts.index, grouped_test_sub_counts['prop']))

In [None]:
grouped_test_sub_counts

In [None]:
grouped = test_df.groupby('sequence')
test_g = grouped[sensor_columns].agg([min, max, np.mean, np.std, np.median, iqr, kurtosis, skew])

# flatten the multi-index that resulted from grouping
test_g.columns = ["_".join(a) for a in test_g.columns.to_flat_index()]
test_g['subject'] = test_g.index.map(seq_sub_test_dict)

# add the sequence count and proportion for each subject
test_g['subject_seq_count'] = test_g['subject'].map(sub_len_test_dict)
test_g['subjects_seq_prop'] = test_g['subject'].map(sub_prop_test_dict)

test_g.head()

In [None]:
# Fit model to entire training dataset
model.fit(X, y)
y_pred = model.predict(test_g)

In [None]:
submission_df = pd.DataFrame({'sequence': test_g.index, 'state': y_pred})
submission_df.to_csv('submission.csv', index=False)
submission_df