In [None]:
# https://www.kaggle.com/code/ambrosm/tpsapr22-eda-which-makes-sense

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import MaxNLocator
from cycler import cycler
from IPython.display import display
import datetime

from sklearn.decomposition import PCA
plt.rcParams['axes.facecolor'] = '#575757'
plt.rcParams['axes.prop_cycle'] = cycler(color=['lime'] + \
    plt.rcParams['axes.prop_cycle'].by_key()['color'][1:])

In [None]:
train = pd.read_csv('../input/tabular-playground-series-apr-2022/train.csv')
train_labels = pd.read_csv('../input/tabular-playground-series-apr-2022/train_labels.csv')
test = pd.read_csv('../input/tabular-playground-series-apr-2022/test.csv')

In [None]:
ss = pd.read_csv('../input/tabular-playground-series-apr-2022/sample_submission.csv')

In [None]:
train

In [None]:
train.sequence.min()

In [None]:
train.sequence.max()

In [None]:
train.subject.min()

In [None]:
train.subject.max()

In [None]:
train.subject.value_counts()

In [None]:
train.step.min()

In [None]:
train.step.max()

In [None]:
train_labels

In [None]:
train_labels.sequence.min()

In [None]:
train_labels.sequence.max()

In [None]:
train_labels.state.unique()

In [None]:
test

In [None]:
test.sequence.min()

In [None]:
test.sequence.max()

In [None]:
test.subject.min()

In [None]:
test.subject.max()

In [None]:
test.subject.value_counts()

In [None]:
ss

In [None]:
plt.subplots(1, 2, sharey=True, figsize=(16, 4))
def plot_sequence_count_distribution(df, title):
    temp = df.subject.value_counts().sort_values() // 60
    plt.bar(range(len(temp)), temp, width=1)
    plt.xlabel('SUBJECT')
    plt.ylabel('SEQUENCE COUNT')
    plt.title(f'SEQUENCE COUNT DISTRIBUTION ON {title} SUBJECTS')
    print(temp.sort_values().rename(f'SEQUENCE COUNT PER {title} SUBJECT'))
    
plt.subplot(1, 2, 1)
plot_sequence_count_distribution(train, 'TRAINING')
plt.subplot(1, 2, 2)
plot_sequence_count_distribution(test, 'TEST')

plt.show()

In [None]:
train.sequence.value_counts(sort=False)

In [None]:
train.subject.value_counts(sort=False)

In [None]:
temp_min = train.groupby('sequence').subject.min()
temp_min

In [None]:
temp_max = train.groupby('sequence').subject.max()
temp_max

In [None]:
temp_min.compare(temp_max)

In [None]:
train_labels

In [None]:
temp = train_labels.merge(temp_min, on='sequence')
temp

In [None]:
temp.groupby('subject')\
    .agg({'state': 'mean'})

In [None]:
temp.groupby('subject')\
    .agg({'state': 'mean', 'sequence': 'count'})

In [None]:
temp = temp.groupby('subject')\
    .agg({'state': 'mean', 'sequence': 'count'})\
    .rename(columns={'state': 'probability', 'sequence': 'sequence_count'})
temp

In [None]:
temp_1 = temp[temp.sequence_count >= 25].probability.rename('STATE PROBABILITY == 1')
temp_1

In [None]:
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.hist(temp_1, bins=20)
plt.ylabel('SUBJECT COUNT')
plt.xlabel('STATE PROBABILITY == 1')
plt.title('SUBJECT STATE PROBABILITIES HISTOGRAM')

plt.subplot(1, 2, 2)
plt.scatter(temp.sequence_count, temp.probability)
plt.xlabel('SEQUENCE COUNT')
plt.ylabel('PROBABILITY')
plt.title('PROBABILITY ON SEQUENCE COUNT DEPENDENCE')

plt.show()

In [None]:
temp[temp.sequence_count >= 25]

In [None]:
temp[temp.sequence_count >= 25].probability.std()

In [None]:
(temp.probability == 0).sum()

In [None]:
print('\nTHE STANDARD DEVIATION -' + \
      f' {temp[temp.sequence_count >= 25].probability.std():.2f}' + \
      ' IS MUCH HIGHER THAN 0.1.')
print(f'\nSUBJECTS PERMANENTLY IN STATE 0: {(temp.probability == 0).sum()}')

In [None]:
figure = plt.figure(figsize=(16, 8))
for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    plt.subplot(4, 4, sensor + 1)
    plt.hist(train[sensor_name], bins=100)
    plt.title(f'{sensor_name} HISTOGRAM')
    
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('SENSOR HISTOGRAMS BEFORE REMOVING OUTLIERS', y=1.02)
plt.show()

In [None]:
figure = plt.figure(figsize=(16, 8))
for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    plt.subplot(4, 4, sensor + 1)
    plt.hist(train[sensor_name], bins=100,
             range=(train[sensor_name].quantile(0.02),
                    train[sensor_name].quantile(0.98)))
    plt.title(f'{sensor_name} HISTOGRAM')
figure.tight_layout(h_pad=1.0, w_pad=0.5)
plt.suptitle('SENSOR HISTOGRAMS AFTER REMOVING OUTLIERS')

plt.show()

In [None]:
sensor_name = 'sensor_12'
plt.hist(train[sensor_name], bins=100,
         range=(train[sensor_name].quantile(0.15),
                train[sensor_name].quantile(0.85)))

plt.show()

In [None]:
print('UNIQUE VALUES COUNT PER SENSOR:')
for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    print(f'{sensor_name}: {len(np.unique(train[sensor_name])):6d}')

In [None]:
sequences = [0, 1, 2, 8364, 15404]
figure, axes = plt.subplots(13, len(sequences), sharex=True, figsize=(16, 16))

for i, sequence in enumerate(sequences):
    for sensor in range(13):
        sensor_name = f'sensor_{sensor:02d}'
        plt.subplot(13, len(sequences), sensor * len(sequences) + i + 1)
        plt.plot(range(60), train[train.sequence == sequence][sensor_name],
                 color=plt.rcParams['axes.prop_cycle'].by_key()['color'][i % 10])
        if sensor == 0:
            plt.title(f'SEQUENCE {sequence}')
        if sequence == sequences[0]:
            plt.ylabel(sensor_name)
figure.tight_layout(w_pad=0.1)
plt.suptitle('SELECTED TIME SERIES', y=1.02)

plt.show()

In [None]:
def stuck_at_constant(seq):
    return seq.min() == seq.max()

for sensor in range(13):
    sensor_name = f'sensor_{sensor:02d}'
    stuck_sequences = train.groupby('sequence')[sensor_name].apply(stuck_at_constant)
    print(f'{sensor_name}: ' + \
          f'{stuck_sequences.sum():4d}  {train_labels[stuck_sequences].state.mean()}')

In [None]:
train_pivoted = train.pivot(index=['sequence', 'subject'],
                            columns='step',
                            values=[col for col in train.columns if 'sensor_' in col])
train_pivoted

In [None]:
temp = train_pivoted.sort_values(by=list(train_pivoted.columns))
duplicates_first = temp.duplicated(keep='first')
duplicates_last = temp.duplicated(keep='last')
temp['duplicates_first'] = duplicates_first
temp['duplicates_last'] = duplicates_last
duplicates = temp[duplicates_first | duplicates_last]
display(duplicates)
dup_sen_00 = duplicates['sensor_00'].apply(stuck_at_constant).all()
print(f'{dup_sen_00}')
print(f'{list(train_labels.loc[duplicates.index.get_level_values(0)].state)}')

In [None]:
def plot_pca(df, col, title):
    pca = PCA()
    Xt = pca.fit_transform(df.values)
    
    plt.subplot(2, 2, col + 1)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('COMPONENTS NUMBER')
    plt.ylabel('CUMULATIVE EXPLAINED VARIANCE')
    plt.title(title)
    
    plt.subplot(2, 2, col + 3)
    plt.scatter(Xt[0], Xt[1])
    
temp = train_pivoted.clip(train_pivoted.quantile(0.02, axis=0).values,
                          train_pivoted.quantile(0.98, axis=0).values,
                          axis=1)

temp.pop('sensor_12')

plt.figure(figsize=(12, 8))
plot_pca(train_pivoted, 0, 'BEFORE OUTLIER REMOVING')
plot_pca(temp, 1, 'AFTER OUTLIER REMOVING')
plt.suptitle('PRINCIPAL COMPONENT ANALYSIS')
plt.tight_layout(h_pad=1.1)

plt.show()