In [16]:
import autorootcwd  # noqa
import numpy as np
import pandas as pd

## Data loading

In [17]:
classes = np.load("data/raw/Classes.npy", allow_pickle=True)
sensor_1 = np.load("data/raw/Dados_1.npy", allow_pickle=True)
sensor_2 = np.load("data/raw/Dados_2.npy", allow_pickle=True)
sensor_3 = np.load("data/raw/Dados_3.npy", allow_pickle=True)
sensor_4 = np.load("data/raw/Dados_4.npy", allow_pickle=True)
sensor_5 = np.load("data/raw/Dados_5.npy", allow_pickle=True)

In [18]:
df_classes = pd.DataFrame(classes, columns=["classes"])
df_classes.value_counts()

classes 
Classe A    10000
Classe B    10000
Classe C    10000
Classe D    10000
Classe E    10000
Name: count, dtype: int64

In [19]:
df_classes

Unnamed: 0,classes
0,Classe D
1,Classe A
2,Classe A
3,Classe B
4,Classe D
...,...
49995,Classe B
49996,Classe E
49997,Classe D
49998,Classe A


## Checking data

By observing the sensor data, the sensor shapes different between the 3 first sensors to the last 2. 

In [20]:
sensor_1.shape, sensor_2.shape, sensor_3.shape, sensor_4.shape, sensor_5.shape

((50000, 201), (50000, 201), (50000, 201), (50000, 200), (50000, 200))

Counting number of samples with NaNs for each sensor.

### Sensor 1

In [21]:
pd.Series(np.isnan(sensor_1).any(axis=1)).value_counts()

True     49999
False        1
Name: count, dtype: int64

In [22]:
pd.Series(sensor_1[:, -1]).value_counts(dropna=False)

NaN    49999
2.0        1
Name: count, dtype: int64

### Sensor 2

In [23]:
pd.Series(np.isnan(sensor_2).any(axis=1)).value_counts()

True     49999
False        1
Name: count, dtype: int64

### Sensor 3

In [24]:
pd.Series(np.isnan(sensor_3).any(axis=1)).value_counts()

True     49999
False        1
Name: count, dtype: int64

### Sensor 4

Sensor 4 has an expressive number of samples with NaNs. Let's check this in detail.

In [25]:
pd.Series(np.isnan(sensor_4).any(axis=1)).value_counts()

False    40869
True      9131
Name: count, dtype: int64

Counting the number of NaNs that occur in each sample

In [26]:
nan_counts = np.isnan(sensor_4).sum(axis=1)
pd.Series(nan_counts).value_counts()

array([0, 0, 0, ..., 1, 0, 0])

### Sensor 5

Sensor 5 has a lot of samples with at least one NaN value.

In [12]:
pd.Series(np.isnan(sensor_5).any(axis=1)).value_counts()

False    30308
True     19692
Name: count, dtype: int64

Sensor 5 seems to be the worst in terms of NaN appearance per sample. 

In [28]:
nan_counts = np.isnan(sensor_5).sum(axis=1)
pd.Series(nan_counts).value_counts()

0     30308
10    12107
9      2817
20     1453
19     1450
18      687
8       252
28      185
17      168
29      159
27      121
30       62
26       61
16       33
37       25
36       18
7        15
25       14
38       10
35       10
34        9
24        8
39        5
15        5
33        5
32        2
46        2
45        1
42        1
48        1
31        1
14        1
55        1
43        1
40        1
44        1
Name: count, dtype: int64

In [32]:
print(f"Worst case having {(44 / 200) * 100:.1f}% of NaNs")

Worst case having 22.0% of NaNs


- For the first three sensors, it is observed that there are only one case that doesn't contain NaN values.
- It can also be observed by checking the last value of each sample, that it matches the same proportion. Meaning that the last value of most of the samples is NaN.

In [13]:
sensor_1, sensor_2, sensor_3 = sensor_1[:,:-1], sensor_2[:,:-1], sensor_3[:,:-1]

sensor_1.shape, sensor_2.shape, sensor_3.shape, sensor_4.shape, sensor_5.shape

((50000, 200), (50000, 200), (50000, 200), (50000, 200), (50000, 200))

- By dropping the last value for the first three sensors, the shapes are now the same and now the first three sensors do not contain NaN values.

In [14]:
assert np.isnan(sensor_1).any(axis=1).any() == False
assert np.isnan(sensor_2).any(axis=1).any() == False
assert np.isnan(sensor_3).any(axis=1).any() == False