# Initial Exploration - SKAB

2026-01-15

Quick look at the data to see what we're dealing with.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')
plt.rcParams['figure.figsize'] = (14, 6)
pd.set_option('display.max_columns', None)

In [None]:
# Load anomaly-free file
data_dir = Path('../data/raw/SKAB/data')
file_path = data_dir / 'anomaly-free' / 'anomaly-free.csv'

df = pd.read_csv(file_path, sep=';', parse_dates=['datetime'], index_col='datetime')
df.shape

In [None]:
df.head()

In [None]:
df.info()

In [None]:
# Any missing data?
df.isnull().sum()

In [None]:
df.describe()

In [None]:
# Check for labels
has_labels = 'anomaly' in df.columns
print(f"Has labels: {has_labels}")

In [None]:
# Plot sensors
sensors = [col for col in df.columns if col not in ['anomaly', 'changepoint']]

fig, axes = plt.subplots(len(sensors), 1, figsize=(14, len(sensors)*2), sharex=True)
if len(sensors) == 1:
    axes = [axes]

for i, col in enumerate(sensors):
    axes[i].plot(df.index, df[col], linewidth=0.7, alpha=0.8)
    axes[i].set_ylabel(col)
    axes[i].grid(alpha=0.3)

plt.tight_layout()
plt.show()

In [None]:
# Correlation
corr = df[sensors].corr()

plt.figure(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm', center=0)
plt.tight_layout()
plt.show()

## Quick notes

- 9405 rows, ~2.7hrs
- No nulls
- 8 sensors
- Baseline file (no anomaly labels)
- Next: check files with actual faults