In [2]:
import pandas as pd

data = pd.read_parquet('data/train.parquet')
print(data.iloc[515998:516101])

        seq_ix  step_in_seq  need_prediction         0         1         2  \
515998     515          998                1  0.201930  1.380481  1.474917   
515999     515          999                0  1.627059 -0.360091  1.532782   
516000     516            0                0  1.400312 -0.333132  1.289533   
516001     516            1                0  2.121034  2.286821 -0.051208   
516002     516            2                0 -0.549268 -0.381298 -0.397618   
...        ...          ...              ...       ...       ...       ...   
516096     516           96                0  2.216919  2.112215 -0.107503   
516097     516           97                0 -0.360455 -0.613832 -0.260912   
516098     516           98                0 -2.060364 -2.126772 -0.041227   
516099     516           99                0  1.014914 -0.154323  0.896714   
516100     516          100                1  1.305554 -0.043348  1.029617   

               3         4         5         6  ...        22  

In [4]:
print("\n--- Individual Functions ---")

print("\nColumn Means:")
print(data.mean().to_frame(name='Mean').to_markdown(floatfmt=".2f"))

print("\nColumn Mins:")
print(data.min().to_frame(name='Min').to_markdown(floatfmt=".2f"))

print("\nColumn Maxs:")
print(data.max().to_frame(name='Max').to_markdown(floatfmt=".2f"))

print("\nColumn Stds:")
print(data.std().to_frame(name='Std').to_markdown(floatfmt=".2f"))


--- Individual Functions ---

Column Means:
|                 |   Mean |
|:----------------|-------:|
| seq_ix          | 258.00 |
| step_in_seq     | 499.50 |
| need_prediction |   0.90 |
| 0               |   0.00 |
| 1               |   0.02 |
| 2               |  -0.01 |
| 3               |  -0.01 |
| 4               |  -0.00 |
| 5               |   0.01 |
| 6               |  -0.02 |
| 7               |  -0.02 |
| 8               |  -0.01 |
| 9               |   0.01 |
| 10              |   0.00 |
| 11              |   0.01 |
| 12              |  -0.01 |
| 13              |   0.00 |
| 14              |   0.03 |
| 15              |  -0.00 |
| 16              |  -0.02 |
| 17              |  -0.02 |
| 18              |   0.02 |
| 19              |  -0.02 |
| 20              |  -0.00 |
| 21              |  -0.00 |
| 22              |   0.02 |
| 23              |  -0.02 |
| 24              |  -0.01 |
| 25              |   0.00 |
| 26              |  -0.02 |
| 27              |  -0.02 

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Calculate correlation matrix
corr_matrix = data.corr()

# Plot heatmap
plt.figure(figsize=(14, 12))
sns.heatmap(corr_matrix, cmap='coolwarm', center=0, 
            square=True, linewidths=0.5, cbar_kws={"shrink": 0.8})
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.show()

In [None]:
# Check for missing values
missing_summary = pd.DataFrame({
    'Column': data.columns,
    'Missing_Count': data.isnull().sum(),
    'Missing_Percentage': 100 * data.isnull().sum() / len(data)
})
print(missing_summary[missing_summary['Missing_Count'] > 0])

# Identify rows without complete windows due to missing data
windows_with_missing = []
for i in range(100, len(data)):
    window = data.iloc[i-100:i]
    if window.isnull().any().any():
        windows_with_missing.append(i)

print(f"Windows with missing values: {len(windows_with_missing)}")

In [None]:
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.colors import Normalize
from matplotlib.cm import ScalarMappable

# Get unique sequences
unique_seqs = data['seq_ix'].unique()
n_seqs = len(unique_seqs)

# Use a colormap that handles many sequences
cmap = plt.cm.get_cmap('hsv')
norm = Normalize(vmin=0, vmax=n_seqs-1)
colors = [cmap(norm(i)) for i in range(n_seqs)]

# Create 32 subplots
fig, axes = plt.subplots(8, 4, figsize=(20, 24))
axes = axes.flatten()

# Plot each feature
for feature_idx in range(32):
    ax = axes[feature_idx]
    
    # Plot each sequence with a different color
    for seq_idx, seq_id in enumerate(unique_seqs):
        seq_data = data[data['seq_ix'] == seq_id].sort_values('step_in_seq')
        ax.plot(seq_data['step_in_seq'], seq_data[str(feature_idx)], 
                color=colors[seq_idx], alpha=0.6, linewidth=1.2, label=f'Seq {seq_idx}')
    
    ax.set_title(f'Feature {feature_idx}', fontsize=12, fontweight='bold')
    ax.set_xlabel('Step in Sequence')
    ax.set_ylabel('Value')
    ax.grid(True, alpha=0.3)

# Add a single legend for the first subplot (showing sequence colors)
handles, labels = axes[0].get_legend_handles_labels()
if len(handles) > 0:
    fig.legend(handles[:min(10, len(handles))], labels[:min(10, len(handles))], 
               loc='upper center', ncol=5, fontsize=8, bbox_to_anchor=(0.5, 1.00))

plt.tight_layout()
plt.savefig('feature_sequences.png', dpi=150, bbox_inches='tight')
plt.show()

: 