# Wake 4-Probe EDA Notebook

This notebook loads one or more `wake_4probes.csv` files, concatenates them, and performs quick EDA and plots.

### What you get
- Robust loader that can scan a directory for multiple runs.
- Auto-parsing of the CSV header and numeric columns.
- Summary tables and missing-value checks.
- Time conversion to minutes and hours based on `time_s_phys`.
- Line plots for probe pressures and optional emission diagnostics if present.
- Rolling means, histograms, simple correlations.

You can set the `DATA_DIR` and file pattern below to point to your data.

In [1]:
# Imports
import os
import glob
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 120)

ImportError: Error importing numpy: you should not try to import numpy from
        its source directory; please exit the numpy source tree, and relaunch
        your python interpreter from there.

In [None]:
# Configuration: update DATA_DIR to your run directory
DATA_DIR = '../data/raw'  # change if needed
FILE_GLOB = 'wake_4probes*.csv'  # supports multiple versions

files = sorted(glob.glob(FILE_GLOB))
if not files:
    print('No files found. Check DATA_DIR and FILE_GLOB.')
files

In [None]:
# Load and concat all files with consistent columns
dfs = []
for fp in files:
    try:
        df = pd.read_csv(fp)
        df['__source_file'] = os.path.basename(fp)
        dfs.append(df)
    except Exception as e:
        print(f'Failed to load {fp}: {e}')

if dfs:
    data = pd.concat(dfs, ignore_index=True, sort=False)
else:
    data = pd.DataFrame()

data.head()

In [None]:
# Basic cleaning and type coercion
if not data.empty:
    # Try numeric on all columns that look numeric
    for c in data.columns:
        if c not in ['__source_file']:
            data[c] = pd.to_numeric(data[c], errors='ignore')

    # Derive time in minutes and hours if time_s_phys exists
    if 'time_s_phys' in data.columns:
        data['time_min'] = data['time_s_phys'] / 60.0
        data['time_hr'] = data['time_s_phys'] / 3600.0

data.info()

In [None]:
# Summary stats and NA audit
if not data.empty:
    display_cols = [c for c in data.columns if c != '__source_file']
    display(data[display_cols].describe(include='all'))
    na_counts = data[display_cols].isna().sum().sort_values(ascending=False)
    print('Missing values per column:')
    display(na_counts.to_frame('na_count'))

## Line plots for pressures
Plots use `time_min` if available, else row index. One plot per series.

In [None]:
def plot_series(x, y, title, xlabel, ylabel):
    plt.figure(figsize=(9, 4))
    plt.plot(x, y)
    plt.title(title)
    plt.xlabel(xlabel)
    plt.ylabel(ylabel)
    plt.grid(True)
    plt.show()

if not data.empty:
    x = data['time_min'] if 'time_min' in data.columns else data.index
    xlab = 'Time (min)' if 'time_min' in data.columns else 'Index'
    for col in ['p_front_Torr','p_wake_Torr','p_free_Torr','p_gap_Torr']:
        if col in data.columns:
            plot_series(x, data[col], f'{col} vs time', xlab, 'Torr')

## Optional emission diagnostics
These are plotted only if the columns exist in the CSV.

In [None]:
optional_cols = [
    'cup_scale', 'nrho_cup_eff', 'cup_emit_real_per_s', 'mbe_emit_real_per_s'
]
if not data.empty:
    x = data['time_min'] if 'time_min' in data.columns else data.index
    xlab = 'Time (min)' if 'time_min' in data.columns else 'Index'
    for col in optional_cols:
        if col in data.columns:
            ylab = '#/s' if col.endswith('_per_s') else 'value'
            plot_series(x, data[col], f'{col} vs time', xlab, ylab)

## Rolling means (smoothing)
Applies a centered rolling mean to reduce noise.

In [None]:
if not data.empty:
    window = 5  # adjust as desired
    x = data['time_min'] if 'time_min' in data.columns else data.index
    xlab = 'Time (min)' if 'time_min' in data.columns else 'Index'
    for col in ['p_front_Torr','p_wake_Torr','p_free_Torr','p_gap_Torr']:
        if col in data.columns:
            s = data[col].rolling(window=window, min_periods=1, center=True).mean()
            plot_series(x, s, f'{col} rolling mean (w={window})', xlab, 'Torr')

## Histograms and simple distribution checks
One histogram per column of interest.

In [None]:
def hist_col(series, title, bins=40):
    plt.figure(figsize=(6, 4))
    plt.hist(series.dropna(), bins=bins)
    plt.title(title)
    plt.xlabel('Value')
    plt.ylabel('Count')
    plt.grid(True)
    plt.show()

if not data.empty:
    for col in ['p_front_Torr','p_wake_Torr','p_free_Torr','p_gap_Torr']:
        if col in data.columns:
            hist_col(data[col], f'Histogram: {col}')

## Correlations
Computes Pearson correlation between numeric columns and shows a table.

In [None]:
if not data.empty:
    num_df = data.select_dtypes(include=[np.number])
    corr = num_df.corr(numeric_only=True)
    corr

## Simple orbit segmentation helper (optional)
If the data are grouped in orbits of 94 rows each, this cell adds an `orbit_idx` and `minute_in_orbit`.

In [None]:
if not data.empty and 'tick' in data.columns:
    # Assumes tick increments by 1 per minute, starting at 0
    ORBIT_LEN = 94
    data['orbit_idx'] = (data['tick'] // ORBIT_LEN).astype(int)
    data['minute_in_orbit'] = (data['tick'] % ORBIT_LEN).astype(int)
    data[['tick','orbit_idx','minute_in_orbit']].head()

## Save a cleaned copy (optional)
Uncomment to write a cleaned combined CSV to disk.

In [None]:
# out_path = os.path.join(DATA_DIR, 'wake_4probes_combined_clean.csv')
# if not data.empty:
#     data.to_csv(out_path, index=False)
#     out_path