# 01 - Data Collection



## Objective
Load raw dataset from Google Drive, verify structure, and persist a raw snapshot for subsequent notebooks.

---

### Run order
Execute notebooks in numeric order: 01 → 02 → 03 → 04 → 05. Each notebook mounts Google Drive and reads/writes intermediate artifacts so it can run independently.

---


In [None]:

# Mount Google Drive (Colab) and load raw CSV
from google.colab import drive
import pandas as pd
import os

# Mount if not already
if not os.path.exists('/content/drive/MyDrive'):
    drive.mount('/content/drive')
else:
    print('Drive already mounted!')

CSV_PATH = '/content/drive/MyDrive/dsp-poc/dsp_rul_hrs.csv'
RAW_OUT = '/content/drive/MyDrive/dsp-poc/data/df_raw.parquet'

# Load raw
df = pd.read_csv(CSV_PATH, low_memory=False)
print('Loaded:', df.shape)
print('Columns:', list(df.columns)[:10], '...')

# Persist a raw snapshot for reproducibility
os.makedirs('/content/drive/MyDrive/dsp-poc/data', exist_ok=True)
df.to_parquet(RAW_OUT)
print('Saved raw snapshot to:', RAW_OUT)


Mounted at /content/drive
Loaded: (166441, 53)
Columns: ['Unnamed: 0', 'timestamp', 'sensor_00', 'sensor_01', 'sensor_02', 'sensor_03', 'sensor_04', 'sensor_05', 'sensor_06', 'sensor_07'] ...
Saved raw snapshot to: /content/drive/MyDrive/dsp-poc/data/df_raw.parquet
