In [9]:
import pandas as pd
import numpy as np
from pathlib import Path
from pptoolbox.platform import preprocess

In [10]:
raw_folder_path = Path("../data/raw")

version = "inspect_only"
pull_date = "260123"  # Updated pull date

label = pd.read_csv(raw_folder_path / version / f"label_{pull_date}.csv")
spectra = pd.read_csv(raw_folder_path / version / f"spectra_{pull_date}.csv")

print(f"Label shape: {label.shape}")
print(f"Spectra shape: {spectra.shape}")

Label shape: (153, 7)
Spectra shape: (35, 34)


In [11]:
label = label.pivot(index=['lot_id'], columns='property_name', values='property_value').reset_index()
label.head()

property_name,lot_id,Quality Control,Source
0,28724,Fail,
1,28725,Fail,
2,28726,Fail,
3,28949,Fail,
4,28950,Fail,


In [12]:
spectra.head()

Unnamed: 0,lot_id,active,analyzer_id,calc_data_mini,calc_data_neo,company_id,dark_ref_data_mini,dark_ref_data_neo,dark_ref_scan_time,date_scanned,...,white_ref_data_neo,white_ref_scan_time,raw_data,dark_ref_data,white_ref_data,scope_data,specimen_id,lot_name,product_id,product_name
0,113018,1,118,"[{""wavelength"": 289.1642150878906, ""reflectanc...","[{""wavelength"": 928.2705078125, ""reflectance"":...",1065,"[-3.1217658519744873, -2.4258341789245605, -0....","[26.758460998535156, 10.878236770629883, -3.83...",1764345570,1764345888,...,"[883.9205322265625, 1116.41650390625, 1689.150...",1764345604,,,,,677693,S823 (B1131025),5449,S823 RASPBERRY 392 IAL (STANDARD)
1,113018,1,118,"[{""wavelength"": 289.1642150878906, ""reflectanc...","[{""wavelength"": 928.2705078125, ""reflectance"":...",1065,"[-3.1217658519744873, -2.4258341789245605, -0....","[26.758460998535156, 10.878236770629883, -3.83...",1764345570,1764346114,...,"[883.9205322265625, 1116.41650390625, 1689.150...",1764345604,,,,,677699,S823 (B1131025),5449,S823 RASPBERRY 392 IAL (STANDARD)
2,113018,1,118,"[{""wavelength"": 289.1642150878906, ""reflectanc...","[{""wavelength"": 928.2705078125, ""reflectance"":...",1065,"[-3.1217658519744873, -2.4258341789245605, -0....","[26.758460998535156, 10.878236770629883, -3.83...",1764345570,1764346366,...,"[883.9205322265625, 1116.41650390625, 1689.150...",1764345604,,,,,677703,S823 (B1131025),5449,S823 RASPBERRY 392 IAL (STANDARD)
3,113018,1,118,"[{""wavelength"": 289.1642150878906, ""reflectanc...","[{""wavelength"": 928.2705078125, ""reflectance"":...",1065,"[-3.1217658519744873, -2.4258341789245605, -0....","[26.758460998535156, 10.878236770629883, -3.83...",1764345570,1764346596,...,"[883.9205322265625, 1116.41650390625, 1689.150...",1764345604,,,,,677707,S823 (B1131025),5449,S823 RASPBERRY 392 IAL (STANDARD)
4,113018,1,118,"[{""wavelength"": 289.1642150878906, ""reflectanc...","[{""wavelength"": 928.2705078125, ""reflectance"":...",1065,"[-3.1217658519744873, -2.4258341789245605, -0....","[26.758460998535156, 10.878236770629883, -3.83...",1764345570,1764346857,...,"[883.9205322265625, 1116.41650390625, 1689.150...",1764345604,,,,,677712,S823 (B1131025),5449,S823 RASPBERRY 392 IAL (STANDARD)


In [13]:
label_lot_id = set(label['lot_id'])
spectra_lot_id = set(spectra['lot_id'])

missing_label_lot_ids = spectra_lot_id - label_lot_id
print(f"Missing lot IDs in label: {missing_label_lot_ids}")

missing_spectra_lot_ids = label_lot_id - spectra_lot_id
print(f"Missing lot IDs in spectra: {missing_spectra_lot_ids}")

Missing lot IDs in label: set()
Missing lot IDs in spectra: {37401, 28724, 28725, 28726, 36409, 36410, 36412, 29266, 29267, 29268, 35925, 35926, 51808, 51809, 51306, 51308, 35456, 35462, 29323, 50831, 50832, 29332, 29341, 38058, 38059, 38060, 38061, 43705, 43706, 43707, 36028, 36029, 43708, 37061, 37064, 37065, 37066, 44243, 44244, 44245, 44246, 36061, 36062, 36063, 36064, 36065, 48386, 51446, 39159, 39160, 39161, 39162, 48387, 48388, 44289, 44290, 44291, 44292, 29445, 29446, 29447, 29448, 29449, 43782, 29451, 43783, 44293, 44294, 48390, 51986, 51987, 51988, 28949, 28950, 28951, 28952, 29462, 28954, 28955, 29463, 29467, 36144, 36145, 36150, 36151, 36152, 48389, 43842, 43844, 38216, 38218, 29515, 29516, 29517, 29518, 29520, 29521, 38226, 38228, 38229, 38232, 52059, 52060, 51037, 51038, 51039, 52061, 52062, 43887, 43888, 37235, 37237, 38774, 37243, 29573, 29574, 29575, 37269, 37270, 29080, 29081, 29082, 38814, 38818, 29610, 29611, 29613, 29616, 29617, 29618, 48613, 48614, 48615, 48616, 4

# Preprocessing

In [18]:
X, y, add = preprocess(
    X_input = spectra,
    y_input = label,
    export_meta = True)

meta = add['metadata']

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"meta shape: {meta.shape}")

X shape: (35, 391)
y shape: (35, 2)
meta shape: (35, 6)


In [19]:
y.head()

Unnamed: 0_level_0,Quality Control,Source
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1
113018,Pass,Production
113018,Pass,Production
113018,Pass,Production
113018,Pass,Production
113018,Pass,Production


In [20]:
assert X.index.equals(y.index), "Indices of X and y do not match!"
assert X.index.equals(meta.index), "Indices of X and y_meta do not match!"

In [24]:
meta['date_reformatted'] = pd.to_datetime(meta['date_scanned'], unit='s').dt.strftime('%Y-%m-%d')

In [25]:
meta.head()

Unnamed: 0_level_0,active,analyzer_id,company_id,specimen_id,date_scanned,lot_name,date_reformatted
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
113018,1,118,1065,677693,1764345888,S823 (B1131025),2025-11-28
113018,1,118,1065,677699,1764346114,S823 (B1131025),2025-11-28
113018,1,118,1065,677703,1764346366,S823 (B1131025),2025-11-28
113018,1,118,1065,677707,1764346596,S823 (B1131025),2025-11-28
113018,1,118,1065,677712,1764346857,S823 (B1131025),2025-11-28


In [22]:
output_path = Path ("../data/processed") / version / "full"
output_path.mkdir(parents=True, exist_ok=True)
X.to_csv(output_path / f"input.csv")
y.to_csv(output_path / f"label.csv")
meta.to_csv(output_path / f"meta.csv")