In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
from pptoolbox.platform import preprocess, WAVELENGTHS_3NM_V1

In [2]:
raw_folder_path = Path("../data/raw")

version = "v2"
pull_date = "250922"  # Updated pull date

label = pd.read_csv(raw_folder_path / version / f"label_{pull_date}.csv")
spectra = pd.read_csv(raw_folder_path / version / f"spectra_{pull_date}.csv")

print(f"Label shape: {label.shape}")
print(f"Spectra shape: {spectra.shape}")

Label shape: (167, 7)
Spectra shape: (346, 13)


In [3]:
label = label.pivot(index=['lot_id'], columns='property_name', values='property_value').reset_index()
label

property_name,lot_id,Condition
0,89595,Controls
1,89598,Controls
2,90053,Controls
3,90064,Controls
4,90140,BD
...,...,...
162,108795,Controls
163,108899,Controls
164,108903,MDD
165,108925,Controls


In [4]:
spectra

Unnamed: 0,lot_id,specimen_id,lot_name,date_scanned,analyser_id,company_id,product_id,product_name,raw_data,dark_ref_data,white_ref_data,dark_ref_scan_time,white_ref_scan_time
0,89593,528114,Pilot27Jan_Ethanol,1737944898,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[27.94798611111111, 31.517777777777777, 23.247...","[27.211597222222224, 30.494305555555556, 22.50...",1737944002,1737944022
1,89593,528115,Pilot27Jan_Ethanol,1737945087,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[27.94798611111111, 31.517777777777777, 23.247...","[27.211597222222224, 30.494305555555556, 22.50...",1737944002,1737944022
2,89595,528118,HC001_Ethanol,1737949820,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[21.83652777777778, 24.80847222222222, 16.6704...","[22.867291666666667, 26.411527777777778, 17.72...",1737949093,1737949116
3,89595,528119,HC001_Ethanol,1737949921,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[21.83652777777778, 24.80847222222222, 16.6704...","[22.867291666666667, 26.411527777777778, 17.72...",1737949093,1737949116
4,89598,528123,HC002_Ethanol,1737954323,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[23.084444444444443, 26.259652777777777, 18.00...","[23.239652777777778, 26.354930555555555, 18.09...",1737953796,1737953826
...,...,...,...,...,...,...,...,...,...,...,...,...,...
341,108903,645988,PT102_Ethanol,1758101413,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[23.5841935483871, 27.406935483870967, 19.0763...","[23.813870967741934, 27.128387096774194, 19.29...",1758100571,1758100623
342,108925,646397,HC065_Ethanol,1758173205,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[26.52741935483871, 30.780645161290323, 22.613...","[26.771209677419353, 30.463064516129034, 22.55...",1758173032,1758173053
343,108925,646398,HC065_Ethanol,1758173224,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[26.52741935483871, 30.780645161290323, 22.613...","[26.771209677419353, 30.463064516129034, 22.55...",1758173032,1758173053
344,108927,646402,HC066_Ethanol,1758173407,78,1089,6636,Blood Spot in Ethanol,"[{""wavelength"": 335.45458984375, ""reflectance""...","[26.52741935483871, 30.780645161290323, 22.613...","[26.771209677419353, 30.463064516129034, 22.55...",1758173032,1758173053


In [5]:
label_lot_id = set(label['lot_id'])
spectra_lot_id = set(spectra['lot_id'])

missing_label_lot_ids = spectra_lot_id - label_lot_id
print(f"Missing lot IDs in label: {missing_label_lot_ids}")

missing_spectra_lot_ids = label_lot_id - spectra_lot_id
print(f"Missing lot IDs in spectra: {missing_spectra_lot_ids}")

Missing lot IDs in label: {89593, 108901}
Missing lot IDs in spectra: set()


# Preprocessing

In [6]:
X, y = preprocess(spectra, label, WAVELENGTHS_3NM_V1)

print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")

X shape: (342, 191)
y shape: (342, 1)


In [7]:
y.head()

Unnamed: 0_level_0,Condition
lot_id,Unnamed: 1_level_1
89595,Controls
89595,Controls
89598,Controls
89598,Controls
90053,Controls


In [9]:
original_label_set = set(label['lot_id'])
processed_y_set = set(y.index)

assert original_label_set == processed_y_set  # Should be True if all lot_ids are retained

In [10]:
y_meta = spectra[['lot_id', 'lot_name', 'specimen_id', 'date_scanned','analyser_id']].set_index('lot_id')
y_meta = y_meta[y_meta.index.isin(y.index)]
y_meta['date_scanned'] = pd.to_datetime(y_meta['date_scanned'], unit='s').dt.strftime('%Y-%m-%d')
y_meta

Unnamed: 0_level_0,lot_name,specimen_id,date_scanned,analyser_id
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
89595,HC001_Ethanol,528118,2025-01-27,78
89595,HC001_Ethanol,528119,2025-01-27,78
89598,HC002_Ethanol,528123,2025-01-27,78
89598,HC002_Ethanol,528124,2025-01-27,78
90053,HC003_Ethanol,530245,2025-02-04,78
...,...,...,...,...
108903,PT102_Ethanol,645988,2025-09-17,78
108925,HC065_Ethanol,646397,2025-09-18,78
108925,HC065_Ethanol,646398,2025-09-18,78
108927,HC066_Ethanol,646402,2025-09-18,78


In [13]:
assert X.index.equals(y.index), "Indices of X and y do not match!"
assert X.index.equals(y_meta.index), "Indices of X and y_meta do not match!"

In [14]:
output_path = Path ("../data/processed") / version / "full"
output_path.mkdir(parents=True, exist_ok=True)
X.to_csv(output_path / f"input.csv")
y.to_csv(output_path / f"label.csv")
y_meta.to_csv(output_path / f"meta.csv")