# Overview
Clean up data and preprocess

In [13]:
print('Importing libraries...')

import numpy as np
import pandas as pd
from pathlib import Path

from pptoolbox.platform import preprocess_v2

Importing libraries...


# Clean Labels

In [14]:
pulldate = '250911'
data_folder = Path('../data')
raw_folder = data_folder / 'raw' / f'pulled_{pulldate}'

In [15]:
label = pd.read_csv(raw_folder / f'label_{pulldate}.csv')
label

Unnamed: 0,lot_id,lot_name,property_name,property_value,company_id,product_type_id,product_name
0,108029,VR1,Milling yield,83.78,1374,6907,Brown rice_2025 Production aligned sample
1,108029,VR1,White rice yield,85.71,1374,6907,Brown rice_2025 Production aligned sample
2,108029,VR1,Total yield,83.33,1374,6907,Brown rice_2025 Production aligned sample
3,108029,VR1,Broken rice,0.84,1374,6907,Brown rice_2025 Production aligned sample
4,108029,VR1,Broken rice during soaking,2.80,1374,6907,Brown rice_2025 Production aligned sample
...,...,...,...,...,...,...,...
163,108055,VR26,Total yield,82.05,1374,6907,Brown rice_2025 Production aligned sample
164,108055,VR26,Broken rice,0.35,1374,6907,Brown rice_2025 Production aligned sample
165,108055,VR26,Broken rice during soaking,2.65,1374,6907,Brown rice_2025 Production aligned sample
166,108055,VR26,Damaged rice,0.59,1374,6907,Brown rice_2025 Production aligned sample


In [16]:
label = pd.pivot(label, index=['lot_id','lot_name'], columns='property_name', values='property_value')
label = label.reset_index()
label

property_name,lot_id,lot_name,Broken rice,Broken rice during soaking,Damaged rice,Milling yield,Total yield,White rice yield,Whiteness
0,108029,VR1,0.84,2.8,1.08,83.78,83.33,85.71,42.5
1,108030,VR2,0.98,3.0,0.88,83.54,83.01,85.4,42.5
2,108032,VR3,0.54,2.75,1.05,85.56,85.45,87.15,43.0
3,108034,VR5,1.02,2.7,0.22,86.85,86.45,88.11,41.0
4,108035,VR6,0.68,2.65,0.17,85.36,85.07,86.21,41.0
5,108036,VR7,0.65,2.6,0.15,88.04,87.67,88.04,41.0
6,108037,VR8,0.32,2.5,0.2,89.14,89.07,89.66,40.0
7,108038,VR9,0.42,2.85,0.18,89.0,88.89,89.59,40.0
8,108039,VR10,0.69,2.7,0.35,84.26,83.95,85.31,42.0
9,108040,VR11,0.41,2.5,0.13,88.32,88.1,88.86,40.5


In [17]:
label.isna().sum()

property_name
lot_id                        0
lot_name                      0
Broken rice                   0
Broken rice during soaking    0
Damaged rice                  0
Milling yield                 0
Total yield                   0
White rice yield              0
Whiteness                     0
dtype: int64

# load spectra

In [18]:
spectra = pd.read_csv(raw_folder / f'spectra_{pulldate}.csv')
spectra.head()

Unnamed: 0,lot_id,active,analyzer_id,raw_data_mini,raw_data_neo,company_id,dark_ref_data_mini,dark_ref_data_neo,dark_ref_scan_time,date_scanned,...,white_ref_scan_time,raw_data,dark_ref_data,white_ref_data,scope_data,specimen_id,lot_name,analyser_id,product_id,product_name
0,108029,1,105,"[{""wavelength"": 290.0734558105469, ""reflectanc...","[{""wavelength"": 926.3883666992188, ""reflectanc...",1374,"[-3.6802313327789307, -1.0272743701934814, 0.7...","[39.3853759765625, 41.57359313964844, 37.86637...",1756859457,1756862844,...,1756859477,,,,,639330,VR1,105,6907,Brown rice_2025 Production aligned sample
1,108029,1,105,"[{""wavelength"": 290.0734558105469, ""reflectanc...","[{""wavelength"": 926.3883666992188, ""reflectanc...",1374,"[-3.6802313327789307, -1.0272743701934814, 0.7...","[39.3853759765625, 41.57359313964844, 37.86637...",1756859457,1756862899,...,1756859477,,,,,639331,VR1,105,6907,Brown rice_2025 Production aligned sample
2,108029,1,105,"[{""wavelength"": 290.0734558105469, ""reflectanc...","[{""wavelength"": 926.3883666992188, ""reflectanc...",1374,"[-3.6802313327789307, -1.0272743701934814, 0.7...","[39.3853759765625, 41.57359313964844, 37.86637...",1756859457,1756862950,...,1756859477,,,,,639332,VR1,105,6907,Brown rice_2025 Production aligned sample
3,108029,1,105,"[{""wavelength"": 290.0734558105469, ""reflectanc...","[{""wavelength"": 926.3883666992188, ""reflectanc...",1374,"[-3.6802313327789307, -1.0272743701934814, 0.7...","[39.3853759765625, 41.57359313964844, 37.86637...",1756859457,1756862994,...,1756859477,,,,,639333,VR1,105,6907,Brown rice_2025 Production aligned sample
4,108029,1,105,"[{""wavelength"": 290.0734558105469, ""reflectanc...","[{""wavelength"": 926.3883666992188, ""reflectanc...",1374,"[-3.6802313327789307, -1.0272743701934814, 0.7...","[39.3853759765625, 41.57359313964844, 37.86637...",1756859457,1756863036,...,1756859477,,,,,639334,VR1,105,6907,Brown rice_2025 Production aligned sample


# preprocess spectra

In [19]:
X_visnir, X_exnir,  y, _ = preprocess_v2(spectra, label.drop(columns=['lot_name']))

## check size before and after preprocess

In [None]:
X_visnir_index_set = set(X_visnir.index)
X_exnir_index_set = set(X_exnir.index)
y_index_set = set(y.index)

assert X_visnir_index_set == X_exnir_index_set == y_index_set, "Index mismatch between features and labels"

# get metadata

In [27]:
y_meta = spectra[['lot_id','lot_name','specimen_id','date_scanned','analyzer_id']].copy().set_index('lot_id')

y_meta = y_meta[y_meta.index.isin(y_index_set)]

# change date_scanned to yyyy-mm-dd format
y_meta['date_scanned'] = pd.to_datetime(y_meta['date_scanned'], unit='s').dt.strftime('%Y-%m-%d')

y_meta

Unnamed: 0_level_0,lot_name,specimen_id,date_scanned,analyzer_id
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
108029,VR1,639330,2025-09-03,105
108029,VR1,639331,2025-09-03,105
108029,VR1,639332,2025-09-03,105
108029,VR1,639333,2025-09-03,105
108029,VR1,639334,2025-09-03,105
...,...,...,...,...
108055,VR26,639746,2025-09-03,105
108055,VR26,639747,2025-09-03,105
108055,VR26,639748,2025-09-03,105
108055,VR26,639749,2025-09-03,105


In [28]:
X_visnir.shape, X_exnir.shape,  y.shape, y_meta.shape

((260, 191), (260, 76), (260, 7), (260, 4))

In [29]:
assert X_exnir.shape[0] == X_visnir.shape[0] == y.shape[0] == y_meta.shape[0], "Shape mismatch between features and labels"
assert X_exnir.index.equals(X_visnir.index) and X_exnir.index.equals(y.index) and X_exnir.index.equals(y_meta.index), "Index mismatch between features and labels"

# export data

In [None]:
processed_folder = data_folder / 'processed'
version = f'v1.1_{pulldate}' # check this !!
output_folder = processed_folder / version / 'eval'  # check this !!

output_folder.mkdir(parents=True, exist_ok=True)

In [31]:
X_visnir.to_csv(output_folder / f'input_vis.csv', index=True)
X_exnir.to_csv(output_folder / f'input_ex.csv', index=True)
y.to_csv(output_folder / f'label.csv', index=True)
y_meta.to_csv(output_folder / f'meta.csv', index=True)