In [14]:
import pandas as pd
import numpy as np
from pathlib import Path
from pptoolbox.interpolate.data import interpolate_spectra, WAVELENGTHS_QMINI

In [15]:
raw_folder_path = Path("../data/raw")

version = "v1"
pull_date = "260204"  # Updated pull date

label = pd.read_csv(raw_folder_path / version / f"label_{pull_date}.csv")
nvd = pd.read_csv(raw_folder_path / version / f"nvd_{pull_date}.csv")

print(f"Label shape: {label.shape}")
print(f"nvd shape: {nvd.shape}")

Label shape: (3922, 7)
nvd shape: (24084, 2465)


In [16]:
# load in csv labels
excel_labels = pd.read_csv(raw_folder_path / version / f"KDP Brazil Sample Data Collection Sheet_20260204.csv")
# drop columns that start with 'Unnamed'
excel_labels = excel_labels.loc[:, ~excel_labels.columns.str.startswith('Unnamed')]

In [17]:
excel_labels.columns

Index(['Sample No', 'Scanned Date', 'Cupped Date', 'Clean', 'Rioy', 'Phenol',
       'Chemical / Medicine', 'Fermented / Pulpy', 'Moldy / Musty', 'Earthy',
       'Dirty', 'Aged', 'Raw Potato', 'Rubbery', 'Others', 'TOTAL DEFECT CUP',
       'DEFECT CUP?', 'Cupping score given ', 'Cupping Score expected',
       'Expected Description', 'Decision', 'Attibutes', 'Acidity Level',
       'Moisture', 'Density', 'Coffee Profile', 'Comments', 'Crop'],
      dtype='object')

In [18]:
excel_label_cols = [
    'Sample No',
    'TOTAL DEFECT CUP',
       'DEFECT CUP?', 'Clean', 'Rioy', 'Phenol',
       'Chemical / Medicine', 'Fermented / Pulpy', 'Moldy / Musty', 'Earthy',
       'Dirty', 'Aged', 'Raw Potato', 'Rubbery', 'Others',
]

excel_meta_cols = [
    'Sample No', 'Cupped Date',   'Cupping Score expected',
       'Expected Description',  'Attibutes', 'Comments', 'Crop','Cupping score given ', 'Decision','Acidity Level',
       'Moisture', 'Density', 'Coffee Profile'
]

In [19]:
label = label.pivot(index=['lot_id'], columns='property_name', values='property_value').reset_index()
label.head()

property_name,lot_id,Acidity Level,Coffee Profile,Cupping Score,Decision,Density,Moisture,Presence of Negative Defects
0,40982,No Acidity,Value Brazil,78,Approved,69.4,10.5,Not Present
1,40983,No Acidity,Value Brazil,78,Approved,70.5,10.5,Not Present
2,40984,No Acidity,Value Brazil,78,Approved,69.4,10.5,Not Present
3,40985,No Acidity,Value Brazil,78,Approved,69.4,10.5,Not Present
4,40986,No Acidity,Value Brazil,78,Approved,70.4,10.4,Not Present


In [20]:
len(set(label['lot_id'])), len(set(excel_labels['Sample No']))

(672, 401)

In [21]:
label.isna().sum()

property_name
lot_id                            0
Acidity Level                   147
Coffee Profile                  150
Cupping Score                   148
Decision                        148
Density                          20
Moisture                         21
Presence of Negative Defects    148
dtype: int64

In [22]:
nvd.head()

Unnamed: 0,lot_id,specimen_id,lot_name,date_scanned,analyzer_id,company_id,product_id,product_name,has_nvd,support,...,1094.85693359375,1095.400146484375,1095.9432373046875,1096.486328125,1097.0291748046875,1097.5721435546875,1098.114990234375,1098.6578369140625,1099.20068359375,1099.7432861328125
0,98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,1,18,...,,,,,,,,,,
1,98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,1,17,...,,,,,,,,,,
2,98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,1,14,...,,,,,,,,,,
3,98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,1,13,...,,,,,,,,,,
4,98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,1,11,...,,,,,,,,,,


In [23]:
label_lot_id = set(label['lot_id'])
nvd_lot_id = set(nvd['lot_id'])

missing_label_lot_ids = nvd_lot_id - label_lot_id
print(f"Missing lot IDs in label: {missing_label_lot_ids}")

missing_nvd_lot_ids = label_lot_id - nvd_lot_id
print(f"Missing lot IDs in nvd: {missing_nvd_lot_ids}")

Missing lot IDs in label: {104463, 104464, 104465, 110628, 100398, 100400, 100401, 104501, 112702, 112740, 112741, 112742, 112743, 112744, 116890, 116891, 116892, 116893, 110750, 116894, 116895, 116896, 108767, 108768, 108769, 108770, 108771, 108772, 108773, 108774, 108775, 108776, 108777, 108778, 108779, 108780, 108781, 112878, 108782, 108783, 108784, 108785, 108786, 112877, 110904, 110905, 110906, 110907, 110908, 110909, 110910, 108864, 108865, 112976, 108904, 108905, 108913, 108914, 108915, 113020, 113021, 113022, 113023, 113024, 113025, 113026, 113027, 113028, 113029, 108937, 108938, 108939, 108940, 108941, 108942, 110988, 110989, 110990, 110991, 110992, 110993, 108975, 109000, 109001, 109002, 109003, 109004, 109005, 109006, 109007, 109049, 109050, 109051, 109052, 109053, 109054, 109055, 109056, 109057, 109058, 116521, 116522, 111122, 111123, 111125, 102967, 102968, 102969, 111163, 111164, 102988, 102990, 117417, 117418, 117419, 109239, 109240, 109241, 109242, 109243, 101053, 10105

# Interpolation

In [41]:
nvd.columns[:11]

Index(['lot_id', 'specimen_id', 'lot_name', 'date_scanned', 'analyzer_id',
       'company_id', 'product_id', 'product_name', 'has_nvd', 'support',
       '400.49560546875'],
      dtype='object')

In [39]:
nvd_spectra = nvd.iloc[:,10:]
nvd_spectra

Unnamed: 0,400.49560546875,401.0894470214844,401.68328857421875,402.27703857421875,402.8708190917969,403.46453857421875,404.0582275390625,404.65191650390625,405.2455139160156,405.839111328125,...,1094.85693359375,1095.400146484375,1095.9432373046875,1096.486328125,1097.0291748046875,1097.5721435546875,1098.114990234375,1098.6578369140625,1099.20068359375,1099.7432861328125
0,0.103150,0.103706,0.100516,0.101426,0.098658,0.098830,0.095642,0.096511,0.094822,0.093603,...,,,,,,,,,,
1,0.134927,0.134966,0.131123,0.131407,0.128779,0.128348,0.125814,0.126225,0.124466,0.123354,...,,,,,,,,,,
2,0.069425,0.067911,0.067575,0.066243,0.065814,0.065150,0.064154,0.063878,0.063228,0.062598,...,,,,,,,,,,
3,0.184387,0.183753,0.180906,0.178843,0.176103,0.174758,0.172851,0.171810,0.170592,0.169851,...,,,,,,,,,,
4,0.150794,0.150954,0.148601,0.146527,0.144628,0.143681,0.141732,0.141476,0.139189,0.138763,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24079,0.114544,0.113221,0.110929,0.109125,0.108182,0.107300,0.106139,0.103559,0.103863,0.102361,...,,,,,,,,,,
24080,0.079715,0.080421,0.079053,0.077189,0.076659,0.075578,0.074944,0.074347,0.075339,0.072068,...,,,,,,,,,,
24081,0.074117,0.073699,0.072930,0.071613,0.071119,0.069519,0.068937,0.068550,0.067832,0.066308,...,,,,,,,,,,
24082,0.172463,0.169021,0.169704,0.166473,0.163202,0.162271,0.162810,0.160770,0.159431,0.155709,...,,,,,,,,,,


In [42]:
nvd_spectra.columns = nvd_spectra.columns.astype(float)
nvd_interpolated = interpolate_spectra(nvd_spectra, wavelengths=WAVELENGTHS_QMINI)
nvd_interpolated

Unnamed: 0,480.0,483.0,486.0,489.0,492.0,495.0,498.0,501.0,504.0,507.0,...,1023.0,1026.0,1029.0,1032.0,1035.0,1038.0,1041.0,1044.0,1047.0,1050.0
0,0.071874,0.072041,0.072913,0.073353,0.074134,0.075063,0.076003,0.077044,0.077574,0.078608,...,0.301933,0.302893,0.303979,0.305078,0.306490,0.307011,0.308314,0.309541,0.309938,0.311425
1,0.096698,0.097686,0.098064,0.098826,0.099887,0.101086,0.101984,0.103158,0.104300,0.105775,...,0.397139,0.398523,0.399527,0.400554,0.402573,0.402444,0.403449,0.405389,0.405117,0.408249
2,0.040805,0.040892,0.041369,0.041413,0.041727,0.042090,0.042612,0.043132,0.043570,0.044208,...,0.230707,0.232482,0.233557,0.234294,0.235265,0.237026,0.237798,0.238576,0.239619,0.240728
3,0.149490,0.150694,0.151771,0.152930,0.154470,0.156213,0.157428,0.159191,0.161175,0.162899,...,0.472497,0.473340,0.475060,0.476163,0.476601,0.479063,0.480005,0.480428,0.481034,0.482880
4,0.119471,0.120408,0.121271,0.122379,0.123435,0.124498,0.125660,0.127190,0.128219,0.129941,...,0.418030,0.419518,0.420670,0.421760,0.422301,0.425089,0.425187,0.427162,0.427534,0.428308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24079,0.080921,0.081815,0.082360,0.082785,0.083847,0.084930,0.085905,0.087115,0.087923,0.089331,...,0.318796,0.320779,0.321970,0.322233,0.324435,0.324913,0.325379,0.327252,0.327758,0.328833
24080,0.055949,0.055796,0.056334,0.056897,0.056818,0.057663,0.058042,0.058739,0.058683,0.059466,...,0.245052,0.246114,0.246728,0.247983,0.249534,0.249835,0.251809,0.250827,0.252676,0.254113
24081,0.048525,0.048480,0.048873,0.049079,0.049296,0.050084,0.050322,0.050894,0.051076,0.051992,...,0.224726,0.226268,0.227058,0.227753,0.229248,0.229009,0.230501,0.230828,0.231917,0.233002
24082,0.141631,0.142932,0.143936,0.145275,0.147172,0.148721,0.150059,0.152178,0.153903,0.155895,...,0.420686,0.421156,0.421936,0.423349,0.424424,0.424971,0.426032,0.427367,0.428944,0.429149


In [59]:
X = pd.concat([nvd[['lot_id','specimen_id','support']], nvd_interpolated], axis=1).set_index('lot_id')
X

Unnamed: 0_level_0,specimen_id,support,480.0,483.0,486.0,489.0,492.0,495.0,498.0,501.0,...,1023.0,1026.0,1029.0,1032.0,1035.0,1038.0,1041.0,1044.0,1047.0,1050.0
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
98905,582758,18,0.071874,0.072041,0.072913,0.073353,0.074134,0.075063,0.076003,0.077044,...,0.301933,0.302893,0.303979,0.305078,0.306490,0.307011,0.308314,0.309541,0.309938,0.311425
98905,582758,17,0.096698,0.097686,0.098064,0.098826,0.099887,0.101086,0.101984,0.103158,...,0.397139,0.398523,0.399527,0.400554,0.402573,0.402444,0.403449,0.405389,0.405117,0.408249
98905,582758,14,0.040805,0.040892,0.041369,0.041413,0.041727,0.042090,0.042612,0.043132,...,0.230707,0.232482,0.233557,0.234294,0.235265,0.237026,0.237798,0.238576,0.239619,0.240728
98905,582758,13,0.149490,0.150694,0.151771,0.152930,0.154470,0.156213,0.157428,0.159191,...,0.472497,0.473340,0.475060,0.476163,0.476601,0.479063,0.480005,0.480428,0.481034,0.482880
98905,582758,11,0.119471,0.120408,0.121271,0.122379,0.123435,0.124498,0.125660,0.127190,...,0.418030,0.419518,0.420670,0.421760,0.422301,0.425089,0.425187,0.427162,0.427534,0.428308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117591,708251,18,0.080921,0.081815,0.082360,0.082785,0.083847,0.084930,0.085905,0.087115,...,0.318796,0.320779,0.321970,0.322233,0.324435,0.324913,0.325379,0.327252,0.327758,0.328833
117591,708251,9,0.055949,0.055796,0.056334,0.056897,0.056818,0.057663,0.058042,0.058739,...,0.245052,0.246114,0.246728,0.247983,0.249534,0.249835,0.251809,0.250827,0.252676,0.254113
117591,708251,16,0.048525,0.048480,0.048873,0.049079,0.049296,0.050084,0.050322,0.050894,...,0.224726,0.226268,0.227058,0.227753,0.229248,0.229009,0.230501,0.230828,0.231917,0.233002
117591,708251,9,0.141631,0.142932,0.143936,0.145275,0.147172,0.148721,0.150059,0.152178,...,0.420686,0.421156,0.421936,0.423349,0.424424,0.424971,0.426032,0.427367,0.428944,0.429149


In [49]:
meta = nvd[['lot_id', 'specimen_id', 'lot_name', 'date_scanned', 'analyzer_id',
       'company_id', 'product_id', 'product_name',]].set_index('lot_id')
meta

Unnamed: 0_level_0,specimen_id,lot_name,date_scanned,analyzer_id,company_id,product_id,product_name
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean
...,...,...,...,...,...,...,...
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean


In [55]:
meta = meta.copy()
meta['lot_name_short'] = meta['lot_name'].str.split('-').str[2]
meta['date_reformatted'] = pd.to_datetime(meta['date_scanned'], unit='s').dt.date

In [56]:
meta

Unnamed: 0_level_0,specimen_id,lot_name,date_scanned,analyzer_id,company_id,product_id,product_name,lot_name_short,date_reformatted
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,S555,2025-05-09
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,S555,2025-05-09
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,S555,2025-05-09
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,S555,2025-05-09
98905,582758,20250509-A47-S555,1746796578,96,1104,6160,Brazilian green coffee bean,S555,2025-05-09
...,...,...,...,...,...,...,...,...,...
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28


In [58]:
y = pd.merge(meta.reset_index()[['lot_id','lot_name_short']], excel_labels[excel_label_cols],
             left_on='lot_name_short', right_on='Sample No', how='left').set_index('lot_id').drop(columns=['Sample No','lot_name_short',])
y

Unnamed: 0_level_0,TOTAL DEFECT CUP,DEFECT CUP?,Clean,Rioy,Phenol,Chemical / Medicine,Fermented / Pulpy,Moldy / Musty,Earthy,Dirty,Aged,Raw Potato,Rubbery,Others
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
98905,1.0,Defect Cup,,1,,,,,,,,,,
98905,1.0,Defect Cup,,1,,,,,,,,,,
98905,1.0,Defect Cup,,1,,,,,,,,,,
98905,1.0,Defect Cup,,1,,,,,,,,,,
98905,1.0,Defect Cup,,1,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
117591,1.0,Defect Cup,,1,,,,,,,,,,
117591,1.0,Defect Cup,,1,,,,,,,,,,
117591,1.0,Defect Cup,,1,,,,,,,,,,
117591,1.0,Defect Cup,,1,,,,,,,,,,


In [60]:
assert X.index.equals(y.index), "Indices of X and y do not match!"
assert X.index.equals(meta.index), "Indices of X and y_meta do not match!"

In [62]:
print(f"Final X shape: {X.shape}")
print(f"Final y shape: {y.shape}")
print(f"Final meta shape: {meta.shape}")

Final X shape: (24084, 193)
Final y shape: (24084, 14)
Final meta shape: (24084, 9)


In [63]:
output_path = Path ("../data/processed") / version / "v1.0" / "full"
output_path.mkdir(parents=True, exist_ok=True)
X.to_csv(output_path / f"input.csv")
y.to_csv(output_path / f"label.csv")
meta.to_csv(output_path / f"meta.csv")

In [64]:
meta.tail()

Unnamed: 0_level_0,specimen_id,lot_name,date_scanned,analyzer_id,company_id,product_id,product_name,lot_name_short,date_reformatted
lot_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
117591,708251,20260126-A47-S959,1769626519,96,1104,6160,Brazilian green coffee bean,S959,2026-01-28
