In [1]:
import pandas as pd
import streamlit as st
import numpy as np
import plotly.express as px
import re

In [2]:
df = pd.read_excel("Raw Data/Copy of NeoBANK Linked Sample.xlsx", sheet_name='Metadata')

In [3]:
df.columns

Index(['Subject ID', 'Sample Type_#', 'CGA', 'DOL ', 'Current Weight',
       'Current Height', 'Current HC', 'Scavenged/Fresh?', 'MBM/DMB?',
       'HMF Y/N?', 'TPN Y/N?', 'Iron Y/N? ', 'Iron Date & time',
       'Duration\n(min)', 'Linked? ', 'feeding time ',
       'Collection date/aliquot time for all samples ', '# Aliquots ',
       'Additional Comments', 'Milk Prep Room Expiration Date & Time '],
      dtype='object')

In [4]:
df = df.rename(columns={
    "HMF Y/N?": "HMF",
    "TPN Y/N?": "TPN",
    "Linked?": "Linked",
    "MBM/DMB?": "Type of Milk",
    "Sample Type_#": "sample_unique_id",
    'Duration\n(min)': "Feeding Duration"
})

In [5]:
df.columns = df.columns.str.strip()
df = df.rename(columns={'Iron Y/N?': 'Iron'})

In [6]:
df.columns

Index(['Subject ID', 'sample_unique_id', 'CGA', 'DOL', 'Current Weight',
       'Current Height', 'Current HC', 'Scavenged/Fresh?', 'Type of Milk',
       'HMF', 'TPN', 'Iron', 'Iron Date & time', 'Feeding Duration', 'Linked?',
       'feeding time', 'Collection date/aliquot time for all samples',
       '# Aliquots', 'Additional Comments',
       'Milk Prep Room Expiration Date & Time'],
      dtype='object')

### Clean Columns

In [7]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

  df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)


In [8]:
df.Iron.value_counts()

Iron
Y    54
N    26
Name: count, dtype: int64

In [9]:
df["Type of Milk"].value_counts()

Type of Milk
MBM          108
DBM           23
MBM + DBM      6
DBM+MBM        4
Name: count, dtype: int64

In [10]:
df['Type of Milk'] = df['Type of Milk'].replace({'MBM': 'MOM', 'MBM + DBM': 'MOM+DBM', 'DBM+MBM': 'MOM+DBM'})

In [11]:
df["Type of Milk"].value_counts()

Type of Milk
MOM        108
DBM         23
MOM+DBM     10
Name: count, dtype: int64

In [12]:
df['Scavenged/Fresh?'] = df['Scavenged/Fresh?'].str.strip().str.capitalize()

In [13]:
df["Scavenged/Fresh?"].value_counts()

Scavenged/Fresh?
Scavenged     139
Scaveneged      2
Name: count, dtype: int64

In [14]:
df['Scavenged/Fresh?'] = df['Scavenged/Fresh?'].replace({'Scaveneged': 'Scavenged'})

In [15]:
df["Scavenged/Fresh?"].value_counts()

Scavenged/Fresh?
Scavenged    141
Name: count, dtype: int64

In [16]:
df.HMF.value_counts()

HMF
Y                 127
Nutramigen          4
Y + Nutramigen      4
N                   2
N+Nutramigen        2
Y+Nutramigen        1
Name: count, dtype: int64

In [17]:
df['HMF'] = df['HMF'].replace({'Y + Nutramigen': 'Y+Nutramigen'})

In [18]:
df.HMF.value_counts()

HMF
Y               127
Y+Nutramigen      5
Nutramigen        4
N                 2
N+Nutramigen      2
Name: count, dtype: int64

In [19]:
df['Iron'] = df['Iron'].replace(r'^\s*Y\s*$', 'Y', regex=True)

In [20]:
df["Iron"].value_counts()

Iron
Y    54
N    26
Name: count, dtype: int64

In [21]:
df["Additional Comments"].value_counts()

Additional Comments
Scavenged Feeding Tube              70
Residual from Milk Prep Room        68
Scavenged Bottle - Residual Feed     2
Scavenged Feeding Syringe            1
Name: count, dtype: int64

In [22]:
df['Sample Source'] = df['Additional Comments'].replace({'Residual from Milk Prep Room': 'Prepped in Milk Room', 'Scavenged Feeding Tube': 'Scavenged', 'Scavenged Bottle - Residual Feed': 'Scavenged', 'Scavenged Feeding Syringe': 'Scavenged'})

In [23]:
df["Sample Source"].value_counts()

Sample Source
Scavenged               73
Prepped in Milk Room    68
Name: count, dtype: int64

In [24]:
df = df.drop(columns=['Additional Comments'])

In [25]:
def extract_numeric_aliquots(value):
    match = re.match(r'^\d+', str(value))
    return int(match.group()) if match else None

df['Aliquots_num'] = df['# Aliquots'].apply(extract_numeric_aliquots)

In [26]:
df = df.drop(columns=['# Aliquots'])

In [27]:
df

Unnamed: 0,Subject ID,sample_unique_id,CGA,DOL,Current Weight,Current Height,Current HC,Scavenged/Fresh?,Type of Milk,HMF,TPN,Iron,Iron Date & time,Feeding Duration,Linked?,feeding time,Collection date/aliquot time for all samples,Milk Prep Room Expiration Date & Time,Sample Source,Aliquots_num
0,NB00237,NB00237_M_10,35.0,74.0,2440.0,44.0,29.0,Scavenged,MOM,Y,N,,,,Y,Prepped \n4/22/2025 PM,4/23/2025\nDOL 21 11:13,,Prepped in Milk Room,11
1,NB00237,NB00237_M_8,34.5,72.0,2350.0,43.5,28.5,Scavenged,MOM,Y,N,,,,Y,Prepped 4/20/25 PM,2025-04-21 10:13:00,,Prepped in Milk Room,6
2,NB00237,NB00237_M_17,36.6,87.0,3010.0,46.1,30.0,Scavenged,MOM,Y,N,Y,2025-05-06 20:57:00,unknown,Y,2025-05-06 08:56:00,2025-05-06 13:47:00,Unknown,Scavenged,2
3,NB00237,NB00237_M_19,37.1,89.0,3110.0,46.3,30.5,Scavenged,MOM,Y,N,Y,2025-05-08 20:43:00,45 min,Y,2025-05-08 12:03:00,2025-05-08 14:07:00,2025-05-08 16:49:00,Scavenged,3
4,NB00237,NB00237_M_20,37.5,93.0,3380.0,46.3,30.5,Scavenged,MOM,Y,N,,,,Y,Prepped 5/11/25 PM,2025-05-12 10:58:00,,Prepped in Milk Room,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
136,NB00469,NB00469_M_2,34.1,25.0,2040.0,42.5,30.3,Scavenged,MOM,Y+Nutramigen,N,Y,2025-05-09 21:09:00,unknown,Y,2025-05-09 08:30:00,2025-05-09 12:12:00,2025-05-09 16:06:00,Scavenged,2
137,NB00486,NB00486_M_1,32.1,9.0,1055.0,36.0,26.7,Scavenged,DBM,Y,,,,,Y,Prepped 5/13/25 PM,2025-05-14 10:45:00,,Prepped in Milk Room,8
138,NB00486,NB00486_M_2,32.1,9.0,1055.0,36.0,26.7,Scavenged,DBM,Y,N,N,,unknown,Y,5/14/2025 done @ 08:11,2025-05-14 10:45:00,2025-05-14 17:37:00,Scavenged,1
139,NB00487,NB00487_M_1,34.2,7.0,1940.0,44.3,29.0,Scavenged,MOM+DBM,Y,N,,,,Y,Prepped 5/13/25 PM,2025-05-14 11:00:00,,Prepped in Milk Room,7


In [28]:
df.to_excel("Cleaned Data/cleaned_linkedmeta_updated.xlsx", index=False)

In [None]:
subject_counts = df['Subject ID'].value_counts()
num_subjects_more_than_3 = (subject_counts > 3).sum()
print(num_subjects_more_than_3)

## Area Counts - LINKED

In [None]:
AC = pd.read_excel("Raw Data/Linked AC.xlsx")
AC

In [None]:
AC_long = AC.set_index('sample_unique_id').transpose().reset_index()
AC_long = AC_long.rename(columns={'index': 'Lab_ID_full'})
AC_long

In [None]:
AC_long = AC_long.rename(columns={
    "2'FL": "2FL",
    "3'SL": "3SL",
    "6'SL": "6SL",
})

In [None]:
AC_long.columns

In [None]:
AC_long.to_excel("Raw Data/AC_long.xlsx", index=False)

In [None]:
AC_long

In [None]:
AC_volumes = pd.read_excel("Raw Data/Linked AC.xlsx", sheet_name="volumes")

In [None]:
AC_volumes['Lab_ID_full'] = AC_volumes['Lab ID'].astype(str) + AC_volumes['Unnamed: 4'].astype(str)

In [None]:
AC_volumes

In [None]:
AC_volumes['sample_unique_id'] = AC_volumes['Subject ID'].astype(str) + '_' + AC_volumes['Prepped'].astype(str)

In [None]:
AC_volumes

In [None]:
AC_volumes = AC_volumes[['Lab_ID_full', 'sample_unique_id']]

In [None]:
AC_long = AC_long.merge(AC_volumes, on='Lab_ID_full', how='left')

In [None]:
cols = ['sample_unique_id'] + [col for col in AC_long.columns if col != 'sample_unique_id']
AC_long = AC_long[cols]
AC_long

In [None]:
AC_long = AC_long.drop(columns=['Lab_ID_full'])

### Merge AC to Metadata

In [None]:
AC_long

In [None]:
df

In [None]:
merged_df = df.merge(AC_long, on='sample_unique_id', how='left')
merged_df

In [None]:
merged_df.to_excel("Cleaned Data/Linked_Merged.xlsx", index=False)