In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import glob

In [None]:

# Function to identify log phase growth
def identify_log_phase(well_data, min_growth_rate=0.2, min_duration_hours=.5):
    """
    Identify log phase growth period for a well.
    
    Parameters:
    - well_data: DataFrame with time_hours and cell_concentration_cells_per_ml
    - min_growth_rate: Minimum growth rate (doubling per hour) to consider as log phase
    - min_duration_hours: Minimum duration in hours for a log phase period
    
    Returns:
    - Dictionary with log phase information
    """
    well_data = well_data.sort_values('time_hours')
    
    # Calculate growth rate (doubling time)
    well_data['growth_rate'] = np.log(well_data['cell_concentration_cells_per_ml'] / well_data['cell_concentration_cells_per_ml'].shift(1)) / (well_data['time_hours'] - well_data['time_hours'].shift(1))
    
    # Find periods of consistent positive growth
    log_phase_periods = []
    current_period_start = None
    
    for i, row in well_data.iterrows():
        if pd.isna(row['growth_rate']):
            continue
            
        if row['growth_rate'] >= min_growth_rate:
            if current_period_start is None:
                current_period_start = i
        else:
            if current_period_start is not None:
                # End of current period
                period_duration = well_data.loc[i, 'time_hours'] - well_data.loc[current_period_start, 'time_hours']
                if period_duration >= min_duration_hours:
                    log_phase_periods.append({
                        'start_time': well_data.loc[current_period_start, 'time_hours'],
                        'end_time': well_data.loc[i, 'time_hours'],
                        'duration_hours': period_duration,
                        'start_concentration': well_data.loc[current_period_start, 'cell_concentration_cells_per_ml'],
                        'end_concentration': well_data.loc[i, 'cell_concentration_cells_per_ml']
                    })
                current_period_start = None
    
    # Handle case where log phase continues to the end
    if current_period_start is not None:
        period_duration = well_data['time_hours'].iloc[-1] - well_data.loc[current_period_start, 'time_hours']
        if period_duration >= min_duration_hours:
            log_phase_periods.append({
                'start_time': well_data.loc[current_period_start, 'time_hours'],
                'end_time': well_data['time_hours'].iloc[-1],
                'duration_hours': period_duration,
                'start_concentration': well_data.loc[current_period_start, 'cell_concentration_cells_per_ml'],
                'end_concentration': well_data['cell_concentration_cells_per_ml'].iloc[-1]
            })
    
    # Calculate total log phase duration
    total_log_duration = sum(period['duration_hours'] for period in log_phase_periods)
    
    return {
        'well': well_data['well'].iloc[0],
        'total_log_duration_hours': total_log_duration,
        'log_phase_periods': log_phase_periods,
        'max_growth_rate': well_data['growth_rate'].max() if not well_data['growth_rate'].isna().all() else 0
    }
def check_ye(series): 
    if series == "E" or series == "F" or series == "G":
        return True
    else:
        return False
def annotate_wells(results_df):
    ratios = [45/255, 60/240, 120/180, 180/120, 240/60, 270/30]
    #glycerol / glucose 
    results_df['column'] = results_df['well'].astype(str).apply(lambda x: x[1:])
    results_df['row'] = results_df['well'].astype(str).apply(lambda x: x[0])
    results_df['sugar'] = results_df['column'].map({
        "1":"High","2":"High","3":"High","4":"High","5":"High","6":"High",
        "7":"Low","8":"Low","9":"Low","10":"Low","11":"Low","12":"Low",})
    results_df['ratio'] = results_df['column'].map({
        "1":ratios[0],"2":ratios[1],"3":ratios[2],"4":ratios[3],"5":ratios[4],"6":ratios[5],
        "7":ratios[0],"8":ratios[1],"9":ratios[2],"10":ratios[3],"11":ratios[4],"12":ratios[5],})
    results_df['YE'] = results_df.row.apply(check_ye)
    return results_df

In [3]:
df_list = []
csv_list = glob.glob('./princess_data/*.csv')
for csv_file in csv_list:
    df = pd.read_csv(csv_file)
    # Convert timestamp to datetime and calculate time in hours
    df_list.append(df)
df = pd.concat(df_list)
df.head()



Unnamed: 0,timestamp,well,culture_id,cell_concentration_cells_per_ml,absorbance_od,absorbance_wavelength_nm,absorbance_bandwidth_nm,absorbance
0,2025-10-26T07:00:52.533258+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,254900000,0.2549,600.0,9.0,
1,2025-10-26T07:07:51.380587+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,248500000,0.2485,600.0,9.0,
2,2025-10-26T07:14:50.178492+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,251300000,0.2513,600.0,9.0,
3,2025-10-26T07:24:09.487456+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,267700000,0.2677,600.0,9.0,
4,2025-10-26T07:33:53.439212+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,274300000,0.2743,600.0,9.0,


In [4]:
#df = pd.read_csv('princess_column1_7am_10am.csv')
# Convert timestamp to datetime and calculate time in hours
df['timestamp'] = pd.to_datetime(df['timestamp'])
df['time_hours'] = (df['timestamp'] - df['timestamp'].min()).dt.total_seconds() / 3600

df.head()


Unnamed: 0,timestamp,well,culture_id,cell_concentration_cells_per_ml,absorbance_od,absorbance_wavelength_nm,absorbance_bandwidth_nm,absorbance,time_hours
0,2025-10-26 07:00:52.533258+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,254900000,0.2549,600.0,9.0,,0.0
1,2025-10-26 07:07:51.380587+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,248500000,0.2485,600.0,9.0,,0.116346
2,2025-10-26 07:14:50.178492+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,251300000,0.2513,600.0,9.0,,0.232679
3,2025-10-26 07:24:09.487456+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,267700000,0.2677,600.0,9.0,,0.388043
4,2025-10-26 07:33:53.439212+00:00,A2,CLTR1JSKV4L7GKEQHW7UVYHRVRVHSAU,274300000,0.2743,600.0,9.0,,0.550252


In [25]:

# Analyze each well
log_phase_results = []
for well in df['well'].unique():
    well_data = df[df['well'] == well].copy()
    #print(well)
    if well != "C7" and well != "C4":
        result = identify_log_phase(well_data,min_growth_rate=0.3, min_duration_hours=0.5)
        log_phase_results.append(result)

# Create results DataFrame
results_df = pd.DataFrame([
    {
        'well': result['well'],
        'log_phase_duration_hours': result['total_log_duration_hours'],
        'max_growth_rate': result['max_growth_rate'],
        'num_log_periods': len(result['log_phase_periods'])
    }
    for result in log_phase_results
])

print("Log Phase Analysis Results:")
print(results_df.sort_values('log_phase_duration_hours', ascending=False))


Log Phase Analysis Results:
   well  log_phase_duration_hours  max_growth_rate  num_log_periods
12   F4                  1.733168         0.522136                1
11   E4                  1.733168         0.616077                1
13   G4                  1.616798         0.535149                2
56   F1                  1.132720         1.078349                1
5    F2                  1.016387         0.610885                1
..  ...                       ...              ...              ...
35  C11                  0.000000         0.035878                0
34  B11                  0.000000         0.052037                0
33  A11                  0.000000         0.024728                0
32  G11                  0.000000         0.041217                0
86   B6                  0.000000         0.004476                0

[87 rows x 4 columns]


In [32]:
results_df_annotated = annotate_wells(results_df)
results_df_annotated[['well','log_phase_duration_hours','sugar','ratio']].sort_values('log_phase_duration_hours', ascending=False)

Unnamed: 0,well,log_phase_duration_hours,sugar,ratio
12,F4,1.733168,High,1.500000
11,E4,1.733168,High,1.500000
13,G4,1.616798,High,1.500000
56,F1,1.132720,High,0.176471
5,F2,1.016387,High,0.250000
...,...,...,...,...
35,C11,0.000000,Low,4.000000
34,B11,0.000000,Low,4.000000
33,A11,0.000000,Low,4.000000
32,G11,0.000000,Low,4.000000


In [36]:
results_df_annotated.to_csv('./princess_data/annotated_results.csv')

In [33]:
results_df_annotated.groupby(['ratio','sugar','YE'])['log_phase_duration_hours'].mean()

ratio     sugar  YE   
0.176471  High   False    0.359454
                 True     0.938876
          Low    False    0.187512
                 True     0.561421
0.250000  High   False    0.239636
                 True     0.848136
          Low    False    0.000000
                 True     0.000000
0.666667  High   False    0.000000
          Low    False    0.000000
                 True     0.000000
1.500000  High   False    0.653795
                 True     1.694378
          Low    False    0.000000
                 True     0.000000
4.000000  High   False    0.000000
                 True     0.000000
          Low    False    0.000000
                 True     0.000000
9.000000  High   False    0.000000
                 True     0.000000
          Low    False    0.000000
                 True     0.000000
Name: log_phase_duration_hours, dtype: float64

In [35]:
results_df_annotated.groupby(['YE','sugar','ratio'])['log_phase_duration_hours'].mean()

YE     sugar  ratio   
False  High   0.176471    0.359454
              0.250000    0.239636
              0.666667    0.000000
              1.500000    0.653795
              4.000000    0.000000
              9.000000    0.000000
       Low    0.176471    0.187512
              0.250000    0.000000
              0.666667    0.000000
              1.500000    0.000000
              4.000000    0.000000
              9.000000    0.000000
True   High   0.176471    0.938876
              0.250000    0.848136
              1.500000    1.694378
              4.000000    0.000000
              9.000000    0.000000
       Low    0.176471    0.561421
              0.250000    0.000000
              0.666667    0.000000
              1.500000    0.000000
              4.000000    0.000000
              9.000000    0.000000
Name: log_phase_duration_hours, dtype: float64

In [34]:
results_df_annotated.groupby(['YE'])['log_phase_duration_hours'].mean()

YE
False    0.117790
True     0.367528
Name: log_phase_duration_hours, dtype: float64

In [37]:
import matplotlib.pyplot as plt

In [38]:
identify_log_phase(df[df['well'] == 'A6'])

{'well': 'A6',
 'total_log_duration_hours': 0,
 'log_phase_periods': [],
 'max_growth_rate': np.float64(0.06178119208321107)}

In [39]:
identify_log_phase(df[df['well'] == 'F6'])

{'well': 'F6',
 'total_log_duration_hours': 0,
 'log_phase_periods': [],
 'max_growth_rate': np.float64(0.01572685704992185)}

In [None]:
df[df['well'] == 'A6']