**PARALLEL COMPUTING SCRIPT(NO OUTLIER)**

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline
from scipy.optimize import curve_fit
import math

def logistic_function(x, L, k, x0):
    return L / (1 + np.exp(-k * (x - x0)))


def get_cell_line_order(input_file):
    df = pd.read_excel(input_file, header=7)
    return [col.split(' ')[0] for col in df.columns if col != 'Elapsed' and col != 'Date Time']


def calculate_mean_std_dev_and_counts(input_file, output_file):
    df = pd.read_excel(input_file, header=7)
    df['Elapsed'] = pd.to_numeric(df['Elapsed'], errors='coerce')
    df = df[df['Elapsed'].between(0, 140)]
    elapsed_time = df['Elapsed']


    cell_line_order = get_cell_line_order(input_file)
    combined_data = {'Elapsed': elapsed_time}


    for prefix in cell_line_order:
        columns = [col for col in df.columns if col.startswith(prefix + ' ') and col != 'Date Time']
        if columns:
            cell_line_data = df[columns]
            combined_data[f"{prefix} Mean"] = cell_line_data.mean(axis=1)
            combined_data[f"{prefix} STD DEV"] = cell_line_data.std(axis=1)
            combined_data[f"{prefix} COUNTS"] = [f"{prefix} - {cell_line_data.shape[1]}"] * len(elapsed_time)

    combined_df = pd.DataFrame(combined_data)
    combined_df.to_excel(output_file, index=False)


def calculate_growth_rates_and_doubling_times(data_file, output_file):
    data = pd.read_excel(data_file)
    elapsed_time = data['Elapsed'].values
    columns_to_calculate = [col for col in data.columns if col.endswith(' Mean')]
    growth_rates_and_doubling_times = []

    for column in columns_to_calculate:
        par_mean = data[column].values
        valid_elapsed = elapsed_time[~np.isnan(par_mean)]
        valid_par_mean = par_mean[~np.isnan(par_mean)]

        if len(valid_par_mean) > 0:
            try:
                y_max, y_min = np.nanmax(valid_par_mean), np.nanmin(valid_par_mean)
                x_max, x_min = np.nanmax(valid_elapsed), np.nanmin(valid_elapsed)
                x_mid, y_mid = (x_max + x_min) / 2, (y_max + y_min) / 2
                L_init, k_init = y_max, 4 / (x_max - x_min)
                x0_init = x_mid - np.log((y_max - y_mid) / (y_mid - y_min)) / k_init

                popt, _ = curve_fit(logistic_function, valid_elapsed, valid_par_mean, p0=[L_init, k_init, x0_init])
                L_opt, k_opt, x0_opt = popt
                growth_rate = k_opt
                doubling_time = math.log(2) / growth_rate
            except:
                L_opt, k_opt, x0_opt = np.nan, np.nan, np.nan
                growth_rate = np.nan
                doubling_time = np.nan
        else:
            L_opt, k_opt, x0_opt = np.nan, np.nan, np.nan
            growth_rate = np.nan
            doubling_time = np.nan

        growth_rates_and_doubling_times.append({
            'Column': column.replace(' Mean', ''),
            'Growth Rate': growth_rate,
            'Doubling Time': doubling_time,
            'Max Carrying Capacity (L)': L_opt,
            'Midpoint (x0)': x0_opt,
            'Time Range': f"0 to {np.max(valid_elapsed):.2f} hours" if len(valid_elapsed) > 0 else "N/A"
        })

    growth_rate_and_doubling_time_df = pd.DataFrame(growth_rates_and_doubling_times)
    growth_rate_and_doubling_time_df.to_excel(output_file, index=False)

def plot_smooth_growth_curve(input_file, output_image):
    df = pd.read_excel(input_file)
    elapsed_time = df['Elapsed']

    columns_to_plot = [col for col in df.columns if col.endswith(' Mean')]

    plt.figure(figsize=(14, 8))
    colors = plt.cm.get_cmap('tab10', len(columns_to_plot))

    for i, column in enumerate(columns_to_plot):
        mean_data = df[column]
        std_column = column.replace(' Mean', ' STD DEV')
        std_data = df[std_column]

        if not mean_data.isnull().all():
            spline_mean = make_interp_spline(elapsed_time, mean_data, k=3)
            smooth_time = np.linspace(elapsed_time.min(), elapsed_time.max(), 500)
            smooth_mean = spline_mean(smooth_time)

            plt.plot(smooth_time, smooth_mean, label=column.replace(' Mean', ''), color=colors(i))
            plt.errorbar(elapsed_time, mean_data, yerr=std_data, fmt='o', capsize=2, color=colors(i))

    plt.xlabel('Time (hours)')
    plt.ylabel('% Confluence')
    plt.title('Growth Curve')
    plt.legend()
    plt.grid(True)
    plt.xticks(np.arange(0, 141, 24))
    plt.savefig(output_image, format='png')
    plt.close()

def process_files(file_list, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for input_file in file_list:
        filename = os.path.splitext(os.path.basename(input_file))[0]

        mean_std_dev_counts_output = os.path.join(output_dir, f"{filename}_mean_stdev_counts.xlsx")
        growth_rate_and_doubling_time_output = os.path.join(output_dir, f"{filename}_GR_DT.xlsx")
        growth_curve_output_image = os.path.join(output_dir, f"{filename}_growth_curve.png")

        calculate_mean_std_dev_and_counts(input_file, mean_std_dev_counts_output)
        calculate_growth_rates_and_doubling_times(mean_std_dev_counts_output, growth_rate_and_doubling_time_output)
        plot_smooth_growth_curve(mean_std_dev_counts_output, growth_curve_output_image)

def main():
    # List of input files
    input_files = [
    ]

    output_dir = "output"

    process_files(input_files, output_dir)

if __name__ == "__main__":
    main()

**STEPS FOR OUTLIER REMOVAL**

**FILTERING DATA FOR 24HR**

In [None]:
import os
import pandas as pd

# Function to process data for a given elapsed time
def process_data(elapsed_time, file_name, output_file):
    # Read data from the Excel file
    df = pd.read_excel(file_name, header=7)

    # Filter the dataframe for rows where 'Elapsed' matches the specified time
    cutrow = df[df['Elapsed'] == elapsed_time].drop(columns=["Elapsed", "Date Time"]).T

    # Assign 'my_list' values
    cutrow['my_list'] = [i for i in range(1, 9) for _ in range(12)]

    # Create a new DataFrame 'wide' with a column 'Rep' containing numbers from 1 to 12
    wide = pd.DataFrame({"Rep": range(1, 13)})

    # Get the correct column name (assuming it's the only numerical column)
    data_columns = [col for col in cutrow.columns if isinstance(col, (int, float))]
    if len(data_columns) != 1:
        raise ValueError("Unexpected number of numerical data columns found.")

    data_column = data_columns[0]

    # Iterate over the range from 1 to 8 to filter the 'cutrow' DataFrame and populate 'wide'
    for i in range(1, 9):
        subset = cutrow[cutrow['my_list'] == i]

        if not subset.empty:
            column = subset.index[0].split(" ")[0]

            if data_column in subset.columns:  # Ensure the correct data column exists in the subset
                wide[column] = subset[data_column].values

    # Save the resulting DataFrame to an Excel file
    wide.to_excel(output_file, index=False)

def process_multiple_files(input_files, output_dir, elapsed_time=24):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for file_name in input_files:
        try:
            # Generate output file name
            base_name = os.path.splitext(os.path.basename(file_name))[0]
            output_file = os.path.join(output_dir, f"{base_name}_{elapsed_time}hr.xlsx")

            # Process the file
            process_data(elapsed_time, file_name, output_file)
            print(f"Processed {file_name} and saved results to {output_file}")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")

# List of input files
input_files = [
    ""
        ]

# Output directory
output_dir = "processed_data_24hr"

# Process all files
process_multiple_files(input_files, output_dir)


**PROCESSED STATS**

In [None]:
import os
import pandas as pd
from scipy.stats import iqr

def process_file(input_file, output_dir):
    # Read the original data from the Excel file
    df = pd.read_excel(input_file)

    # Get the cell lines present in this file, preserving the original order
    file_cell_lines = [col for col in df.columns if col != 'Rep']

    # Create a list to store the statistics for each cell line
    cell_line_stats = []

    # Iterate over cell lines in the order they appear in the file
    for cell_line in file_cell_lines:
        # Extract the doubling time for the current cell line
        doubling_time = df[cell_line]

        # Calculate the IQR, Q1, and Q3 for the doubling time
        Q1 = doubling_time.quantile(0.25)
        Q3 = doubling_time.quantile(0.75)
        IQR_value = iqr(doubling_time)

        # Calculate the lower and upper limits
        lower_limit = Q1 - 1.5 * IQR_value
        upper_limit = Q3 + 1.5 * IQR_value

        # Calculate mean and standard deviation before removing outliers
        mean_before = doubling_time.mean()
        std_dev_before = doubling_time.std()

        # Identify outliers
        outliers = doubling_time[(doubling_time < lower_limit) | (doubling_time > upper_limit)]

        # Remove outliers
        filtered_doubling_time = doubling_time[(doubling_time >= lower_limit) & (doubling_time <= upper_limit)]

        # Calculate mean and standard deviation after removing outliers
        mean_after = filtered_doubling_time.mean()
        std_dev_after = filtered_doubling_time.std()

        # Store the statistics for the current cell line
        cell_line_stats.append({
            'Cell Line': cell_line,
            'IQR': IQR_value,
            'Q1': Q1,
            'Q3': Q3,
            'Lower Limit': lower_limit,
            'Upper Limit': upper_limit,
            'Mean Before': mean_before,
            'Std Dev Before': std_dev_before,
            'Mean After': mean_after,
            'Std Dev After': std_dev_after,
            'Outliers': list(outliers)
        })

    # Create a DataFrame from the collected statistics
    cell_line_stats_df = pd.DataFrame(cell_line_stats)

    # Generate output file name
    base_name = os.path.splitext(os.path.basename(input_file))[0]
    output_file = os.path.join(output_dir, f"{base_name}_Stats.xlsx")

    # Save the statistics to a new Excel file
    cell_line_stats_df.to_excel(output_file, index=False)

    print(f"Processed {input_file} and saved statistics to {output_file}")

def process_multiple_files(input_files, output_dir):
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)

    for input_file in input_files:
        try:
            process_file(input_file, output_dir)
        except Exception as e:
            print(f"Error processing {input_file}: {str(e)}")

# List of input files
input_files = [
    ""
    ]

# Output directory
output_dir = "processed_stats"

# Process all files
process_multiple_files(input_files, output_dir)

**PARALLEL COMPUTING SCRIPT(OUTLIER_24)**

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline
from scipy.optimize import curve_fit
import math

def logistic_function(x, L, k, x0):
    return L / (1 + np.exp(-k * (x - x0)))

def get_cell_line_order(input_file):
    df = pd.read_excel(input_file, header=7, sheet_name="Outlier_24")
    return [col.split(' ')[0] for col in df.columns if col != 'Elapsed' and col != 'Date Time']

def calculate_mean_std_dev_and_counts(input_file, output_file):
    df = pd.read_excel(input_file, header=7, sheet_name="Outlier_24")
    df['Elapsed'] = pd.to_numeric(df['Elapsed'], errors='coerce')
    df = df[df['Elapsed'].between(0, 140)]
    elapsed_time = df['Elapsed']

    cell_line_order = get_cell_line_order(input_file)
    combined_data = {'Elapsed': elapsed_time}

    for prefix in cell_line_order:
        columns = [col for col in df.columns if col.startswith(prefix + ' ') and col != 'Date Time']
        cell_line_data = df[columns].dropna(axis=1, how='all')
        if not cell_line_data.empty:
            combined_data[f"{prefix} Mean"] = cell_line_data.mean(axis=1)
            combined_data[f"{prefix} STD DEV"] = cell_line_data.std(axis=1)
            combined_data[f"{prefix} COUNTS"] = [f"{prefix} - {cell_line_data.shape[1]}"] * len(elapsed_time)

    combined_df = pd.DataFrame(combined_data)
    combined_df.to_excel(output_file, index=False)

def calculate_growth_rates_and_doubling_times(data_file, output_file):
    data = pd.read_excel(data_file)
    elapsed_time = data['Elapsed'].values
    columns_to_calculate = [col for col in data.columns if col.endswith(' Mean')]
    results = []

    for column in columns_to_calculate:
        par_mean = data[column].values
        valid_indices = ~np.isnan(par_mean) & ~np.isnan(elapsed_time)
        valid_par_mean = par_mean[valid_indices]
        valid_elapsed = elapsed_time[valid_indices]

        if len(valid_par_mean) > 0:
            try:
                y_max, y_min = np.max(valid_par_mean), np.min(valid_par_mean)
                x_max, x_min = np.max(valid_elapsed), np.min(valid_elapsed)
                x_mid, y_mid = (x_max + x_min) / 2, (y_max + y_min) / 2
                L_init, k_init = y_max, 4 / (x_max - x_min)
                x0_init = x_mid - np.log((y_max - y_mid) / (y_mid - y_min)) / k_init

                popt, _ = curve_fit(logistic_function, valid_elapsed, valid_par_mean, p0=[L_init, k_init, x0_init])
                L_opt, k_opt, x0_opt = popt
                growth_rate = k_opt
                doubling_time = math.log(2) / growth_rate
            except:
                L_opt, k_opt, x0_opt = np.nan, np.nan, np.nan
                growth_rate = np.nan
                doubling_time = np.nan
        else:
            L_opt, k_opt, x0_opt = np.nan, np.nan, np.nan
            growth_rate = np.nan
            doubling_time = np.nan

        results.append({
            'Column': column.replace(' Mean', ''),
            'Growth Rate (k)': k_opt,
            'Doubling Time': doubling_time,
            'Max Carrying Capacity (L)': L_opt,
            'Midpoint (x0)': x0_opt,
            'Time Range': f"0 to {np.max(valid_elapsed):.2f} hours" if len(valid_elapsed) > 0 else "N/A"
        })

    results_df = pd.DataFrame(results)
    results_df.to_excel(output_file, index=False)

def plot_smooth_growth_curve(input_file, output_image):
    df = pd.read_excel(input_file)
    elapsed_time = df['Elapsed']

    columns_to_plot = [col for col in df.columns if col.endswith(' Mean')]

    plt.figure(figsize=(14, 8))
    colors = plt.cm.get_cmap('tab10', len(columns_to_plot))

    for i, column in enumerate(columns_to_plot):
        mean_data = df[column]
        std_column = column.replace(' Mean', ' STD DEV')
        std_data = df[std_column]

        if not mean_data.isnull().all():
            spline_mean = make_interp_spline(elapsed_time, mean_data, k=3)
            smooth_time = np.linspace(elapsed_time.min(), elapsed_time.max(), 500)
            smooth_mean = spline_mean(smooth_time)

            plt.plot(smooth_time, smooth_mean, label=column.replace(' Mean', ''), color=colors(i))
            plt.errorbar(elapsed_time, mean_data, yerr=std_data, fmt='o', capsize=2, color=colors(i))

    plt.xlabel('Time (hours)')
    plt.ylabel('% Confluence')
    plt.title('Growth Curve (Outlier)')
    plt.legend()
    plt.grid(True)
    plt.xticks(np.arange(0, 141, 24))
    plt.savefig(output_image, format='png')
    plt.close()

def process_files(file_list, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for input_file in file_list:
        filename = os.path.splitext(os.path.basename(input_file))[0]

        mean_std_dev_counts_output = os.path.join(output_dir, f"{filename}_mean_stdev_counts.xlsx")
        growth_rate_and_doubling_time_output = os.path.join(output_dir, f"{filename}_GR_DT.xlsx")
        growth_curve_output_image = os.path.join(output_dir, f"{filename}_growth_curve.png")

        calculate_mean_std_dev_and_counts(input_file, mean_std_dev_counts_output)
        calculate_growth_rates_and_doubling_times(mean_std_dev_counts_output, growth_rate_and_doubling_time_output)
        plot_smooth_growth_curve(mean_std_dev_counts_output, growth_curve_output_image)

def main():
    # List of input files
    input_files = [
        ""
    ]

    output_dir = "output"

    process_files(input_files, output_dir)

if __name__ == "__main__":
    main()
