<a href="https://colab.research.google.com/github/shivani-git08/BIOL_5930/blob/main/BIOL_5930_PROJECT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**KINOME PROJECT**

**PARALLEL COMPUTING SCRIPT(NO OUTLIER)**

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline
from scipy.optimize import curve_fit
import math

def logistic_function(x, L, k, x0):
    return L / (1 + np.exp(-k * (x - x0)))


def get_cell_line_order(input_file):
    df = pd.read_excel(input_file, header=7)
    return [col.split(' ')[0] for col in df.columns if col != 'Elapsed' and col != 'Date Time']


def calculate_mean_std_dev_and_counts(input_file, output_file):
    df = pd.read_excel(input_file, header=7)
    df['Elapsed'] = pd.to_numeric(df['Elapsed'], errors='coerce')
    df = df[df['Elapsed'].between(0, 140)]
    elapsed_time = df['Elapsed']


    cell_line_order = get_cell_line_order(input_file)
    combined_data = {'Elapsed': elapsed_time}


    for prefix in cell_line_order:
        columns = [col for col in df.columns if col.startswith(prefix + ' ') and col != 'Date Time']
        if columns:
            cell_line_data = df[columns]
            combined_data[f"{prefix} Mean"] = cell_line_data.mean(axis=1)
            combined_data[f"{prefix} STD DEV"] = cell_line_data.std(axis=1)
            combined_data[f"{prefix} COUNTS"] = [f"{prefix} - {cell_line_data.shape[1]}"] * len(elapsed_time)

    combined_df = pd.DataFrame(combined_data)
    combined_df.to_excel(output_file, index=False)


def calculate_growth_rates_and_doubling_times(data_file, output_file):
    data = pd.read_excel(data_file)
    elapsed_time = data['Elapsed'].values
    columns_to_calculate = [col for col in data.columns if col.endswith(' Mean')]
    growth_rates_and_doubling_times = []

    for column in columns_to_calculate:
        par_mean = data[column].values
        valid_elapsed = elapsed_time[~np.isnan(par_mean)]
        valid_par_mean = par_mean[~np.isnan(par_mean)]

        if len(valid_par_mean) > 0:
            try:
                y_max, y_min = np.nanmax(valid_par_mean), np.nanmin(valid_par_mean)
                x_max, x_min = np.nanmax(valid_elapsed), np.nanmin(valid_elapsed)
                x_mid, y_mid = (x_max + x_min) / 2, (y_max + y_min) / 2
                L_init, k_init = y_max, 4 / (x_max - x_min)
                x0_init = x_mid - np.log((y_max - y_mid) / (y_mid - y_min)) / k_init

                popt, _ = curve_fit(logistic_function, valid_elapsed, valid_par_mean, p0=[L_init, k_init, x0_init])
                L_opt, k_opt, x0_opt = popt
                growth_rate = k_opt
                doubling_time = math.log(2) / growth_rate
            except:
                L_opt, k_opt, x0_opt = np.nan, np.nan, np.nan
                growth_rate = np.nan
                doubling_time = np.nan
        else:
            L_opt, k_opt, x0_opt = np.nan, np.nan, np.nan
            growth_rate = np.nan
            doubling_time = np.nan

        growth_rates_and_doubling_times.append({
            'Column': column.replace(' Mean', ''),
            'Growth Rate': growth_rate,
            'Doubling Time': doubling_time,
            'Max Carrying Capacity (L)': L_opt,
            'Midpoint (x0)': x0_opt,
            'Time Range': f"0 to {np.max(valid_elapsed):.2f} hours" if len(valid_elapsed) > 0 else "N/A"
        })

    growth_rate_and_doubling_time_df = pd.DataFrame(growth_rates_and_doubling_times)
    growth_rate_and_doubling_time_df.to_excel(output_file, index=False)

def plot_smooth_growth_curve(input_file, output_image):
    df = pd.read_excel(input_file)
    elapsed_time = df['Elapsed']

    columns_to_plot = [col for col in df.columns if col.endswith(' Mean')]

    plt.figure(figsize=(14, 8))
    colors = plt.cm.get_cmap('tab10', len(columns_to_plot))

    for i, column in enumerate(columns_to_plot):
        mean_data = df[column]
        std_column = column.replace(' Mean', ' STD DEV')
        std_data = df[std_column]

        if not mean_data.isnull().all():
            spline_mean = make_interp_spline(elapsed_time, mean_data, k=3)
            smooth_time = np.linspace(elapsed_time.min(), elapsed_time.max(), 500)
            smooth_mean = spline_mean(smooth_time)

            plt.plot(smooth_time, smooth_mean, label=column.replace(' Mean', ''), color=colors(i))
            plt.errorbar(elapsed_time, mean_data, yerr=std_data, fmt='o', capsize=2, color=colors(i))

    plt.xlabel('Time (hours)')
    plt.ylabel('% Confluence')
    plt.title('Growth Curve')
    plt.legend()
    plt.grid(True)
    plt.xticks(np.arange(0, 141, 24))
    plt.savefig(output_image, format='png')
    plt.close()

def process_files(file_list, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    for input_file in file_list:
        filename = os.path.splitext(os.path.basename(input_file))[0]

        mean_std_dev_counts_output = os.path.join(output_dir, f"{filename}_mean_stdev_counts.xlsx")
        growth_rate_and_doubling_time_output = os.path.join(output_dir, f"{filename}_GR_DT.xlsx")
        growth_curve_output_image = os.path.join(output_dir, f"{filename}_growth_curve.png")

        calculate_mean_std_dev_and_counts(input_file, mean_std_dev_counts_output)
        calculate_growth_rates_and_doubling_times(mean_std_dev_counts_output, growth_rate_and_doubling_time_output)
        plot_smooth_growth_curve(mean_std_dev_counts_output, growth_curve_output_image)

def main():
    # List of input files
    input_files = [
    ]

    output_dir = "output"

    process_files(input_files, output_dir)

if __name__ == "__main__":
    main()