In [2]:
import os
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
current_dir = os.getcwd()
project_root = os.path.dirname(current_dir)
results_folder_path = os.path.join(project_root, 'results')

In [4]:
project_root

'C:\\Users\\ShiskaRaut\\Desktop\\Projects\\1_Clonogenic_assay_analysis_sam_multiday'

In [15]:
device_ID = "C273"
well_ID = "wellA"
seeding_density = 6000
num_microwells = 12000

# define output directory
output_dir = os.path.join(results_folder_path, device_ID, well_ID)

# provide path to numpy raw counts file
npy_fpath = os.path.join(results_folder_path, device_ID, well_ID, 'raw_counts.npy')

# verify that file exists
if os.path.exists(npy_fpath):
    print("The file exists.")
else:
    print("The file does not exist.")

The file exists.


In [3]:
raw_counts = np.load(npy_fpath)

if raw_counts.shape[1] != 2:
    raise ValueError("The numpy array must have a shape of (n, 2).")

raw_counts_df = pd.DataFrame(raw_counts, columns = ['day1', 'dayn'])

NameError: name 'npy_fpath' is not defined

### Helper Functions

In [19]:
# remove empty well datapoints that were overcounted
def limit_dataframe_size(df, max_rows=12000):
    # Identify rows where both 'day1' and 'dayn' are zero
    zero_rows = df[(df['day1'] == 0) & (df['dayn'] == 0)]

    # Calculate the number of rows to delete
    rows_to_delete = len(df) - max_rows

    if rows_to_delete > 0:
        # Randomly sample rows to delete if necessary
        rows_to_remove = zero_rows.sample(n=rows_to_delete, random_state=42)

        # Drop the sampled rows
        df = df.drop(rows_to_remove.index)

    # Reset the index of the DataFrame
    df.reset_index(drop=True, inplace=True)

    return df


In [20]:
raw_counts_df = limit_dataframe_size(raw_counts_df)
raw_counts_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype
---  ------  --------------  -----
 0   day1    12000 non-null  uint8
 1   dayn    12000 non-null  uint8
dtypes: uint8(2)
memory usage: 23.6 KB


In [22]:
raw_counts_df.tail(10)

Unnamed: 0,day1,dayn
11990,0,0
11991,0,0
11992,0,0
11993,0,0
11994,0,0
11995,0,0
11996,0,0
11997,0,0
11998,0,0
11999,0,0


In [23]:
non_zero_df = raw_counts_df[~((raw_counts_df['day1'] == 0) & (raw_counts_df['dayn'] == 0))]
non_zero_df

Unnamed: 0,day1,dayn
70,1,0
71,2,0
150,0,1
151,3,0
152,0,1
...,...,...
11919,0,2
11927,0,6
11928,0,5
11932,5,8


In [24]:
len(non_zero_df)

2181

## Seeding

#### Helper Functions

In [6]:
def get_seeding_counts_df(well_raw_df):

    # count the number of wells that are empty, sungles, dublets, etc.
    d1_cell_catg_count = well_raw_df['day1'].value_counts()  # 'day-1 cell category count'
    d1_cell_catg_count_df = d1_cell_catg_count.reset_index()
    d1_cell_catg_count_df.columns = ['day-1 number of cells per microwell', 'day-1 microwell count']
    d1_cell_catg_count_df = d1_cell_catg_count_df.sort_values(by='day-1 microwell count', ascending=True)
    d1_cell_catg_count_df['day 1 microwell %'] = (d1_cell_catg_count_df['day-1 microwell count'] / num_microwells) * 100
    d1_cell_catg_count_df['day-1 number of cells per microwell'] = d1_cell_catg_count_df['day-1 number of cells per microwell'].astype(
        'category')
    d1_cell_catg_count_df = d1_cell_catg_count_df.reset_index(drop=True)
    return d1_cell_catg_count_df
    
def generate_histograms(df, ID, title):
    fig, axs = plt.subplots(1, 2, figsize=(14, 6), tight_layout=True)

    # First histogram: number of wells vs number of cells
    sns.barplot(y='day-1 well count', data=df, x='day-1 no. of cells', ax=axs[0])
    axs[0].set_title(f'{title} - Values', fontsize=14)
    axs[0].set_xlabel('Number of Cells/Microwell After Seeding', fontsize=11)
    axs[0].set_ylabel('Well Count', fontsize=11)
    axs[0].grid(axis='y', linestyle='--', alpha=0.7)

    # Adding values on top of bars for the first plot
    for container in axs[0].containers:
        axs[0].bar_label(container, fmt='%.0f', label_type='edge', fontsize=10)

    # Second histogram: percentage of wells vs number of cells
    sns.barplot(y='well %', data=df, x='day-1 no. of cells', ax=axs[1])
    axs[1].set_title(f'{title} - %', fontsize=14)
    axs[1].set_xlabel('Number of Cells/Microwell After Seeding', fontsize=11)
    axs[1].set_ylabel('% of Wells', fontsize=11)
    axs[1].grid(axis='y', linestyle='--', alpha=0.7)

    # Adding values on top of bars for the second plot
    for container in axs[1].containers:
        axs[1].bar_label(container, fmt='%.2f%%', label_type='edge', fontsize=10)

    # Display the plots
    plt.show()