In [66]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

syngap_path = "/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/syngap_t1/Activity/Compiled_ActivityScan.csv"
data=pd.read_csv(syngap_path)

In [83]:
# extract WT
WT = data[data['NeuronType'] == 'WT']

WT

Unnamed: 0,Run_ID,DIV,Well,NeuronType,Time,Chip_ID,Mean_FiringRate,Mean_SpikeAmplitude,Active_area
0,1,4,1,WT,25-Mar-2024 11:10:17,M06691,1.630019,40.636041,23.121212
12,3,4,1,WT,25-Mar-2024 11:43:07,M07301,1.446821,34.255155,23.515152
24,9,7,1,WT,28-Mar-2024 09:25:28,M06691,0.75826,68.948117,34.409091
36,11,7,1,WT,28-Mar-2024 09:52:03,M07301,0.745001,59.967954,37.378788
48,17,12,1,WT,02-Apr-2024 09:35:20,M06691,1.254047,87.557557,66.075758
60,19,12,1,WT,02-Apr-2024 10:02:43,M07301,0.913039,76.82544,65.575758
72,25,15,1,WT,05-Apr-2024 10:19:43,M06691,1.115053,90.616355,62.818182
84,27,15,1,WT,05-Apr-2024 10:47:00,M07301,1.393783,85.916692,71.666667
96,33,19,1,WT,09-Apr-2024 10:19:53,M06691,1.018936,76.689648,49.954545
108,35,19,1,WT,09-Apr-2024 10:52:30,M07301,1.278888,90.431808,70.348485


In [68]:
chip_ids = WT['Chip_ID'].unique()
chip_ids

array(['M06691', 'M07301'], dtype=object)

In [69]:
# Define the criteria for filtering: lines with Activity_area < 50, filter out if 50% of WT units exhibit lower than 50% activity area

criteria = 50

# Group by 'Run_ID' and calculate the percentage of rows with 'Active_area' < 50 for each Run_ID
grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())

# Calculate the percentage of lines with low activity
percentage_low_activity_run_ids = (grouped_by_run > 0.5).mean() * 100

# Calculate the percentage of qualified lines
percentage_qualified_run_ids = 100 - percentage_low_activity_run_ids

# Determine if the dataset is qualified or not based on the criteria
if percentage_low_activity_run_ids > 50:
    print("The dataset is not qualified. More than 50% of lines show low activity (<50%).")
else:
    print("The dataset is qualified. Less than or equal to 50% of lines show low activity (<50).")

# Print the percentage of qualified lines
print(f"Percentage of qualified lines: {percentage_qualified_run_ids:.2f}%")

The dataset is not qualified. More than 50% of lines show low activity (<50%).
Percentage of qualified lines: 43.75%


In [70]:
grouped_by_run

Chip_ID  Well  DIV
M06691   1     4      1.0
               7      1.0
               12     0.0
               15     0.0
               19     1.0
               22     0.0
               26     1.0
               29     1.0
M07301   1     4      1.0
               7      1.0
               12     0.0
               15     0.0
               19     0.0
               22     0.0
               26     1.0
               29     1.0
Name: Active_area, dtype: float64

In [71]:
def batch_process_datasets(file_paths):
    """
    Batch process multiple datasets to check their qualification based on the criteria 
    that less than or equal to 50% of unique lines exhibit more than 50% of rows with low activity (<50).

    Parameters:
    - file_paths (list of str): List of paths to CSV files.

    Returns:
    - dict: A dictionary with file paths as keys and tuples as values containing the qualification message
            and the percentage of qualified lines for each dataset.
    """
    import pandas as pd
    
    # Function to process each file and determine qualification
    def process_single_file(file_path):
        data = pd.read_csv(file_path)
        WT = data[data['NeuronType'] == 'WT']
        criteria = 50
        grouped_by_run = WT.groupby('Run_ID')['Active_area'].apply(lambda x: (x < criteria).mean())
        percentage_low_activity_run_ids = (grouped_by_run > 0.5).mean() * 100
        percentage_qualified_run_ids = 100 - percentage_low_activity_run_ids
        
        if percentage_low_activity_run_ids > 50:
            result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50)."
        else:
            result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50)."
        
        return result_message, percentage_qualified_run_ids

    # Dictionary to store results
    results = {}
    
    # Process each file and store the result
    for path in file_paths:
        try:
            results[path] = process_single_file(path)
        except Exception as e:
            results[path] = (f"Error processing file: {str(e)}", None)
    
    return results

In [72]:
# Example usage with a list of file paths
file_paths = ['/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/SYNGAP_T1_ALL/Activity/Compiled_ActivityScan.csv', 
              '/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/SYNGAP_T1/Activity/Compiled_ActivityScan.csv']  # Add more file paths as needed
batch_results = batch_process_datasets(file_paths)

# Display results for each file processed
for path, (message, percentage) in batch_results.items():
    print(f"File: {path}")
    print(message)
    if percentage is not None:
        print(f"Percentage of qualified lines: {percentage:.2f}%")
    print("---")

File: /Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/SYNGAP_T1_ALL/Activity/Compiled_ActivityScan.csv
The dataset is not qualified. More than 50% of lines show low activity (<50).
Percentage of qualified lines: 39.29%
---
File: /Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs/SYNGAP_T1/Activity/Compiled_ActivityScan.csv
The dataset is not qualified. More than 50% of lines show low activity (<50).
Percentage of qualified lines: 43.75%
---


In [73]:
def find_and_process_activity_data(base_path):
    """
    Recursively find and process 'Compiled_ActivityScan.csv' in each 'Activity' subfolder
    within the given base directory.
    
    Parameters:
    - base_path (str): The base directory to start the search from.
    
    Returns:
    - None: Results are printed directly.
    """
    import os
    import pandas as pd
    def process_single_file(file_path):
        try:
            data = pd.read_csv(file_path)
            if 'WT' not in data['NeuronType'].unique():
                return ("Missing WT data.", "nan%")

            WT = data[data['NeuronType'] == 'WT']
            if WT.empty:
                return ("Missing WT data.", "nan%")

            criteria = 50
            grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())
            percentage_low_activity_run_ids = (grouped_by_run > 0.5).mean() * 100
            percentage_qualified_run_ids = 100 - percentage_low_activity_run_ids
            
            if percentage_low_activity_run_ids > 50:
                result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50%)."
            else:
                result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%)."
            
            return (result_message, f"{percentage_qualified_run_ids:.2f}%")
        except Exception as e:
            return (f"Error processing file: {str(e)}", None)

    for root, dirs, files in os.walk(base_path):
        if 'Activity' in dirs:
            activity_path = os.path.join(root, 'Activity')
            csv_file = os.path.join(activity_path, 'Compiled_ActivityScan.csv')
            subfolder_name = root.split(os.sep)[-1]
            
            if os.path.exists(csv_file):
                result = process_single_file(csv_file)
                print(f"{subfolder_name}: {result[0]} Percentage of qualified lines: {result[1]}")
            else:
                print(f"{subfolder_name}: Missing 'Compiled_ActivityScan.csv'")
        else:
            if root.count(os.sep) - base_path.count(os.sep) == 1:  # Only report missing in direct subfolders of base_path
                subfolder_name = root.split(os.sep)[-1]
                print(f"{subfolder_name}: Missing 'Activity' folder")

In [74]:
# Uprated Example usage
base_path = "/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs"
find_and_process_activity_data(base_path)

ADNP_Therapy_T2: Missing WT data. Percentage of qualified lines: nan%
SYNGAP_T1_ALL: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 34.38%
TEST: Missing 'Activity' folder
SYNGAP_T1: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 43.75%
SYNGAP_Therapy_T1: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 25.00%
SYNGAP_T2: Missing WT data. Percentage of qualified lines: nan%
TEST_2: Missing 'Activity' folder
whatever: Missing WT data. Percentage of qualified lines: nan%


In [82]:
# final function, with saving results to CSV files
def find_and_process_activity_data(base_path):
    """
    Recursively find and process 'Compiled_ActivityScan.csv' in each 'Activity' subfolder
    within the given base directory. Outputs two CSV files listing eligible and ineligible subfolders
    with their corresponding percentages of qualified lines.
    
    Parameters:
    - base_path (str): The base directory to start the search from.
    
    Returns:
    - None: Results are printed directly and saved into CSV files.
    """
    import os
    import pandas as pd

    eligible = []
    ineligible = []
    
    def process_single_file(file_path):
        try:
            data = pd.read_csv(file_path)
            if 'WT' not in data['NeuronType'].unique() or data[data['NeuronType'] == 'WT'].empty:
                return ("Missing WT data.", "nan", False)

            WT = data[data['NeuronType'] == 'WT']
            criteria = 50
            grouped_by_run = WT.groupby(['Chip_ID','Well','DIV'])['Active_area'].apply(lambda x: (x < criteria).mean())
            percentage_low_activity_lines = (grouped_by_run > 0.5).mean() * 100
            percentage_qualified_lines = 100 - percentage_low_activity_lines
            
            if percentage_low_activity_lines > 50:
                result_message = "The dataset is not qualified. More than 50% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", False)
            else:
                result_message = "The dataset is qualified. Less than or equal to 50% of lines show low activity (<50%)."
                return (result_message, f"{percentage_qualified_lines:.2f}%", True)
        except Exception as e:
            return (f"Error processing file: {str(e)}", "nan", False)

    for root, dirs, files in os.walk(base_path):
        subfolder_name = root.split(os.sep)[-1]
        if 'Activity' in dirs:
            activity_path = os.path.join(root, 'Activity')
            csv_file = os.path.join(activity_path, 'Compiled_ActivityScan.csv')
            
            if os.path.exists(csv_file):
                result = process_single_file(csv_file)
                print(f"{subfolder_name}: {result[0]} Percentage of qualified lines: {result[1]}")
                if result[2]:
                    eligible.append((subfolder_name, result[1]))
                else:
                    ineligible.append((subfolder_name, result[1]))
            else:
                print(f"{subfolder_name}: Missing 'Compiled_ActivityScan.csv'")
                ineligible.append((subfolder_name, "nan"))
        else:
            if root.count(os.sep) - base_path.count(os.sep) == 1:  # Only report missing in direct subfolders of base_path
                print(f"{subfolder_name}: Missing 'Activity' folder")
                ineligible.append((subfolder_name, "nan"))
    
    # Save results to CSV files
    pd.DataFrame(eligible, columns=['Subfolder', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Eligible_Cohorts.csv'), index=False)
    pd.DataFrame(ineligible, columns=['Subfolder', 'Percentage of Qualified Lines']).to_csv(os.path.join(base_path, 'Ineligible_Cohorts.csv'), index=False)

In [81]:
# Example usage
base_path = "/Users/liufanling/Library/CloudStorage/OneDrive-Personal/1 UC DAVIS/2024 Summer/CSRA/Outputs"
find_and_process_activity_data(base_path)

ADNP_Therapy_T2: Missing WT data. Percentage of qualified lines: nan
SYNGAP_T1_ALL: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 34.38%
TEST: Missing 'Activity' folder
SYNGAP_T1: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 43.75%
SYNGAP_Therapy_T1: The dataset is not qualified. More than 50% of lines show low activity (<50%). Percentage of qualified lines: 25.00%
SYNGAP_T2: Missing WT data. Percentage of qualified lines: nan
TEST_2: Missing 'Activity' folder
whatever: Missing WT data. Percentage of qualified lines: nan
