In [None]:
# code written by Abhishek Sule @ Chatham University, M.S. Biology for use with Single Cell Portal by Broad Institute

import pandas as pd
import anndata
import os
from watchdog.observers import Observer
from watchdog.events import FileSystemEventHandler

# Load the data
print("Loading data...")
adata = anndata.read_h5ad("all_cells.h5ad")
arrays = list(adata.obs.index)  # the names of the arrays
genes = list(adata.var.index)  # names of all genes in the dataset

# Create the DataFrame
print("Creating DataFrame...")
df = pd.DataFrame(adata.X)
df.index = arrays
df.columns = genes

# Check unique values in GZMB column
print("Unique values in GZMB column:", df['GZMB'].unique())

# Check data type of GZMB column
print("Data type of GZMB column:", df['GZMB'].dtype)

# Print a sample of the DataFrame
print("Sample of the DataFrame:", df.head())

# Adjust filtering logic for near-zero values
print("Filtering data...")
non_gzmb_cells = df[df['GZMB'] <= 1e-6].index.tolist()
gzmb_cells = df[df['GZMB'] > 1e-6].index.tolist()

# Create DataFrames for the non-GZMB and GZMB cells
non_gzmb_df = df.loc[non_gzmb_cells]
gzmb_df = df.loc[gzmb_cells]

# Extract gene names
non_gzmb_genes = non_gzmb_df.columns.tolist()
gzmb_genes = gzmb_df.columns.tolist()

# Specify a different directory for saving the files
output_dir = "output_files"
os.makedirs(output_dir, exist_ok=True)

# Save the entire filtered DataFrames to separate CSV files
print("Saving filtered DataFrames...")
non_gzmb_df.to_csv(os.path.join(output_dir, "non_gzmb_cells_genes_complete.csv"), index=False)
gzmb_df.to_csv(os.path.join(output_dir, "gzmb_cells_genes_complete.csv"), index=False)

# Function to save gene names in groups of 50
def save_gene_groups(genes, group_size, prefix):
    group_number = 1
    for i in range(0, len(genes), group_size):
        group = genes[i:i + group_size]
        if group:
            file_name = os.path.join(output_dir, f"{prefix}_group_{group_number}.csv")
            pd.DataFrame(group, columns=["Gene"]).to_csv(file_name, index=False)
            print(f"Saved {file_name}")
            group_number += 1

# Group size (50 genes per file)
group_size = 50

# Save the gene names in groups of 50
print("Saving gene names in groups of 50...")
save_gene_groups(non_gzmb_genes, group_size, "non_gzmb_genes")
save_gene_groups(gzmb_genes, group_size, "gzmb_genes")

print("Gene names for cells that do and do not produce granzyme B have been saved in groups of 50 and as complete files.")

# Define the file handler class for monitoring the folder
class FileHandler(FileSystemEventHandler):
    def __init__(self, folder_to_watch, database_url):
        self.folder_to_watch = folder_to_watch
        self.database_url = database_url

    def on_created(self, event):
        if event.is_directory:
            return
        file_path = event.src_path
        if file_path.endswith('.csv'):
            print(f"New CSV file detected: {file_path}")
            self.process_file(file_path)

    def process_file(self, file_path):
        print(f"Processing file: {file_path}")
        df = pd.read_csv(file_path)
        gene_names = df['Gene'].tolist()
        self.search_database(gene_names)

    def search_database(self, genes):
        # Placeholder for actual search logic
        print(f"Searching database at {self.database_url} with {len(genes)} gene names.")
        # Implement your search logic here

# Function to start monitoring the folder
def start_monitoring(folder_to_watch, database_url):
    event_handler = FileHandler(folder_to_watch, database_url)
    observer = Observer()
    observer.schedule(event_handler, folder_to_watch, recursive=False)
    observer.start()
    print(f"Started monitoring folder: {folder_to_watch}")
    try:
        while True:
            pass
    except KeyboardInterrupt:
        observer.stop()
    observer.join()

if __name__ == "__main__":
    folder_to_watch = "output_files"
    database_url = "https://singlecell.broadinstitute.org/single_cell/study/SCP642/cd8-lymphocytes-are-critical-for-early-control-of-tuberculosis-in-macaques?cluster=All%20Cells%20Clustering&spatialGroups=--&annotation=General_Celltypes--group--study&subsample=all#study-visualize"
    start_monitoring(folder_to_watch, database_url)


Loading data...
Creating DataFrame...
Unique values in GZMB column: [-0.4437231  -0.6130368  -0.6747464  ... -0.16763632  2.6062613
  0.9008085 ]
Data type of GZMB column: float32
Sample of the DataFrame:                               AADAT     ABCA1     ABCA3     ABCA6     ABCA8  \
Array2_28918_CAACGACACATC -0.167468  0.638589  1.007928 -0.227317 -0.065573   
Array2_28918_CAGTATGAGATC -0.167468  0.790691 -0.415276  0.098851 -0.065573   
Array2_28918_ACGTCGTCCCTT -0.167468  1.923116  0.261594 -0.227317 -0.065573   
Array2_28918_GCCAGCACATTA -0.167468  0.903512  0.122951  3.366497 -0.065573   
Array2_28918_CGAGTCTGTTAG -0.167468  1.757663  0.396551 -0.227317 -0.065573   

                              ABCB1    ABCB11     ABCC4     ABCC8     ABCC9  \
Array2_28918_CAACGACACATC -0.193275 -0.080952  0.713645 -0.059069 -0.061314   
Array2_28918_CAGTATGAGATC -0.193275 -0.080952  0.661741 -0.059069 -0.061314   
Array2_28918_ACGTCGTCCCTT  0.226489 -0.080952 -0.483003 -0.059069 -0.061314   
Arra

Exception in thread Thread-5:
Traceback (most recent call last):
  File "C:\Users\abhic\anaconda3\Lib\site-packages\pandas\core\indexes\base.py", line 3805, in get_loc
    return self._engine.get_loc(casted_key)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "index.pyx", line 167, in pandas._libs.index.IndexEngine.get_loc
  File "index.pyx", line 196, in pandas._libs.index.IndexEngine.get_loc
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7081, in pandas._libs.hashtable.PyObjectHashTable.get_item
  File "pandas\\_libs\\hashtable_class_helper.pxi", line 7089, in pandas._libs.hashtable.PyObjectHashTable.get_item
KeyError: 'Gene'

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "C:\Users\abhic\anaconda3\Lib\threading.py", line 1073, in _bootstrap_inner
    self.run()
  File "C:\Users\abhic\anaconda3\Lib\site-packages\watchdog\observers\api.py", line 223, in run
    self.dispatch_events(self.event_queue)
  File "