# scRNA-ML Pipeline
Clustering PBMC 3k single-cell RNA-seq data with ML and visualization.

In [ ]:
# Install dependencies if not already installed
!pip install scanpy pandas numpy matplotlib seaborn scikit-learn h5py

In [ ]:
import scanpy as sc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load data
adata = sc.read_10x_h5('../data/pbmc_3k_filtered_feature_bc_matrix.h5')

# Preprocessing
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=2000, subset=True)

# PCA
sc.tl.pca(adata, svd_solver='arpack')

# Compute neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)

# UMAP
sc.tl.umap(adata)

# Clustering
sc.tl.leiden(adata, resolution=0.5)

# Plot UMAP colored by clusters
sc.pl.umap(adata, color='leiden', save='_clusters.png')

# Save results
adata.obs.to_csv('../results/cell_clusters.csv')