In [10]:
# Step 1: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Step 2: Install Required Python and R Dependencies
# Install Python dependencies
!pip install anndata requests rpy2

# Install R dependencies
# Set up R in Colab
!apt-get install r-base

# Install necessary R packages
!R -e 'install.packages("remotes", repos="https://cloud.r-project.org")'
!R -e 'remotes::install_github("satijalab/azimuth")'

!R -e 'library(azimuth)'

# Step 3: Load a Subset of the expr.h5ad File
import anndata
import numpy as np

# Path to the h5ad file in Google Drive
file_path = '/content/drive/My Drive/expr.h5ad'

# Load the h5ad file
adata = anndata.read_h5ad(file_path)

# Subset the data to only use 1000 cells (to save time)
np.random.seed(42)  # For reproducibility
subset_indices = np.random.choice(adata.n_obs, size=1000, replace=False)
adata_subset = adata[subset_indices, :]

# Save the subset to a new h5ad file
adata_subset.write("subset_expr.h5ad")

# Step 4: Run Azimuth Cell Type Annotation in R
# Write the R script to a file
r_script = """
library(azimuth)
library(Seurat)

# Load the subset h5ad file
adata_subset <- ReadH5AD("subset_expr.h5ad")

# Run Azimuth on the subset
annotated_adata <- RunAzimuth(adata_subset, reference = "pbmc")

# Save the annotated h5ad file
WriteH5AD(annotated_adata, "annotated_subset_expr.h5ad")
"""

# Save the R script to a file
with open("run_azimuth.R", "w") as f:
    f.write(r_script)

# Run the R script
!Rscript run_azimuth.R

# Step 5: Read the Annotated h5ad File in Python
import anndata

# Load the annotated h5ad file
annotated_adata = anndata.read_h5ad("annotated_subset_expr.h5ad")

# Display the annotated data
print(annotated_adata)

# Step 6: Compute the Number of Cells per Cell Type
import pandas as pd

# Extract cell type annotations
cell_types = annotated_adata.obs['predicted.celltype']

# Count the number of cells per cell type
cell_type_counts = cell_types.value_counts()

# Display the counts
print("Number of cells per cell type:")
print(cell_type_counts)

# Step 7: Visualize the Distribution of Cells per Cell Type
import matplotlib.pyplot as plt
import seaborn as sns

# Set the style for the plot
sns.set(style="whitegrid")

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=cell_type_counts.index, y=cell_type_counts.values, palette="viridis")
plt.title('Distribution of Cells per Cell Type (Subset)')
plt.xlabel('Cell Type')
plt.ylabel('Number of Cells')
plt.xticks(rotation=45)
plt.show()

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
r-base is already the newest version (4.4.2-1.2204.0).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.

R version 4.4.2 (2024-10-31) -- "Pile of Leaves"
Copyright (C) 2024 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to hel

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'annotated_subset_expr.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [14]:
# ===========================
# 🚀 STEP 1: Mount Google Drive
# ===========================
from google.colab import drive
drive.mount('/content/drive')

# ===========================
# 🚀 STEP 2: Install Dependencies
# ===========================

# Install Python dependencies
!pip install anndata requests rpy2

# Install R and required packages
!apt-get install -y r-base
!R -e 'install.packages("remotes", repos="https://cloud.r-project.org")'
!R -e 'remotes::install_github("satijalab/azimuth")'
!R -e 'install.packages("Seurat", repos="https://cloud.r-project.org")'

# ===========================
# 🚀 STEP 3: Load & Subset Data
# ===========================

import anndata
import numpy as np

# Define file path (update the path if needed)
file_path = '/content/drive/My Drive/expr.h5ad'

# Load the h5ad file
adata = anndata.read_h5ad(file_path)

# Subset the data to only use 1000 cells (to save time)
np.random.seed(42)
subset_indices = np.random.choice(adata.n_obs, size=1000, replace=False)
adata_subset = adata[subset_indices, :]

# Save the subset
adata_subset.write("subset_expr.h5ad")

# ===========================
# 🚀 STEP 4: Run Azimuth in R
# ===========================

r_script = """
library(Seurat)
library(azimuth)

# Load subset
adata_subset <- ReadH5AD("subset_expr.h5ad")

# Run Azimuth annotation
annotated_adata <- RunAzimuth(adata_subset, reference = "pbmc")

# Print available metadata columns (for debugging)
print(colnames(annotated_adata@meta.data))

# Save the annotated dataset
WriteH5AD(annotated_adata, "annotated_subset_expr.h5ad")
"""

# Save the R script
with open("run_azimuth.R", "w") as f:
    f.write(r_script)

# Run the R script
!Rscript run_azimuth.R

# ===========================
# 🚀 STEP 5: Read Annotated Data
# ===========================

import anndata

# Load the annotated file
annotated_adata = anndata.read_h5ad("annotated_subset_expr.h5ad")

# Print available metadata columns
print("Metadata columns:", annotated_adata.obs.columns)

# ===========================
# 🚀 STEP 6: Compute & Visualize Cell Type Counts
# ===========================

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Verify if 'predicted.celltype' exists
if 'predicted.celltype' in annotated_adata.obs.columns:
    cell_types = annotated_adata.obs['predicted.celltype']

    # Count the number of cells per type
    cell_type_counts = cell_types.value_counts()

    # Display the counts
    print("Number of cells per cell type:")
    print(cell_type_counts)

    # Visualization
    sns.set(style="whitegrid")
    plt.figure(figsize=(10, 6))
    sns.barplot(x=cell_type_counts.index, y=cell_type_counts.values, palette="viridis")
    plt.title('Distribution of Cells per Cell Type (Subset)')
    plt.xlabel('Cell Type')
    plt.ylabel('Number of Cells')
    plt.xticks(rotation=45)
    plt.show()
else:
    raise KeyError("❌ 'predicted.celltype' not found in metadata. Check the Azimuth output.")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
r-base is already the newest version (4.4.2-1.2204.0).
0 upgraded, 0 newly installed, 0 to remove and 19 not upgraded.

R version 4.4.2 (2024-10-31) -- "Pile of Leaves"
Copyright (C) 2024 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to hel

FileNotFoundError: [Errno 2] Unable to synchronously open file (unable to open file: name = 'annotated_subset_expr.h5ad', errno = 2, error message = 'No such file or directory', flags = 0, o_flags = 0)

In [16]:
# Install R dependencies properly
!R -e "install.packages(c('remotes', 'devtools'), repos='https://cloud.r-project.org')"
!R -e "devtools::install_github('satijalab/azimuth')"
!R -e "install.packages(c('Seurat', 'SeuratObject', 'BiocManager'), repos='https://cloud.r-project.org')"
!R -e "BiocManager::install('SeuratDisk')"

# Verify Azimuth installation
!R -e "if (!requireNamespace('azimuth', quietly = TRUE)) stop('Azimuth failed to install!')"

# Run R script to annotate the dataset using Azimuth
r_script = """
library(Seurat)
library(SeuratDisk)
library(azimuth)

# Check if Azimuth loaded
if (!requireNamespace("azimuth", quietly = TRUE)) {
    stop("❌ Azimuth failed to load. Check installation.")
}

# Load Seurat object
seurat_obj <- readRDS("seurat_input.rds")  # Ensure this file exists

# Run Azimuth annotation
annotated_obj <- RunAzimuth(seurat_obj, reference = "pbmc")

# Save output as H5AD
SeuratDisk::SaveH5Seurat(annotated_obj, filename = "annotated_subset_expr.h5Seurat")
SeuratDisk::Convert("annotated_subset_expr.h5Seurat", dest = "h5ad")
"""

with open("script.R", "w") as f:
    f.write(r_script)

!Rscript script.R

# Verify if annotation file was created
import os
if not os.path.exists("annotated_subset_expr.h5ad"):
    raise FileNotFoundError("❌ 'annotated_subset_expr.h5ad' was not generated. Check R script execution.")

# Load Python dependencies
import anndata
import pandas as pd

# Load annotated data
annotated_adata = anndata.read_h5ad("annotated_subset_expr.h5ad")

# Print available metadata columns
print("Metadata columns:", annotated_adata.obs.columns)

# Ensure 'predicted.celltype' exists
if 'predicted.celltype' in annotated_adata.obs.columns:
    cell_types = annotated_adata.obs['predicted.celltype']
else:
    raise KeyError("❌ 'predicted.celltype' not found in metadata. Check the Azimuth output.")

# Compute cell type counts
cell_counts = cell_types.value_counts()
print("Cell Type Counts:\n", cell_counts)



R version 4.4.2 (2024-10-31) -- "Pile of Leaves"
Copyright (C) 2024 The R Foundation for Statistical Computing
Platform: x86_64-pc-linux-gnu

R is free software and comes with ABSOLUTELY NO WARRANTY.
You are welcome to redistribute it under certain conditions.
Type 'license()' or 'licence()' for distribution details.

  Natural language support but running in an English locale

R is a collaborative project with many contributors.
Type 'contributors()' for more information and
'citation()' on how to cite R or R packages in publications.

Type 'demo()' for some demos, 'help()' for on-line help, or
'help.start()' for an HTML browser interface to help.
Type 'q()' to quit R.

> install.packages(c('remotes', 'devtools'), repos='https://cloud.r-project.org')
Installing packages into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)
trying URL 'https://cloud.r-project.org/src/contrib/remotes_2.5.0.tar.gz'
Content type 'application/x-gzip' length 164496 bytes (160 KB)
downloaded 160 KB

FileNotFoundError: ❌ 'annotated_subset_expr.h5ad' was not generated. Check R script execution.

In [None]:
import os
import requests
import anndata
import subprocess
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Install dependencies (if needed)
!pip install anndata requests seaborn matplotlib rpy2

  # Replace with actual HuBMAP URL
filename = "expr.h5ad"

response = requests.get(url)
with open(filename, "wb") as f:
    f.write(response.content)
print("Downloaded the dataset successfully.")

# Step 2: Run Azimuth cell type annotation (R script)
r_script = """
library(Azimuth)
data <- Read10X_h5("expr.h5ad")
result <- Azimuth::AnnotateCells(data)
write.csv(result, "annotated_cells.csv")
"""
with open("run_azimuth.R", "w") as f:
    f.write(r_script)

subprocess.run(["Rscript", "run_azimuth.R"], check=True)
print("Azimuth annotation completed.")

# Step 3: Load the annotated data into Python
annotated_df = pd.read_csv("annotated_cells.csv")

# Step 4: Compute the number of cells per cell type
cell_counts = annotated_df["cell_type"].value_counts()

# Step 5: Visualize the distribution
plt.figure(figsize=(10, 6))
sns.barplot(x=cell_counts.index, y=cell_counts.values, palette="viridis")
plt.xticks(rotation=45)
plt.xlabel("Cell Type")
plt.ylabel("Count")
plt.title("Distribution of Cells per Cell Type")
plt.show()

print("Analysis complete. Please check the GitHub repository for further documentation.")


In [None]:
# Install the required Python dependencies
!pip install anndata requests rpy2 --quiet

import os
import anndata
import subprocess
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from google.colab import drive

# Mount Google Drive to access the uploaded file
drive.mount('/content/drive')

# Path to your uploaded h5ad file in Google Drive (change this path to your file's location)
h5ad_file_path = "/content/drive/MyDrive/expr.h5ad"  # Replace with your actual path

# Check if the file exists
if os.path.exists(h5ad_file_path):
    print(f"File found: {h5ad_file_path}")
else:
    print(f"File not found: {h5ad_file_path}")

# Read the .h5ad file using anndata
adata = anndata.read_h5ad(h5ad_file_path)

# Subset the data for faster testing (using only 100 cells for now for quicker execution)
adata_subset = adata[:100, :]  # Use fewer cells for testing

# Save the subsetted data into a temporary file for R script input
adata_subset.write("/content/drive/MyDrive/subset_adata.h5ad")

# Run the R script for Azimuth annotation (assuming 'run_azimuth_subset.R' already exists)
try:
    print("Running the Azimuth R script...")
    result = subprocess.run(
        ["Rscript", "/content/run_azimuth_subset.R"],  # Path to your downloaded R script
        check=True,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True
    )
    print("Azimuth annotation R script executed successfully.")
    print("R script output:\n", result.stdout)
except subprocess.CalledProcessError as e:
    print(f"Error occurred while running R script: {e}")
    print("R script error output:\n", e.stderr)

# Load the annotated h5ad file (this is the output after running Azimuth)
try:
    adata_annotated = anndata.read_h5ad("/content/drive/MyDrive/annotated_output_azimuth_subset.h5Seurat")
    print("Annotated data loaded successfully.")
except FileNotFoundError:
    print("Error: 'annotated_output_azimuth_subset.h5Seurat' not found. R script may have failed.")
    adata_annotated = None

# If annotation was successful, proceed with further analysis
if adata_annotated is not None:
    # Check available annotations in the dataset
    print("Available annotations in adata_annotated.obs:", adata_annotated.obs.columns)

    # Check if Azimuth annotation exists and extract it
    if 'azimuth_cell_types' in adata_annotated.obs.columns:
        cell_types = adata_annotated.obs['azimuth_cell_types']
    else:
        raise KeyError("Error: 'azimuth_cell_types' annotation not found in adata_annotated.obs.")

    # Compute the number of cells per cell type
    cell_counts = cell_types.value_counts()

    # Visualize the distribution of cell types
    plt.figure(figsize=(12, 6))
    sns.barplot(x=cell_counts.index, y=cell_counts.values, palette="viridis")
    plt.xticks(rotation=90)
    plt.xlabel("Cell Type")
    plt.ylabel("Count")
    plt.title("Distribution of Cell Types")
    plt.show()
else:
    print("Annotation failed, no further analysis performed.")


Mounted at /content/drive
File found: /content/drive/MyDrive/expr.h5ad
Running the Azimuth R script...


KeyboardInterrupt: 