In [1]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study
adata = sc.read_h5ad("all_cells.h5ad")

# Subset to CD8+ lymphocytes if they are labeled
adata_cd8 = adata[adata.obs['cell_type'] == 'CD8+ lymphocyte']

# Extract relevant data
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")


MemoryError: Unable to allocate 4.65 GiB for an array with shape (41976, 29766) and data type float32

In [3]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Subset to CD8+ lymphocytes if they are labeled
adata_cd8 = adata[adata.obs['cell_type'] == 'CD8+ lymphocyte']

# Extract relevant data in backed mode
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.toarray().flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.toarray().flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")


KeyError: 'cell_type'

In [5]:
import scanpy as sc

# Load the .h5ad file in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Print the available observation columns
print("Obs columns:", adata.obs.columns)



Obs columns: Index(['sample', 'batch', 'doublet_scores', 'predicted_doublet',
       'n_genes_by_counts', 'log1p_n_genes_by_counts', 'total_counts',
       'log1p_total_counts', 'percent_mito', 'M.Number', 'treatment',
       'infection dose', 'Tissue name', 'CFU/granuloma', 'total thoracic CFU',
       'lung region', 'Gran State', 'n_counts', 'n_genes',
       'General Celltypes'],
      dtype='object')


In [7]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Subset to CD8+ lymphocytes if they are labeled
adata_cd8 = adata[adata.obs['General Celltypes'] == 'CD8+ lymphocyte']

# Extract relevant data in backed mode
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")


ValueError: Currently, you cannot index repeatedly into a backed AnnData, that is, you cannot make a view of a view.

In [9]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Extract relevant observation indices
cd8_indices = adata.obs.index[adata.obs['General Celltypes'] == 'CD8+ lymphocyte']
adata_cd8 = adata[cd8_indices, :].to_memory()

# Extract relevant data in backed mode
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")


ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by KMeans.

In [11]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Subset to CD8 expressing cells
adata_cd8 = adata[adata.obs['General Celltypes'] == 'CD8']

# Extract relevant data in backed mode
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")


ValueError: Currently, you cannot index repeatedly into a backed AnnData, that is, you cannot make a view of a view.

In [13]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Subset to CD8 expressing cells
cd8_indices = adata.obs.index[adata.obs['General Celltypes'] == 'CD8+ T cells']

# Bring the subset to memory
adata_cd8 = adata[cd8_indices, :].copy()

# Extract relevant data
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.toarray().flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.toarray().flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")



ValueError: To copy an AnnData object in backed mode, pass a filename: `.copy(filename='myfilename.h5ad')`. To load the object into memory, use `.to_memory()`.

In [15]:
import scanpy as sc
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.stats import spearmanr
import os

# Create output directory if it doesn't exist
output_dir = "output"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Load the .h5ad file from the study in backed mode
adata = sc.read_h5ad("all_cells.h5ad", backed='r')

# Subset to CD8 expressing cells
cd8_indices = adata.obs.index[adata.obs['General Celltypes'] == 'CD8']

# Bring the subset to memory
adata_cd8 = adata[cd8_indices, :].to_memory()

# Extract relevant data
data_clustering = pd.DataFrame({
    'GZMB': adata_cd8[:, 'GZMB'].X.flatten(), 
    'GNLY': adata_cd8[:, 'GNLY'].X.flatten()
})
data_clustering['CFU/granuloma'] = adata_cd8.obs['CFU/granuloma'].values

# Apply KMeans clustering
kmeans = KMeans(n_clusters=3, random_state=42)
data_clustering['Cluster'] = kmeans.fit_predict(data_clustering[['GZMB', 'GNLY']])

# Visualize GZMB and GNLY expression within clusters
gzm_gnly_scatter = sns.lmplot(x='GZMB', y='GNLY', data=data_clustering, hue='Cluster', palette='coolwarm', fit_reg=False, height=6, aspect=1.5)
plt.title('GZMB vs GNLY within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('GNLY Expression')
plt.savefig(os.path.join(output_dir, 'GZMB_vs_GNLY_Clusters.png'))
plt.show()

# Visualize GZMB expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GZMB'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GZMB Expression vs CFU/granuloma within Clusters')
plt.xlabel('GZMB Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GZMB_vs_CFU_Clusters.png'))
plt.show()

# Visualize GNLY expression and CFU/granuloma within clusters
plt.figure(figsize=(10, 6))
sns.scatterplot(x=data_clustering['GNLY'], y=data_clustering['CFU/granuloma'], hue=data_clustering['Cluster'], palette='coolwarm', alpha=0.6)
plt.title('GNLY Expression vs CFU/granuloma within Clusters')
plt.xlabel('GNLY Expression')
plt.ylabel('CFU/granuloma')
plt.legend(title='Cluster')
plt.grid(True)
plt.savefig(os.path.join(output_dir, 'GNLY_vs_CFU_Clusters.png'))
plt.show()

# Compare GZMB and GNLY Expression within Clusters
for cluster in data_clustering['Cluster'].unique():
    cluster_data = data_clustering[data_clustering['Cluster'] == cluster]
    for gene in ['GZMB', 'GNLY']:
        correlation, p_value = spearmanr(cluster_data[gene], cluster_data['CFU/granuloma'])
        print(f"Cluster {cluster}:")
        print(f"  Correlation between {gene} and CFU/granuloma: {correlation:.2f} (p-value: {p_value:.2e})")

ValueError: Found array with 0 sample(s) (shape=(0, 2)) while a minimum of 1 is required by KMeans.