# Data Collection

We will collect the data from the Xenium Datasets on the 10x Genomics platform.

In [1]:
import requests

In [2]:
def download_file(url):
    # Get the filename from the URL
    filename = f"data/{url.split('/')[-1]}"

    # Send a GET request to the URL
    response = requests.get(url, stream=True)

    # Check if the request was successful
    if response.status_code == 200:
        # Save the file
        with open(filename, 'wb') as f:
            for chunk in response.iter_content(chunk_size=1024):
                if chunk:  # filter out keep-alive new chunks
                    f.write(chunk)
        print(f"File '{filename}' downloaded successfully.")
    else:
        print(f"Error: Unable to download file. HTTP status code: {response.status_code}")

In [3]:
url = "https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FFPE_Human_Breast_ILC/Xenium_V1_FFPE_Human_Breast_ILC_outs.zip"
download_file(url)

File 'data/Xenium_V1_FFPE_Human_Breast_ILC_outs.zip' downloaded successfully.


# Get Data Array

In [4]:
import zipfile

In [5]:
with zipfile.ZipFile('data/Xenium_V1_FFPE_Human_Breast_ILC_outs.zip', 'r') as data:
    data.extractall('data/hBreast')

This should have extracted all the outs. For now, we are interested in the transcripts data.

In [None]:
import pandas as pd
import numpy as np

In [None]:
# Path to your .gz file
file_path = 'data/hBreast/transcripts.csv.gz'

# Read the gzipped CSV file into a DataFrame
df_transcripts = pd.read_csv(file_path, compression='gzip')

df_transcripts["error_prob"] = 10 ** (-df_transcripts["qv"]/10)

In [None]:
# Display the first few rows of the DataFrame
print(df_transcripts.head())

print(len(df_transcripts))

In [None]:
# Function to read a Parquet file and convert it to a pandas DataFrame
def read_parquet_to_dataframe(file_path):
    """
    Reads a Parquet file from the specified file path and converts it into a pandas DataFrame.

    Parameters:
    file_path (str): The file path of the Parquet file to be read.

    Returns:
    pd.DataFrame: A DataFrame containing the data from the Parquet file.
    """

    # Using pandas to read the Parquet file
    try:
        df = pd.read_parquet(file_path)
        return df
    except Exception as e:
        # If there's an error (e.g., file not found, file format issues), it will print the error message
        print(f"Error reading the Parquet file: {e}")
        return None

In [None]:
file_path = 'data/hBreast/transcripts.parquet' # Replace with your Parquet file path
dataframe = read_parquet_to_dataframe(file_path)

# Optional: Displaying the DataFrame if it's not None
if dataframe is not None:
    print(dataframe)

# Convert the dataset s.t. each row is a cell.

In [None]:
location_means = df_transcripts.groupby('cell_id').agg({
    'x_location': 'mean',
    'y_location': 'mean',
    'z_location': 'mean'
}).reset_index()

In [None]:
counts = df_transcripts.groupby(['cell_id', 'feature_name']).size().reset_index(name='count')
print(counts.head())
counts["normalized_log1p_count"] = np.log1p(counts['count'])

In [None]:
counts_pivot = counts.pivot_table(index='cell_id', 
                                  columns='feature_name', 
                                  values='normalized_log1p_count', 
                                  fill_value=0)

In [None]:
breast_cells_df = location_means.join(counts_pivot, on='cell_id')

In [None]:
print(breast_cells_df.head())
breast_cells_df.shape

In [None]:
# Drop the unassigned cell id
breast_cells_df = breast_cells_df.query("cell_id != -1")

In [None]:
print(breast_cells_df.shape)
breast_cells_df.head()

In [None]:
breast_cells_df.to_csv("data/hBreast/hBreastST.csv")

# Plotting

We can plot the the locations of the transcripts using a 3D scatterplot (only for AZGP1).

In [None]:
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')

ax.scatter(breast_cells_df["x_location"], breast_cells_df["y_location"], breast_cells_df["z_location"], s=5*breast_cells_df["ABCC11"], alpha=0.1)

# Clustering

For now, we will use a very trivial clustering technique just to showcase what is possible with the data we have collected so far.

### Clustering with Scanpy

In [None]:
import scanpy as sc
import anndata as ad
import matplotlib.pyplot as plt

In [None]:
breast_sc_adata = ad.read_csv('data/hBreast/hBreastST.csv')

In [None]:
sc.tl.pca(breast_sc_adata, svd_solver='arpack')
plt.rcParams['figure.facecolor'] = 'white'
sc.pl.pca_variance_ratio(breast_sc_adata, log=True)

In [None]:
sc.pp.neighbors(breast_sc_adata, n_neighbors=15, n_pcs=20)

In [None]:
sc.tl.leiden(breast_sc_adata,resolution=2.2,key_added='leiden_2_2')
sc.tl.leiden(breast_sc_adata,resolution=1.8,key_added='leiden_1_8')
sc.tl.leiden(breast_sc_adata,resolution=1.4,key_added='leiden_1_4')
sc.tl.leiden(breast_sc_adata,resolution=1.0,key_added='leiden_1_0')
sc.tl.leiden(breast_sc_adata,resolution=0.8,key_added='leiden_0_8')
sc.tl.leiden(breast_sc_adata,resolution=0.6,key_added='leiden_0_6')
sc.tl.leiden(breast_sc_adata,resolution=0.025,key_added='leiden_0_025')

In [None]:
sc.tl.leiden(breast_sc_adata,resolution=0.001,key_added='leiden_0_001')

In [None]:
sc.tl.umap(breast_sc_adata, min_dist=0.1)

In [None]:
breast_sc_adata.obs['leiden_2_2'], breast_sc_adata.obs['leiden_0_6'], breast_sc_adata.obs['leiden_0_001']

In [None]:
# sc.set_figure_params(scanpy=True, dpi=150,figsize=(10,10))
# plt.rcParams['figure.facecolor'] = 'white'
sc.pl.umap(breast_sc_adata,size=30,color='leiden_0_001',legend_loc='on data',legend_fontsize=3,legend_fontoutline=1,show=False,palette="rainbow")

### Obviously, this looks way too cluttered. The reason? 