In [7]:
import os
import requests
import zipfile
from bs4 import BeautifulSoup
import pandas as pd

In [3]:
def download_data(url, directory):
    """
    Downloads data from the given URL to the specified directory.
    
    Parameters:
    url (str): The URL to download the data from.
    directory (str): The directory where the data will be saved.
    """
    # Ensure the directory exists
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # Get the filename from the URL
    filename = os.path.join(directory, url.split('/')[-1])
    
    # Download the data
    response = requests.get(url)
    with open(filename, 'wb') as file:
        file.write(response.content)
        
    print(f"Data downloaded to {filename}")

In [4]:
# FILENAME = "Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip"
# DATA_DIR = "data"
# EXTRACTED_DATA_PATH = 'extracted_data'
# if not os.path.isfile(zip_path := os.path.join(DATA_DIR, FILENAME)) and not os.path.isdir('extracted_data'):
#     download_data('https://cf.10xgenomics.com/samples/xenium/1.0.2/Xenium_V1_FF_Mouse_Brain_MultiSection_1/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip', DATA_DIR)

KeyboardInterrupt: 

In [4]:
def extract_zip(zip_path, extract_to):
    """
    Extracts the contents of a zip file to the specified directory.
    
    Parameters:
    zip_path (str): The path to the zip file.
    extract_to (str): The directory where the contents will be extracted.
    """
    # Ensure the directory exists
    if not os.path.exists(extract_to):
        os.makedirs(extract_to)
    
    # Extract the zip file
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_to)
        
    print(f"Data extracted to {extract_to}")

In [5]:
# Delete the zip file after extraction

if os.path.exists(zip_path):
    os.remove(zip_path)
    print(f"Zip file {zip_path} has been deleted.")
else:
    if not os.path.exists(EXTRACTED_DATA_PATH):
        extract_zip(os.path.join(DATA_DIR, FILENAME), f'./{EXTRACTED_DATA_PATH}')
        print(f"Zip file {zip_path} extracted.")
    else:
        print(f"Zip file {zip_path} has already been extracted.")

Zip file data/Xenium_V1_FF_Mouse_Brain_MultiSection_1_outs.zip has already been extracted.


In [6]:
import pandas as pd

# Load the extracted data from the CSV file into a pandas DataFrame
csv_file_path = './extracted_data/cells.csv.gz'
cells_df = pd.read_csv(csv_file_path, index_col='cell_id')

print(cells_df.shape)

# Display the first few rows of the DataFrame
print(cells_df.head())


(162033, 8)
          x_centroid   y_centroid  transcript_counts  control_probe_counts  \
cell_id                                                                      
1        1557.532239  2528.022437                327                     0   
2        1560.669312  2543.632678                354                     0   
3        1570.462885  2530.810461                422                     0   
4        1573.927734  2546.454529                250                     0   
5        1581.344379  2557.024951                550                     1   

         control_codeword_counts  total_counts   cell_area  nucleus_area  
cell_id                                                                   
1                              0           327  240.953750     63.038125  
2                              0           354  211.692500     65.476562  
3                              0           422  186.946875     69.540625  
4                              0           250  239.237812     61.

In [7]:
import pandas as pd

# # Load the extracted data from the CSV file into a pandas DataFrame
csv_file_path = './extracted_data/transcripts.csv.gz'
transcripts_df = pd.read_csv(csv_file_path)

transcripts_df.drop(columns = ["transcript_id", "overlaps_nucleus"], inplace=True)

print(transcripts_df.shape)

# Display the first few rows of the DataFrame
print(transcripts_df.head())


(62744602, 6)
   cell_id feature_name  x_location  y_location  z_location         qv
0    67490      Bhlhe40   4843.0460   6427.7300   19.068869  40.000000
1    67957        Parm1   4844.6330   6223.1826   18.520160  40.000000
2    67539      Bhlhe40   4842.9434   6478.3105   18.500109  13.813585
3    68003         Lyz2   4843.9414   6344.5503   15.016154  19.471884
4    67344         Dkk3   4843.1626   6632.1120   15.394680  40.000000


In [8]:
# create transcriptional counts for each cell

gene_counts = (
    transcripts_df.groupby(['cell_id', 'feature_name'])
    .size()  # counts occurrences
    .unstack(fill_value=0)  # pivot gene names into columns
)

spatial_means = (
    transcripts_df.groupby('cell_id')[['x_location', 'y_location', 'z_location', 'qv']]
    .mean()
)

final_df = gene_counts.join(spatial_means, how='inner', on='cell_id')
final_df.drop(-1, inplace=True)
final_df.head()

Unnamed: 0_level_0,2010300C02Rik,Acsbg1,Acta2,Acvrl1,Adamts2,Adamtsl1,Adgrl4,Aldh1a2,Angpt1,Ano1,...,Vip,Vwc2l,Wfs1,Zfp366,Zfp536,Zfpm2,x_location,y_location,z_location,qv
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6,1,0,0,0,2,0,0,1,1,...,0,1,2,0,0,2,1557.470384,2529.235501,14.000948,31.390449
2,0,1,0,0,0,1,0,0,0,0,...,0,0,2,0,8,0,1560.893115,2541.72752,14.789414,30.481819
3,5,5,0,0,0,1,0,0,2,0,...,0,0,7,0,0,1,1570.152317,2532.381,15.395041,30.390635
4,5,10,0,0,0,1,0,0,2,0,...,0,0,3,0,3,2,1575.010769,2545.660127,14.47816,32.96024
5,23,8,0,0,0,2,0,0,1,0,...,0,0,8,0,2,2,1580.884363,2555.688014,14.901122,31.854846


In [9]:
final_df["x_location"] = cells_df["x_centroid"]
final_df["y_location"] = cells_df["y_centroid"]
final_df.head()

Unnamed: 0_level_0,2010300C02Rik,Acsbg1,Acta2,Acvrl1,Adamts2,Adamtsl1,Adgrl4,Aldh1a2,Angpt1,Ano1,...,Vip,Vwc2l,Wfs1,Zfp366,Zfp536,Zfpm2,x_location,y_location,z_location,qv
cell_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,6,1,0,0,0,2,0,0,1,1,...,0,1,2,0,0,2,1557.532239,2528.022437,14.000948,31.390449
2,0,1,0,0,0,1,0,0,0,0,...,0,0,2,0,8,0,1560.669312,2543.632678,14.789414,30.481819
3,5,5,0,0,0,1,0,0,2,0,...,0,0,7,0,0,1,1570.462885,2530.810461,15.395041,30.390635
4,5,10,0,0,0,1,0,0,2,0,...,0,0,3,0,3,2,1573.927734,2546.454529,14.47816,32.96024
5,23,8,0,0,0,2,0,0,1,0,...,0,0,8,0,2,2,1581.344379,2557.024951,14.901122,31.854846


In [15]:
final_df = final_df.loc[:, ~final_df.columns.str.lower().str.contains('blank_|negcontrolcodeword_|negcontrolprobe_')]

In [16]:
final_df.describe()

Unnamed: 0,2010300C02Rik,Acsbg1,Acta2,Acvrl1,Adamts2,Adamtsl1,Adgrl4,Aldh1a2,Angpt1,Ano1,...,Vip,Vwc2l,Wfs1,Zfp366,Zfp536,Zfpm2,x_location,y_location,z_location,qv
count,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,...,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0,162033.0
mean,4.722698,4.794708,0.624601,0.448668,0.437979,0.442799,0.823227,0.414076,0.521264,0.140194,...,0.411762,0.935229,2.370838,0.177149,1.816112,0.633482,5100.189971,3688.966891,16.636496,32.758069
std,7.27756,6.617659,2.790731,1.299424,1.418455,1.04433,2.222647,2.189851,1.000984,0.636379,...,2.514632,1.701964,4.696733,0.617197,2.747544,1.388443,2485.187494,1608.313597,1.445612,1.731354
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,180.890002,22.949301,12.286758,18.128212
25%,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,2998.991125,2325.958386,15.611929,31.542031
50%,2.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,1.0,0.0,5108.107788,3752.095898,16.525967,32.720428
75%,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,2.0,0.0,2.0,1.0,7253.035059,5008.46521,17.597927,33.993172
max,124.0,78.0,82.0,20.0,57.0,15.0,34.0,70.0,24.0,16.0,...,81.0,49.0,88.0,21.0,44.0,27.0,9885.679297,6965.974951,29.200815,40.0


In [17]:
list(final_df.columns)

['2010300C02Rik',
 'Acsbg1',
 'Acta2',
 'Acvrl1',
 'Adamts2',
 'Adamtsl1',
 'Adgrl4',
 'Aldh1a2',
 'Angpt1',
 'Ano1',
 'Aqp4',
 'Arc',
 'Arhgap12',
 'Arhgap25',
 'Arhgap6',
 'Arhgef28',
 'Bcl11b',
 'Bdnf',
 'Bhlhe22',
 'Bhlhe40',
 'Btbd11',
 'Cabp7',
 'Cacna2d2',
 'Calb1',
 'Calb2',
 'Car4',
 'Carmn',
 'Cbln1',
 'Cbln4',
 'Ccn2',
 'Cd24a',
 'Cd300c2',
 'Cd44',
 'Cd53',
 'Cd68',
 'Cd93',
 'Cdh13',
 'Cdh20',
 'Cdh4',
 'Cdh6',
 'Cdh9',
 'Chat',
 'Chodl',
 'Chrm2',
 'Cldn5',
 'Clmn',
 'Cntn6',
 'Cntnap4',
 'Cntnap5b',
 'Cobll1',
 'Col19a1',
 'Col1a1',
 'Col6a1',
 'Cort',
 'Cplx3',
 'Cpne4',
 'Cpne6',
 'Cpne8',
 'Crh',
 'Cspg4',
 'Cux2',
 'Cwh43',
 'Cyp1b1',
 'Dcn',
 'Deptor',
 'Dkk3',
 'Dner',
 'Dpy19l1',
 'Dpyd',
 'Ebf3',
 'Emcn',
 'Epha4',
 'Eya4',
 'Fezf2',
 'Fgd5',
 'Fhod3',
 'Fibcd1',
 'Fign',
 'Fmod',
 'Fn1',
 'Fos',
 'Foxp2',
 'Gad1',
 'Gad2',
 'Gadd45a',
 'Galnt14',
 'Garnl3',
 'Gfap',
 'Gfra2',
 'Gjb2',
 'Gjc3',
 'Gli3',
 'Gm19410',
 'Gm2115',
 'Gng12',
 'Gpr17',
 'Grik3',
 'Gsg1l

In [9]:
# final_df.to_csv("../data/raw/xenium.csv", index="cell_id")
final_df = pd.read_csv("../data/raw/xenium.csv", index_col="cell_id")

In [11]:
import pandas as pd
from io import StringIO

mouse_genes = [gene.lower() for gene in final_df.columns]

# === 1. OmniPath Intercellular Annotations ===
omnipath_url = "https://omnipathdb.org/intercell?fields=genesymbol&format=tab"
omni_intercell = pd.read_csv(StringIO(requests.get(omnipath_url).text), sep="\t")
omni_lig = set(omni_intercell.loc[omni_intercell['category'] == 'ligand', 'genesymbol'].str.lower())
omni_rec = set(omni_intercell.loc[omni_intercell['category'] == 'receptor', 'genesymbol'].str.lower())

# === 2. OmniPath Ligand-Receptor pairs (includes FANTOM5) ===
fantom_url = "https://omnipathdb.org/interactions?format=tab&datasets=ligrecextra"
fantom_df = pd.read_csv(StringIO(requests.get(fantom_url).text), sep="\t")
fantom_lig = set(fantom_df['source'].str.lower())
fantom_rec = set(fantom_df['target'].str.lower())

# === 3. NATMI ConnectomeDB2020 ===
# Define the URL
url = "https://asrhou.github.io/NATMI/"

# Send a GET request
response = requests.get(url)

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table in the HTML
table = soup.find('table')

# Convert the table to a pandas DataFrame
natmi_df = pd.read_html(str(table))[0]
natmi_lig = set(natmi_df['Ligand gene symbol'].str.lower())
natmi_rec = set(natmi_df['Receptor gene symbol'].str.lower())

# === 4. Jin 2020 Mouse L-R Dataset ===
jin_url = "https://github.com/LewisLabUCSD/Ligand-Receptor-Pairs/raw/master/Mouse/Mouse-2020-Jin-LR-pairs.csv"
jin_df = pd.read_csv(jin_url)
jin_lig = set(jin_df['ligand'].str.lower())
jin_rec = set(jin_df['receptor'].str.lower())

# === Annotate each gene ===
results = []
for gene in mouse_genes:
    sources = []
    role = "neither"

    if gene in omni_lig or gene in omni_rec:
        sources.append("OmniPath")
    if gene in fantom_lig or gene in fantom_rec:
        sources.append("FANTOM5 (OmniPath)")
    if gene in natmi_lig or gene in natmi_rec:
        sources.append("NATMI ConnectomeDB2020")
    if gene in jin_lig or gene in jin_rec:
        sources.append("Jin2020 Mouse")

    if sources:
        if gene in omni_lig.union(fantom_lig, natmi_lig, jin_lig) and \
           gene in omni_rec.union(fantom_rec, natmi_rec, jin_rec):
            role = "ligand_and_receptor"
        elif gene in omni_lig.union(fantom_lig, natmi_lig, jin_lig):
            role = "ligand"
        elif gene in omni_rec.union(fantom_rec, natmi_rec, jin_rec):
            role = "receptor"

    results.append({"gene": gene, "role": role, "sources": ", ".join(sources)})

result_df = pd.DataFrame(results)
# Save the result_df DataFrame to a CSV file in the 'extracted_data' directory
result_df.to_csv('annotated_genes.csv', index=False)

In [12]:
result_df.value_counts("role")

role
neither                152
receptor                53
ligand                  30
ligand_and_receptor     17
dtype: int64

In [13]:
ligand_receptor_indexes = result_df[(result_df['role'] == 'ligand') | 
                                     (result_df['role'] == 'receptor') | 
                                     (result_df['role'] == 'ligand_and_receptor')].index

assert len(ligand_receptor_indexes) == 100

In [16]:
final_df.shape

(162033, 252)

In [15]:
final_df.columns[ligand_receptor_indexes]

Index(['Acvrl1', 'Adamts2', 'Adgrl4', 'Angpt1', 'Ano1', 'Aqp4', 'Bdnf',
       'Cbln1', 'Cbln4', 'Ccn2', 'Cd44', 'Cd53', 'Cd68', 'Cd93', 'Cdh13',
       'Cdh4', 'Cdh6', 'Chat', 'Chrm2', 'Cldn5', 'Cntn6', 'Cntnap4', 'Col19a1',
       'Col1a1', 'Col6a1', 'Cort', 'Crh', 'Cspg4', 'Cyp1b1', 'Dcn', 'Dkk3',
       'Dner', 'Dpyd', 'Epha4', 'Fn1', 'Gad1', 'Gad2', 'Gfra2', 'Gpr17',
       'Grik3', 'Hapln1', 'Hat1', 'Htr1f', 'Igf1', 'Igf2', 'Igfbp4', 'Inpp4b',
       'Kctd8', 'Kdr', 'Mapk4', 'Neto2', 'Npnt', 'Npy2r', 'Nr2f2', 'Nrn1',
       'Nrp2', 'Nts', 'Ntsr2', 'Nxph3', 'Opn3', 'Paqr5', 'Pcsk5', 'Pde11a',
       'Pde7b', 'Pdgfra', 'Pdyn', 'Pecam1', 'Penk', 'Pglyrp1', 'Pip5k1b',
       'Plch1', 'Prph', 'Pthlh', 'Rbp4', 'Ror1', 'Rspo1', 'Rspo2', 'Rxfp1',
       'Sdk2', 'Sema3a', 'Sema3d', 'Sema3e', 'Sema5b', 'Sema6a', 'Shisa6',
       'Slc13a4', 'Slc17a6', 'Slc39a12', 'Slc6a3', 'Slit2', 'Sorcs3', 'Spp1',
       'Sst', 'Tacr1', 'Th', 'Trbc2', 'Trem2', 'Trpc4', 'Vip', 'Vwc2l'],
      dtype='object