In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import scanpy as sc
import warnings
import os
from tqdm import tqdm
import numpy as np
from scipy.spatial import ConvexHull
warnings.filterwarnings('ignore')
from shapely.geometry import Point, Polygon
from shapely.prepared import prep
from rtree import index

In [2]:
adata = sc.read_h5ad('/mnt/Data16Tc/home/haichao/code/SpaCon/Data/N_20231213_zxw/mouse_1/adata_processed.h5ad')
print(adata.shape)
adata.obs

(2616328, 1122)


Unnamed: 0_level_0,brain_section_label,x,y,z,x_section_mean,x_ccf,y_ccf,z_ccf
cell_label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
182941331246012878296807398333956011710,Zhuang-ABCA-1.089,79.021898,30.484256,5.829622,78.263674,7.902190,3.048426,0.582962
221260934538535633595532020856387724686,Zhuang-ABCA-1.089,79.065131,31.452003,5.776024,78.263674,7.906513,3.145200,0.577602
22228792606814781533240955623030943708,Zhuang-ABCA-1.089,79.061104,31.827607,5.537314,78.263674,7.906110,3.182761,0.553731
272043042552227961220474294517855477150,Zhuang-ABCA-1.089,79.046267,31.318079,5.635250,78.263674,7.904627,3.131808,0.563525
110116287883089187971185374239350249328,Zhuang-ABCA-1.089,79.072356,32.306472,5.430477,78.263674,7.907236,3.230647,0.543048
...,...,...,...,...,...,...,...,...
94310525370042131911495836073267655162,Zhuang-ABCA-1.110,96.812435,44.539795,8.520273,97.196243,9.681244,4.453979,0.852027
298798481479578578007190103666214714353,Zhuang-ABCA-1.110,96.769993,42.916474,8.995306,97.196243,9.676999,4.291647,0.899531
330756942354980576352210203729462562749,Zhuang-ABCA-1.110,96.787599,43.632823,8.940819,97.196243,9.678760,4.363282,0.894082
47305871059582831548494138048361484565,Zhuang-ABCA-1.110,96.786411,43.603459,9.011947,97.196243,9.678641,4.360346,0.901195


In [3]:
path = '/mnt/Data18Td/Data/haichao/merfish_raw_data_zxw1/'
decode_files_name = os.listdir(path + 'decode/')
decode_files_name

['spots_220720_co2_9_merfish4_adaptor.csv',
 'spots_220516_wb3_co2_7_5z18R_merfish5.csv',
 'spots_220708_co2_17B_merfish4_adaptor.csv',
 'spots_220726_co2_1B_merfish4_adaptor.csv',
 'spots_220616_co2_8B_merfish4_adaptor.csv',
 'spots_220507_wb3_co2_11_5z18R_merfish5.csv',
 'spots_220623_co2_4B_merfish4_adaptor.csv',
 'spots_220530_wb3_co2_4_5z18R_merfish5.csv',
 'spots_220518_wb3_co2_17_5z18R_merfish5.csv',
 'spots_220623_wb3_co2_24_5z18R_merfish6.csv',
 'spots_220506_wb3_co2_10_5z18R_merfish2.csv',
 'spots_220530_wb3_co2_5_5z18R_merfish6.csv',
 'spots_220630_wb3_co2_25_5z18R_merfish5.csv',
 'spots_220620_wb3_co2_B_18_5z18R_merfish5.csv',
 'spots_220620_wb3_co2_B_22_5z18R_merfish6.csv',
 'spots_220630_wb3_co2_20_5z18R_merfish6.csv',
 'spots_220717_co2_13B_merfish4_adaptor.csv',
 'spots_220723_co2_6B_merfish4_adaptor.csv',
 'spots_220503_co2_13_merfish4_adaptor_5z18r.csv',
 'spots_220516_wb3_co2_8_5z18R_merfish6.csv',
 'spots_220525_wb3_co2_22_5z18R2bd_merfish5.csv',
 'spots_220605_co2_

In [5]:
def create_cell_index(cells):
    idx = index.Index()
    for i, cell in enumerate(cells):
        idx.insert(i, Polygon(cell).bounds)
    return idx

def point_in_polygon(point, polygon):
    return Point(point).within(Polygon(polygon))

def assign_rna_to_cells(rna_points, cells, cell_name):
    cell_index = create_cell_index(cells)
    results = []
    
    for rna_point in tqdm(rna_points):
        potential_cells = list(cell_index.intersection(Point(rna_point).bounds))
        cell_assigned = False
        for cell_id in potential_cells:
            if point_in_polygon(rna_point, cells[cell_id]):
                results.append({
                    'cell_id': cell_name[cell_id],
                    'rna_x': rna_point[0],
                    'rna_y': rna_point[1]
                })
                cell_assigned = True
                break
        
        if not cell_assigned:
            results.append({
                'cell_id': -1,
                'rna_x': rna_point[0],
                'rna_y': rna_point[1]
            })
    
    return pd.DataFrame(results)

In [6]:
for f in decode_files_name:
    print(f)
    decode = pd.read_csv(path + 'decode/' + f)
    cellpose = pd.read_csv(path + 'cell_boundary/' + f[6:], index_col=0)
    print('raw: ', cellpose.shape)
    cellpose = cellpose[cellpose.index.isin(adata.obs.index)]
    print('in adata: ', cellpose.shape)

    for z in decode['global_z'].unique():
        z = int(z)
        boundary_x_col = f'boundaryX_z{z}'
        boundary_y_col = f'boundaryY_z{z}'
        cellpose_z = cellpose[[boundary_x_col, boundary_y_col]]
        cellpose_z = cellpose_z.dropna()
        cellpose_z = cellpose_z[cellpose_z.index.isin(adata.obs.index)]  # choose the cell which in the adata
        cells = []
        for idx in cellpose_z.index:
            x1 = np.array(cellpose.loc[idx, boundary_x_col].split(', '), dtype=float)
            y1 = np.array(cellpose.loc[idx, boundary_y_col].split(', '), dtype=float)
            cells.append(list(zip(x1, y1)))

        dec_z = decode[decode['global_z'] == z]
        rna_points = dec_z[['global_x', 'global_y']].values
        result_df = assign_rna_to_cells(rna_points, cells, cellpose_z.index)
        result_df.to_csv(f'/mnt/Data18Td/Data/haichao/merfish_raw_data_zxw1/RNA_assign_result/z{z}_{f}')

spots_220629_co2_15B_merfish4_adaptor.csv


raw:  (130659, 10)
in adata:  (69084, 10)


100%|██████████| 26828744/26828744 [1:12:36<00:00, 6158.98it/s]  
100%|██████████| 29836347/29836347 [1:21:12<00:00, 6123.68it/s]  
100%|██████████| 30105592/30105592 [1:26:40<00:00, 5788.82it/s]  
100%|██████████| 27387694/27387694 [1:19:22<00:00, 5750.69it/s]  
100%|██████████| 21543123/21543123 [1:05:22<00:00, 5492.36it/s] 


spots_220523_wb3_co2_20_5z18R2bd_merfish5.csv
raw:  (154644, 10)
in adata:  (72912, 10)


100%|██████████| 14461296/14461296 [33:57<00:00, 7096.11it/s] 
100%|██████████| 16681367/16681367 [42:23<00:00, 6557.49it/s]  
100%|██████████| 17459528/17459528 [45:50<00:00, 6348.76it/s]  
100%|██████████| 16690638/16690638 [44:47<00:00, 6211.47it/s]  
100%|██████████| 14793027/14793027 [39:37<00:00, 6220.85it/s] 


spots_220601_wb3_co2_1_5z18R2bd_merfish5.csv
raw:  (131976, 10)
in adata:  (29149, 10)


100%|██████████| 19007426/19007426 [28:50<00:00, 10982.11it/s] 
100%|██████████| 19511354/19511354 [32:41<00:00, 9948.87it/s]  
100%|██████████| 18848612/18848612 [34:31<00:00, 9099.52it/s]  
100%|██████████| 16467394/16467394 [31:54<00:00, 8599.66it/s]  
100%|██████████| 12944472/12944472 [26:32<00:00, 8127.75it/s] 


spots_220510_wb3_co2_12_5z18R_merfish5.csv
spots_220714_co2_19B_merfish4_adaptor.csv
raw:  (153665, 10)
in adata:  (73017, 10)


100%|██████████| 16940652/16940652 [39:48<00:00, 7093.41it/s]  
100%|██████████| 17636567/17636567 [44:21<00:00, 6626.83it/s]  
100%|██████████| 19189732/19189732 [50:47<00:00, 6296.57it/s]  
100%|██████████| 18673161/18673161 [50:59<00:00, 6103.84it/s]  
100%|██████████| 16702576/16702576 [44:38<00:00, 6236.15it/s]  
