In [6]:
import os
import shutil
import gzip
import tarfile
import pandas as pd
import scipy.io
import anndata
import scanpy as sc
from functools import reduce

# 1. 解压tar文件
raw_dir = r"D:\data\GDK_sc_retina\raw_data_normal\GSE220661"
os.makedirs(raw_dir, exist_ok=True)

def extract_raw_tar_files(folder_path):
    for filename in os.listdir(folder_path):
        if filename.endswith('.tar') and 'RAW' in filename:
            file_path = os.path.join(folder_path, filename)
            try:
                with tarfile.open(file_path, 'r:*') as tar:
                    tar.extractall(path=folder_path)
                print(f"成功解压: {filename} 到 {folder_path}")
            except (tarfile.TarError, EOFError) as e:
                print(f"解压失败 {filename}: {e}")

extract_raw_tar_files(raw_dir)

# 2. 重命名和组织文件
all_files = [f for f in os.listdir(raw_dir) 
             if os.path.isfile(os.path.join(raw_dir, f)) and 'GSM' in f]

samples = set()
for f in all_files:
    # 提取形如GSM3148575的文件名部分
    base_name = f.split('.')[0]  # 移除文件扩展名
    
    # 关键修改：使用split()从左侧分割一次，提取第一部分
    sample_name = base_name.split('_')[0]
    
    samples.add(sample_name)

for sample in samples:
    # 找到当前样本的相关文件
    related_files = [f for f in all_files if sample in f and any(key in f for key in ['barcodes', 'features', 'genes', 'matrix'])]
    
    # 创建样本文件夹
    sample_dir = os.path.join(raw_dir, sample)
    os.makedirs(sample_dir, exist_ok=True)
    
    # 移动并重命名文件
    for file in related_files:
        src = os.path.join(raw_dir, file)
        
        # 确定目标文件名
        if 'barcodes' in file:
            dest_name = "barcodes.tsv.gz"
        elif 'features' in file or 'genes' in file:
            dest_name = "features.tsv.gz"
        elif 'matrix' in file:
            dest_name = "matrix.mtx.gz"
        else:
            continue
            
        dest = os.path.join(sample_dir, dest_name)
        shutil.move(src, dest)
    
    print(f"样本文件已整理: {sample}")

# 3. 解压gz文件并读取数据
adata_list = []
samples = [d for d in os.listdir(raw_dir) 
           if os.path.isdir(os.path.join(raw_dir, d)) and 'GSM' in d]

def gunzip_file(gz_path, output_path):
    """解压.gz文件"""
    try:
        with gzip.open(gz_path, 'rb') as f_in:
            with open(output_path, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        return True
    except Exception as e:
        print(f"解压失败 {gz_path}: {e}")
        return False

def make_index_unique(index):
    """使索引唯一化：重复值添加后缀"""
    if index.is_unique:
        return index
    new_index = []
    count = {}
    for item in index:
        if item not in count:
            count[item] = 1
            new_index.append(item)
        else:
            count[item] += 1
            new_index.append(f"{item}.{count[item]}")
    return pd.Index(new_index)

for sample in samples:
    print(f"\n处理样本: {sample}")
    sample_dir = os.path.join(raw_dir, sample)
    
    # 解压所有gz文件
    files_to_unzip = [
        ('barcodes.tsv.gz', 'barcodes.tsv'),
        ('features.tsv.gz', 'features.tsv'),
        ('matrix.mtx.gz', 'matrix.mtx')
    ]
    
    for gz_name, unzipped_name in files_to_unzip:
        gz_path = os.path.join(sample_dir, gz_name)
        unzip_path = os.path.join(sample_dir, unzipped_name)
        
        if os.path.exists(gz_path):
            if gunzip_file(gz_path, unzip_path):
                # 删除压缩文件
                os.remove(gz_path)
                print(f"已解压: {gz_name} -> {unzipped_name}")
        else:
            print(f"文件不存在: {gz_path}")
    
    # 读取10X Genomics数据
    print(f"读取10X数据: {sample}")
    try:
        # 直接使用解压后的文件路径
        barcodes_path = os.path.join(sample_dir, "barcodes.tsv")
        features_path = os.path.join(sample_dir, "features.tsv")
        matrix_path = os.path.join(sample_dir, "matrix.mtx")
        
        cell = pd.read_csv(barcodes_path, header=None)
        genes = pd.read_csv(features_path, sep='\t', header=None, comment='#')
        mtx = scipy.io.mmread(matrix_path)
        
        # 修正1：确保基因索引唯一
        gene_symbols = make_index_unique(genes[1])
        
        # 创建AnnData对象
        adata = anndata.AnnData(X=mtx.T.tocsr())  # 转换为稀疏矩阵并转置
        
        # 修正2：添加样本名前缀到细胞索引
        adata.obs.index = pd.Index([f"{sample}_{barcode}" for barcode in cell[0].values], 
                                  name='barcode')
        
        adata.var.index = gene_symbols
        adata.var.index.name = 'gene_symbol'
        adata.var['gene_ids'] = genes[0].values
        
        # 过滤数据
        print(f"过滤数据: {sample} (原始细胞数={adata.n_obs})")
        sc.pp.filter_cells(adata, min_genes=300)
        sc.pp.filter_genes(adata, min_cells=5)
        print(f"过滤后: {adata.n_obs}细胞, {adata.n_vars}基因")
        
        # 添加样本信息
        adata.obs['Sample'] = sample
        adata_list.append(adata)
        
    except Exception as e:
        print(f"处理样本 {sample} 失败: {str(e)}")
        import traceback
        traceback.print_exc()

# 4. 合并数据
if len(adata_list) == 0:
    print("无可合并的数据")
elif len(adata_list) == 1:
    all_data = adata_list[0]
    print("仅单个样本:", all_data)
else:
    print(f"\n合并 {len(adata_list)} 个样本数据...")
    
    # 提取样本名称列表
    batch_categories = [adata.obs['Sample'].iloc[0] for adata in adata_list]
    
    # 使用外部concat方法，避免索引问题
    all_data = anndata.concat(
        adata_list,
        join='outer',   # 使用外连接以包含所有基因
        merge='same',    # 保持非矩阵数据相同
        axis=0,         # 按行(细胞)合并
        label='Sample', # 保存原始样本的字段名
        keys=batch_categories, # 样本ID作为键
        index_unique='-' # 确保最终细胞索引唯一
    )
    
    print("合并完成!\n总数据:", all_data)
    print(f"总细胞数: {all_data.n_obs}, 总基因数: {all_data.n_vars}")


成功解压: GSE220661_RAW.tar 到 D:\data\GDK_sc_retina\raw_data_normal\GSE220661
样本文件已整理: GSM6808629
样本文件已整理: GSM6808632
样本文件已整理: GSM6808630
样本文件已整理: GSM6808633
样本文件已整理: GSM6808628
样本文件已整理: GSM6808631

处理样本: GSM6808628
已解压: barcodes.tsv.gz -> barcodes.tsv
已解压: features.tsv.gz -> features.tsv
已解压: matrix.mtx.gz -> matrix.mtx
读取10X数据: GSM6808628
过滤数据: GSM6808628 (原始细胞数=7984)
过滤后: 7973细胞, 19943基因

处理样本: GSM6808629
已解压: barcodes.tsv.gz -> barcodes.tsv
已解压: features.tsv.gz -> features.tsv
已解压: matrix.mtx.gz -> matrix.mtx
读取10X数据: GSM6808629
过滤数据: GSM6808629 (原始细胞数=6962)
过滤后: 6930细胞, 20173基因

处理样本: GSM6808630
已解压: barcodes.tsv.gz -> barcodes.tsv
已解压: features.tsv.gz -> features.tsv
已解压: matrix.mtx.gz -> matrix.mtx
读取10X数据: GSM6808630
过滤数据: GSM6808630 (原始细胞数=2860)
过滤后: 2847细胞, 18621基因

处理样本: GSM6808631
文件不存在: D:\data\GDK_sc_retina\raw_data_normal\GSE220661\GSM6808631\barcodes.tsv.gz
文件不存在: D:\data\GDK_sc_retina\raw_data_normal\GSE220661\GSM6808631\features.tsv.gz
文件不存在: D:\data\GDK_sc_retina\raw_dat

Traceback (most recent call last):
  File "C:\Users\zhyx\AppData\Local\Temp\ipykernel_29160\1977073458.py", line 131, in <module>
    cell = pd.read_csv(barcodes_path, header=None)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\tools\anaconda3\envs\scvi-env\Lib\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\tools\anaconda3\envs\scvi-env\Lib\site-packages\pandas\io\parsers\readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\tools\anaconda3\envs\scvi-env\Lib\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "d:\tools\anaconda3\envs\scvi-env\Lib\site-packages\pandas\io\parsers\readers.py", line 1880, in _make_engine
    self.han

合并完成!
总数据: AnnData object with n_obs × n_vars = 17750 × 21452
    obs: 'n_genes', 'Sample'
总细胞数: 17750, 总基因数: 21452


In [7]:
adata = all_data
adata

AnnData object with n_obs × n_vars = 17750 × 21452
    obs: 'n_genes', 'Sample'

In [8]:
import pandas as pd
import scanpy as sc
import numpy as np
# 2. 读取样本信息表
sample_info = pd.read_excel(r'D:\data\try\job\样本信息V1.xlsx')
# 3. 创建Sample-Barcode到GDK barcode的映射字典
barcode_mapping = sample_info.set_index("Sample-Barcode")["GDK barcode"].to_dict()

# 4. 替换adata.obs中的Sample列
adata.obs["Sample"] = adata.obs["Sample"].map(barcode_mapping)

# 5. 创建样本信息子表 (移除Sample-Barcode列)
sample_meta = sample_info.drop(columns=["Sample-Barcode"]).set_index("GDK barcode")

# 6. 将样本元数据添加到adata.obs中
adata.obs = adata.obs.join(sample_meta, on="Sample")

# 验证结果
print("样本信息整合完成！")
print(f"添加的样本信息列: {list(sample_meta.columns)}")
print(adata.obs.head())



样本信息整合完成！
添加的样本信息列: ['Barcode', 'Sample label', 'platform-seq', 'Sample-type/age', 'Species', 'seq-type', 'gender', 'age', 'ethnicity']
                                          n_genes        Sample    Barcode  \
barcode                                                                      
GSM6808628_AAACCCAAGGAACTCG-1-GSM6808628     2527  GDK000000104  GSE220661   
GSM6808628_AAACCCAAGGTAAGGA-1-GSM6808628     2525  GDK000000104  GSE220661   
GSM6808628_AAACCCAAGGTTGAGC-1-GSM6808628     2234  GDK000000104  GSE220661   
GSM6808628_AAACCCACAAGCGCTC-1-GSM6808628     2049  GDK000000104  GSE220661   
GSM6808628_AAACCCACAGTAGAAT-1-GSM6808628     6917  GDK000000104  GSE220661   

                                                                   Sample label  \
barcode                                                                           
GSM6808628_AAACCCAAGGAACTCG-1-GSM6808628  ES cell line-derived retinal organoid   
GSM6808628_AAACCCAAGGTAAGGA-1-GSM6808628  ES cell line-derived retin

In [9]:
adata.obs['Sample']

barcode
GSM6808628_AAACCCAAGGAACTCG-1-GSM6808628    GDK000000104
GSM6808628_AAACCCAAGGTAAGGA-1-GSM6808628    GDK000000104
GSM6808628_AAACCCAAGGTTGAGC-1-GSM6808628    GDK000000104
GSM6808628_AAACCCACAAGCGCTC-1-GSM6808628    GDK000000104
GSM6808628_AAACCCACAGTAGAAT-1-GSM6808628    GDK000000104
                                                ...     
GSM6808630_TTTGGAGGTGAGTTTC-1-GSM6808630    GDK000000106
GSM6808630_TTTGGAGTCCGACGGT-1-GSM6808630    GDK000000106
GSM6808630_TTTGGTTAGAGCCATG-1-GSM6808630    GDK000000106
GSM6808630_TTTGGTTCACTTCTCG-1-GSM6808630    GDK000000106
GSM6808630_TTTGGTTCAGCACAAG-1-GSM6808630    GDK000000106
Name: Sample, Length: 17750, dtype: object

In [10]:
import os
import scanpy as sc
import omicverse as ov

# 设置输出目录
output_dir = r"D:\data\GDK_sc_retina\GDK_samples"
os.makedirs(output_dir, exist_ok=True)

# 获取所有唯一GDK barcode（样本ID）
unique_samples = adata.obs['Sample'].unique()

# 遍历每个样本单独处理
for sample_id in unique_samples:
    print(f"\n正在处理样本: {sample_id}")
    
    # 提取当前样本的子集
    sample_adata = adata[adata.obs['Sample'] == sample_id].copy()
    
    print(f"原始细胞数: {sample_adata.n_obs}")
    
   
    # 构建输出文件名 (使用GDK barcode)
    output_path = os.path.join(output_dir, f"{sample_id}.h5ad")
    
    # 保存处理后的数据
    sample_adata.write(output_path)
    print(f"样本已保存至: {output_path}")

print("\n所有样本处理完成!")



正在处理样本: GDK000000104
原始细胞数: 7973
样本已保存至: D:\data\GDK_sc_retina\GDK_samples\GDK000000104.h5ad

正在处理样本: GDK000000105
原始细胞数: 6930
样本已保存至: D:\data\GDK_sc_retina\GDK_samples\GDK000000105.h5ad

正在处理样本: GDK000000106
原始细胞数: 2847
样本已保存至: D:\data\GDK_sc_retina\GDK_samples\GDK000000106.h5ad

所有样本处理完成!
