In [10]:
import h5py
import pandas
import scanpy
import numpy

In [2]:
def read_barcodes(h5):
    h5_file = h5py.File(h5)
    
    bc = h5_file['matrix']['barcodes']
    og_bc = h5_file['matrix']['observations']['original_barcodes']
    
    barcodes = [x.decode() for x in bc]
    original_barcodes = [x.decode() for x in og_bc]
    
    df = pandas.DataFrame(
        {'barcodes' : barcodes,
         'original_barcodes' : original_barcodes}
    )
    
    return df

In [6]:
h5 = 'GSM5123955_X066-RP0C1W1_leukopak_perm-cells_cite_200M_rna_counts.h5'
adt = 'GSM5123955_X066-RP0C1W1_leukopak_perm-cells_cite_48M_adt_counts.csv.gz'

In [3]:
bc_convert = read_barcodes(h5)

In [48]:
adt_df = pandas.read_csv(adt)

Keep barcodes that are also in the .h5 file:

In [49]:
adt_df = adt_df.loc[adt_df['cell_barcode'].isin(bc_convert['original_barcodes']),:]

Convert from original to new barcodes using a dictionary:

In [50]:
bc_dict = dict(zip(bc_convert['original_barcodes'], bc_convert['barcodes']))

In [51]:
adt_df['barcodes'] = [bc_dict[x] for x in adt_df['cell_barcode']]

Use the updated barcodes as an index to sort cells to match the .h5 file:

In [52]:
adt_df = adt_df.set_index('barcodes')
adt_df = adt_df.loc[bc_convert['barcodes'],:]

Drop the old barcodes and the 'total' column for use in analysis

In [55]:
adt_df = adt_df.drop(['cell_barcode', 'total'], axis = 1)

In [56]:
adt_df.head()

Unnamed: 0_level_0,CD10,CD11b,CD11c,CD123,CD127,CD14,CD141,CD16,CD172a,CD185,...,FceRI,HLA-DR,IgD,IgG1-K-Isotype-Control,IgM,KLRG1,TCR-Va24-Ja18,TCR-Va7.2,TCR-a/b,TCR-g/d
barcodes,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
195d17cc4f8711eb841542010a19c80f,6,2,243,29,0,272,60,54,136,0,...,5,30,0,0,3,23,10,0,11,80
195d18b24f8711eb841542010a19c80f,2,9,558,9,0,169,55,3,235,0,...,8,187,1,3,5,13,6,2,9,72
195d19204f8711eb841542010a19c80f,2,0,2,20,0,8,2,2,9,18,...,2,203,28,1,19,10,2,0,15,74
195d19844f8711eb841542010a19c80f,2,6,417,9,0,319,26,5,275,1,...,1,489,2,0,3,14,8,0,8,43
195d19e84f8711eb841542010a19c80f,0,0,2,3,3,3,1,6,3,0,...,1,1,0,1,1,7,4,0,15,40


Save the updated version for later use

In [57]:
adt_df.to_csv('GSM5123955_X066-RP0C1W1_leukopak_perm-cells_cite_48M_adt_counts_fixed.csv')