In [1]:
import pandas as pd
import numpy as np
import gc
from scipy.io import mmread,mmwrite
from scipy.sparse import csr_matrix

In [2]:
df_genes = pd.read_csv('../data/raw/pbmc/10x/68k_pbmc/genes.tsv',delimiter='\t',header=None)
df_barcodes = pd.read_csv('../data/raw/pbmc/10x/68k_pbmc/barcodes.tsv',delimiter='\t',header=None)
df_mapping = pd.read_csv('https://raw.githubusercontent.com/10XGenomics/single-cell-3prime-paper/master/pbmc68k_analysis/68k_pbmc_barcodes_annotation.tsv',delimiter='\t')

In [3]:
matrix = mmread('../data/raw/pbmc/10x/68k_pbmc/matrix.mtx')

In [4]:
df_matrix = pd.DataFrame.sparse.from_spmatrix(matrix)

In [5]:
df_matrix.shape

(32738, 68579)

In [6]:
df_matrix.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,68569,68570,68571,68572,68573,68574,68575,68576,68577,68578
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df_barcodes.head()

Unnamed: 0,0
0,AAACATACACCCAA-1
1,AAACATACCCCTCA-1
2,AAACATACCGGAGA-1
3,AAACATACTAACCG-1
4,AAACATACTCTTCA-1


In [8]:
df_genes.shape

(32738, 2)

In [9]:
df_mapping.shape

(68579, 4)

In [10]:
dict_barcode_cell = dict(zip(df_barcodes.index,df_barcodes[0]))

In [11]:
dict_gene_index = dict(zip(df_genes.index,df_genes[1]))

In [12]:
df_mapping.celltype.value_counts()

CD8+ Cytotoxic T                20773
CD8+/CD45RA+ Naive Cytotoxic    16666
CD56+ NK                         8776
CD4+/CD25 T Reg                  6187
CD19+ B                          5908
CD4+/CD45RO+ Memory              3061
CD14+ Monocyte                   2862
Dendritic                        2099
CD4+/CD45RA+/CD25- Naive T       1873
CD34+                             277
CD4+ T Helper2                     97
Name: celltype, dtype: int64

In [13]:
gc.collect()

40

### Cell Filtering

In [14]:
colSum = pd.DataFrame(df_matrix[df_matrix>=3].count())

In [15]:
l1 = set(colSum[colSum[0]>=colSum[0].quantile(0.001)].index.tolist())
l2 = set(colSum[colSum[0]<=colSum[0].quantile(1)].index.tolist())

In [16]:
len(colSum)-len(l1.intersection(l2))

64

In [17]:
cell_filtered_matrix = df_matrix[list(l1.intersection(l2))]

In [18]:
cell_filtered_matrix.shape

(32738, 68515)

In [19]:
gc.collect()

40

### Gene Filtering

In [20]:
rowSum = pd.DataFrame(cell_filtered_matrix[cell_filtered_matrix>2].count(axis=1))

In [21]:
filtered_df = cell_filtered_matrix[cell_filtered_matrix.index.isin(rowSum[rowSum[0]>3].index.tolist())]

### Write Filtered DF

In [22]:
filtered_df.columns = list(map(lambda x: dict_barcode_cell[x] , filtered_df.columns.tolist()))

In [23]:
filtered_df.index = list(map(lambda x: dict_gene_index[x], filtered_df.index.tolist()))

In [24]:
gc.collect()

60

In [25]:
filtered_df.head()

Unnamed: 0,AAACATACACCCAA-1,AAACATACCCCTCA-1,AAACATACCGGAGA-1,AAACATACTAACCG-1,AAACATACTCTTCA-1,AAACATACTGGATC-1,AAACATACTGTCTT-1,AAACATACTTCTAC-1,AAACATTGCTGCTC-1,AAACATTGCTTCGC-1,...,TTTGACTGCTTTAC-8,TTTGACTGTATCGG-8,TTTGACTGTCGTTT-8,TTTGACTGTGCTAG-8,TTTGCATGACACCA-8,TTTGCATGAGCCTA-8,TTTGCATGCTAGCA-8,TTTGCATGCTGCAA-8,TTTGCATGGCTCCT-8,TTTGCATGTGGTAC-8
NOC2L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
HES4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISG15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF18,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
TNFRSF4,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
df_mapping[df_mapping.barcodes.isin(filtered_df.columns)][['celltype']].to_csv('../data/processed/pbmc68k/celltype.csv',index=False)

In [27]:
pd.DataFrame(filtered_df.index).to_csv('../data/processed/pbmc68k/genes.csv',index=False)

In [28]:
pd.DataFrame(filtered_df.columns).to_csv('../data/processed/pbmc68k/barcodes.csv',index=False)

In [29]:
filtered_df.shape

(6977, 68515)

In [30]:
mmwrite('../data/processed/pbmc68k/matrix.mtx',csr_matrix(filtered_df.values))