# H5AD Testing Notebook

In this notebook, we'll test the to_h5ad method and make sure it is working as intended, as well as keeping the original columns of our dummy dataframe

In [1]:
import pandas as pd 
import numpy as np
from bigcsv import BigCSV
import os 
import anndata as an
from scipy.sparse import csr_matrix

In [2]:
df = pd.DataFrame(index=[f'obs_{i}' for i in range(25)], columns=[f'col_{i}' for i in range(10)])

for i in range(25):
    df.iloc[i, :] = [i]*10

cols = df.columns 
obs = df.index 
df.index.name = 'index_col'

# df = an.AnnData(df.values)
# df.var.index = cols
# df.obs['name'] == 

df.to_csv('test_data.csv')
pd.read_csv('test_data.csv').head()

Unnamed: 0,index_col,col_0,col_1,col_2,col_3,col_4,col_5,col_6,col_7,col_8,col_9
0,obs_0,0,0,0,0,0,0,0,0,0,0
1,obs_1,1,1,1,1,1,1,1,1,1,1
2,obs_2,2,2,2,2,2,2,2,2,2,2
3,obs_3,3,3,3,3,3,3,3,3,3,3
4,obs_4,4,4,4,4,4,4,4,4,4,4


In [3]:
# converter = BigCSV(
#     file='test_data.csv',
#     outfile='test_data.h5ad',
#     insep=',',
#     chunkfolder='chunks/',
#     save_chunks=True,
#     chunksize=10,
# )

In [4]:
# converter.to_h5ad(index_col='index_col')

In [22]:
chunksize=5
file='test_data.csv'

with open(file) as f:
    lines = len(f.readlines()) - 1

index_col='index_col'
sparsify=True

num_chunks = lines//chunksize + int(lines % chunksize == 0)
chunkified = pd.read_csv(file, chunksize=chunksize, index_col=index_col)

anndatas = []
for chunk, data in zip(range(0, num_chunks + 1), chunkified):
    print(f'Working on chunk {chunk}/{num_chunks}')
    
    if sparsify:
        df = an.AnnData(
            X=csr_matrix(data.values),
        )
        df.var.index = data.columns.values 
        df.obs.index = data.index.values

    else:
        df = an.AnnData(data)
        print(df.obs)
        print(df.var)

    anndatas.append(df)

Working on chunk 0/6
Working on chunk 1/6
Working on chunk 2/6
Working on chunk 3/6
Working on chunk 4/6


In [23]:
df = an.concat(anndatas)

In [29]:
orig = pd.read_csv('test_data.csv', index_col='index_col')


In [31]:
df.var.index == orig.columns

array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
        True])

In [12]:
total = pd.read_csv(
    '../../organoid-classification/data/retina/raw_matrix.tsv', 
    sep='\t', 
    index_col='gene', 
#     dtype=np.float32
)

In [13]:
all(total.index == df.obs['gene'])

True

In [23]:
res = np.linalg.norm(df.X.todense() - total.values)
res

0.0

In [24]:
total

Unnamed: 0_level_0,0024369980fd003553cbc9dfe29f7f95,002f621cb1c922521e488d9cbf8b707d,0037f1b36684cb59b84d3585ca55ff69,00390952646f52d11a9ab9bba7d6ac51,005b3351658380695a5dc46c384d72d7,008ef517b5adc96fcfe041bebf820658,00906832f470fc434a52ac7d678a95bc,00a9aedfbe8af54deb525fedc0261060,0122c7064395ffed6091c6577babaed4,0136e826d795619b4004b2c147101ad0,...,ff61c0c282f41e4a37885b05342441da,ff686d05b08ace5f171d6a68f2317a83,ff758dccecbdbc1775c903208d9e90fd,ff7ff693256ca29698f8fd8fe5606116,ffa1993eb6098fa1f4c9b8472658cbd9,ffa4633bef82949d2c6ac17b3ddf46e9,ffa5758b0600f47722fdc755444dfe0c,ffd3fd6119de767f3c3b8c47b2c28bf0,ffe21ae45a7fad28db64783d0697fc4f,fffd3d11e75c6c544e6741967b3edb7a
gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ENSG00000000003|TSPAN6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000000419|DPM1,0,0,0,0,0,1,0,1,0,0,...,0,1,0,0,1,0,1,0,0,1
ENSG00000000457|SCYL3,0,1,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000000460|C1orf112,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
ENSG00000000938|FGR,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ENSG00000284738|AL358472.5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000284740|AL645728.2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000284744|AL591163.1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
ENSG00000284747|AL034417.4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
all(pd.read_csv('../../organoid-classification/data/retina/raw_labels.tsv', sep='\t')['index'] == total.columns)

True

In [28]:
for x in idk:
    print(len(x))

3
3
3
3
3
3
3
3
1
