# Adding an expression matrix to the CHARTS HDF5 database

This is the first step required to run the CHARTS pipeline on a new dataset.

### Load the toy dataset from the file, "GSE70630_MGH36_MGH53.tsv".

In [1]:
import pandas as pd
import numpy as np
import h5py
from os.path import join

print("Loading data...")
df = pd.read_csv(
    './GSE70630_MGH36_MGH53.tsv',
    sep='\t',
    index_col=0
)
df

Loading data...


Unnamed: 0,'A1BG','A1BG-AS1','A1CF','A2M','A2M-AS1','A2ML1','A2MP1','A4GALT','A4GNT','AA06',...,'ZWINT','ZXDA','ZXDB','ZXDC','ZYG11A','ZYG11B','ZYX','ZZEF1','ZZZ3',tumor
MGH36_P6_A12,0.0,0.000000,0.000000,6.240009,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,1.702933,1.324423,0.000000,2.815442,3.059183,0.0,MGH36
MGH36_P6_H09,0.0,0.000000,0.000000,5.335641,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.722701,0.000000,3.625432,4.465193,0.000000,0.0,MGH36
MGH36_P10_G12,0.0,0.000000,0.000000,6.185816,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,2.540779,0.000000,2.666543,0.000000,1.115133,0.0,MGH36
MGH36_P6_B07,0.0,0.000000,0.000000,8.071685,4.521649,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,1.568619,2.310552,4.824850,0.000000,0.385260,0.0,MGH36
MGH36_P10_B12,0.0,0.000000,1.688248,7.444788,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,1.597364,0.000000,1.040277,1.437462,0.000000,1.950180,0.470003,0.0,MGH36
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
MGH53_P4_G08,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,5.137924,2.163324,1.214924,0.810948,5.500930,0.000000,0.0,MGH53
MGH53_P4_G09,0.0,0.000000,0.254643,1.504079,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.657507,0.824185,1.547571,0.000000,0.000000,0.0,MGH53
MGH53_P4_H10,0.0,5.060341,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.955497,1.054312,1.175564,0.000000,0.000000,0.0,MGH53
MGH53_P4_H11,0.0,0.000000,0.000000,1.238385,0.000000,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.000000,1.175564,1.345484,1.519512,0.000000,0.000000,0.0,MGH53


### Set location of the CHARTS database

This is the "charts.h5" HDF5 database.

In [2]:
h5_loc = '/Users/matthewbernstein/Development/charts/charts_db_mod/charts.h5'

### Add tumor MGH36 to the CHARTS database

Because MGH36 is already in the database and we are only doing this as a demonstration, we will rename the tumor to "My_Tumor_1" to emphasize that this a user-provided tumor.

In [5]:
tumor_1_name = 'My_Tumor_1' # Name of the tumor

df_tumor1 = df.loc[df['tumor'] == 'MGH36'][df.columns[:-1]]
with h5py.File(h5_loc, 'r+') as f:
    try:
        del f['per_tumor/{}'.format(tumor_1_name)]
    except KeyError:
        pass
    try:
        f.create_group('per_tumor')
    except:
        pass
    f['per_tumor'].create_group(tumor_1_name)
    f['per_tumor'][tumor_1_name].create_dataset(
        'log1_tpm',
        data=np.array(df_tumor1),
        compression="gzip"
    )
    f['per_tumor'][tumor_1_name].create_dataset(
        'cell',
        data=np.array([
            x.encode('utf-8')
            for x in df_tumor1.index
        ]),
        compression="gzip"
    )
    f['per_tumor'][tumor_1_name].create_dataset(
        'gene_name',
        data=np.array([
            x.encode('utf-8')
            for x in df_tumor1.columns
        ]),
        compression="gzip"
    )

### Add tumor MGH53 to the CHARTS database

Because MGH53 is already in the database and we are only doing this as a demonstration, we will rename the tumor to "My_Tumor_1" to emphasize that this a user-provided tumor.

In [7]:
tumor_2_name = 'My_Tumor_2' # Name of the tumor

df_tumor2 = df.loc[df['tumor'] == 'MGH53'][df.columns[:-1]]
with h5py.File(h5_loc, 'r+') as f:
    try:
        del f['per_tumor/{}'.format(tumor_2_name)]
    except KeyError:
        pass
    try:
        f.create_group('per_tumor')
    except:
        pass
    f['per_tumor'].create_group(tumor_2_name)
    f['per_tumor'][tumor_2_name].create_dataset(
        'log1_tpm',
        data=np.array(df_tumor2),
        compression="gzip"
    )
    f['per_tumor'][tumor_2_name].create_dataset(
        'cell',
        data=np.array([
            x.encode('utf-8')
            for x in df_tumor2.index
        ]),
        compression="gzip"
    )
    f['per_tumor'][tumor_2_name].create_dataset(
        'gene_name',
        data=np.array([
            x.encode('utf-8')
            for x in df_tumor2.columns
        ]),
        compression="gzip"
    )