# Cancer Cell Lines Data Testing Suite

### Lets test the functions and outputs in the cancer_cell_lines_data.ipynb notebook.

In [1]:
%%capture
%run cancer_cell_lines_data.ipynb

In [66]:
from pandas.util.testing import assert_frame_equal
from pandas.util.testing import assert_series_equal

### Test Data Imports and Preparation

In [2]:
def test_prepare_dataset(ds):
    return prepare_dataset(ds)

test_prepare_dataset('datasets/COAD_hgnc.csv')

Unnamed: 0,hgnc_symbol,X00106523.5b1d.44ad.a9f1.7d84db08722c.htseq.counts.gz,X00589871.e54f.492f.988f.502670edd606.htseq.counts.gz,X00cc9b4d.a847.464e.979a.7751e1a87ae3.htseq.counts.gz,X00f768f9.9e6c.4e84.bdba.c19368f7e522.htseq.counts.gz,X0134d0cc.e66b.4fda.804b.c4434ec00bd2.htseq.counts.gz,X020aa019.a3a4.4055.92ee.be824a597501.htseq.counts.gz,X02a81bc3.4672.4d2f.808d.d27f4b63bf85.htseq.counts.gz,X02ddfa3d.13b3.4624.994d.62e740fa4a3d.htseq.counts.gz,X03109baa.936f.49ae.acb3.2d00ac03c7ab.htseq.counts.gz,...,fabefb10.5546.4017.8ea1.29982a10fb3c.htseq.counts.gz,fc477ec6.19af.4eb7.91cf.a8a16392f034.htseq.counts.gz,fc5dea3c.dfa2.4d52.ac5a.49335d6d5eaf.htseq.counts.gz,fc822b3c.b73f.4529.960b.e3bdfcff80ed.htseq.counts.gz,fcb5f382.f06f.4885.8ea1.c0699b408838.htseq.counts.gz,fd40f0e5.0825.409b.8a4c.86da09f0a84c.htseq.counts.gz,fda5b844.1fa4.462e.bd3b.482025f6ee30.htseq.counts.gz,fdd24a0e.f3ef.4a5a.87cf.f99884b90fb9.htseq.counts.gz,fe898d64.faa4.452c.b8ef.cfec7250a286.htseq.counts.gz,ff8de5e6.76c5.491a.8fac.41492723b780.htseq.counts.gz
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A1CF,4257,1604,132,5425,1155,3972,2906,4103,1763,...,4069,2523,851,2478,1181,1081,5392,3540,2737,954
3,A2M,183,39,14,221,20,134,44,90,10,...,92,102,6,289,88,22,87,67,32,10
4,A2M-AS1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,24,1,0,12,2,22,0,9,2,...,17,23,0,21,3,0,19,44,22,3
37334,ZYG11B,1646,1237,1097,3918,1115,2751,3616,1165,691,...,2641,3569,1117,3304,2102,2210,3389,2932,3984,1512
37335,ZYX,624,128,93,2609,91,687,534,156,185,...,325,555,74,2169,58,242,537,310,2565,190
37336,ZYXP1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Test TPM calculation

1. Test calculate gene length. if this test passes the dataframe will show a number that remains consistent for each row.

2. Test the creation of the rpk table

3. Test the calculation of rpk with the rpk function

4. Test calculate total reads, this tests that sum of the column is being taken

5. Test per million table tests that the per million scaling factor is being taken and that the final step of tpm is being calculated properly.

In [3]:
def test_calculate_gene_length(table):
    missing_genes_dict = {'C12orf74':'PLEKHG7',
                     'LINC00856':'LINC00595'}

    precalculated_gene_length_dict = {'CCL3L1': 3.090, 'RP11-34P13.7': 2.748, 
                                      'RP11-34P13.15':0.755, 'RP11-34P13.14':0.323,
                                      'RP11-34P13.13':1.301, 'RP11-34P13.9':0.457,
                                      'RP4-669L17.10':0.457, 'RP4-669L17.8':1.239,
                                      'RP4-669L17.4':2.017, 'RP4-669L17.2':0.324,
                                      'RP5-857K21.15':2.3, 'RP4-669L17.1':0.385,
                                      'RP5-857K21.1':0.845, 'RP5-857K21.2':123.115,
                                      'RP5-857K21.3':1.543, 'RP5-857K21.4':0.413,
                                      'RP5-857K21.5':1.634, 'hsa-mir-6723':1.543,
                                      'RP5-857K21.7':0.682, 'RP5-857K21.11':0.547,
                                      'RP11-206L10.1':13.770, 'RP11-206L10.3':4.860,
                                      'RP11-206L10.5':9.283, 'RP11-206L10.4':7.604, 'RP11-206L10.2':13.770,
                                      'RP11-206L10.9':8.204, 'RP11-206L10.8':2.746, 'RP11-206L10.10':2.823,
                                      'RP11-206L10.11':1.079, 'RP11-54O7.16':0.351, 'RP11-54O7.1':3.043,
                                      'RP11-54O7.2':0.156, 'RP11-54O7.3':1.389, 'C1orf170':3.035}

    table_numpy = table.to_numpy(copy=True)
    
    
    ## -------- FOR CALCULATING RPK ----------
    rpk_table = table_numpy
    gene_length = 0
    
    for index, value in np.ndenumerate(table_numpy):

        if index[1] == 0:
            gene = value
            if gene in precalculated_gene_length_dict:
                gene_length = precalculated_gene_length_dict.get(value)
            else:
                if gene in missing_genes_dict:
                    gene = missing_genes_dict.get(value)
                gene_length = find_gene_length_ensembl(gene)
            

        else:
            rpk_table[index[0], index[1]] = gene_length
            
    return pd.DataFrame(rpk_table)

test_calculate_gene_length(coad_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,512,513,514,515,516,517,518,519,520,521
0,A1BG,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,...,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314
1,A1BG-AS1,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,...,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737
2,A1CF,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,...,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266
3,A2M,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,...,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565
4,A2M-AS1,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,...,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,...,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993
37334,ZYG11B,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,...,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883
37335,ZYX,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,...,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816
37336,ZYXP1,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,...,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117


In [4]:
def test_calculate_rpk_table(table):
    missing_genes_dict = {'C12orf74':'PLEKHG7',
                     'LINC00856':'LINC00595'}

    precalculated_gene_length_dict = {'CCL3L1': 3.090}

    table_numpy = table.to_numpy(copy=True)
    
    
    ## -------- FOR CALCULATING RPK ----------
    rpk_table = table_numpy
    gene_length = 0
    
    for index, value in np.ndenumerate(table_numpy):

        if index[1] == 0:
            gene = value
            if gene in precalculated_gene_length_dict:
                gene_length = precalculated_gene_length_dict.get(value)
            else:
                if gene in missing_genes_dict:
                    gene = missing_genes_dict.get(value)
                gene_length = find_gene_length_ensembl(gene)
            

        else:
            rpk_table[index[0], index[1]] = calculate_rpk(value, gene_length)
     
    # compare original table with the resultant rpk values
    display (table)
    return pd.DataFrame(rpk_table)

test_calculate_rpk_table(coad_all)

Unnamed: 0,hgnc_symbol,X00106523.5b1d.44ad.a9f1.7d84db08722c.htseq.counts.gz,X00589871.e54f.492f.988f.502670edd606.htseq.counts.gz,X00cc9b4d.a847.464e.979a.7751e1a87ae3.htseq.counts.gz,X00f768f9.9e6c.4e84.bdba.c19368f7e522.htseq.counts.gz,X0134d0cc.e66b.4fda.804b.c4434ec00bd2.htseq.counts.gz,X020aa019.a3a4.4055.92ee.be824a597501.htseq.counts.gz,X02a81bc3.4672.4d2f.808d.d27f4b63bf85.htseq.counts.gz,X02ddfa3d.13b3.4624.994d.62e740fa4a3d.htseq.counts.gz,X03109baa.936f.49ae.acb3.2d00ac03c7ab.htseq.counts.gz,...,fabefb10.5546.4017.8ea1.29982a10fb3c.htseq.counts.gz,fc477ec6.19af.4eb7.91cf.a8a16392f034.htseq.counts.gz,fc5dea3c.dfa2.4d52.ac5a.49335d6d5eaf.htseq.counts.gz,fc822b3c.b73f.4529.960b.e3bdfcff80ed.htseq.counts.gz,fcb5f382.f06f.4885.8ea1.c0699b408838.htseq.counts.gz,fd40f0e5.0825.409b.8a4c.86da09f0a84c.htseq.counts.gz,fda5b844.1fa4.462e.bd3b.482025f6ee30.htseq.counts.gz,fdd24a0e.f3ef.4a5a.87cf.f99884b90fb9.htseq.counts.gz,fe898d64.faa4.452c.b8ef.cfec7250a286.htseq.counts.gz,ff8de5e6.76c5.491a.8fac.41492723b780.htseq.counts.gz
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A1CF,4257,1604,132,5425,1155,3972,2906,4103,1763,...,4069,2523,851,2478,1181,1081,5392,3540,2737,954
3,A2M,183,39,14,221,20,134,44,90,10,...,92,102,6,289,88,22,87,67,32,10
4,A2M-AS1,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,24,1,0,12,2,22,0,9,2,...,17,23,0,21,3,0,19,44,22,3
37334,ZYG11B,1646,1237,1097,3918,1115,2751,3616,1165,691,...,2641,3569,1117,3304,2102,2210,3389,2932,3984,1512
37335,ZYX,624,128,93,2609,91,687,534,156,185,...,325,555,74,2169,58,242,537,310,2565,190
37336,ZYXP1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,512,513,514,515,516,517,518,519,520,521
0,A1BG,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,A1CF,49.3474,18.5937,1.53015,62.8869,13.3888,46.0436,33.6865,47.5622,20.4368,...,47.1681,29.2467,9.86484,28.7251,13.6902,12.531,62.5043,41.0359,31.7274,11.0588
3,A2M,3.76815,0.803047,0.288273,4.5506,0.411819,2.75919,0.906002,1.85319,0.20591,...,1.89437,2.10028,0.123546,5.95079,1.812,0.453001,1.79141,1.37959,0.658911,0.20591
4,A2M-AS1,0,0,0,0,0,0,0,0.283607,0,...,0,0,0,0,0,0,0.283607,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,24.1692,1.00705,0,12.0846,2.0141,22.1551,0,9.06344,2.0141,...,17.1198,23.1621,0,21.148,3.02115,0,19.1339,44.3102,22.1551,3.02115
37334,ZYG11B,16.3159,12.2617,10.874,38.8371,11.0524,27.2692,35.8435,11.548,6.84952,...,26.1788,35.3776,11.0722,32.7508,20.836,21.9066,33.5934,29.0634,39.4913,14.9877
37335,ZYX,63.5697,13.0399,9.47433,265.791,9.27058,69.9878,54.401,15.8924,18.8468,...,33.1092,56.5403,7.53871,220.966,5.90872,24.6536,54.7066,31.5811,261.308,19.3562
37336,ZYXP1,0,0,0,8.54701,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
def test_calculate_rpk(sample_count, gene_length, expected):
    test = calculate_rpk(sample_count, gene_length)
    return test - expected

# test zyg11b, patient 2
test_calculate_rpk(1237, 100.883, 12.2617)

2.8933517045359736e-05

In [6]:
def test_calculate_total_reads(table):
    missing_genes_dict = {'C12orf74':'PLEKHG7',
                     'LINC00856':'LINC00595'}

    precalculated_gene_length_dict = {'CCL3L1': 3.090}

    table_numpy = table.to_numpy(copy=True)
    
    
    ## -------- FOR CALCULATING RPK ----------
    rpk_table = table_numpy
    gene_length = 0

    for index, value in np.ndenumerate(table_numpy):
        if index[1] == 0:
            gene = value
            if gene in precalculated_gene_length_dict:
                gene_length = precalculated_gene_length_dict.get(value)
            else:
                if gene in missing_genes_dict:
                    gene = missing_genes_dict.get(value)
                gene_length = find_gene_length_ensembl(gene)
            
                
        else:
            rpk_table[index[0], index[1]] = calculate_rpk(value, gene_length)
    
    ## ---- FOR CALCULATING PER MILLION SCALING FACTOR -----
    per_mil_table = rpk_table
    
    total_reads = np.sum(rpk_table[:, 1:], axis=0)
    
    rpk_pandas = pd.DataFrame(rpk_table)
    rpk_pandas.loc['total by col', :] = rpk_table.sum(axis=0)
    
    # comparing first table against second table, where second table is our function output
    display (rpk_pandas.iloc[[-1]])
    
    return pd.DataFrame(total_reads).transpose()

test_calculate_total_reads(coad_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,512,513,514,515,516,517,518,519,520,521
total by col,A1BGA1BG-AS1A1CFA2MA2M-AS1A2ML1A2ML1-AS1A2ML1-...,32191200.0,13454300.0,3634560.0,46051800.0,11008300.0,23936600.0,16546500.0,9899670.0,10086700.0,...,20973600.0,21669100.0,8804130.0,22032800.0,8257830.0,13364400.0,21699900.0,29101400.0,41387800.0,11366800.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,511,512,513,514,515,516,517,518,519,520
0,32191200.0,13454300.0,3634560.0,46051800.0,11008300.0,23936600.0,16546500.0,9899670.0,10086700.0,30071800.0,...,20973600.0,21669100.0,8804130.0,22032800.0,8257830.0,13364400.0,21699900.0,29101400.0,41387800.0,11366800.0


In [7]:
def test_per_million_table(table):
    missing_genes_dict = {'C12orf74':'PLEKHG7',
                     'LINC00856':'LINC00595'}

    precalculated_gene_length_dict = {'CCL3L1': 3.090}
    table_numpy = table.to_numpy(copy=True)
    
    table_columns = table.columns.values
    
    ## -------- FOR CALCULATING RPK ----------
    rpk_table = table_numpy
    gene_length = 0
    
    for index, value in np.ndenumerate(table_numpy):
        if index[1] == 0:
            gene = value
            if gene in precalculated_gene_length_dict:
                gene_length = precalculated_gene_length_dict.get(value)
            else:
                if gene in missing_genes_dict:
                    gene = missing_genes_dict.get(value)
                gene_length = find_gene_length_ensembl(gene)
            
                
        else:
            rpk_table[index[0], index[1]] = calculate_rpk(value, gene_length)
    
    ## ---- FOR CALCULATING PER MILLION SCALING FACTOR -----
    per_mil_table = rpk_table
    
    total_reads = np.sum(rpk_table[:, 1:], axis=0)
    
    for index, value in np.ndenumerate(rpk_table):
        if (index[1] == 0):
            continue
        total_for_column = total_reads[index[1] - 1]
        
        scaling_factor = calculate_per_million(total_for_column)
        
        per_mil_table[index[0], index[1]] = str(value) + '/' + str(scaling_factor)
        
    return pd.DataFrame(per_mil_table, columns=table_columns)

test_per_million_table(coad_all)

Unnamed: 0,hgnc_symbol,X00106523.5b1d.44ad.a9f1.7d84db08722c.htseq.counts.gz,X00589871.e54f.492f.988f.502670edd606.htseq.counts.gz,X00cc9b4d.a847.464e.979a.7751e1a87ae3.htseq.counts.gz,X00f768f9.9e6c.4e84.bdba.c19368f7e522.htseq.counts.gz,X0134d0cc.e66b.4fda.804b.c4434ec00bd2.htseq.counts.gz,X020aa019.a3a4.4055.92ee.be824a597501.htseq.counts.gz,X02a81bc3.4672.4d2f.808d.d27f4b63bf85.htseq.counts.gz,X02ddfa3d.13b3.4624.994d.62e740fa4a3d.htseq.counts.gz,X03109baa.936f.49ae.acb3.2d00ac03c7ab.htseq.counts.gz,...,fabefb10.5546.4017.8ea1.29982a10fb3c.htseq.counts.gz,fc477ec6.19af.4eb7.91cf.a8a16392f034.htseq.counts.gz,fc5dea3c.dfa2.4d52.ac5a.49335d6d5eaf.htseq.counts.gz,fc822b3c.b73f.4529.960b.e3bdfcff80ed.htseq.counts.gz,fcb5f382.f06f.4885.8ea1.c0699b408838.htseq.counts.gz,fd40f0e5.0825.409b.8a4c.86da09f0a84c.htseq.counts.gz,fda5b844.1fa4.462e.bd3b.482025f6ee30.htseq.counts.gz,fdd24a0e.f3ef.4a5a.87cf.f99884b90fb9.htseq.counts.gz,fe898d64.faa4.452c.b8ef.cfec7250a286.htseq.counts.gz,ff8de5e6.76c5.491a.8fac.41492723b780.htseq.counts.gz
0,A1BG,0.0/32.19124003556955,0.0/13.454347186168276,0.0/3.6345606252618006,0.0/46.05183341128436,0.0/11.008285516897024,0.0/23.936563100436654,0.0/16.546506360946584,0.0/9.899670043963393,0.0/10.086729281811511,...,0.0/20.973595215242202,0.0/21.669142813995663,0.0/8.804126081065075,0.0/22.032798283074595,0.0/8.257825603582537,0.0/13.36435828164764,0.0/21.699925905068614,0.0/29.10137223709744,0.0/41.38783695785632,0.0/11.36680044459077
1,A1BG-AS1,0.0/32.19124003556955,0.0/13.454347186168276,0.0/3.6345606252618006,0.0/46.05183341128436,0.0/11.008285516897024,0.0/23.936563100436654,0.0/16.546506360946584,0.0/9.899670043963393,0.0/10.086729281811511,...,0.0/20.973595215242202,0.0/21.669142813995663,0.0/8.804126081065075,0.0/22.032798283074595,0.0/8.257825603582537,0.0/13.36435828164764,0.0/21.699925905068614,0.0/29.10137223709744,0.0/41.38783695785632,0.0/11.36680044459077
2,A1CF,49.34736744487979/32.19124003556955,18.593652192057124/13.454347186168276,1.5301509285234043/3.6345606252618006,62.88688475181415/46.05183341128436,13.388820624579788/11.008285516897024,46.04363248556789/23.936563100436654,33.68650453249252/16.546506360946584,47.56219136160248/9.899670043963393,20.436788537778497/10.086729281811511,...,47.168061576982815/20.973595215242202,29.246748429276884/21.669142813995663,9.864836667980432/8.804126081065075,28.725106067280272/22.032798283074595,13.690213989288942/8.257825603582537,12.531008740407575/13.36435828164764,62.5043470196833/21.699925905068614,41.03586581040039/29.10137223709744,31.727447661883012/41.38783695785632,11.05881807432824/11.36680044459077
3,A2M,3.768145784000824/32.19124003556955,0.80304746216411/13.454347186168276,0.2882734479563472/3.6345606252618006,4.550602285596623/46.05183341128436,0.41181921136621025/11.008285516897024,2.759188716153609/23.936563100436654,0.9060022650056626/16.546506360946584,1.8531864511479461/9.899670043963393,0.20590960568310512/10.086729281811511,...,1.8943683722845672/20.973595215242202,2.1002779779676723/21.669142813995663,0.12354576340986308/8.804126081065075,5.9507876042417385/22.032798283074595,1.8120045300113252/8.257825603582537,0.4530011325028313/13.36435828164764,1.7914135694430147/21.699925905068614,1.3795943580768044/29.10137223709744,0.6589107381859364/41.38783695785632,0.20590960568310512/11.36680044459077
4,A2M-AS1,0.0/32.19124003556955,0.0/13.454347186168276,0.0/3.6345606252618006,0.0/46.05183341128436,0.0/11.008285516897024,0.0/23.936563100436654,0.0/16.546506360946584,0.2836074872376631/9.899670043963393,0.0/10.086729281811511,...,0.0/20.973595215242202,0.0/21.669142813995663,0.0/8.804126081065075,0.0/22.032798283074595,0.0/8.257825603582537,0.0/13.36435828164764,0.2836074872376631/21.699925905068614,0.0/29.10137223709744,0.0/41.38783695785632,0.0/11.36680044459077
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,24.169184290030213/32.19124003556955,1.0070493454179255/13.454347186168276,0.0/3.6345606252618006,12.084592145015106/46.05183341128436,2.014098690835851/11.008285516897024,22.15508559919436/23.936563100436654,0.0/16.546506360946584,9.06344410876133/9.899670043963393,2.014098690835851/10.086729281811511,...,17.119838872104733/20.973595215242202,23.162134944612287/21.669142813995663,0.0/8.804126081065075,21.148036253776436/22.032798283074595,3.0211480362537766/8.257825603582537,0.0/13.36435828164764,19.133937562940584/21.699925905068614,44.31017119838872/29.10137223709744,22.15508559919436/41.38783695785632,3.0211480362537766/11.36680044459077
37334,ZYG11B,16.31593033514071/32.19124003556955,12.261728933517045/13.454347186168276,10.873982732472271/3.6345606252618006,38.83706868352448/46.05183341128436,11.05240724403517/11.008285516897024,27.269212850529822/23.936563100436654,35.84350187841361/16.546506360946584,11.548030887265446/9.899670043963393,6.8495187494424234/10.086729281811511,...,26.178840835423212/20.973595215242202,35.37761565377715/21.669142813995663,11.072232189764382/8.804126081065075,32.750810344656685/22.032798283074595,20.83601796140083/8.257825603582537,21.90656503077823/13.36435828164764,33.59337053814815/21.699925905068614,29.063370439023423/29.10137223709744,39.49129189258844/41.38783695785632,14.987658971283567/11.36680044459077
37335,ZYX,63.569682151589234/32.19124003556955,13.039934800325998/13.454347186168276,9.474327628361857/3.6345606252618006,265.79054604726974/46.05183341128436,9.270578647106763/11.008285516897024,69.9877750611247/23.936563100436654,54.40097799511002/16.546506360946584,15.892420537897308/9.899670043963393,18.846780766096167/10.086729281811511,...,33.10920945395273/20.973595215242202,56.54034229828851/21.669142813995663,7.538712306438467/8.804126081065075,220.96577017114913/22.032798283074595,5.908720456397718/8.257825603582537,24.65362673186634/13.36435828164764,54.70660146699266/21.699925905068614,31.581092094539525/29.10137223709744,261.3080684596577/41.38783695785632,19.356153219233903/11.36680044459077
37336,ZYXP1,0.0/32.19124003556955,0.0/13.454347186168276,0.0/3.6345606252618006,8.547008547008547/46.05183341128436,0.0/11.008285516897024,0.0/23.936563100436654,0.0/16.546506360946584,0.0/9.899670043963393,0.0/10.086729281811511,...,0.0/20.973595215242202,0.0/21.669142813995663,0.0/8.804126081065075,0.0/22.032798283074595,0.0/8.257825603582537,0.0/13.36435828164764,0.0/21.699925905068614,0.0/29.10137223709744,0.0/41.38783695785632,0.0/11.36680044459077


### With other cell lines:

In [8]:
test_calculate_gene_length(lihc_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,416,417,418,419,420,421,422,423,424,425
0,A1BG,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,...,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314
1,A1BG-AS1,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,...,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737
2,A1CF,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,...,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266
3,A2M,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,...,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565
4,A2M-AS1,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,...,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,...,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993
37334,ZYG11B,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,...,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883
37335,ZYX,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,...,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816
37336,ZYXP1,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,...,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117


In [9]:
test_calculate_rpk_table(lihc_all)

Unnamed: 0,hgnc_symbol,X004c60cf.c08e.49df.b4ce.baca41e11250.htseq.counts.gz,X0069f64b.8d8f.4426.968d.23483929ee58.htseq.counts.gz,X014b9b85.3128.416b.93d4.7ace3b676d4e.htseq.counts.gz,X03011a57.3e95.49d1.a927.cff4111d2d5b.htseq.counts.gz,X0415a9b4.a58d.4641.ab1a.927ed7a04824.htseq.counts.gz,X047bd029.d63b.4f25.8a73.b95ad72d434f.htseq.counts.gz,X04dc4da1.1d1a.46da.a9d8.da9964591aec.htseq.counts.gz,X04e7f1a4.3173.4a6f.af60.04e1f2e29868.htseq.counts.gz,X05ac7b05.e459.4833.97fc.530185a7a55f.htseq.counts.gz,...,fbbb6d26.8bd4.40da.870b.d8db0f653cfa.htseq.counts.gz,fca37687.75b4.4ca6.9963.36b468ca01a7.htseq.counts.gz,fdb62f73.33a7.44c3.950c.739383b9dd30.htseq.counts.gz,fe506b98.0733.43c9.943a.be9b12f1c2fb.htseq.counts.gz,fe625352.dd2e.478d.8d21.06659f854945.htseq.counts.gz,fe76a5ca.f70a.4ab7.b080.5a19ae36dc2b.htseq.counts.gz,feae9113.b2f3.4dd4.9faf.6076eb32c925.htseq.counts.gz,ff8776f1.5499.459c.989f.0bc5268e6631.htseq.counts.gz,ffeed225.c2a3.4b4c.954c.4816903782a9.htseq.counts.gz,ID
0,A1BG,0,0,1,0,0,0,0,0,0,...,0,3,1,0,0,0,0,1,0,19205
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,19206
2,A1CF,1670,2730,1638,1205,2029,2815,1338,2422,1587,...,358,1939,5003,1390,1146,900,2995,1872,3534,19207
3,A2M,5,15,4,0,14,6,0,6,5,...,0,27,14,14,9,4,13,5,6,19208
4,A2M-AS1,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,4,17,17,32,12,15,29,56,0,...,3,11,16,21,0,0,43,48,13,60478
37334,ZYG11B,523,1068,1024,318,1203,844,845,1253,833,...,558,916,1529,588,1700,706,1160,1320,2179,60479
37335,ZYX,101,81,286,30,162,458,34,106,192,...,88,146,274,362,233,181,99,134,475,60480
37336,ZYXP1,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,60481


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,416,417,418,419,420,421,422,423,424,425
0,A1BG,0,0,0.120279,0,0,0,0,0,0,...,0,0.360837,0.120279,0,0,0,0,0.120279,0,2309.96
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0.129249,0,2482.36
2,A1CF,19.3587,31.6463,18.9878,13.9684,23.5203,32.6316,15.5102,28.076,18.3966,...,4.14995,22.477,57.995,16.113,13.2845,10.4328,34.7182,21.7003,40.9663,222.649
3,A2M,0.102955,0.308864,0.0823638,0,0.288273,0.123546,0,0.123546,0.102955,...,0,0.555956,0.288273,0.288273,0.185319,0.0823638,0.267682,0.102955,0.123546,395.511
4,A2M-AS1,0,0,0.283607,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5447.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,4.0282,17.1198,17.1198,32.2256,12.0846,15.1057,29.2044,56.3948,0,...,3.02115,11.0775,16.1128,21.148,0,0,43.3031,48.3384,13.0916,60904.3
37334,ZYG11B,5.18422,10.5865,10.1504,3.15217,11.9247,8.36613,8.37604,12.4203,8.25709,...,5.53116,9.07983,15.1562,5.82853,16.8512,6.99821,11.4985,13.0845,21.5993,599.496
37335,ZYX,10.2893,8.25183,29.1361,3.05623,16.5037,46.6585,3.46373,10.7987,19.5599,...,8.96496,14.8737,27.9136,36.8786,23.7368,18.4393,10.0856,13.6512,48.3904,6161.37
37336,ZYXP1,0,0,0,0,0,0,0,0,0,...,0,0,0,8.54701,0,0,0,0,0,516932


In [10]:
test_calculate_total_reads(lihc_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,416,417,418,419,420,421,422,423,424,425
total by col,A1BGA1BG-AS1A1CFA2MA2M-AS1A2ML1A2ML1-AS1A2ML1-...,73488000.0,20514200.0,38085100.0,33618300.0,21308900.0,24802900.0,20969800.0,31650700.0,24576300.0,...,18086300.0,28959200.0,47616800.0,39553100.0,15439400.0,18321100.0,23979500.0,20080500.0,23136000.0,3108500000.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,415,416,417,418,419,420,421,422,423,424
0,73488000.0,20514200.0,38085100.0,33618300.0,21308900.0,24802900.0,20969800.0,31650700.0,24576300.0,46750500.0,...,18086300.0,28959200.0,47616800.0,39553100.0,15439400.0,18321100.0,23979500.0,20080500.0,23136000.0,3108500000.0


In [11]:
test_per_million_table(lihc_all)

Unnamed: 0,hgnc_symbol,X004c60cf.c08e.49df.b4ce.baca41e11250.htseq.counts.gz,X0069f64b.8d8f.4426.968d.23483929ee58.htseq.counts.gz,X014b9b85.3128.416b.93d4.7ace3b676d4e.htseq.counts.gz,X03011a57.3e95.49d1.a927.cff4111d2d5b.htseq.counts.gz,X0415a9b4.a58d.4641.ab1a.927ed7a04824.htseq.counts.gz,X047bd029.d63b.4f25.8a73.b95ad72d434f.htseq.counts.gz,X04dc4da1.1d1a.46da.a9d8.da9964591aec.htseq.counts.gz,X04e7f1a4.3173.4a6f.af60.04e1f2e29868.htseq.counts.gz,X05ac7b05.e459.4833.97fc.530185a7a55f.htseq.counts.gz,...,fbbb6d26.8bd4.40da.870b.d8db0f653cfa.htseq.counts.gz,fca37687.75b4.4ca6.9963.36b468ca01a7.htseq.counts.gz,fdb62f73.33a7.44c3.950c.739383b9dd30.htseq.counts.gz,fe506b98.0733.43c9.943a.be9b12f1c2fb.htseq.counts.gz,fe625352.dd2e.478d.8d21.06659f854945.htseq.counts.gz,fe76a5ca.f70a.4ab7.b080.5a19ae36dc2b.htseq.counts.gz,feae9113.b2f3.4dd4.9faf.6076eb32c925.htseq.counts.gz,ff8776f1.5499.459c.989f.0bc5268e6631.htseq.counts.gz,ffeed225.c2a3.4b4c.954c.4816903782a9.htseq.counts.gz,ID
0,A1BG,0.0/73.48803662891481,0.0/20.51419563911485,0.12027904738994467/38.085110772238416,0.0/33.618263441608335,0.0/21.3089393031896,0.0/24.802869384832825,0.0/20.969767839983877,0.0/31.650738239464083,0.0/24.57634375277428,...,0.0/18.086311191987,0.36083714216983404/28.959194250620474,0.12027904738994467/47.61681694784602,0.0/39.55310616082073,0.0/15.439407361000027,0.0/18.32110683018386,0.0/23.979497457936535,0.12027904738994467/20.08054798228364,0.0/23.13604665947981,2309.959105123887/3108.500726345128
1,A1BG-AS1,0.0/73.48803662891481,0.0/20.51419563911485,0.0/38.085110772238416,0.0/33.618263441608335,0.0/21.3089393031896,0.0/24.802869384832825,0.0/20.969767839983877,0.0/31.650738239464083,0.0/24.57634375277428,...,0.0/18.086311191987,0.0/28.959194250620474,0.0/47.61681694784602,0.0/39.55310616082073,0.0/15.439407361000027,0.0/18.32110683018386,0.0/23.979497457936535,0.12924906294429364/20.08054798228364,0.0/23.13604665947981,2482.357502908104/3108.500726345128
2,A1CF,19.358727656318827/73.48803662891481,31.646303294461315/20.51419563911485,18.98778197667679/38.085110772238416,13.968423249020471/33.618263441608335,23.520274499802934/21.3089393031896,32.631627756010474/24.802869384832825,15.510166230032688/20.969767839983877,28.075951127906706/31.650738239464083,18.396587299747292/24.57634375277428,...,4.149954790995293/18.086311191987,22.476989775809702/28.959194250620474,57.99503860153479/47.61681694784602,16.112952959451/39.55310616082073,13.284492152180464/15.439407361000027,10.432847239932302/18.32110683018386,34.71819720399694/23.979497457936535,21.700322259059188/20.08054798228364,40.9663134954675/23.13604665947981,222.64855215264413/3108.500726345128
3,A2M,0.10295480284155256/73.48803662891481,0.3088644085246577/20.51419563911485,0.08236384227324205/38.085110772238416,0.0/33.618263441608335,0.2882734479563472/21.3089393031896,0.12354576340986308/24.802869384832825,0.0/20.969767839983877,0.12354576340986308/31.650738239464083,0.10295480284155256/24.57634375277428,...,0.0/18.086311191987,0.5559559353443838/28.959194250620474,0.2882734479563472/47.61681694784602,0.2882734479563472/39.55310616082073,0.18531864511479462/15.439407361000027,0.08236384227324205/18.32110683018386,0.2676824873880367/23.979497457936535,0.10295480284155256/20.08054798228364,0.12354576340986308/23.13604665947981,395.51117059610834/3108.500726345128
4,A2M-AS1,0.0/73.48803662891481,0.0/20.51419563911485,0.2836074872376631/38.085110772238416,0.0/33.618263441608335,0.0/21.3089393031896,0.0/24.802869384832825,0.0/20.969767839983877,0.0/31.650738239464083,0.0/24.57634375277428,...,0.0/18.086311191987,0.0/28.959194250620474,0.0/47.61681694784602,0.0/39.55310616082073,0.0/15.439407361000027,0.0/18.32110683018386,0.0/23.979497457936535,0.0/20.08054798228364,0.0/23.13604665947981,5447.8162223482705/3108.500726345128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,4.028197381671702/73.48803662891481,17.119838872104733/20.51419563911485,17.119838872104733/38.085110772238416,32.225579053373615/33.618263441608335,12.084592145015106/21.3089393031896,15.105740181268882/24.802869384832825,29.204431017119838/20.969767839983877,56.394763343403824/31.650738239464083,0.0/24.57634375277428,...,3.0211480362537766/18.086311191987,11.07754279959718/28.959194250620474,16.112789526686807/47.61681694784602,21.148036253776436/39.55310616082073,0.0/15.439407361000027,0.0/18.32110683018386,43.3031218529708/23.979497457936535,48.338368580060425/20.08054798228364,13.091641490433032/23.13604665947981,60904.330312185295/3108.500726345128
37334,ZYG11B,5.184223308188694/73.48803662891481,10.58652101939871/20.51419563911485,10.150372213356066/38.085110772238416,3.1521663709445598/33.618263441608335,11.924704856120456/21.3089393031896,8.36612709772707/24.802869384832825,8.376039570591676/20.969767839983877,12.420328499350733/31.650738239464083,8.25708989621641/24.57634375277428,...,5.531159858449888/18.086311191987,9.079825143978669/28.959194250620474,15.156171009981861/47.61681694784602,5.828534044388054/39.55310616082073,16.85120386982941/15.439407361000027,6.998205842411506/18.32110683018386,11.498468522942419/23.979497457936535,13.084464181279305/20.08054798228364,21.59927837197546/23.13604665947981,599.496446378478/3108.500726345128
37335,ZYX,10.289323553382232/73.48803662891481,8.251833740831295/20.51419563911485,29.1361043194784/38.085110772238416,3.056234718826406/33.618263441608335,16.50366748166259/21.3089393031896,46.65851670741646/24.802869384832825,3.463732681336593/20.969767839983877,10.798696006519966/31.650738239464083,19.559902200488995/24.57634375277428,...,8.964955175224123/18.086311191987,14.873675631621841/28.959194250620474,27.913610431947838/47.61681694784602,36.878565607171964/39.55310616082073,23.73675631621842/15.439407361000027,18.439282803585982/18.32110683018386,10.085574572127138/23.979497457936535,13.651181744091279/20.08054798228364,48.39038304808476/23.13604665947981,6161.369193154033/3108.500726345128
37336,ZYXP1,0.0/73.48803662891481,0.0/20.51419563911485,0.0/38.085110772238416,0.0/33.618263441608335,0.0/21.3089393031896,0.0/24.802869384832825,0.0/20.969767839983877,0.0/31.650738239464083,0.0/24.57634375277428,...,0.0/18.086311191987,0.0/28.959194250620474,0.0/47.61681694784602,8.547008547008547/39.55310616082073,0.0/15.439407361000027,0.0/18.32110683018386,0.0/23.979497457936535,0.0/20.08054798228364,0.0/23.13604665947981,516931.6239316239/3108.500726345128


In [12]:
test_calculate_gene_length(prad_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,542,543,544,545,546,547,548,549,550,551
0,A1BG,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,...,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314,8.314
1,A1BG-AS1,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,...,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737,7.737
2,A1CF,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,...,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266,86.266
3,A2M,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,...,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565,48.565
4,A2M-AS1,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,...,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526,3.526
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,...,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993,0.993
37334,ZYG11B,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,...,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883,100.883
37335,ZYX,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,...,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816,9.816
37336,ZYXP1,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,...,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117,0.117


In [13]:
test_calculate_rpk_table(prad_all)

Unnamed: 0,hgnc_symbol,X00eb7c26.84fd.4ab9.93a1.3d209dfc0f43.htseq.counts.gz,X010d5c94.b440.4927.bc8c.ddc25a56650e.htseq.counts.gz,X0160fa3a.3a99.48b5.8bbc.6f0add02ae78.htseq.counts.gz,X02599613.1566.40af.8606.a438c2b17061.htseq.counts.gz,X02d936a2.14a6.43f1.90f5.eb5b14823f2d.htseq.counts.gz,X03870270.25eb.4552.8845.684df866074a.htseq.counts.gz,X03ba09dd.2589.42ab.afe1.cf0493ba15a1.htseq.counts.gz,X045ec00a.3938.4280.959a.f853e53261dd.htseq.counts.gz,X0490a280.03ac.4135.8f37.fe71f64a8054.htseq.counts.gz,...,fcf3715b.ecfc.4554.bb0b.96d3171d85be.htseq.counts.gz,fd876ee6.c7a1.445c.a4d8.2325a4ed4b84.htseq.counts.gz,fd973a6f.54c1.4da1.afb8.7c74cf7aafcd.htseq.counts.gz,fe2ee510.b575.4983.b16e.c8c8dc699916.htseq.counts.gz,fe350add.e157.4a2c.8305.6588c7b592e0.htseq.counts.gz,fe7abfa5.005e.4436.9855.03bd8c372508.htseq.counts.gz,fe9e58ac.246f.47b3.b2a6.6bd86b06329b.htseq.counts.gz,ff052832.df71.405d.9cc9.8e1f55f65610.htseq.counts.gz,ff73b9bb.de12.434d.9550.3148c642442b.htseq.counts.gz,ID
0,A1BG,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19205
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19206
2,A1CF,2914,2072,3997,3340,1964,2869,3043,3327,1282,...,1361,1864,6191,2917,4690,1639,2241,2595,2406,19207
3,A2M,89,84,90,145,167,30,91,80,44,...,36,19,64,71,54,58,95,155,54,19208
4,A2M-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,19209
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,27,20,21,54,34,5,9,37,2,...,1,3,13,14,22,2,3,44,30,60478
37334,ZYG11B,1468,2900,2552,2233,2212,3006,1418,2755,1947,...,1472,4548,1553,1176,4960,1088,2405,2555,772,60479
37335,ZYX,163,138,290,220,180,310,270,105,435,...,1502,332,901,85,254,462,264,127,284,60480
37336,ZYXP1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,60481


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,542,543,544,545,546,547,548,549,550,551
0,A1BG,0,0,0,0,0.240558,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2309.96
1,A1BG-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,2482.36
2,A1CF,33.7792,24.0187,46.3334,38.7175,22.7668,33.2576,35.2746,38.5668,14.861,...,15.7768,21.6076,71.7664,33.814,54.3667,18.9994,25.9778,30.0814,27.8905,222.649
3,A2M,1.8326,1.72964,1.85319,2.98569,3.43869,0.617729,1.87378,1.64728,0.906002,...,0.741275,0.391228,1.31782,1.46196,1.11191,1.19428,1.95614,3.1916,1.11191,395.511
4,A2M-AS1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5447.82
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,27.1903,20.141,21.148,54.3807,34.2397,5.03525,9.06344,37.2608,2.0141,...,1.00705,3.02115,13.0916,14.0987,22.1551,2.0141,3.02115,44.3102,30.2115,60904.3
37334,ZYG11B,14.5515,28.7462,25.2966,22.1346,21.9264,29.7969,14.0559,27.3089,19.2996,...,14.5912,45.0819,15.3941,11.6571,49.1659,10.7848,23.8395,25.3264,7.65243,599.496
37335,ZYX,16.6055,14.0587,29.5436,22.4124,18.3374,31.5811,27.5061,10.6968,44.3154,...,153.015,33.8223,91.7889,8.65933,25.8761,47.066,26.8949,12.9381,28.9324,6161.37
37336,ZYXP1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,516932


In [14]:
test_calculate_total_reads(prad_all)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,542,543,544,545,546,547,548,549,550,551
total by col,A1BGA1BG-AS1A1CFA2MA2M-AS1A2ML1A2ML1-AS1A2ML1-...,28702000.0,22043700.0,33525500.0,29282600.0,20285700.0,39598800.0,23818100.0,21699300.0,21153300.0,...,14525500.0,25607400.0,21316900.0,18969500.0,40769900.0,19561200.0,72854200.0,48225700.0,27378900.0,3108500000.0


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,541,542,543,544,545,546,547,548,549,550
0,28702000.0,22043700.0,33525500.0,29282600.0,20285700.0,39598800.0,23818100.0,21699300.0,21153300.0,26857400.0,...,14525500.0,25607400.0,21316900.0,18969500.0,40769900.0,19561200.0,72854200.0,48225700.0,27378900.0,3108500000.0


In [15]:
test_per_million_table(prad_all)

Unnamed: 0,hgnc_symbol,X00eb7c26.84fd.4ab9.93a1.3d209dfc0f43.htseq.counts.gz,X010d5c94.b440.4927.bc8c.ddc25a56650e.htseq.counts.gz,X0160fa3a.3a99.48b5.8bbc.6f0add02ae78.htseq.counts.gz,X02599613.1566.40af.8606.a438c2b17061.htseq.counts.gz,X02d936a2.14a6.43f1.90f5.eb5b14823f2d.htseq.counts.gz,X03870270.25eb.4552.8845.684df866074a.htseq.counts.gz,X03ba09dd.2589.42ab.afe1.cf0493ba15a1.htseq.counts.gz,X045ec00a.3938.4280.959a.f853e53261dd.htseq.counts.gz,X0490a280.03ac.4135.8f37.fe71f64a8054.htseq.counts.gz,...,fcf3715b.ecfc.4554.bb0b.96d3171d85be.htseq.counts.gz,fd876ee6.c7a1.445c.a4d8.2325a4ed4b84.htseq.counts.gz,fd973a6f.54c1.4da1.afb8.7c74cf7aafcd.htseq.counts.gz,fe2ee510.b575.4983.b16e.c8c8dc699916.htseq.counts.gz,fe350add.e157.4a2c.8305.6588c7b592e0.htseq.counts.gz,fe7abfa5.005e.4436.9855.03bd8c372508.htseq.counts.gz,fe9e58ac.246f.47b3.b2a6.6bd86b06329b.htseq.counts.gz,ff052832.df71.405d.9cc9.8e1f55f65610.htseq.counts.gz,ff73b9bb.de12.434d.9550.3148c642442b.htseq.counts.gz,ID
0,A1BG,0.0/28.70199030433525,0.0/22.043740888065592,0.0/33.525543978778785,0.0/29.282612466371102,0.24055809477988935/20.28567831672448,0.0/39.598844807379855,0.0/23.81805110650167,0.0/21.69926734339955,0.0/21.153279281773717,...,0.0/14.525520705318566,0.0/25.607397921167216,0.0/21.316850502962232,0.0/18.969467213780806,0.0/40.76986021317264,0.0/19.561227977640268,0.0/72.85422332950527,0.0/48.22570475314917,0.0/27.378917434137776,2309.959105123887/3108.500726345128
1,A1BG-AS1,0.0/28.70199030433525,0.0/22.043740888065592,0.0/33.525543978778785,0.0/29.282612466371102,0.0/20.28567831672448,0.0/39.598844807379855,0.0/23.81805110650167,0.0/21.69926734339955,0.0/21.153279281773717,...,0.0/14.525520705318566,0.0/25.607397921167216,0.0/21.316850502962232,0.0/18.969467213780806,0.0/40.76986021317264,0.0/19.561227977640268,0.0/72.85422332950527,0.0/48.22570475314917,0.0/27.378917434137776,2482.357502908104/3108.500726345128
2,A1CF,33.77924095240303/28.70199030433525,24.018732756821922/22.043740888065592,46.33343379778823/33.525543978778785,38.717455312637654/29.282612466371102,22.766791088030047/20.28567831672448,33.25759859040642/39.598844807379855,35.274615723459995/23.81805110650167,38.566758630283076/21.69926734339955,14.861011290659123/21.153279281773717,...,15.776783437275403/14.525520705318566,21.60758583914868/25.607397921167216,71.76639695824542/21.316850502962232,33.81401710986947/18.969467213780806,54.366726172536104/40.76986021317264,18.999374029165605/19.561227977640268,25.97778962743143/72.85422332950527,30.08137620847147/48.22570475314917,27.890478288085685/27.378917434137776,222.64855215264413/3108.500726345128
3,A2M,1.8325954905796356/28.70199030433525,1.729640687738083/22.043740888065592,1.8531864511479461/33.525543978778785,2.9856892824050245/29.282612466371102,3.4386904149078554/20.28567831672448,0.6177288170493154/39.598844807379855,1.8737774117162567/23.81805110650167,1.647276845464841/21.69926734339955,0.9060022650056626/21.153279281773717,...,0.7412745804591785/14.525520705318566,0.39122825079789975/25.607397921167216,1.3178214763718727/21.316850502962232,1.4619582003500464/18.969467213780806,1.1119118706887676/40.76986021317264,1.1942757129620096/19.561227977640268,1.9561412539894987/72.85422332950527,3.1915988880881296/48.22570475314917,1.1119118706887676/27.378917434137776,395.51117059610834/3108.500726345128
4,A2M-AS1,0.0/28.70199030433525,0.0/22.043740888065592,0.0/33.525543978778785,0.0/29.282612466371102,0.0/20.28567831672448,0.0/39.598844807379855,0.0/23.81805110650167,0.0/21.69926734339955,0.0/21.153279281773717,...,0.0/14.525520705318566,0.0/25.607397921167216,0.0/21.316850502962232,0.0/18.969467213780806,0.0/40.76986021317264,0.0/19.561227977640268,0.0/72.85422332950527,0.0/48.22570475314917,0.0/27.378917434137776,5447.8162223482705/3108.500726345128
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37333,ZYG11AP1,27.19033232628399/28.70199030433525,20.14098690835851/22.043740888065592,21.148036253776436/33.525543978778785,54.38066465256798/29.282612466371102,34.239677744209466/20.28567831672448,5.0352467270896275/39.598844807379855,9.06344410876133/23.81805110650167,37.26082578046324/21.69926734339955,2.014098690835851/21.153279281773717,...,1.0070493454179255/14.525520705318566,3.0211480362537766/25.607397921167216,13.091641490433032/21.316850502962232,14.098690835850956/18.969467213780806,22.15508559919436/40.76986021317264,2.014098690835851/19.561227977640268,3.0211480362537766/72.85422332950527,44.31017119838872/48.22570475314917,30.211480362537763/27.378917434137776,60904.330312185295/3108.500726345128
37334,ZYG11B,14.551510165240924/28.70199030433525,28.746171307356047/22.043740888065592,25.296630750473323/33.525543978778785,22.134551906664157/29.282612466371102,21.92638997650744/20.28567831672448,29.796893431004236/39.598844807379855,14.055886522010647/23.81805110650167,27.308862741988246/21.69926734339955,19.299584667386974/21.153279281773717,...,14.591160056699346/14.525520705318566,45.08192658822597/25.607397921167216,15.394070358732394/21.316850502962232,11.657068088776107/18.969467213780806,49.165865408443445/40.76986021317264,10.78477047669082/19.561227977640268,23.839497239376307/72.85422332950527,25.326368169067138/48.22570475314917,7.652429051475472/27.378917434137776,599.496446378478/3108.500726345128
37335,ZYX,16.605541972290137/28.70199030433525,14.058679706601467/22.043740888065592,29.543602281988587/33.525543978778785,22.41238793806031/29.282612466371102,18.337408312958434/20.28567831672448,31.581092094539525/39.598844807379855,27.50611246943765/23.81805110650167,10.69682151589242/21.69926734339955,44.31540342298288/21.153279281773717,...,153.01548492257538/14.525520705318566,33.822330888345554/25.607397921167216,91.78891605541972/21.316850502962232,8.659331703341483/18.969467213780806,25.8761206193969/40.76986021317264,47.066014669926645/19.561227977640268,26.89486552567237/72.85422332950527,12.93806030969845/48.22570475314917,28.932355338223307/27.378917434137776,6161.369193154033/3108.500726345128
37336,ZYXP1,0.0/28.70199030433525,0.0/22.043740888065592,0.0/33.525543978778785,0.0/29.282612466371102,0.0/20.28567831672448,0.0/39.598844807379855,0.0/23.81805110650167,0.0/21.69926734339955,0.0/21.153279281773717,...,0.0/14.525520705318566,0.0/25.607397921167216,0.0/21.316850502962232,0.0/18.969467213780806,0.0/40.76986021317264,0.0/19.561227977640268,0.0/72.85422332950527,0.0/48.22570475314917,0.0/27.378917434137776,516931.6239316239/3108.500726345128


### Test filter and sort

6. Test filter genes of interest checks that the gene is part of the neurotransmitter list

7. Test filter_genes_of_interest 2 tests that all of the neurotransmitter genes successfully made it in the table (meaning that the original table contained all 107 of the neurotransmitter genes

In [16]:
def test_filter_genes_of_interest_1(table):
    function_filter = filter_genes_of_interest(table)
    test = function_filter['hgnc_symbol'].isin(neurotransmitter_genes["receptor gene"].tolist())
    for row in test.iteritems():
        if row == False:
            return False
    return True

test_filter_genes_of_interest_1(coad_all)

True

In [17]:
test_filter_genes_of_interest_1(lihc_all)

True

In [18]:
test_filter_genes_of_interest_1(prad_all)

True

In [19]:
def test_filter_genes_of_interest_2(table):
    function_filter = filter_genes_of_interest(table)
    filter_count = function_filter['hgnc_symbol'].count() 
    neuro_count = neurotransmitter_genes["receptor gene"].count()
    return filter_count == neuro_count

test_filter_genes_of_interest_2(coad_all)

True

In [20]:
test_filter_genes_of_interest_2(lihc_all)

True

In [21]:
test_filter_genes_of_interest_2(prad_all)

True

In [22]:
def test_sort_genes_of_interest_1(table):
    t_unsorted = table.copy()
    t_sorted = sort_genes_of_interest(table)
    
    t_merged = pd.merge(t_unsorted, t_sorted, on=list(t_unsorted.columns.values), how='inner')
    
    return t_merged.count() == t_unsorted.count()

test_sort_genes_of_interest_1(filter_genes_of_interest(coad_all))

hgnc_symbol                                              True
X00106523.5b1d.44ad.a9f1.7d84db08722c.htseq.counts.gz    True
X00589871.e54f.492f.988f.502670edd606.htseq.counts.gz    True
X00cc9b4d.a847.464e.979a.7751e1a87ae3.htseq.counts.gz    True
X00f768f9.9e6c.4e84.bdba.c19368f7e522.htseq.counts.gz    True
                                                         ... 
fd40f0e5.0825.409b.8a4c.86da09f0a84c.htseq.counts.gz     True
fda5b844.1fa4.462e.bd3b.482025f6ee30.htseq.counts.gz     True
fdd24a0e.f3ef.4a5a.87cf.f99884b90fb9.htseq.counts.gz     True
fe898d64.faa4.452c.b8ef.cfec7250a286.htseq.counts.gz     True
ff8de5e6.76c5.491a.8fac.41492723b780.htseq.counts.gz     True
Length: 522, dtype: bool

In [23]:
test_sort_genes_of_interest_1(filter_genes_of_interest(lihc_all))

hgnc_symbol                                              True
X004c60cf.c08e.49df.b4ce.baca41e11250.htseq.counts.gz    True
X0069f64b.8d8f.4426.968d.23483929ee58.htseq.counts.gz    True
X014b9b85.3128.416b.93d4.7ace3b676d4e.htseq.counts.gz    True
X03011a57.3e95.49d1.a927.cff4111d2d5b.htseq.counts.gz    True
                                                         ... 
fe76a5ca.f70a.4ab7.b080.5a19ae36dc2b.htseq.counts.gz     True
feae9113.b2f3.4dd4.9faf.6076eb32c925.htseq.counts.gz     True
ff8776f1.5499.459c.989f.0bc5268e6631.htseq.counts.gz     True
ffeed225.c2a3.4b4c.954c.4816903782a9.htseq.counts.gz     True
ID                                                       True
Length: 426, dtype: bool

In [24]:
test_sort_genes_of_interest_1(filter_genes_of_interest(prad_all))

hgnc_symbol                                              True
X00eb7c26.84fd.4ab9.93a1.3d209dfc0f43.htseq.counts.gz    True
X010d5c94.b440.4927.bc8c.ddc25a56650e.htseq.counts.gz    True
X0160fa3a.3a99.48b5.8bbc.6f0add02ae78.htseq.counts.gz    True
X02599613.1566.40af.8606.a438c2b17061.htseq.counts.gz    True
                                                         ... 
fe7abfa5.005e.4436.9855.03bd8c372508.htseq.counts.gz     True
fe9e58ac.246f.47b3.b2a6.6bd86b06329b.htseq.counts.gz     True
ff052832.df71.405d.9cc9.8e1f55f65610.htseq.counts.gz     True
ff73b9bb.de12.434d.9550.3148c642442b.htseq.counts.gz     True
ID                                                       True
Length: 552, dtype: bool

In [25]:
def test_sort_genes_of_interest_2(table):
    i = 0
    t_sorted = sort_genes_of_interest(table)
    for index, row in t_sorted.iterrows():
        if row['hgnc_symbol'] != receptor_gene_list[i]:
            return False
        i = i + 1
    return True

test_sort_genes_of_interest_2(filter_genes_of_interest(coad_all))

True

In [26]:
test_sort_genes_of_interest_2(filter_genes_of_interest(lihc_all))

True

In [27]:
test_sort_genes_of_interest_2(filter_genes_of_interest(prad_all))

True

### Test Log Expression Heatmap data manipulations

10. Test compute zscore

In [28]:
def test_z_score(table):
    function_table = table.drop('hgnc_symbol', axis=1)
    function_numpy = function_table.to_numpy(copy=True, dtype=float)
    function_values = z_score(function_numpy)
    return function_values

test_z_score(coad_tpm)

array([[ 0.06983846, -0.02559923, -0.01850794, ...,  0.51539318,
         0.23956633,  0.29935491],
       [-0.02610668, -0.13003073,  0.12397274, ...,  0.6463984 ,
        -0.13541218,  0.17783404],
       [-0.22794959, -0.21587521, -0.23852426, ..., -0.23977366,
        -0.2436163 , -0.22474902],
       ...,
       [-0.26148573, -0.26065939, -0.27016962, ..., -0.27071671,
        -0.27362645, -0.27029494],
       [ 3.73287373,  2.95478442,  0.75360975, ...,  4.65062567,
         2.37954088,  2.49015479],
       [-0.27641879, -0.27641879, -0.27641879, ..., -0.27641879,
        -0.27641879, -0.27641879]])

Test comparative zscore colimns tests that the list being generated for each column (normally through the zscore function) is being added to the resultant dataframe correctly

List 0 - 6 gets added to each column

In [38]:
def test_comparative_zscore_columns_1(table, compute_zscore):
    htseq_count_values = table.drop('hgnc_symbol', axis=1)
    expression_grid = htseq_count_values.to_numpy(copy=True, dtype=float)
    rnaseq_columns = list(table.columns.values)

    expression_logged = expression_grid
        
    if compute_zscore:
        comparative_zscore = []
        if htseq_count_values.columns.values[0] == 'Tumor':
            for i in range(len(htseq_count_values.columns.values)):
                column_zscore = []
                for row in range(len(expression_logged[0])):
                    column_zscore.append(i)
                #column_zscore = z_score(expression_logged[:, i])
                print (column_zscore)
                comparative_zscore.append(column_zscore)
            expression_logged = np.transpose(np.array(comparative_zscore)) 
        else:
            expression_logged = z_score(expression_logged)
    
    return pd.DataFrame(expression_logged)
            
test_comparative_zscore_columns_1(tumor_panc_tpm, True)

[0, 0, 0, 0, 0, 0, 0]
[1, 1, 1, 1, 1, 1, 1]
[2, 2, 2, 2, 2, 2, 2]
[3, 3, 3, 3, 3, 3, 3]
[4, 4, 4, 4, 4, 4, 4]
[5, 5, 5, 5, 5, 5, 5]
[6, 6, 6, 6, 6, 6, 6]


Unnamed: 0,0,1,2,3,4,5,6
0,0,1,2,3,4,5,6
1,0,1,2,3,4,5,6
2,0,1,2,3,4,5,6
3,0,1,2,3,4,5,6
4,0,1,2,3,4,5,6
5,0,1,2,3,4,5,6
6,0,1,2,3,4,5,6


Test comparative zscore columns 2 checks that the first and last computed zscore align with the first and the last of a column

In [40]:
def test_comparative_zscore_columns_2(table, compute_zscore):
    htseq_count_values = table.drop('hgnc_symbol', axis=1)
    expression_grid = htseq_count_values.to_numpy(copy=True, dtype=float)
    rnaseq_columns = list(table.columns.values)

    expression_logged = expression_grid
        
    if compute_zscore:
        comparative_zscore = []
        if htseq_count_values.columns.values[0] == 'Tumor':
            for i in range(len(htseq_count_values.columns.values)):
                column_zscore = z_score(expression_logged[:, i])
                print ([column_zscore[0], column_zscore[len(column_zscore) -1]])
                comparative_zscore.append(column_zscore)
            expression_logged = np.transpose(np.array(comparative_zscore)) 
        else:
            expression_logged = z_score(expression_logged)
    
    return pd.DataFrame(expression_logged)
            
test_comparative_zscore_columns_2(tumor_panc_tpm, True)

[-0.17070420506258807, -0.35867455245227675]
[2.057309825406346, -0.2871773896346283]
[-0.19325467023053522, -0.1994133837234751]
[-0.20707350649277678, -0.20453940001599996]
[0.05893978467797574, -0.2330601242433928]
[-0.12330516518635026, -0.1269132672602102]
[1.2378431355291852, -0.2096646706058193]


Unnamed: 0,0,1,2,3,4,5,6
0,-0.170704,2.057310,-0.193255,-0.207074,0.058940,-0.123305,1.237843
1,0.116642,-0.200955,-0.199725,-0.206170,-0.224562,-0.127285,-0.209545
2,-0.275934,-0.287703,-0.200653,-0.206907,-0.234517,-0.127482,-0.210146
3,-0.270730,-0.199384,0.772149,-0.075043,-0.043099,-0.074578,-0.006828
4,0.198861,-0.287703,-0.194195,-0.207074,-0.230723,-0.094671,-0.210146
...,...,...,...,...,...,...,...
102,1.384329,-0.281757,-0.193644,-0.198887,-0.227652,-0.125174,-0.197444
103,-0.185972,-0.287532,-0.200653,-0.206443,-0.234458,-0.127482,-0.210146
104,-0.352508,-0.287621,-0.200595,-0.201858,-0.233972,-0.125255,-0.209681
105,6.144871,1.636437,0.388460,-0.135897,-0.019652,0.015871,0.347299


In [57]:
def test_comparative_zscore_and_sort(table, sort):
    htseq_count_values = table.drop('hgnc_symbol', axis=1)
    expression_grid = htseq_count_values.to_numpy(copy=True, dtype=float)
    rnaseq_columns = list(table.columns.values)

    expression_logged = expression_grid
        
    comparative_zscore = []
    if htseq_count_values.columns.values[0] == 'Tumor':
        for i in range(len(htseq_count_values.columns.values)):
            column_zscore = z_score(expression_logged[:, i])
            comparative_zscore.append(column_zscore)
        expression_logged = np.transpose(np.array(comparative_zscore)) 
        
    else:
        expression_logged = z_score(expression_logged)
       
    if sort:
        expression_logged_pandas = convert_numpy_to_pandas(expression_logged, rnaseq_columns[1:])
        pre_sort_tpm = expression_logged_pandas
        expression_logged_pandas_sorted = sort_table(expression_logged_pandas)
        y_axis_list = expression_logged_pandas_sorted['hgnc_symbol'].tolist()
        expression_logged = convert_pandas_to_numpy(expression_logged_pandas_sorted)
        
    return pd.DataFrame(expression_logged), pre_sort_tpm

tested_sorted_tpm, pre_sort_tpm = test_comparative_zscore_and_sort(tumor_panc_tpm, True)

In [58]:
pre_sort_tpm

Unnamed: 0,hgnc_symbol,Tumor,PANC1,MIAPACA2,HPAC,CAPAN2,BXPC3,CAPAN1
0,DRD1,-0.170704,2.057310,-0.193255,-0.207074,0.058940,-0.123305,1.237843
1,DRD2,0.116642,-0.200955,-0.199725,-0.206170,-0.224562,-0.127285,-0.209545
2,DRD3,-0.275934,-0.287703,-0.200653,-0.206907,-0.234517,-0.127482,-0.210146
3,DRD4,-0.270730,-0.199384,0.772149,-0.075043,-0.043099,-0.074578,-0.006828
4,DRD5,0.198861,-0.287703,-0.194195,-0.207074,-0.230723,-0.094671,-0.210146
...,...,...,...,...,...,...,...,...
102,CHRM1,1.384329,-0.281757,-0.193644,-0.198887,-0.227652,-0.125174,-0.197444
103,CHRM2,-0.185972,-0.287532,-0.200653,-0.206443,-0.234458,-0.127482,-0.210146
104,CHRM3,-0.352508,-0.287621,-0.200595,-0.201858,-0.233972,-0.125255,-0.209681
105,CHRM4,6.144871,1.636437,0.388460,-0.135897,-0.019652,0.015871,0.347299


In [59]:
tested_sorted_tpm

Unnamed: 0,0,1,2,3,4,5,6
0,-0.234517,-0.210146,-0.200653,-0.206907,-0.287703,-0.127482,-0.275934
1,-0.224562,-0.209545,-0.199725,-0.206170,-0.200955,-0.127285,0.116642
2,-0.230723,-0.210146,-0.194195,-0.207074,-0.287703,-0.094671,0.198861
3,-0.043099,-0.006828,0.772149,-0.075043,-0.199384,-0.074578,-0.270730
4,0.058940,1.237843,-0.193255,-0.207074,2.057310,-0.123305,-0.170704
...,...,...,...,...,...,...,...
102,0.672508,0.087483,0.016395,-0.097888,0.232694,-0.053846,-0.319092
103,-0.207768,-0.187520,-0.142385,-0.174465,-0.278436,-0.127482,2.239505
104,0.177469,-0.001931,0.579835,0.410925,0.618944,-0.052801,-0.249490
105,0.148807,0.183480,0.264957,-0.088118,1.213220,0.198009,-0.328789


In [72]:
tested_sorted_tpm.to_csv('datasets/test_cell_lines_sorting.csv', sep=',')

not sure how it was sorting but ... huh let's run the sorting tests on it

In [60]:
def test_create_sum_column(table):
    rnaseq_orig = table.copy()
    
    excluded = rnaseq_orig.loc[:, 'hgnc_symbol']
    rnaseq_orig.drop('hgnc_symbol', axis=1, inplace=True)
    rnaseq_orig.loc[:, 'Total by row'] = rnaseq_orig.sum(axis=1)
    rnaseq_with_total = pd.concat([excluded.rename('hgnc_symbol'), rnaseq_orig], axis=1)
    
    return rnaseq_with_total

test_create_sum_column(pre_sort_tpm)

Unnamed: 0,hgnc_symbol,Tumor,PANC1,MIAPACA2,HPAC,CAPAN2,BXPC3,CAPAN1,Total by row
0,DRD1,-0.170704,2.057310,-0.193255,-0.207074,0.058940,-0.123305,1.237843,2.659755
1,DRD2,0.116642,-0.200955,-0.199725,-0.206170,-0.224562,-0.127285,-0.209545,-1.051601
2,DRD3,-0.275934,-0.287703,-0.200653,-0.206907,-0.234517,-0.127482,-0.210146,-1.543343
3,DRD4,-0.270730,-0.199384,0.772149,-0.075043,-0.043099,-0.074578,-0.006828,0.102486
4,DRD5,0.198861,-0.287703,-0.194195,-0.207074,-0.230723,-0.094671,-0.210146,-1.025651
...,...,...,...,...,...,...,...,...,...
102,CHRM1,1.384329,-0.281757,-0.193644,-0.198887,-0.227652,-0.125174,-0.197444,0.159772
103,CHRM2,-0.185972,-0.287532,-0.200653,-0.206443,-0.234458,-0.127482,-0.210146,-1.452686
104,CHRM3,-0.352508,-0.287621,-0.200595,-0.201858,-0.233972,-0.125255,-0.209681,-1.611491
105,CHRM4,6.144871,1.636437,0.388460,-0.135897,-0.019652,0.015871,0.347299,8.377388


In [68]:
def test_sorting_rows_1(table):
    
    #### --- Test Setup ---
    rnaseq_orig = table.copy()
    
    neuro_genes_dict = {}
    for index, row in neurotransmitter_genes.iterrows():
        value = row[0]
        if (value in neuro_genes_dict):
            gene_list = neuro_genes_dict[value]
        else:
            gene_list = []
        gene_list.append(row[1])
        neuro_genes_dict[value] = gene_list
        
    excluded = rnaseq_orig.loc[:, 'hgnc_symbol']
    rnaseq_orig.drop('hgnc_symbol', axis=1, inplace=True)
    rnaseq_orig.loc[:, 'Total by row'] = rnaseq_orig.sum(axis=1)
    rnaseq_with_total = pd.concat([excluded.rename('hgnc_symbol'), rnaseq_orig], axis=1)
    rnaseq_with_total = rnaseq_with_total.reset_index(drop=True)
    
    table_columns = list(table.columns.values)
    
    rnaseq_sorted = pd.DataFrame(columns=table_columns)
    
    # sorts the rows section by section, based on the size of each family of neurotransmitters
    index_begin = 0
    index_end = 0
    appended_data = []
    for family, gene_list in neuro_genes_dict.items():
        index_end = len(gene_list) + index_begin
        to_sort = rnaseq_with_total[index_begin : index_end].sort_values('Total by row', ascending=True)
        appended_data.append(to_sort)
        index_begin = index_end
    # the families were sorted as separate dataframes and then concat together
    rnaseq_sorted = pd.concat(appended_data)
    
    try:
        assert_frame_equal(rnaseq_sorted, rnaseq_with_total, check_like=True)
        return True
    except:
        return False
    
test_sorting_rows_1(pre_sort_tpm)

True

In [70]:
def test_sorting_rows_2_helper(to_sort):
        with pd.option_context('display.max_rows', None, 'display.max_columns', None):
            display (to_sort)

def test_sorting_rows_2(table):
    rnaseq_orig = table.copy()
    
    neuro_genes_dict = {}
    for index, row in neurotransmitter_genes.iterrows():
        value = row[0]
        if (value in neuro_genes_dict):
            gene_list = neuro_genes_dict[value]
        else:
            gene_list = []
        gene_list.append(row[1])
        neuro_genes_dict[value] = gene_list
    
    # sort table wasn't working right with decimals, so hgnc symbol column was removed
    excluded = rnaseq_orig.loc[:, 'hgnc_symbol']
    rnaseq_orig.drop('hgnc_symbol', axis=1, inplace=True)
    rnaseq_orig.loc[:, 'Total by row'] = rnaseq_orig.sum(axis=1)
    rnaseq_with_total = pd.concat([excluded.rename('hgnc_symbol'), rnaseq_orig], axis=1)
    
    table_columns = list(table.columns.values)
    
    # SORTING THE ROWS -------
    rnaseq_sorted = pd.DataFrame(columns=table_columns)
    
    # sorts the rows section by section, based on the size of each family of neurotransmitters
    index_begin = 0
    index_end = 0
    appended_data = []
    for family, gene_list in neuro_genes_dict.items():
        index_end = len(gene_list) + index_begin
        to_sort = rnaseq_with_total[index_begin : index_end].sort_values('Total by row', ascending=True)
        test_sorting_rows_2_helper(to_sort[['hgnc_symbol', 'Total by row']])
        index_begin = index_end
    # the families were sorted as separate dataframes and then concat together
    
    
test_sorting_rows_2(pre_sort_tpm)

Unnamed: 0,hgnc_symbol,Total by row
2,DRD3,-1.543343
1,DRD2,-1.051601
4,DRD5,-1.025651
3,DRD4,0.102486
0,DRD1,2.659755


Unnamed: 0,hgnc_symbol,Total by row
12,GRM8,-1.625629
19,GRIK1,-1.624741
9,GRM5,-1.62473
26,GRIN2B,-1.624415
20,GRIK2,-1.61823
14,GRIA2,-1.617003
22,GRIK4,-1.616477
21,GRIK3,-1.615999
18,GRID2,-1.613555
8,GRM4,-1.605186


Unnamed: 0,hgnc_symbol,Total by row
36,GABRA6,-1.625733
31,GABRA1,-1.624861
46,GABRQ,-1.618512
49,GABRR3,-1.615512
47,GABRR1,-1.615167
35,GABRA5,-1.614407
33,GABRA3,-1.607974
32,GABRA2,-1.605356
44,GABRG3,-1.604996
37,GABRB1,-1.602103


Unnamed: 0,hgnc_symbol,Total by row
60,ADRB3,-1.546686
52,ADRA1A,-1.459963
54,ADRA1D,-1.298888
53,ADRA1B,-1.223738
56,ADRA2B,-0.96539
55,ADRA2A,0.306859
58,ADRB1,4.983343
57,ADRA2C,15.508092
59,ADRB2,41.320897


Unnamed: 0,hgnc_symbol,Total by row
63,TACR3,-1.613572
61,TACR1,-1.593291
62,TACR2,-1.39004


Unnamed: 0,hgnc_symbol,Total by row
71,HTR2C,-1.623943
72,HTR4,-1.618133
80,HTR3D,-1.613938
81,HTR3E,-1.61288
69,HTR2A,-1.610807
73,HTR5A,-1.586409
67,HTR1E,-1.581914
74,HTR5BP,-1.570655
79,HTR3C,-1.515795
76,HTR7,-1.503743


Unnamed: 0,hgnc_symbol,Total by row
83,HRH2,-1.470533
85,HRH4,-1.024427
82,HRH1,0.001796
84,HRH3,1.528582


Unnamed: 0,hgnc_symbol,Total by row
106,CHRM5,-1.619443
104,CHRM3,-1.611491
98,CHRNB4,-1.599027
89,CHRNA4,-1.584834
86,CHRNA1,-1.552654
92,CHRNA7,-1.551263
93,CHRNA9,-1.487361
100,CHRNE,-1.455937
103,CHRM2,-1.452686
96,CHRNB2,-1.418298


In [71]:
def test_create_sum_row(table):
    rnaseq_orig = table.copy()
    
    neuro_genes_dict = {}
    for index, row in neurotransmitter_genes.iterrows():
        value = row[0]
        if (value in neuro_genes_dict):
            gene_list = neuro_genes_dict[value]
        else:
            gene_list = []
        gene_list.append(row[1])
        neuro_genes_dict[value] = gene_list
    
    # sort table wasn't working right with decimals, so hgnc symbol column was removed
    excluded = rnaseq_orig.loc[:, 'hgnc_symbol']
    rnaseq_orig.drop('hgnc_symbol', axis=1, inplace=True)
    rnaseq_orig.loc[:, 'Total by row'] = rnaseq_orig.sum(axis=1)
    rnaseq_with_total = pd.concat([excluded.rename('hgnc_symbol'), rnaseq_orig], axis=1)
    
    table_columns = list(table.columns.values)
    
    # SORTING THE ROWS -------
    rnaseq_sorted = pd.DataFrame(columns=table_columns)
    
    # sorts the rows section by section, based on the size of each family of neurotransmitters
    index_begin = 0
    index_end = 0
    appended_data = []
    for family, gene_list in neuro_genes_dict.items():
        index_end = len(gene_list) + index_begin
        to_sort = rnaseq_with_total[index_begin : index_end].sort_values('Total by row', ascending=True)
        appended_data.append(to_sort)
        index_begin = index_end
    # the families were sorted as separate dataframes and then concat together
    rnaseq_sorted = pd.concat(appended_data)
    
    # adding the column sum back in so now we can sort by column
    rnaseq_sorted_2 = rnaseq_sorted.to_numpy(copy=True)
    rnaseq_sorted = pd.DataFrame(rnaseq_sorted_2)
    
    table_columns.append('Total by row')
    rnaseq_sorted.columns = table_columns
    rnaseq_sorted.loc['Total by col', :] = rnaseq_with_total.sum(axis=0)
    table_columns.remove('Total by row')
    
    return rnaseq_sorted.iloc[[-1]].transpose()

test_create_sum_row(pre_sort_tpm)

Unnamed: 0,Total by col
hgnc_symbol,DRD1DRD2DRD3DRD4DRD5GRM1GRM2GRM3GRM4GRM5GRM6GR...
Tumor,5.9952e-15
PANC1,1.60982e-15
MIAPACA2,-3.33067e-16
HPAC,1.11022e-16
CAPAN2,-9.07607e-15
BXPC3,3.19189e-15
CAPAN1,-4.10783e-15
Total by row,-3.77476e-15


these values are all basically zero because sum of zscores = 0, should update zscore map to not even do the columns?

furthermore why were the column names not changed after sorting the log expression map
FIXED 

In [74]:
def test_sorting_column_2(table):
    rnaseq_orig = table.copy()
    
    neuro_genes_dict = {}
    for index, row in neurotransmitter_genes.iterrows():
        value = row[0]
        if (value in neuro_genes_dict):
            gene_list = neuro_genes_dict[value]
        else:
            gene_list = []
        gene_list.append(row[1])
        neuro_genes_dict[value] = gene_list
    
    # sort table wasn't working right with decimals, so hgnc symbol column was removed
    excluded = rnaseq_orig.loc[:, 'hgnc_symbol']
    rnaseq_orig.drop('hgnc_symbol', axis=1, inplace=True)
    rnaseq_orig.loc[:, 'Total by row'] = rnaseq_orig.sum(axis=1)
    rnaseq_with_total = pd.concat([excluded.rename('hgnc_symbol'), rnaseq_orig], axis=1)
    
    table_columns = list(table.columns.values)
    
    # SORTING THE ROWS -------
    rnaseq_sorted = pd.DataFrame(columns=table_columns)
    
    # sorts the rows section by section, based on the size of each family of neurotransmitters
    index_begin = 0
    index_end = 0
    appended_data = []
    for family, gene_list in neuro_genes_dict.items():
        index_end = len(gene_list) + index_begin
        to_sort = rnaseq_with_total[index_begin : index_end].sort_values('Total by row', ascending=True)
        appended_data.append(to_sort)
        index_begin = index_end
    # the families were sorted as separate dataframes and then concat together
    rnaseq_sorted = pd.concat(appended_data)
    
    # adding the column sum back in so now we can sort by column
    rnaseq_sorted_2 = rnaseq_sorted.to_numpy(copy=True)
    rnaseq_sorted = pd.DataFrame(rnaseq_sorted_2)
    
    table_columns.append('Total by row')
    rnaseq_sorted.columns = table_columns
    rnaseq_sorted.loc['Total by col', :] = rnaseq_with_total.sum(axis=0)
    table_columns.remove('Total by row')
    
    # SORTING THE COLUMNS ----------
    
    # remove hgnc_symbol column, sort the values, and then remove the total col and total row   
    excluded_after_row_sorting = rnaseq_sorted.loc[:, 'hgnc_symbol']
    del rnaseq_sorted['hgnc_symbol']
    
    sorted_cases = rnaseq_sorted.sort_values('Total by col', axis=1, ascending=True)

    display (rnaseq_sorted)
    display (sorted_cases)
    try:
        assert_frame_equal(rnaseq_sorted, sorted_cases, check_like=True)
        return True
    except:
        return False

test_sorting_column_2(pre_sort_tpm)

Unnamed: 0,Tumor,PANC1,MIAPACA2,HPAC,CAPAN2,BXPC3,CAPAN1,Total by row
0,-0.275934,-0.287703,-0.200653,-0.206907,-0.234517,-0.127482,-0.210146,-1.54334
1,0.116642,-0.200955,-0.199725,-0.20617,-0.224562,-0.127285,-0.209545,-1.0516
2,0.198861,-0.287703,-0.194195,-0.207074,-0.230723,-0.0946707,-0.210146,-1.02565
3,-0.27073,-0.199384,0.772149,-0.0750432,-0.043099,-0.0745785,-0.00682806,0.102486
4,-0.170704,2.05731,-0.193255,-0.207074,0.0589398,-0.123305,1.23784,2.65976
...,...,...,...,...,...,...,...,...
103,2.23951,-0.278436,-0.142385,-0.174465,-0.207768,-0.127482,-0.18752,1.12145
104,-0.24949,0.618944,0.579835,0.410925,0.177469,-0.0528008,-0.00193059,1.48295
105,-0.328789,1.21322,0.264957,-0.0881182,0.148807,0.198009,0.18348,1.59157
106,6.14487,1.63644,0.38846,-0.135897,-0.0196523,0.0158713,0.347299,8.37739


Unnamed: 0,CAPAN2,CAPAN1,Total by row,MIAPACA2,HPAC,PANC1,BXPC3,Tumor
0,-0.234517,-0.210146,-1.54334,-0.200653,-0.206907,-0.287703,-0.127482,-0.275934
1,-0.224562,-0.209545,-1.0516,-0.199725,-0.20617,-0.200955,-0.127285,0.116642
2,-0.230723,-0.210146,-1.02565,-0.194195,-0.207074,-0.287703,-0.0946707,0.198861
3,-0.043099,-0.00682806,0.102486,0.772149,-0.0750432,-0.199384,-0.0745785,-0.27073
4,0.0589398,1.23784,2.65976,-0.193255,-0.207074,2.05731,-0.123305,-0.170704
...,...,...,...,...,...,...,...,...
103,-0.207768,-0.18752,1.12145,-0.142385,-0.174465,-0.278436,-0.127482,2.23951
104,0.177469,-0.00193059,1.48295,0.579835,0.410925,0.618944,-0.0528008,-0.24949
105,0.148807,0.18348,1.59157,0.264957,-0.0881182,1.21322,0.198009,-0.328789
106,-0.0196523,0.347299,8.37739,0.38846,-0.135897,1.63644,0.0158713,6.14487


True