# Data Loading

In [1]:

import os
import sys
import logging
import scgenome.db.search

LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO)

local_cache_directory = os.environ['TANTALUS_CACHE_DIR']

library_id = 'A96157C'

analysis = scgenome.db.search.search_hmmcopy_analysis(
    library_id,
    aligner_name='BWA_MEM_0_7_6A',
)

print(analysis['jira_ticket'])


2019-11-04 18:41:04,237 - INFO - searching for hmmcopy data for A96157C
2019-11-04 18:41:06,287 - INFO - found results from jira ticket SC-3041


SC-3041


In [2]:

import scgenome.db.qc

hmmcopy_tickets = [
    'SC-1935',
    'SC-1936',
    'SC-1937',
]

sample_ids = [
    'SA1090',
    'SA921',
    'SA922',
]

results_tables = scgenome.db.qc.get_qc_data(
    hmmcopy_tickets,
    local_cache_directory,
    sample_ids=sample_ids,
    do_caching=False,
)

cn_data, metrics_data = (
    results_tables['hmmcopy_reads'],
    results_tables['annotation_metrics'],
)

print(cn_data.head())


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[col] = df[col].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  df[col] = df[col].cat.set_categories(col_categories[col])


  chr    start      end  reads        gc      copy  state  \
0   1        1   500000     13 -1.000000       NaN      6   
1   1   500001  1000000    442 -1.000000       NaN      6   
2   1  1000001  1500000    461  0.598332  6.672340      6   
3   1  1500001  2000000    478  0.539498  5.211916      6   
4   1  2000001  2500000    594  0.594508  8.384862      6   

                 cell_id sample_id library_id  
0  SA922-A90554B-R34-C70     SA922    A90554B  
1  SA922-A90554B-R34-C70     SA922    A90554B  
2  SA922-A90554B-R34-C70     SA922    A90554B  
3  SA922-A90554B-R34-C70     SA922    A90554B  
4  SA922-A90554B-R34-C70     SA922    A90554B  


In [3]:

import dbclients.tantalus
import datamanagement.transfer_files

ticket_id = 'SC-1939'

local_cache_directory = os.environ['TANTALUS_CACHE_DIR']

tantalus_api = dbclients.tantalus.TantalusApi()

ticket_results = tantalus_api.list('results', analysis__jira_ticket=ticket_id)

for results in ticket_results:
    filepaths = datamanagement.transfer_files.cache_dataset(
        tantalus_api,
        results['id'],
        'resultsdataset',
        'singlecellresults',
        local_cache_directory,
    )


2019-11-04 18:42:41,178 - INFO - 702a37f8-c0d2-46e5-9177-91ece06a7afc - TokenRequest:Getting token with client credentials.
2019-11-04 18:42:41,600 - INFO - 702a37f8-c0d2-46e5-9177-91ece06a7afc - OAuth2Client:Get Token Server returned this correlation_id: 702a37f8-c0d2-46e5-9177-91ece06a7afc
2019-11-04 18:42:48,665 - INFO - starting caching SC-1939/results/SA921_A90554A_snv_annotations.h5 to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:49,094 - INFO - skipping transfer of file resource SC-1939/results/SA921_A90554A_snv_annotations.h5 that matches existing file
2019-11-04 18:42:49,094 - INFO - starting caching SC-1939/results/SA921_A90554A_snv_counts.h5 to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:49,160 - INFO - skipping transfer of file resource SC-1939/results/SA921_A90554A_snv_counts.h5 that matches existing file
2019-11-04 18:42:49,160 - INFO - starting caching SC-1939/results/SA921_A90554A_museq.vcf.gz to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 1

2019-11-04 18:42:51,050 - INFO - starting caching SC-1939/results/SA1090_A96213A_strelka_snv.vcf.gz.csi to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:51,117 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_strelka_snv.vcf.gz.csi that matches existing file
2019-11-04 18:42:51,118 - INFO - starting caching SC-1939/results/SA1090_A96213A_strelka_snv.vcf.gz.tbi to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:51,185 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_strelka_snv.vcf.gz.tbi that matches existing file
2019-11-04 18:42:51,186 - INFO - starting caching SC-1939/results/SA1090_A96213A_strelka_indel.vcf.gz to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:51,254 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_strelka_indel.vcf.gz that matches existing file
2019-11-04 18:42:51,255 - INFO - starting caching SC-1939/results/SA1090_A96213A_strelka_indel.vcf.gz.csi to /Users/mcp

2019-11-04 18:42:53,125 - INFO - skipping transfer of file resource SC-1939/results/SA922_A90554B_snv_allele_counts.csv.gz.yaml that matches existing file
2019-11-04 18:42:53,126 - INFO - starting caching SC-1939/results/SA922_A90554B_snv_strelka.csv.gz to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:53,192 - INFO - skipping transfer of file resource SC-1939/results/SA922_A90554B_snv_strelka.csv.gz that matches existing file
2019-11-04 18:42:53,193 - INFO - starting caching SC-1939/results/SA922_A90554B_snv_strelka.csv.gz.yaml to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:53,260 - INFO - skipping transfer of file resource SC-1939/results/SA922_A90554B_snv_strelka.csv.gz.yaml that matches existing file
2019-11-04 18:42:53,260 - INFO - starting caching SC-1939/results/SA922_A90554B_snv_cosmic_status.csv.gz to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:53,328 - INFO - skipping transfer of file resource SC-1939/results/SA922_A90554B_snv_cosmic_status.cs

2019-11-04 18:42:55,054 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_snv_snpeff.csv.gz.yaml that matches existing file
2019-11-04 18:42:55,055 - INFO - starting caching SC-1939/results/SA1090_A96213A_snv_trinuc.csv.gz to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:55,124 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_snv_trinuc.csv.gz that matches existing file
2019-11-04 18:42:55,125 - INFO - starting caching SC-1939/results/SA1090_A96213A_snv_trinuc.csv.gz.yaml to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:55,194 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_snv_trinuc.csv.gz.yaml that matches existing file
2019-11-04 18:42:55,195 - INFO - starting caching SC-1939/results/SA1090_A96213A_snv_museq.csv.gz to /Users/mcphera1/Scratch/tantalus_data/
2019-11-04 18:42:55,269 - INFO - skipping transfer of file resource SC-1939/results/SA1090_A96213A_snv_museq.csv.gz that matches ex

In [4]:

import scgenome.loaders.snv

LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO)

ticket_id = 'SC-1939'

local_cache_directory = os.environ['TANTALUS_CACHE_DIR']

ticket_directory = os.path.join(local_cache_directory, ticket_id)

snv_results = scgenome.loaders.snv.load_snv_data(
    ticket_directory,
)

print(snv_results.keys())
print(snv_results['snv_data'].head())


2019-11-04 18:42:56,675 - INFO - starting load
2019-11-04 18:42:56,688 - INFO - Loading snv mappability annotations from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA921_A90554A_snv_mappability.csv.gz
2019-11-04 18:42:56,891 - INFO - Loading snv mappability annotations from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA922_A90554B_snv_mappability.csv.gz
  strelka_filter=strelka_filter)
2019-11-04 18:42:57,119 - INFO - Loading snv mappability annotations from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA1090_A96213A_snv_mappability.csv.gz
2019-11-04 18:42:58,016 - INFO - Loading snv strelka annotations from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA921_A90554A_snv_strelka.csv.gz
2019-11-04 18:42:58,053 - INFO - Loading snv strelka annotations from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA922_A90554B_snv_strelka.csv.gz
2019-11-04 18:42:58,089 - INFO - Loading snv strelka annotations from /Users/mcphera1/Scratch/tantalus_data/SC

dict_keys(['snv_data', 'snv_count_data'])
   chrom    coord ref alt  alt_counts_sum  ref_counts_sum  mappability  \
17     1   985349   G   A            32.0            27.0          1.0   
24     1  1079129   G   T            80.0             0.0          1.0   
66     1  2032634   T   C            65.0            62.0          1.0   
68     1  2063033   C   A           354.0             1.0          1.0   
74     1  2117392   G   A           430.0             2.0          1.0   

   is_cosmic gene_name effect effect_impact amino_acid_change  \
17      True       NaN    NaN           NaN               NaN   
24       NaN       NaN    NaN           NaN               NaN   
66       NaN       NaN    NaN           NaN               NaN   
68       NaN       NaN    NaN           NaN               NaN   
74       NaN       NaN    NaN           NaN               NaN   

   tri_nucleotide_context  max_strelka_score  max_museq_score  
17                    CGT                 93             0

In [5]:

import scgenome.loaders.breakpoint

LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO)

ticket_id = 'SC-1939'

local_cache_directory = os.environ['TANTALUS_CACHE_DIR']

ticket_directory = os.path.join(local_cache_directory, ticket_id)

breakpoint_results = scgenome.loaders.breakpoint.load_breakpoint_data(
    ticket_directory,
)

print(breakpoint_results.keys())
print(breakpoint_results['breakpoint_data'].head())
print(breakpoint_results['breakpoint_count_data'].head())


dict_keys(['breakpoint_data', 'breakpoint_count_data'])
   prediction_id chromosome_1 strand_1  position_1 chromosome_2 strand_2  \
0            456            1        -    17084897            1        -   
1            569            1        -   234914910            1        -   
2           1032            1        -   148902774            1        -   
3           1313            1        -   204051209            1        -   
4           2081            1        -    64520958            1        -   

   position_2  homology  num_split inserted  ...  dgv_ids  is_germline  \
0    17084866         8          3      nan  ...      NaN        False   
1   234914883        13          6      nan  ...      NaN        False   
2   148902756         6          2      nan  ...      NaN        False   
3   204051202         0          2      nan  ...      NaN        False   
4    64520949         3          2      nan  ...      NaN        False   

   is_dgv  num_patients  is_filtered  dist

In [6]:

import scgenome.loaders.allele

LOGGING_FORMAT = "%(asctime)s - %(levelname)s - %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, stream=sys.stderr, level=logging.INFO)

ticket_id = 'SC-1939'

local_cache_directory = '/Users/mcphera1/Scratch/tantalus_data/'

ticket_directory = os.path.join(local_cache_directory, ticket_id)

allele_results = scgenome.loaders.allele.load_haplotype_allele_data(
    ticket_directory,
)

print(allele_results.keys())
print(allele_results['allele_counts'].head())


2019-11-04 18:53:33,567 - INFO - Loading haplotype allele counts from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA921_A90554A_allele_counts.csv
2019-11-04 18:54:00,981 - INFO - Loaded haplotype allele counts table with shape (29543456, 7), memory 1418112296
2019-11-04 18:54:00,982 - INFO - Loading haplotype allele counts from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA922_A90554B_allele_counts.csv
2019-11-04 18:54:26,368 - INFO - Loaded haplotype allele counts table with shape (27822447, 7), memory 1335503608
2019-11-04 18:54:26,369 - INFO - Loading haplotype allele counts from /Users/mcphera1/Scratch/tantalus_data/SC-1939/results/SA1090_A96213A_allele_counts.csv
2019-11-04 18:55:01,056 - INFO - Loaded haplotype allele counts table with shape (34468856, 7), memory 1654532408
2019-11-04 18:55:15,163 - INFO - Loaded all haplotype allele counts table with shape (91834759, 7), memory 4408166984


dict_keys(['allele_counts'])
   allele_id                cell_id chromosome      end  hap_label  readcount  \
0          0  SA921-A90554A-R12-C09          1  1000000         27          1   
1          0  SA921-A90554A-R12-C09          1  2500000        151          1   
2          1  SA921-A90554A-R12-C09          1  3845268        259          1   
3          0  SA921-A90554A-R12-C09          1  4500000        285          1   
4          0  SA921-A90554A-R12-C09          1  7000000        406          1   

     start  
0   521368  
1  2000000  
2  3500000  
3  4000000  
4  6500000  
