From 05ed18568ca9744c189a6bd48d895d9db1b82697 Mon Sep 17 00:00:00 2001 From: Sigve Nakken Date: Sun, 18 Feb 2024 21:38:23 +0100 Subject: [PATCH] simplified tier assignment --- pcgr/arg_checker.py | 8 +- pcgr/biomarker.py | 28 +- pcgr/cna.py | 16 +- pcgr/config.py | 2 +- pcgr/cpsr.py | 4 +- pcgr/main.py | 5 +- pcgr/pcgr_vars.py | 4 +- pcgrr/DESCRIPTION | 4 +- pcgrr/NAMESPACE | 7 +- pcgrr/R/acmg.R | 309 ++- pcgrr/R/biomarkers.R | 93 +- pcgrr/R/germline.R | 355 ++-- pcgrr/R/input_data.R | 321 ++- pcgrr/R/kataegis.R | 5 +- pcgrr/R/main.R | 79 +- pcgrr/R/main2.R | 1801 +++++++++++++++++ pcgrr/R/msi.R | 216 +- pcgrr/R/mutation.R | 92 - pcgrr/R/mutational_burden.R | 4 +- pcgrr/R/mutational_signatures.R | 327 +-- pcgrr/R/reference_data.R | 84 +- pcgrr/R/report.R | 132 +- pcgrr/R/utils.R | 150 +- pcgrr/R/validate.R | 10 - pcgrr/R/value_boxes.R | 4 +- pcgrr/data-raw/data-raw.R | 196 ++ pcgrr/data/cancer_phenotypes_regex.rda | Bin 539 -> 540 bytes pcgrr/data/color_palette.rda | Bin 686 -> 686 bytes pcgrr/data/data_coltype_defs.rda | Bin 1898 -> 1922 bytes pcgrr/data/effect_prediction_algos.rda | Bin 1015 -> 1006 bytes pcgrr/data/evidence_levels.rda | Bin 110 -> 110 bytes pcgrr/data/evidence_types.rda | Bin 138 -> 139 bytes pcgrr/data/tcga_cohorts.rda | Bin 767 -> 762 bytes pcgrr/data/variant_db_url.rda | Bin 549 -> 544 bytes .../pcgr_flexdb/flexdb_scna_tier1.Rmd | 12 +- .../pcgr_flexdb/flexdb_scna_tier2.Rmd | 12 +- .../pcgr_flexdb/flexdb_snv_tier1.Rmd | 18 +- .../pcgr_flexdb/flexdb_snv_tier2.Rmd | 18 +- .../pcgr_rmarkdown/cna_biomarkers.Rmd | 16 +- .../templates/pcgr_rmarkdown/snv_tier1.Rmd | 14 +- .../templates/pcgr_rmarkdown/snv_tier2.Rmd | 10 +- pcgrr/man/append_cancer_gene_evidence.Rd | 11 +- pcgrr/man/assign_acmg_tiers.Rd | 29 + pcgrr/man/assign_germline_popfreq_status.Rd | 2 +- pcgrr/man/assign_somatic_classification.Rd | 6 +- pcgrr/man/assign_somatic_germline_evidence.Rd | 4 +- pcgrr/man/generate_pcgr_report2.Rd | 14 + pcgrr/man/generate_report_data_msi.Rd | 10 +- .../man/generate_report_data_signatures_mp.Rd | 20 +- pcgrr/man/generate_report_data_snv_indel.Rd | 20 +- pcgrr/man/generate_report_data_snv_indel2.Rd | 25 + pcgrr/man/generate_report_data_tumor_only.Rd | 11 +- pcgrr/man/generate_tier_tsv.Rd | 324 ++- pcgrr/man/get_population_tag.Rd | 23 - pcgrr/man/get_prevalent_site_signatures.Rd | 8 +- pcgrr/man/get_proper_maf_alleles.Rd | 23 - pcgrr/man/init_kataegis_content.Rd | 11 + pcgrr/man/init_msi_content.Rd | 11 + pcgrr/man/load_dna_variants.Rd | 14 +- pcgrr/man/load_somatic_cna.Rd | 4 +- pcgrr/man/make_upset_plot_data.Rd | 5 +- pcgrr/man/predict_msi_status.Rd | 8 +- pcgrr/man/write_report_output.Rd | 12 +- scripts/pcgrr.R | 34 +- 64 files changed, 3932 insertions(+), 1053 deletions(-) create mode 100644 pcgrr/R/main2.R delete mode 100644 pcgrr/R/validate.R create mode 100644 pcgrr/man/assign_acmg_tiers.Rd create mode 100644 pcgrr/man/generate_pcgr_report2.Rd create mode 100644 pcgrr/man/generate_report_data_snv_indel2.Rd delete mode 100644 pcgrr/man/get_population_tag.Rd delete mode 100644 pcgrr/man/get_proper_maf_alleles.Rd create mode 100644 pcgrr/man/init_kataegis_content.Rd create mode 100644 pcgrr/man/init_msi_content.Rd diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py index af13ad8c..bf20378e 100644 --- a/pcgr/arg_checker.py +++ b/pcgr/arg_checker.py @@ -106,12 +106,12 @@ def check_args(arg_dict): # if assay is targeted or mode is Tumor-Only, MSI prediction will not be performed/switched off assay_type = 'Tumor-Control' - if arg_dict['estimate_msi_status'] is True and (arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True): + if arg_dict['estimate_msi'] is True and (arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True): if arg_dict['tumor_only'] is True: assay_type = 'Tumor-Only' warn_msg = f"MSI status prediction can be applied for WGS/WES tumor-control assays only (query type: {arg_dict['assay']}|{assay_type}) - analysis will be omitted" warn_message(warn_msg, logger) - arg_dict['estimate_msi_status'] = 0 + arg_dict['estimate_msi'] = 0 # minimum number of mutations required for mutational signature reconstruction cannot be less than 100 (somewhat arbitrary lower threshold, recommended value is 200) if int(arg_dict['min_mutations_signatures']) < int(pcgr_vars.RECOMMENDED_N_MUT_SIGNATURE): @@ -124,8 +124,8 @@ def check_args(arg_dict): error_message(err_msg, logger) # if MSI status is to be estimated, mutational burden must be turned on - if arg_dict['estimate_msi_status'] is True and arg_dict['estimate_tmb'] is False: - err_msg = "Prediction of MSI status ('--estimate_msi_status') requires mutational burden analysis ('--estimate_tmb')" + if arg_dict['estimate_msi'] is True and arg_dict['estimate_tmb'] is False: + err_msg = "Prediction of MSI status ('--estimate_msi') requires mutational burden analysis ('--estimate_tmb')" error_message(err_msg, logger) if arg_dict['tumor_only'] is True: diff --git a/pcgr/biomarker.py b/pcgr/biomarker.py index 01c627a1..31e3ea02 100644 --- a/pcgr/biomarker.py +++ b/pcgr/biomarker.py @@ -21,7 +21,7 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b Returns: - variant_biomarkers: A dictionary containing variant biomarkers. The keys are variant alias types - ('dbsnp', 'hgvsp', 'hgvsc', 'genomic', 'exon', 'other', 'aa_region'), and the values are + ('dbsnp', 'hgvsp', 'hgvsc', 'genomic', 'exon', 'other_gene', 'aa_region'), and the values are dictionaries containing variant information. Note: @@ -33,7 +33,7 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b """ variant_biomarkers = {} ##dictionary to return - for variant_alias_type in ['dbsnp','hgvsp','hgvsc','genomic','exon','other','aa_region']: + for variant_alias_type in ['dbsnp','hgvsp','hgvsc','genomic','exon','other_gene','aa_region']: variant_biomarkers[variant_alias_type] = {} check_file_exists(biomarker_clinical_fname, logger) @@ -92,12 +92,12 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b entry_alias_type = str(row['alias_type']).replace("_grch37", "") entry_alias_type = entry_alias_type.replace("_grch38", "") - if entry_alias_type == "other": + if entry_alias_type == "other_gene": if bool(re.search(r'^((ACTIVATING )?MUTATION|LOSS|START LOSS)$', row['variant_alias'])) is True: varkey = str(row['entrezgene']) - if not varkey in variant_biomarkers['other']: - variant_biomarkers['other'][varkey] = [] - variant_biomarkers['other'][varkey].append(row) + if not varkey in variant_biomarkers['other_gene']: + variant_biomarkers['other_gene'][varkey] = [] + variant_biomarkers['other_gene'][varkey].append(row) if entry_alias_type == 'exon': exons = row['variant_exon'] @@ -131,20 +131,18 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b if biomarker_vartype == 'CNA' and (row['alteration_type'].startswith('CNA')): row['clinical_evidence_items'] = '.' if row['variant_id'] in variant_to_clinical_evidence.keys(): - row['clinical_evidence_items'] = variant_to_clinical_evidence[row['variant_id']] - entry_alias_type = str(row['alias_type']).replace("_grch37", "") - entry_alias_type = entry_alias_type.replace("_grch38", "") + row['clinical_evidence_items'] = variant_to_clinical_evidence[row['variant_id']] - if entry_alias_type == "other": + if row['alias_type'] == "other_gene": if bool(re.search(r'^(AMPLIFICATION|DELETION)$', row['variant_alias'])) is True: varkey = str(row['entrezgene']) + "_" + \ re.sub(r"transcript_","",str(row['variant_consequence'])) - if not varkey in variant_biomarkers['other']: - variant_biomarkers['other'][varkey] = [] + if not varkey in variant_biomarkers['other_gene']: + variant_biomarkers['other_gene'][varkey] = [] del row['variant_exon'] del row['gene'] del row['alias_type'] - variant_biomarkers['other'][varkey].append(row) + variant_biomarkers['other_gene'][varkey].append(row) @@ -327,8 +325,8 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi ## Match biomarkers indicated by gene only - "gene level" resolution if entrezgene != "." and principal_csq_entrezgene is True: - if str(entrezgene) in variant_biomarkers['other'].keys(): - hits_gene = variant_biomarkers['other'][str(entrezgene)] + if str(entrezgene) in variant_biomarkers['other_gene'].keys(): + hits_gene = variant_biomarkers['other_gene'][str(entrezgene)] for ghit in hits_gene: bkey3 = f"{ghit['biomarker_source']}|{ghit['variant_id']}|{ghit['clinical_evidence_items']}" ## match biomarkers annotated as "Mutation" only for a given gene - diff --git a/pcgr/cna.py b/pcgr/cna.py index 478deafd..fc63b332 100644 --- a/pcgr/cna.py +++ b/pcgr/cna.py @@ -10,7 +10,7 @@ from pcgr import utils from pybedtools import BedTool from pcgr.annoutils import nuclear_chromosomes -from pcgr.utils import error_message, warn_message, check_file_exists +from pcgr.utils import error_message, warn_message, check_file_exists, remove_file from pcgr.biomarker import load_biomarkers def annotate_cna_segments(output_fname: str, @@ -129,8 +129,8 @@ def annotate_cna_segments(output_fname: str, biomarkers[db] = load_biomarkers( logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA') - for key in biomarkers[db]['other']: - biomarker_data = biomarkers[db]['other'][key] + for key in biomarkers[db]['other_gene']: + biomarker_data = biomarkers[db]['other_gene'][key] biomarker_item = str(db) + '|' + str(biomarker_data[0]['variant_id']) + \ '|' + str(biomarker_data[0]['clinical_evidence_items']) + '|by_cna_segment' if not key in cna_actionable_dict: @@ -154,6 +154,10 @@ def annotate_cna_segments(output_fname: str, cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] > 0,"loss_cond"] = False cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] == 0,"loss_cond"] = True + cna_query_segment_df['variant_class'] = 'undefined' + cna_query_segment_df.loc[cna_query_segment_df.amp_cond, 'variant_class'] = 'gain' + cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'variant_class'] = 'homdel' + cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'aberration_key'] = \ cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'entrezgene'].astype(str) + '_ablation' @@ -165,7 +169,7 @@ def annotate_cna_segments(output_fname: str, ## remove all temporary files for fname in temp_files: - utils.remove(fname) + remove_file(fname) cna_query_segment_df.columns = map(str.upper, cna_query_segment_df.columns) cna_query_segment_df.rename(columns = {'CHROMOSOME':'CHROM','SEGMENT_ID':'VAR_ID'}, inplace = True) @@ -253,7 +257,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_d ## remove all temporary files for fname in temp_files: - utils.remove(fname) + remove_file(fname) return cytoband_annotated_segments @@ -363,7 +367,7 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str, ## remove all temporary files for fname in temp_files: - utils.remove(fname) + remove_file(fname) return(cna_segments_annotated) diff --git a/pcgr/config.py b/pcgr/config.py index c17e4d4d..636a6d3f 100644 --- a/pcgr/config.py +++ b/pcgr/config.py @@ -94,7 +94,7 @@ def create_config(arg_dict, workflow = "PCGR"): 'exclude_nonexonic': int(arg_dict['exclude_nonexonic']) } conf_options['somatic_snv']['msi'] = { - 'run': int(arg_dict['estimate_msi_status']) + 'run': int(arg_dict['estimate_msi']) } conf_options['somatic_snv']['tmb'] = { 'run': int(arg_dict['estimate_tmb']), diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py index ab33ef4d..07834b97 100755 --- a/pcgr/cpsr.py +++ b/pcgr/cpsr.py @@ -38,7 +38,7 @@ def get_args(): optional_panel.add_argument('--panel_id',dest = "virtual_panel_id",type = str, default = "-1", help="Comma-separated string with identifier(s) of predefined virtual cancer predisposition gene panels,\nchoose any combination of the following identifiers (GEP = Genomics England PanelApp):\n" + str(pcgr_vars.panels)) optional_panel.add_argument('--custom_list',dest = "custom_list",help="Provide custom list of genes from virtual panel 0 (single-column .txt/.tsv file with Ensembl gene identifiers),\n alternative to predefined panels provided with --panel_id)") optional_panel.add_argument('--custom_list_name',dest = "custom_list_name", default="None", help="Set name for custom made panel/list (single word - no whitespace), will be displayed in the report") - optional_panel.add_argument('--diagnostic_grade_only', action="store_true",help="For panel_id's 1-42 (Genomics England PanelApp) - consider genes with a GREEN status only, default: %(default)s") + optional_panel.add_argument('--diagnostic_grade_only', action="store_true",help="For panel_id's 1-44 (Genomics England PanelApp) - consider genes with a GREEN status only, default: %(default)s") optional_other.add_argument('--force_overwrite', action = "store_true", help='By default, the script will fail with an error if any output file already exists.\n You can force the overwrite of existing result files by using this flag, default: %(default)s') optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version())) @@ -204,7 +204,6 @@ def run_cpsr(conf_options, cpsr_paths): output_vcf = vep_vcf) logger = getlogger('cpsr-vep') - #print(str(vep_command["main"])) logger.info(( f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor (version {pcgr_vars.VEP_VERSION}, " @@ -224,7 +223,6 @@ def run_cpsr(conf_options, cpsr_paths): check_subprocess(logger, vep_command["tabix"], debug) logger.info("Finished cpsr-vep") print('----') - #exit(0) ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs logger = getlogger('cpsr-vcfanno') diff --git a/pcgr/main.py b/pcgr/main.py index 3e010e91..130ad3b6 100755 --- a/pcgr/main.py +++ b/pcgr/main.py @@ -62,10 +62,9 @@ def cli(): optional_allelic_support.add_argument("--control_af_max", type=float, default=1, dest="control_af_max", help="If VCF INFO tag for variant allelic fraction (control) is specified and found, set maximum tolerated AF for inclusion in report (default: %(default)s)") optional_tmb_msi.add_argument("--estimate_tmb", action="store_true", help="Estimate tumor mutational burden from the total number of somatic mutations and target region size, default: %(default)s") - #optional_tmb_msi.add_argument("--tmb_algorithm", dest="tmb_algorithm", default="all_coding", choices=[ "all_coding", "nonsyn"], help="Method for calculation of TMB, all coding variants (Chalmers et al., Genome Medicine, 2017), or non-synonymous variants only, default: %(default)s") optional_tmb_msi.add_argument("--tmb_dp_min", dest="tmb_dp_min", default=0, help="If VCF INFO tag for sequencing depth (tumor) is specified and found, set minimum required sequencing depth for TMB calculation: default: %(default)s") optional_tmb_msi.add_argument("--tmb_af_min", dest="tmb_af_min", default=0, help="If VCF INFO tag for allelic fraction (tumor) is specified and found, set minimum required allelic fraction for TMB calculation: default: %(default)s") - optional_tmb_msi.add_argument("--estimate_msi_status", action="store_true", help="Predict microsatellite instability status from patterns of somatic mutations/indels, default: %(default)s") + optional_tmb_msi.add_argument("--estimate_msi", action="store_true", help="Predict microsatellite instability status from patterns of somatic mutations/indels, default: %(default)s") optional_assay.add_argument("--assay", dest="assay", default="WES", choices=[ "WGS", "WES","TARGETED"], help="Type of DNA sequencing assay performed for input data (VCF), default: %(default)s") @@ -77,7 +76,7 @@ def cli(): optional_signatures.add_argument("--min_mutations_signatures", type=int, default=200, dest="min_mutations_signatures", help="Minimum number of SNVs required for reconstruction of mutational signatures (SBS) by MutationalPatterns (default: %(default)s, minimum n = 100)") optional_signatures.add_argument("--all_reference_signatures", action="store_true", help="Use all reference mutational signatures (SBS, n = 67) in signature reconstruction rather than only those already attributed to the tumor type (default: %(default)s)") optional_signatures.add_argument("--include_artefact_signatures", action="store_true", help="Include sequencing artefacts in the collection of reference signatures (default: %(default)s") - optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=5, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)") + optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=1, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)") optional_other.add_argument("--cpsr_report", dest="cpsr_report", help="CPSR report file (Gzipped JSON - file ending with 'cpsr..json.gz' - germline report of patient's blood/control sample") optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)") diff --git a/pcgr/pcgr_vars.py b/pcgr/pcgr_vars.py index aff86bc3..0a959d20 100644 --- a/pcgr/pcgr_vars.py +++ b/pcgr/pcgr_vars.py @@ -3,7 +3,7 @@ from pcgr._version import __version__ PCGR_VERSION = __version__ -DB_VERSION = '20240203' +DB_VERSION = '20240209' ## MISCELLANEOUS NCBI_BUILD_MAF = 'GRCh38' @@ -103,7 +103,7 @@ 37: "Renal cancer pertinent cancer susceptibility (GEP)", 38: "Rhabdoid tumour predisposition (GEP)", 39: "Sarcoma cancer susceptibility (GEP)", - 40: "Sarcoma susceptbility (GEP)", + 40: "Sarcoma susceptibility (GEP)", 41: "Thyroid cancer pertinent cancer susceptibility (GEP)", 42: "Tumour predisposition - childhood onset (GEP)", 43: "Upper gastrointestinal cancer pertinent cancer susceptibility (GEP)", diff --git a/pcgrr/DESCRIPTION b/pcgrr/DESCRIPTION index d6f71612..a42c4e84 100644 --- a/pcgrr/DESCRIPTION +++ b/pcgrr/DESCRIPTION @@ -2,7 +2,7 @@ Package: pcgrr Type: Package Title: Personal Cancer Genome ReporteR Version: 1.4.1.9001 -Date: 2023-12-30 +Date: 2024-12-18 Authors@R: c(person(given = "Sigve", family = "Nakken", @@ -69,5 +69,5 @@ Suggests: BSgenome.Hsapiens.UCSC.hg38 Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) diff --git a/pcgrr/NAMESPACE b/pcgrr/NAMESPACE index 954a4b6a..b15fa866 100644 --- a/pcgrr/NAMESPACE +++ b/pcgrr/NAMESPACE @@ -9,6 +9,7 @@ export(append_gwas_citation_phenotype) export(append_otargets_pheno_link) export(append_tcga_var_link) export(append_tfbs_annotation) +export(assign_acmg_tiers) export(assign_germline_popfreq_status) export(assign_mutation_type) export(assign_somatic_classification) @@ -28,11 +29,13 @@ export(filter_eitems_by_site) export(filter_read_support) export(generate_annotation_link) export(generate_pcgr_report) +export(generate_pcgr_report2) export(generate_report_data_kataegis) export(generate_report_data_msi) export(generate_report_data_rainfall) export(generate_report_data_signatures_mp) export(generate_report_data_snv_indel) +export(generate_report_data_snv_indel2) export(generate_report_data_tmb) export(generate_report_data_trials) export(generate_report_data_tumor_only) @@ -45,15 +48,15 @@ export(get_cna_overlapping_transcripts) export(get_genome_obj) export(get_oncogene_tsgene_target_sets) export(get_ordinary_chromosomes) -export(get_population_tag) export(get_prevalent_site_signatures) -export(get_proper_maf_alleles) export(get_valid_chromosomes) export(het_af_germline_status) export(hom_af_status) export(init_cna_content) export(init_germline_content) +export(init_kataegis_content) export(init_m_signature_content) +export(init_msi_content) export(init_rainfall_content) export(init_report) export(init_report_display_content) diff --git a/pcgrr/R/acmg.R b/pcgrr/R/acmg.R index 34aec75f..e3c9630a 100644 --- a/pcgrr/R/acmg.R +++ b/pcgrr/R/acmg.R @@ -18,8 +18,8 @@ assign_tier1_tier2_acmg <- function(pcg_report_snv_indel) { unique_variants_tier2 <- data.frame() ## eitems - eitems_specific_ttype <- - pcg_report_snv_indel[["clin_eitem"]][["specific_ttype"]] + eitems_query_ttype <- + pcg_report_snv_indel[["clin_eitem"]][["query_ttype"]] eitems_any_ttype <- pcg_report_snv_indel[["clin_eitem"]][["any_ttype"]] eitems_other_ttype <- @@ -27,9 +27,9 @@ assign_tier1_tier2_acmg <- function(pcg_report_snv_indel) { for (etype in c("diagnostic", "predictive", "prognostic")) { - if (nrow(eitems_specific_ttype[[etype]][["A_B"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["A_B"]]) > 0) { vars <- - dplyr::select(eitems_specific_ttype[[etype]][["A_B"]], + dplyr::select(eitems_query_ttype[[etype]][["A_B"]], .data$GENOMIC_CHANGE) |> dplyr::distinct() unique_variants_tier1 <- @@ -43,16 +43,16 @@ assign_tier1_tier2_acmg <- function(pcg_report_snv_indel) { eitems_other_ttype[[etype]][["A_B"]] <- eitems_any_ttype[[etype]][["A_B"]] - if (nrow(eitems_specific_ttype[[etype]][["A_B"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["A_B"]]) > 0) { if (pcgrr::check_common_colnames( df1 = eitems_any_ttype[[etype]][["A_B"]], - df2 = eitems_specific_ttype[[etype]][["A_B"]], + df2 = eitems_query_ttype[[etype]][["A_B"]], cnames = c("GENOMIC_CHANGE"))) { eitems_other_ttype[[etype]][["A_B"]] <- dplyr::anti_join(eitems_any_ttype[[etype]][["A_B"]], - eitems_specific_ttype[[etype]][["A_B"]], + eitems_query_ttype[[etype]][["A_B"]], by = c("GENOMIC_CHANGE")) } } @@ -61,7 +61,7 @@ assign_tier1_tier2_acmg <- function(pcg_report_snv_indel) { if (pcgrr::check_common_colnames( df1 = unique_variants_tier1, df2 = eitems_other_ttype[[etype]][["A_B"]], - cnames = c("GENOMIC_CHANGE"))){ + cnames = c("GENOMIC_CHANGE"))) { eitems_other_ttype[[etype]][["A_B"]] <- dplyr::anti_join(eitems_other_ttype[[etype]][["A_B"]], unique_variants_tier1, @@ -77,22 +77,22 @@ assign_tier1_tier2_acmg <- function(pcg_report_snv_indel) { } } } - if (nrow(eitems_specific_ttype[[etype]][["C_D_E"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["C_D_E"]]) > 0) { if (nrow(unique_variants_tier1) > 0) { if (pcgrr::check_common_colnames( df1 = unique_variants_tier1, - df2 = eitems_specific_ttype[[etype]][["C_D_E"]], + df2 = eitems_query_ttype[[etype]][["C_D_E"]], cnames = c("GENOMIC_CHANGE"))) { - eitems_specific_ttype[[etype]][["C_D_E"]] <- + eitems_query_ttype[[etype]][["C_D_E"]] <- dplyr::anti_join( - eitems_specific_ttype[[etype]][["C_D_E"]], + eitems_query_ttype[[etype]][["C_D_E"]], unique_variants_tier1, by = c("GENOMIC_CHANGE")) } } - if (nrow(eitems_specific_ttype[[etype]][["C_D_E"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["C_D_E"]]) > 0) { unique_variants_tier2 <- unique_variants_tier2 |> dplyr::bind_rows( - dplyr::select(eitems_specific_ttype[[etype]][["C_D_E"]], + dplyr::select(eitems_query_ttype[[etype]][["C_D_E"]], .data$GENOMIC_CHANGE)) |> dplyr::distinct() } @@ -103,8 +103,8 @@ assign_tier1_tier2_acmg <- function(pcg_report_snv_indel) { unique_variants_tier1 pcg_report_snv_indel[["disp"]][["tier2"]] <- unique_variants_tier2 - pcg_report_snv_indel[["clin_eitem"]][["specific_ttype"]] <- - eitems_specific_ttype + pcg_report_snv_indel[["clin_eitem"]][["query_ttype"]] <- + eitems_query_ttype pcg_report_snv_indel[["clin_eitem"]][["any_ttype"]] <- eitems_any_ttype pcg_report_snv_indel[["clin_eitem"]][["other_ttype"]] <- @@ -167,18 +167,18 @@ assign_tier1_tier2_acmg_cna <- function(pcg_report_cna) { unique_variants_tier2 <- data.frame() ## eitems - eitems_specific_ttype <- pcg_report_cna[["clin_eitem"]][["specific_ttype"]] + eitems_query_ttype <- pcg_report_cna[["clin_eitem"]][["query_ttype"]] eitems_any_ttype <- pcg_report_cna[["clin_eitem"]][["any_ttype"]] eitems_other_ttype <- pcg_report_cna[["clin_eitem"]][["other_ttype"]] for (etype in c("diagnostic", "predictive", "prognostic")) { - if (nrow(eitems_specific_ttype[[etype]][["A_B"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["A_B"]]) > 0) { - assertable::assert_colnames(eitems_specific_ttype[[etype]][["A_B"]], + assertable::assert_colnames(eitems_query_ttype[[etype]][["A_B"]], c("SYMBOL", "SEGMENT", "CNA_TYPE"), only_colnames = F, quiet = T) - vars <- dplyr::select(eitems_specific_ttype[[etype]][["A_B"]], + vars <- dplyr::select(eitems_query_ttype[[etype]][["A_B"]], .data$SYMBOL, .data$SEGMENT, .data$CNA_TYPE) |> dplyr::distinct() unique_variants_tier1 <- rbind(unique_variants_tier1, vars) |> @@ -191,16 +191,16 @@ assign_tier1_tier2_acmg_cna <- function(pcg_report_cna) { eitems_other_ttype[[etype]][["A_B"]] <- eitems_any_ttype[[etype]][["A_B"]] - if (nrow(eitems_specific_ttype[[etype]][["A_B"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["A_B"]]) > 0) { if (pcgrr::check_common_colnames( df1 = eitems_any_ttype[[etype]][["A_B"]], - df2 = eitems_specific_ttype[[etype]][["A_B"]], + df2 = eitems_query_ttype[[etype]][["A_B"]], cnames = c("SYMBOL", "SEGMENT", "CNA_TYPE"))) { eitems_other_ttype[[etype]][["A_B"]] <- dplyr::anti_join(eitems_any_ttype[[etype]][["A_B"]], - eitems_specific_ttype[[etype]][["A_B"]], + eitems_query_ttype[[etype]][["A_B"]], by = c("SYMBOL", "SEGMENT", "CNA_TYPE")) } } @@ -230,28 +230,28 @@ assign_tier1_tier2_acmg_cna <- function(pcg_report_cna) { } } } - if (nrow(eitems_specific_ttype[[etype]][["C_D_E"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["C_D_E"]]) > 0) { if (nrow(unique_variants_tier1) > 0) { if (pcgrr::check_common_colnames( df1 = unique_variants_tier1, - df2 = eitems_specific_ttype[[etype]][["C_D_E"]], + df2 = eitems_query_ttype[[etype]][["C_D_E"]], cnames = c("SYMBOL", "SEGMENT", "CNA_TYPE"))) { - eitems_specific_ttype[[etype]][["C_D_E"]] <- + eitems_query_ttype[[etype]][["C_D_E"]] <- dplyr::anti_join( - eitems_specific_ttype[[etype]][["C_D_E"]], + eitems_query_ttype[[etype]][["C_D_E"]], unique_variants_tier1, by = c("SYMBOL", "SEGMENT", "CNA_TYPE")) } } - if (nrow(eitems_specific_ttype[[etype]][["C_D_E"]]) > 0) { + if (nrow(eitems_query_ttype[[etype]][["C_D_E"]]) > 0) { - assertable::assert_colnames(eitems_specific_ttype[[etype]][["C_D_E"]], + assertable::assert_colnames(eitems_query_ttype[[etype]][["C_D_E"]], c("SYMBOL", "SEGMENT", "CNA_TYPE"), only_colnames = F, quiet = T) unique_variants_tier2 <- unique_variants_tier2 |> dplyr::bind_rows( - dplyr::select(eitems_specific_ttype[[etype]][["C_D_E"]], + dplyr::select(eitems_query_ttype[[etype]][["C_D_E"]], .data$SYMBOL, .data$SEGMENT, .data$CNA_TYPE)) |> dplyr::distinct() } @@ -260,10 +260,257 @@ assign_tier1_tier2_acmg_cna <- function(pcg_report_cna) { pcg_report_cna[["disp"]][["tier1"]] <- unique_variants_tier1 pcg_report_cna[["disp"]][["tier2"]] <- unique_variants_tier2 - pcg_report_cna[["clin_eitem"]][["specific_ttype"]] <- eitems_specific_ttype + pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- eitems_query_ttype pcg_report_cna[["clin_eitem"]][["any_ttype"]] <- eitems_any_ttype pcg_report_cna[["clin_eitem"]][["other_ttype"]] <- eitems_other_ttype return(pcg_report_cna) } + +#' Function that assigns tier classifications to somatic CNA segments and +#' SNVs/InDels, based on the presence of biomarker evidence found in +#' the variant set +#' +#' @param vartype variant type ('snv_indel' or 'cna') +#' @param primary_site primary tumor site +#' @param variants_df data frame with variants (SNVs/InDels or CNAs) +#' @param biomarker_items data frame with biomarker evidence items +#' +#' @export +assign_acmg_tiers <- function( + vartype = "snv_indel", + primary_site = "Any", + variants_df = NULL, + biomarker_items = NULL) { + + invisible(assertthat::assert_that( + is.data.frame(variants_df), + msg = paste0("Argument variants_df needs be of type data.frame"))) + assertable::assert_colnames( + variants_df, c("TUMOR_SUPPRESSOR", + "VAR_ID", + "VARIANT_CLASS", + "ONCOGENE", + "ENTREZGENE"), + only_colnames = F, quiet = T) + invisible(assertthat::assert_that( + is.data.frame(biomarker_items), + msg = paste0("Argument 'biomarker_items' needs be of type data.frame"))) + assertable::assert_colnames( + biomarker_items, + c("VAR_ID", + "ENTREZGENE", + "BM_EVIDENCE_LEVEL", + "BM_PRIMARY_SITE"), + only_colnames = F, quiet = T) + + results_acmg <- list() + tier_classification <- data.frame() + + if (NROW(biomarker_items) > 0) { + tier_classification <- + biomarker_items |> + #results[['biomarker_evidence']][['items']] |> + dplyr::select( + c("VAR_ID", + "VARIANT_CLASS", + "ENTREZGENE", + "BM_EVIDENCE_LEVEL", + "BM_PRIMARY_SITE")) |> + dplyr::distinct() |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::case_when( + .data$BM_PRIMARY_SITE == primary_site & + primary_site != "Any" & + stringr::str_detect( + .data$BM_EVIDENCE_LEVEL, "^(A|B)" + ) ~ as.integer(1), + .data$BM_PRIMARY_SITE != primary_site & + #primary_site != "Any" & + stringr::str_detect( + .data$BM_EVIDENCE_LEVEL, "^(A|B)" + ) ~ as.integer(2), + .data$BM_PRIMARY_SITE == primary_site & + primary_site != "Any" & + stringr::str_detect( + .data$BM_EVIDENCE_LEVEL, "^(C|D|E)" + ) ~ as.integer(2), + TRUE ~ as.integer(100) + )) |> + dplyr::group_by( + .data$VAR_ID, + .data$ENTREZGENE, + .data$VARIANT_CLASS) |> + #c("VAR_ID", "ENTREZGENE", "VARIANT_CLASS")) |> + dplyr::summarise( + ACMG_AMP_TIER = min(.data$ACMG_AMP_TIER, na.rm = T), + .groups = "drop") |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::if_else( + .data$ACMG_AMP_TIER == 100, + as.integer(NA), + as.integer(.data$ACMG_AMP_TIER) + )) + + if (vartype == 'snv_indel' & + "CODING_STATUS" %in% colnames(variants_df)) { + + variants_df <- variants_df |> + dplyr::left_join( + tier_classification, + by = c("VAR_ID","ENTREZGENE","VARIANT_CLASS")) |> + + dplyr::mutate(ACMG_TIER2 = dplyr::if_else( + (!is.na(.data$TUMOR_SUPPRESSOR) & + .data$TUMOR_SUPPRESSOR == TRUE) | + (!is.na(.data$ONCOGENE) & + .data$ONCOGENE == TRUE) & + .data$CODING_STATUS == "coding", + as.integer(3), + as.integer(NA) + )) |> + dplyr::mutate(ACMG_TIER2 = dplyr::if_else( + is.na(.data$ACMG_TIER2) | + (!is.na(.data$ACMG_TIER2) & + .data$ACMG_TIER2 != 3) & + .data$CODING_STATUS == "coding", + as.integer(4), + as.integer(.data$ACMG_TIER2) + )) |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::if_else( + .data$CODING_STATUS == "noncoding", + as.integer(5), + as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::case_when( + is.na(.data$ACMG_AMP_TIER) & + !is.na(.data$ACMG_TIER2) ~ .data$ACMG_TIER2, + TRUE ~ as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::select(-c("ACMG_TIER2")) |> + dplyr::arrange(.data$ACMG_AMP_TIER) + }else{ + + if (vartype == 'cna') { + + variants_df <- variants_df |> + dplyr::left_join( + tier_classification, + by = c("VAR_ID", + "ENTREZGENE", + "VARIANT_CLASS")) |> + dplyr::mutate(ACMG_TIER2 = dplyr::if_else( + (!is.na(.data$TUMOR_SUPPRESSOR) & + .data$TUMOR_SUPPRESSOR == TRUE & + .data$VARIANT_CLASS == "homdel") | + (!is.na(.data$ONCOGENE) & + .data$ONCOGENE == TRUE & + .data$VARIANT_CLASS == "gain"), + as.integer(3), + as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::case_when( + is.na(.data$ACMG_AMP_TIER) & + !is.na(.data$ACMG_TIER2) ~ .data$ACMG_TIER2, + TRUE ~ as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::select(-c("ACMG_TIER2")) |> + dplyr::arrange(.data$ACMG_AMP_TIER) |> + dplyr::distinct() + + } + } + } + else{ + if (vartype == 'snv_indel') { + variants_df <- variants_df |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::if_else( + (!is.na(.data$TUMOR_SUPPRESSOR) & + .data$TUMOR_SUPPRESSOR == TRUE) | + (!is.na(.data$ONCOGENE) & + .data$ONCOGENE == TRUE) & + .data$CODING_STATUS == "coding", + as.integer(3), + as.integer(NA) + )) |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::if_else( + is.na(.data$ACMG_AMP_TIER) & + .data$CODING_STATUS == "coding", + as.integer(4), + as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::if_else( + .data$CODING_STATUS == "noncoding", + as.integer(5), + as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::arrange(.data$ACMG_AMP_TIER) |> + dplyr::distinct() + } + if (vartype == 'cna') { + + variants_df <- variants_df |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::if_else( + (!is.na(.data$TUMOR_SUPPRESSOR) & + .data$TUMOR_SUPPRESSOR == TRUE & + .data$VARIANT_CLASS == "homdel") | + (!is.na(.data$ONCOGENE) & + .data$ONCOGENE == TRUE & + .data$VARIANT_CLASS == "gain"), + as.integer(3), + as.integer(NA) + )) |> + dplyr::distinct() |> + dplyr::arrange(.data$ACMG_AMP_TIER) + } + } + + biomarker_items <- biomarker_items |> + dplyr::left_join( + dplyr::select( + variants_df, + c("VAR_ID", + "ENTREZGENE", + "ACMG_AMP_TIER") + ), + by = c("VAR_ID","ENTREZGENE") + ) |> + dplyr::mutate(ACMG_AMP_TIER = dplyr::case_when( + .data$BM_PRIMARY_SITE == primary_site & + primary_site != "Any" & + as.integer(.data$ACMG_AMP_TIER) == 1 & + stringr::str_detect( + .data$BM_EVIDENCE_LEVEL,"^(C|D|E)" + ) ~ as.integer(NA), + .data$BM_PRIMARY_SITE != primary_site & + primary_site != "Any" & + .data$ACMG_AMP_TIER == 2 & + stringr::str_detect( + .data$BM_EVIDENCE_LEVEL,"^(C|D|E)" + ) ~ as.integer(NA), + .data$BM_PRIMARY_SITE != primary_site & + primary_site != "Any" & + .data$ACMG_AMP_TIER == 1 ~ as.integer(NA), + TRUE ~ as.integer(.data$ACMG_AMP_TIER) + )) |> + dplyr::arrange(.data$ACMG_AMP_TIER, + .data$BM_EVIDENCE_LEVEL, + dplyr::desc(.data$BM_RATING)) |> + dplyr::distinct() + + results_acmg[['variant']] <- variants_df |> + dplyr::rename(TIER = .data$ACMG_AMP_TIER) |> + dplyr::mutate(TIER_GUIDELINE = "ACMG_AMP") + + results_acmg[['biomarker_evidence']][['items']] <- + biomarker_items |> + dplyr::rename(TIER = .data$ACMG_AMP_TIER) |> + dplyr::mutate(TIER_GUIDELINE = "ACMG_AMP") + + results_acmg[['biomarker_evidence']][['tier_classification']] <- + tier_classification |> + dplyr::rename(TIER = .data$ACMG_AMP_TIER) |> + dplyr::mutate(TIER_GUIDELINE = "ACMG_AMP") + + return(results_acmg) + +} diff --git a/pcgrr/R/biomarkers.R b/pcgrr/R/biomarkers.R index 1887a793..47bd82c3 100644 --- a/pcgrr/R/biomarkers.R +++ b/pcgrr/R/biomarkers.R @@ -104,7 +104,7 @@ get_clin_assocs_snv_indel <- function(sample_calls, pcgrr::log_var_eitem_stats(var_eitems = var_eitems, target_type = "exon") ## Organize all variants in a list object 'clin_items', organized through - ## 1) tumor type (specific_ttype|any_ttype|other_ttype) + ## 1) tumor type (query_ttype|any_ttype|other_ttype) ## 2) evidence type (diagnostic|prognostic|predictive) ## 3) clinical significance ('A_B','C_D_E','any') @@ -145,7 +145,7 @@ get_clin_assocs_snv_indel <- function(sample_calls, #' get_clin_assocs_cna <- function(onco_ts_sets, annotation_tags = NULL, - eitems = NULL){ + eitems = NULL) { assertthat::assert_that( "oncogene_gain" %in% names(onco_ts_sets) & @@ -193,7 +193,7 @@ get_clin_assocs_cna <- function(onco_ts_sets, ## Organize all variants in a list object 'clin_items', organized through - ## 1) tumor type (specific_ttype|any_ttype|other_ttype) + ## 1) tumor type (query_ttype|any_ttype|other_ttype) ## 2) evidence type (diagnostic|prognostic|predictive) ## 3) clinical significance ('A_B','C_D_E','any') @@ -243,7 +243,7 @@ load_eitems <- function(eitems_raw = NULL, "two values: 'Germline' or 'Somatic' and NOT: ", origin))) - if(origin == "Somatic"){ + if (origin == "Somatic") { invisible( assertthat::assert_that( !is.null(tumor_type_specificity), @@ -291,7 +291,7 @@ assertthat::assert_that( ## mutation type and origin eitems_all <- data.frame() - for(alteration_type in alteration_types){ + for(alteration_type in alteration_types) { eitems_alteration_type <- pcgrr::load_all_eitems( eitems_raw = eitems_raw, @@ -397,7 +397,7 @@ load_all_eitems <- function(eitems_raw = NULL, only_colnames = F, quiet = T) - if(alteration_type == "CNA") { + if (alteration_type == "CNA") { selected_eitems[[db]] <- eitems_raw[[db]] |> dplyr::filter(.data$ALTERATION_TYPE == alteration_type & @@ -489,7 +489,7 @@ load_all_eitems <- function(eitems_raw = NULL, # dplyr::distinct() # # -# if(db == "cgi"){ +# if (db == "cgi") { # evidence_identifiers <- c("CGI_ID", "CGI_ID_SEGMENT") # if (region_marker == T) { # evidence_identifiers <- c("CGI_ID_SEGMENT", "CGI_ID") @@ -529,7 +529,7 @@ load_all_eitems <- function(eitems_raw = NULL, # cnames = c("HGVS_ALIAS", evidence_identifiers)) # ) # -# if(NROW(var_eitems[['by_id']]) > 0){ +# if (NROW(var_eitems[['by_id']]) > 0) { # var_eitems[['all']] <- var_eitems[['by_id']] # } # @@ -539,11 +539,11 @@ load_all_eitems <- function(eitems_raw = NULL, # ## Add additional var_eitems based on matching against # ## HGVS (protein_change) + SYMBOL # -# if(region_marker == F){ +# if (region_marker == F) { # eitems_hgvs <- eitems_db |> # dplyr::filter(!is.na(.data$HGVS_ALIAS)) # -# if(NROW(eitems_hgvs) > 0){ +# if (NROW(eitems_hgvs) > 0) { # eitems_hgvs <- eitems_hgvs |> # tidyr::separate_rows( # .data$HGVS_ALIAS, sep = "\\|") |> @@ -555,7 +555,7 @@ load_all_eitems <- function(eitems_raw = NULL, # dplyr::filter(!is.na(.data$PROTEIN_CHANGE)) |> # dplyr::select(dplyr::one_of(colset)) # -# if(NROW(vars_hgvs_mapped) > 0){ +# if (NROW(vars_hgvs_mapped) > 0) { # var_eitems_hgvs_mapped <- as.data.frame(vars_hgvs_mapped |> # dplyr::inner_join( # eitems_hgvs, by = c("SYMBOL","PROTEIN_CHANGE")) |> @@ -565,15 +565,15 @@ load_all_eitems <- function(eitems_raw = NULL, # # ## skip duplicate evidence items already found from # ## exact matching at genomic level -# if(NROW(var_eitems_hgvs_mapped) > 0){ -# if(NROW(var_eitems[['by_id']]) > 0){ +# if (NROW(var_eitems_hgvs_mapped) > 0) { +# if (NROW(var_eitems[['by_id']]) > 0) { # var_eitems_hgvs_mapped <- # var_eitems_hgvs_mapped |> # dplyr::anti_join( # var_eitems[['by_id']], by = c("GENOMIC_CHANGE")) # } # -# if(NROW(var_eitems_hgvs_mapped) > 0){ +# if (NROW(var_eitems_hgvs_mapped) > 0) { # var_eitems[['all']] <- var_eitems_exact |> # dplyr::bind_rows(var_eitems_hgvs_mapped) |> # dplyr::distinct() @@ -591,7 +591,7 @@ load_all_eitems <- function(eitems_raw = NULL, # dplyr::filter(!is.na(.data$HGVS_ALIAS)) |> # dplyr::filter(BIOMARKER_MAPPING == "codon") # -# if(NROW(eitems_hgvs_codon) > 0){ +# if (NROW(eitems_hgvs_codon) > 0) { # eitems_hgvs_codon <- eitems_hgvs_codon |> # tidyr::separate_rows(.data$HGVS_ALIAS, sep = "\\|") |> # dplyr::filter( @@ -615,7 +615,7 @@ load_all_eitems <- function(eitems_raw = NULL, # )) |> # dplyr::select(dplyr::one_of(colset)) # -# if(NROW(vars_codon_mapped) > 0){ +# if (NROW(vars_codon_mapped) > 0) { # var_eitems_codon_mapped <- as.data.frame( # vars_codon_mapped |> # dplyr::inner_join( @@ -627,8 +627,8 @@ load_all_eitems <- function(eitems_raw = NULL, # # ## skip duplicate evidence items already found from # ## exact matching at genomic level -# if(nrow(var_eitems_codon_mapped) > 0){ -# if(NROW(var_eitems[['by_id']]) > 0){ +# if (nrow(var_eitems_codon_mapped) > 0) { +# if (NROW(var_eitems[['by_id']]) > 0) { # var_eitems_codon_mapped <- var_eitems_codon_mapped |> # dplyr::select(-c("AA_CODON")) |> # dplyr::anti_join( @@ -719,8 +719,8 @@ qc_var_eitems <- function(var_eitems = NULL, if (nrow(filtered_var_eitems) > 0) { - if("LOSS_OF_FUNCTION" %in% colnames(filtered_var_eitems) & - "ALTERATION_TYPE" %in% colnames(filtered_var_eitems)){ + if ("LOSS_OF_FUNCTION" %in% colnames(filtered_var_eitems) & + "ALTERATION_TYPE" %in% colnames(filtered_var_eitems)) { filtered_var_eitems <- filtered_var_eitems |> dplyr::filter((.data$LOSS_OF_FUNCTION == T & @@ -877,7 +877,7 @@ structure_var_eitems <- function(var_eitems, #' @export deduplicate_eitems <- function(var_eitems = NULL, target_type = "exact", - target_other = c("codon","exon","gene")){ + target_other = c("codon","exon","gene")) { invisible( assertthat::assert_that(!is.null(var_eitems), @@ -887,7 +887,7 @@ deduplicate_eitems <- function(var_eitems = NULL, msg = paste0("Argument 'target_type' can only", "take on values 'codon' or 'exact'"))) - if(target_type == "exact"){ + if (target_type == "exact") { invisible( assertthat::assert_that( ("codon" %in% target_other & @@ -945,7 +945,7 @@ deduplicate_eitems <- function(var_eitems = NULL, #' @export log_var_eitem_stats <- function(var_eitems = NULL, - target_type = "exact"){ + target_type = "exact") { invisible( assertthat::assert_that(!is.null(var_eitems), @@ -981,7 +981,7 @@ log_var_eitem_stats <- function(var_eitems = NULL, var_eitems[[target_type]]$PROTEIN_CHANGE, sep = ":")), collapse = ", ") - if(nchar(variants_found_log) <= 200){ + if (nchar(variants_found_log) <= 200) { pcgrr::log4r_info( variants_found_log ) @@ -1001,51 +1001,62 @@ log_var_eitem_stats <- function(var_eitems = NULL, expand_biomarker_items <- function( callset = NULL, variant_origin = "somatic", - target_genes = NULL){ + target_genes = NULL) { - if("variant" %in% names(callset) & - "biomarker_evidence" %in% names(callset)){ + if ("variant" %in% names(callset) & + "biomarker_evidence" %in% names(callset)) { variant_properties <- c("VAR_ID", "GENOMIC_CHANGE", "GENOME_VERSION", "SAMPLE_ID", - "GENOTYPE", "VARIANT_CLASS", "SYMBOL", "GENENAME", "ENTREZGENE", + "REFSEQ_TRANSCRIPT_ID", + "ENSEMBL_TRANSCRIPT_ID", + "ENSEMBL_PROTEIN_ID", "CONSEQUENCE", "PROTEIN_CHANGE", "MUTATION_HOTSPOT", + "MUTATION_HOTSPOT_CANCERTYPE", "CDS_CHANGE", "LOSS_OF_FUNCTION", + "ONCOGENICITY", + "ONCOGENICITY_CLASSIFICATION_CODE", + "ONCOGENICITY_SCORE", "HGVSc", "HGVSp", "REFSEQ", "OFFICIAL_GENENAME", + "TARGETED_CANCER_DRUGS", "PREDICTED_EFFECT", "PROTEIN_DOMAIN", + "TCGA_FREQUENCY", "DBSNP", "CLINVAR", "COSMIC", "VEP_ALL_CSQ") - if(variant_origin == "germline"){ + if (variant_origin == "germline") { variant_properties <- c( variant_properties, + "GENOTYPE", "CLINVAR_CLASSIFICATION", "CPSR_CLASSIFICATION" ) } - if(variant_origin == "somatic"){ + if (variant_origin == "somatic") { variant_properties <- c( variant_properties, + "CALL_CONFIDENCE", "DP_TUMOR", "AF_TUMOR", "DP_CONTROL", - "AF_CONTROL" + "AF_CONTROL", + "GENOME_VERSION" ) } @@ -1054,7 +1065,7 @@ expand_biomarker_items <- function( for (type in c(pcgrr::evidence_types, "all")) { for (elevel in c("any", "A_B", "C_D_E")) { - if(NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0){ + if (NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0) { callset[['biomarker_evidence']][[type]][[elevel]] <- callset[['biomarker_evidence']][[type]][[elevel]] |> dplyr::left_join( @@ -1068,23 +1079,23 @@ expand_biomarker_items <- function( dplyr::desc( .data$RATING)) - if(variant_origin == "germline"){ + if (variant_origin == "germline") { callset[['biomarker_evidence']][[type]][[elevel]] <- callset[['biomarker_evidence']][[type]][[elevel]] |> dplyr::filter( - (!is.na(CLINVAR_CLASSIFICATION) & + (!is.na(.data$CLINVAR_CLASSIFICATION) & stringr::str_detect( - tolower(CLINVAR_CLASSIFICATION), "pathogenic")) | - (is.na(CLINVAR_CLASSIFICATION) & - !is.na(CPSR_CLASSIFICATION) & + tolower(.data$CLINVAR_CLASSIFICATION), "pathogenic")) | + (is.na(.data$CLINVAR_CLASSIFICATION) & + !is.na(.data$CPSR_CLASSIFICATION) & stringr::str_detect( - tolower(CPSR_CLASSIFICATION), "pathogenic")) + tolower(.data$CPSR_CLASSIFICATION), "pathogenic")) ) - if(NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0 & + if (NROW(callset[['biomarker_evidence']][[type]][[elevel]]) > 0 & is.data.frame(target_genes) & NROW(target_genes) > 0 & - "ENTREZGENE" %in% colnames(target_genes)){ + "ENTREZGENE" %in% colnames(target_genes)) { callset[['biomarker_evidence']][[type]][[elevel]] <- callset[['biomarker_evidence']][[type]][[elevel]] |> dplyr::semi_join(target_genes, by = "ENTREZGENE") @@ -1099,3 +1110,5 @@ expand_biomarker_items <- function( return(callset) } + +#assign_classification <- diff --git a/pcgrr/R/germline.R b/pcgrr/R/germline.R index afec4089..72576f16 100644 --- a/pcgrr/R/germline.R +++ b/pcgrr/R/germline.R @@ -5,7 +5,7 @@ #' @param sample_calls data frame with sample variant calls #' #' @export -max_af_gnomad <- function(sample_calls){ +max_af_gnomad <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), msg = paste0("Argument 'sample_calls' must be of ", @@ -23,7 +23,7 @@ max_af_gnomad <- function(sample_calls){ "gnomADe_OTH_AF") sample_calls$MAX_AF_GNOMAD <- 0 for (c in gnomad_cols) { - if(c %in% colnames(sample_calls)){ + if (c %in% colnames(sample_calls)) { if (nrow( sample_calls[!is.na(sample_calls[, c]) & sample_calls[, c] > sample_calls$MAX_AF_GNOMAD, ]) > 0) { @@ -48,7 +48,7 @@ max_af_gnomad <- function(sample_calls){ #' #' #' @export -clinvar_germline_status <- function(sample_calls){ +clinvar_germline_status <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), @@ -77,7 +77,7 @@ clinvar_germline_status <- function(sample_calls){ #' @param sample_calls data frame with sample variant calls #' #' @export -dbsnp_germline_status <- function(sample_calls){ +dbsnp_germline_status <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), @@ -87,18 +87,12 @@ dbsnp_germline_status <- function(sample_calls){ ## assign STATUS_DBSNP_GERMLINE status to all calls recorded in ## dbSNP (except relevant in a somatic setting, as defined by ClinVar/DoCM) if ("DBSNPRSID" %in% colnames(sample_calls) & - "DOCM_PMID" %in% colnames(sample_calls) & "CLINVAR_MSID" %in% colnames(sample_calls) & "CLINVAR_VARIANT_ORIGIN" %in% colnames(sample_calls)) { sample_calls <- sample_calls |> dplyr::mutate( STATUS_DBSNP_GERMLINE = dplyr::if_else(!is.na(.data$DBSNPRSID), TRUE, FALSE)) |> - dplyr::mutate( - STATUS_DBSNP_GERMLINE = - dplyr::if_else(.data$STATUS_DBSNP_GERMLINE == T & - !is.na(.data$DOCM_PMID), - FALSE, .data$STATUS_DBSNP_GERMLINE)) |> dplyr::mutate( STATUS_DBSNP_GERMLINE = dplyr::if_else( @@ -116,7 +110,7 @@ dbsnp_germline_status <- function(sample_calls){ #' @param sample_calls data frame with sample variant calls #' #' @export -tcga_somatic_status <- function(sample_calls){ +tcga_somatic_status <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), @@ -129,7 +123,9 @@ tcga_somatic_status <- function(sample_calls){ sample_calls <- sample_calls |> dplyr::mutate( STATUS_TCGA_SOMATIC = - dplyr::if_else(!is.na(.data$TCGA_PANCANCER_COUNT), TRUE, FALSE)) + dplyr::if_else( + !is.na(.data$TCGA_PANCANCER_COUNT), + TRUE, FALSE)) } return(sample_calls) @@ -141,7 +137,7 @@ tcga_somatic_status <- function(sample_calls){ #' @param sample_calls data frame with sample variant calls #' #' @export -cosmic_somatic_status <- function(sample_calls){ +cosmic_somatic_status <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), @@ -154,7 +150,9 @@ cosmic_somatic_status <- function(sample_calls){ sample_calls <- sample_calls |> dplyr::mutate( STATUS_COSMIC = - dplyr::if_else(!is.na(.data$COSMIC_MUTATION_ID), TRUE, FALSE)) + dplyr::if_else( + !is.na(.data$COSMIC_MUTATION_ID), + TRUE, FALSE)) } return(sample_calls) @@ -166,7 +164,7 @@ cosmic_somatic_status <- function(sample_calls){ #' @param sample_calls data frame with sample variant calls #' #' @export -hom_af_status <- function(sample_calls){ +hom_af_status <- function(sample_calls) { invisible( @@ -180,8 +178,11 @@ hom_af_status <- function(sample_calls){ sample_calls <- sample_calls |> dplyr::mutate( STATUS_LIKELY_GERMLINE_HOMOZYGOUS = - dplyr::if_else(!is.na(.data$AF_TUMOR) & .data$AF_TUMOR == 1, - TRUE, FALSE)) + dplyr::if_else( + !is.na(.data$AF_TUMOR) & + .data$AF_TUMOR == 1, + TRUE, + FALSE)) } return(sample_calls) } @@ -193,7 +194,7 @@ hom_af_status <- function(sample_calls){ #' @param sample_calls data frame with sample variant calls #' #' @export -pon_status <- function(sample_calls){ +pon_status <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), @@ -206,8 +207,10 @@ pon_status <- function(sample_calls){ sample_calls <- sample_calls |> dplyr::mutate( STATUS_PON = - dplyr::if_else(.data$PANEL_OF_NORMALS == TRUE, - TRUE, FALSE)) + dplyr::if_else( + .data$PANEL_OF_NORMALS == TRUE, + TRUE, + FALSE)) } return(sample_calls) } @@ -219,7 +222,7 @@ pon_status <- function(sample_calls){ #' @param sample_calls data frame with sample variant calls #' #' @export -het_af_germline_status <- function(sample_calls){ +het_af_germline_status <- function(sample_calls) { invisible( assertthat::assert_that(is.data.frame(sample_calls), @@ -238,12 +241,13 @@ het_af_germline_status <- function(sample_calls){ sample_calls <- sample_calls |> dplyr::mutate( STATUS_LIKELY_GERMLINE_HETEROZYGOUS = - dplyr::if_else(!is.na(.data$MAX_AF_GNOMAD) & - .data$STATUS_DBSNP_GERMLINE == TRUE & - !is.na(.data$AF_TUMOR) & - .data$AF_TUMOR >= 0.40 & .data$AF_TUMOR <= 0.60 & - .data$STATUS_TCGA_SOMATIC == FALSE & - .data$STATUS_COSMIC == FALSE, TRUE, FALSE)) + dplyr::if_else( + !is.na(.data$MAX_AF_GNOMAD) & + .data$STATUS_DBSNP_GERMLINE == TRUE & + !is.na(.data$AF_TUMOR) & + .data$AF_TUMOR >= 0.40 & .data$AF_TUMOR <= 0.60 & + .data$STATUS_TCGA_SOMATIC == FALSE & + .data$STATUS_COSMIC == FALSE, TRUE, FALSE)) } return(sample_calls) } @@ -253,14 +257,14 @@ het_af_germline_status <- function(sample_calls){ #' based on evidence found in variant set, #' potentially limited by user-defined options #' -#' @param sample_calls data frame with variants -#' @param config configuration object +#' @param sample_calls data frame with putative somatic variants +#' @param settings PCGR configuration settings #' #' @return sample_calls #' #' @export -assign_somatic_classification <- function(sample_calls, config) { +assign_somatic_classification <- function(sample_calls, settings) { sample_calls$SOMATIC_CLASSIFICATION <- "SOMATIC" @@ -285,7 +289,7 @@ assign_somatic_classification <- function(sample_calls, config) { sample_calls <- sample_calls |> dplyr::mutate( SOMATIC_CLASSIFICATION = - dplyr::if_else(.data$STATUS_POPFREQ_GNOMAD_ABOVE_TOLERATED == TRUE & + dplyr::if_else(.data$gnomADe_AF_ABOVE_TOLERATED == TRUE & .data$SOMATIC_CLASSIFICATION == "SOMATIC", "GERMLINE_GNOMAD", .data$SOMATIC_CLASSIFICATION)) |> dplyr::mutate( @@ -296,7 +300,7 @@ assign_somatic_classification <- function(sample_calls, config) { dplyr::mutate( SOMATIC_CLASSIFICATION = dplyr::if_else(.data$STATUS_PON == TRUE & - config[["tumor_only"]][["exclude_pon"]] == TRUE & + settings$conf$somatic_snv[["exclude_pon"]] == TRUE & .data$SOMATIC_CLASSIFICATION == "SOMATIC", "GERMLINE_PON", .data$SOMATIC_CLASSIFICATION)) |> @@ -304,28 +308,31 @@ assign_somatic_classification <- function(sample_calls, config) { SOMATIC_CLASSIFICATION = dplyr::if_else( .data$STATUS_LIKELY_GERMLINE_HOMOZYGOUS == TRUE & - config[["tumor_only"]][["exclude_likely_hom_germline"]] == TRUE & + settings$conf$somatic_snv[["exclude_likely_hom_germline"]] == TRUE & .data$SOMATIC_CLASSIFICATION == "SOMATIC", - "GERMLINE_HOMOZYGOUS", .data$SOMATIC_CLASSIFICATION)) |> + "GERMLINE_HOMOZYGOUS", + .data$SOMATIC_CLASSIFICATION)) |> dplyr::mutate( SOMATIC_CLASSIFICATION = dplyr::if_else( .data$STATUS_LIKELY_GERMLINE_HETEROZYGOUS == TRUE & - config[["tumor_only"]][["exclude_likely_het_germline"]] == TRUE & + settings$conf$somatic_snv[["exclude_likely_het_germline"]] == TRUE & .data$SOMATIC_CLASSIFICATION == "SOMATIC", - "GERMLINE_HETEROZYGOUS", .data$SOMATIC_CLASSIFICATION)) + "GERMLINE_HETEROZYGOUS", + .data$SOMATIC_CLASSIFICATION)) ## set variants found in DBSNP as germline if this option is set to TRUE - if (config[["tumor_only"]][["exclude_dbsnp_nonsomatic"]] == TRUE) { + if (settings$conf$somatic_snv[["exclude_dbsnp_nonsomatic"]] == TRUE) { sample_calls <- sample_calls |> dplyr::mutate( SOMATIC_CLASSIFICATION = - dplyr::if_else(.data$STATUS_DBSNP_GERMLINE == TRUE & - .data$STATUS_TCGA_SOMATIC == FALSE & - .data$STATUS_COSMIC == FALSE & - .data$SOMATIC_CLASSIFICATION == "SOMATIC", - "GERMLINE_DBSNP", .data$SOMATIC_CLASSIFICATION)) + dplyr::if_else( + .data$STATUS_DBSNP_GERMLINE == TRUE & + .data$STATUS_TCGA_SOMATIC == FALSE & + .data$STATUS_COSMIC == FALSE & + .data$SOMATIC_CLASSIFICATION == "SOMATIC", + "GERMLINE_DBSNP", .data$SOMATIC_CLASSIFICATION)) } @@ -336,13 +343,15 @@ assign_somatic_classification <- function(sample_calls, config) { #' evidence for somatic/germline status of variants #' #' @param sample_calls data frame with variants -#' @param config configuration object +#' @param settings PCGR configuration settings #' #' @return sample_calls #' #' @export -assign_somatic_germline_evidence <- function(sample_calls, config) { +assign_somatic_germline_evidence <- function( + sample_calls, + settings = NULL) { invisible( assertthat::assert_that( @@ -360,7 +369,7 @@ assign_somatic_germline_evidence <- function(sample_calls, config) { pop = pop, dbquery = "gnomADe", max_tolerated_af = - config[["tumor_only"]][[paste0("maf_gnomad_", tolower(pop))]]) + settings$conf$somatic_snv$tumor_only[[paste0("maf_gnomad_", tolower(pop))]]) } sample_calls <- sample_calls |> @@ -389,26 +398,26 @@ assign_somatic_germline_evidence <- function(sample_calls, config) { #' #' @export assign_germline_popfreq_status <- function(sample_calls, - pop = "EUR", + pop = "NFE", dbquery = "gnomADe", max_tolerated_af = 0.01) { if (dbquery == "gnomADe") { - if (!("STATUS_POPFREQ_GNOMAD_ABOVE_TOLERATED" %in% colnames(sample_calls))) { - sample_calls$STATUS_POPFREQ_GNOMAD_ABOVE_TOLERATED <- FALSE + if (!("gnomADe_AF_ABOVE_TOLERATED" %in% colnames(sample_calls))) { + sample_calls$gnomADe_AF_ABOVE_TOLERATED <- FALSE } col <- paste0(dbquery,"_",pop, "_AF") if (any(grepl(paste0("^", col, "$"), names(sample_calls)))) { sample_calls$max_tolerated_af <- max_tolerated_af - if(nrow( + if (nrow( sample_calls[!is.na(sample_calls[, col]) & - sample_calls[, col] > sample_calls$max_tolerated_af, ]) > 0){ + sample_calls[, col] > sample_calls$max_tolerated_af, ]) > 0) { sample_calls[!is.na(sample_calls[, col]) & sample_calls[, col] > sample_calls$max_tolerated_af, - "STATUS_POPFREQ_GNOMAD_ABOVE_TOLERATED"] <- TRUE + "gnomADe_AF_ABOVE_TOLERATED"] <- TRUE } sample_calls$max_tolerated_af <- NULL } @@ -421,132 +430,126 @@ assign_germline_popfreq_status <- function(sample_calls, #' Function that retrieves name of VCF INFO tag and #' population description for gnomad/1000G population #' -#' @param population_code three-letter code -#' @param db 1KG or GNOMAD -#' @param subset NA or "non_cancer" (for GNOMAD) #' -#' @return pop_tag_info -#' -#' @export -get_population_tag <- function(population_code, db = "1KG", subset = NA) { - pop_tag_info <- - list("vcf_tag" = paste0(toupper(population_code), "_AF_", db), - "pop_description" = NA) - if (db == "GNOMAD"){ - if(!is.na(subset)){ - if(subset == "non_cancer"){ - pop_tag_info <- - list("vcf_tag" = - paste0("NON_CANCER_AF_", toupper(population_code)), - "pop_description" = NA) - } - } - } - - pop_descriptions_1KG <- - data.frame(code = "afr", - pop_description = "African", stringsAsFactors = F) |> - rbind(data.frame( - code = "amr", - pop_description = "Admixed American", stringsAsFactors = F)) |> - rbind(data.frame( - code = "eur", - pop_description = "European", stringsAsFactors = F)) |> - rbind(data.frame( - code = "eas", - pop_description = "East Asian", stringsAsFactors = F)) |> - rbind(data.frame( - code = "sas", - pop_description = "South Asian", stringsAsFactors = F)) |> - rbind(data.frame( - code = "global", - pop_description = "global", stringsAsFactors = F)) - - pop_descriptions_gnomad <- - data.frame(code = "afr", - pop_description = "African", stringsAsFactors = F) |> - rbind(data.frame( - code = "amr", - pop_description = "Admixed American", stringsAsFactors = F)) |> - rbind(data.frame( - code = "nfe", - pop_description = "Non-Finnish European", stringsAsFactors = F)) |> - rbind(data.frame( - code = "fin", - pop_description = "Finnish", stringsAsFactors = F)) |> - rbind(data.frame( - code = "oth", - pop_description = "Other", stringsAsFactors = F)) |> - rbind(data.frame( - code = "asj", - pop_description = "Ashkenazi Jewish", stringsAsFactors = F)) |> - rbind(data.frame( - code = "eas", - pop_description = "East Asian", stringsAsFactors = F)) |> - rbind(data.frame( - code = "sas", - pop_description = "South Asian", stringsAsFactors = F)) |> - rbind(data.frame( - code = "global", - pop_description = "global", stringsAsFactors = F)) - - pop_descriptions_gnomad_non_cancer <- - data.frame(code = "afr", - pop_description = "African non-cancer subset", - stringsAsFactors = F) |> - rbind(data.frame( - code = "amr", - pop_description = "Admixed American non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "nfe", - pop_description = "Non-Finnish European non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "fin", - pop_description = "Finnish non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "oth", - pop_description = "Other non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "asj", - pop_description = "Ashkenazi Jewish non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "eas", - pop_description = "East Asian non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "sas", - pop_description = "South Asian non-cancer subset", - stringsAsFactors = F)) |> - rbind(data.frame( - code = "global", - pop_description = "Global non-cancer subset", - stringsAsFactors = F)) - - if (db == "1KG") { - pop_entry <- dplyr::filter(pop_descriptions_1KG, - .data$code == population_code) - pop_tag_info[["pop_description"]] <- pop_entry$pop_description - } - if (db == "GNOMAD") { - pop_entry <- dplyr::filter(pop_descriptions_gnomad, - .data$code == tolower(population_code)) - pop_tag_info[["pop_description"]] <- pop_entry$pop_description - if(!is.na(subset)){ - if (subset == "non_cancer") { - pop_entry <- dplyr::filter(pop_descriptions_gnomad_non_cancer, - .data$code == tolower(population_code)) - pop_tag_info[["pop_description"]] <- pop_entry$pop_description - } - } - - } - return(pop_tag_info) -} +# get_population_tag <- function(population_code, db = "1KG", subset = NA) { +# pop_tag_info <- +# list("vcf_tag" = paste0(toupper(population_code), "_AF_", db), +# "pop_description" = NA) +# if (db == "GNOMAD") { +# if (!is.na(subset)) { +# if (subset == "non_cancer") { +# pop_tag_info <- +# list("vcf_tag" = +# paste0("NON_CANCER_AF_", toupper(population_code)), +# "pop_description" = NA) +# } +# } +# } +# +# pop_descriptions_1KG <- +# data.frame(code = "afr", +# pop_description = "African", stringsAsFactors = F) |> +# rbind(data.frame( +# code = "amr", +# pop_description = "Admixed American", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "eur", +# pop_description = "European", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "eas", +# pop_description = "East Asian", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "sas", +# pop_description = "South Asian", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "global", +# pop_description = "global", stringsAsFactors = F)) +# +# pop_descriptions_gnomad <- +# data.frame(code = "afr", +# pop_description = "African", stringsAsFactors = F) |> +# rbind(data.frame( +# code = "amr", +# pop_description = "Admixed American", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "nfe", +# pop_description = "Non-Finnish European", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "fin", +# pop_description = "Finnish", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "oth", +# pop_description = "Other", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "asj", +# pop_description = "Ashkenazi Jewish", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "eas", +# pop_description = "East Asian", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "sas", +# pop_description = "South Asian", stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "global", +# pop_description = "global", stringsAsFactors = F)) +# +# pop_descriptions_gnomad_non_cancer <- +# data.frame(code = "afr", +# pop_description = "African non-cancer subset", +# stringsAsFactors = F) |> +# rbind(data.frame( +# code = "amr", +# pop_description = "Admixed American non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "nfe", +# pop_description = "Non-Finnish European non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "fin", +# pop_description = "Finnish non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "oth", +# pop_description = "Other non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "asj", +# pop_description = "Ashkenazi Jewish non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "eas", +# pop_description = "East Asian non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "sas", +# pop_description = "South Asian non-cancer subset", +# stringsAsFactors = F)) |> +# rbind(data.frame( +# code = "global", +# pop_description = "Global non-cancer subset", +# stringsAsFactors = F)) +# +# if (db == "1KG") { +# pop_entry <- dplyr::filter(pop_descriptions_1KG, +# .data$code == population_code) +# pop_tag_info[["pop_description"]] <- pop_entry$pop_description +# } +# if (db == "GNOMAD") { +# pop_entry <- dplyr::filter(pop_descriptions_gnomad, +# .data$code == tolower(population_code)) +# pop_tag_info[["pop_description"]] <- pop_entry$pop_description +# if (!is.na(subset)) { +# if (subset == "non_cancer") { +# pop_entry <- dplyr::filter(pop_descriptions_gnomad_non_cancer, +# .data$code == tolower(population_code)) +# pop_tag_info[["pop_description"]] <- pop_entry$pop_description +# } +# } +# +# } +# return(pop_tag_info) +# } #' Function that makes input data for an UpSet plot #' (filtering/intersection results) for the somatic-germline diff --git a/pcgrr/R/input_data.R b/pcgrr/R/input_data.R index 399c9ae3..2f6f7b50 100644 --- a/pcgrr/R/input_data.R +++ b/pcgrr/R/input_data.R @@ -3,21 +3,45 @@ #' #' @param fname Path to file name #' @param ref_data Object with reference data +#' @param settings Object with PCGR report configuration #' #' @export -load_somatic_cna <- function(fname, ref_data = NULL){ +load_somatic_cna <- function( + fname, + ref_data = NULL, + settings = NULL) { log4r_info(paste0( "Reading annotated molecular dataset (DNA) - somatic copy number aberrations")) - callset <- load_dna_variants( + callset_cna <- pcgrr::load_dna_variants( fname = fname, cols = pcgrr::data_coltype_defs$cna_somatic_raw, ref_data = ref_data, + vartype = 'cna', + primary_site = settings[['conf']][['sample_properties']]$site, retained_info_tags = "None", variant_origin = "Somatic") - return(callset) + tumor_site <- + settings[['conf']][['sample_properties']][['site']] + + if (NROW(callset_cna$variant) > 0) { + callset_cna[['variant']] <- callset_cna[['variant']] |> + pcgrr::append_cancer_gene_evidence( + ref_data = ref_data, + site = tumor_site, + pos_var = 'SEGMENT_START') |> + pcgrr::append_drug_var_link( + ref_data = ref_data + ) |> + dplyr::arrange( + .data$TIER, + dplyr::desc(.data$TISSUE_ASSOC_RANK), + dplyr::desc(.data$GLOBAL_ASSOC_RANK)) + } + + return(callset_cna) } @@ -32,19 +56,26 @@ load_somatic_cna <- function(fname, ref_data = NULL){ load_somatic_snv_indel <- function( fname = NA, ref_data = NULL, - settings = NULL){ + settings = NULL) { log4r_info(paste0( "Reading annotated molecular dataset (DNA) - somatic SNV/InDels")) - callset <- load_dna_variants( + callset <- pcgrr::load_dna_variants( fname = fname, cols = pcgrr::data_coltype_defs$snv_indel_somatic_raw, ref_data = ref_data, + vartype = 'snv_indel', + primary_site = settings[['conf']][['sample_properties']]$site, retained_info_tags = settings[['conf']][['other']]$retained_vcf_info_tags, variant_origin = "Somatic") + tumor_site <- + settings[['conf']][['sample_properties']][['site']] + + + callset[['variant_unfiltered']] <- data.frame() callset[['variant']] <- callset[['variant']] |> pcgrr::append_dbnsfp_var_link() |> pcgrr::append_dbmts_var_link() |> @@ -52,12 +83,34 @@ load_somatic_snv_indel <- function( pcgrr::append_annotation_links() |> pcgrr::append_drug_var_link(ref_data = ref_data) |> pcgrr::append_tfbs_annotation() |> - pcgrr::append_cancer_gene_evidence(ref_data = ref_data) + pcgrr::append_cancer_gene_evidence(ref_data = ref_data, + site = tumor_site) + + if (settings$conf$assay_properties$vcf_tumor_only == 1) { + callset[['variant_unfiltered']] <- callset[['variant']] + callset[['variant']] <- callset[['variant']] |> + ## assign evidence tags for germline/somatic state of variants, + ## partially based on user-defined criteria + ## (population allele frequency thresholds) + pcgrr::assign_somatic_germline_evidence2( + settings = settings) |> + + ## assign somatic classification based on accumulation + ## of evidence tags and user-defined options + pcgrr::assign_somatic_classification( + settings = settings) + } + + callset[['variant']] <- callset[['variant']] |> + dplyr::arrange(.data$TIER, + dplyr::desc(.data$ONCOGENICITY_SCORE), + dplyr::desc(.data$TISSUE_ASSOC_RANK), + dplyr::desc(.data$GLOBAL_ASSOC_RANK)) - callset <- - pcgrr::expand_biomarker_items( - callset = callset, - variant_origin = "somatic") + # callset <- + # pcgrr::expand_biomarker_items( + # callset = callset, + # variant_origin = "somatic") return(callset) @@ -65,11 +118,13 @@ load_somatic_snv_indel <- function( } #' Function that reads and validates CNA or SNV/InDel TSV files -#' file from PCGR/CPSR pre-report pipeline +#' file from PCGR/CPSR pre-report (Python) pipeline #' -#' @param fname Path to raw file with DNA aberrations (PCGR/CPSR) -#' @param cols column type definitions of input +#' @param fname Path to raw input file with DNA aberrations (PCGR/CPSR) +#' @param cols column type definitions of raw input file #' @param ref_data reference data object +#' @param vartype type of DNA aberrations ('snv_indel','cna') +#' @param primary_site primary site of tumor #' @param retained_info_tags VCF INFO tags to be retained in output (SNVs/InDels) #' @param variant_origin Germline/Somatic #' @@ -79,8 +134,10 @@ load_dna_variants <- function( fname = NA, cols = NULL, ref_data = NULL, + vartype = 'snv_indel', + primary_site = "Any", retained_info_tags = "None", - variant_origin = "Somatic"){ + variant_origin = "Somatic") { pcgrr::check_file_exists(fname) calls_raw <- suppressWarnings( @@ -101,7 +158,7 @@ load_dna_variants <- function( raw_col_check <- tibble::has_name(calls_raw, compulsary_cols) - if(FALSE %in% raw_col_check){ + if (FALSE %in% raw_col_check) { missing_cols <- compulsary_cols[!raw_col_check] log4r_fatal( @@ -112,11 +169,11 @@ load_dna_variants <- function( cols_including_retained <- cols retained_cols <- NULL - if(retained_info_tags != "None"){ + if (retained_info_tags != "None") { retained_cols <- stringr::str_split( retained_info_tags, pattern = ",")[[1]] - for(c in retained_cols){ - if(c %in% colnames(calls_raw)){ + for(c in retained_cols) { + if (c %in% colnames(calls_raw)) { col_retain <- readr::cols_only( !!rlang::sym(c) := readr::col_character() ) @@ -139,9 +196,9 @@ load_dna_variants <- function( ) retained_cols_renamed <- c() - if(!is.null(retained_cols)){ - for(c in retained_cols){ - if(c %in% colnames(calls)){ + if (!is.null(retained_cols)) { + for(c in retained_cols) { + if (c %in% colnames(calls)) { new_col <- paste0('VCF_INFO_', c) retained_cols_renamed <- c( retained_cols_renamed, new_col @@ -155,30 +212,27 @@ load_dna_variants <- function( results <- list() results[['variant']] <- calls results[['biomarker_evidence']] <- list() - results[['biomarker_evidence']][['all']] <- list() - for (elevel in pcgrr::evidence_levels) { - results[['biomarker_evidence']][['all']][[elevel]] <- data.frame() - } + results[['biomarker_evidence']][['items']] <- + data.frame() - for (type in pcgrr::evidence_types) { - results[['biomarker_evidence']][[type]] <- list() - for (elevel in pcgrr::evidence_levels) { - results[['biomarker_evidence']][[type]][[elevel]] <- data.frame() - } + ## Rename annotations for more clarity + if ("TSG" %in% colnames(results[['variant']])) { + results[['variant']] <- + results[['variant']] |> + dplyr::rename( + TUMOR_SUPPRESSOR = "TSG" + ) } - results[['retained_info_tags']] <- paste( - retained_cols_renamed, collapse="," - ) - - if("TSG" %in% colnames(results[['variant']])){ + if ("ONCOGENICITY_CLASSIFICATION" %in% colnames(results[['variant']])) { results[['variant']] <- results[['variant']] |> dplyr::rename( - TUMOR_SUPPRESSOR = "TSG" + ONCOGENICITY = "ONCOGENICITY_CLASSIFICATION" ) } - if("VEP_ALL_CSQ" %in% colnames(results[['variant']])){ + + if ("VEP_ALL_CSQ" %in% colnames(results[['variant']])) { results[['variant']] <- results[['variant']] |> dplyr::mutate( @@ -187,14 +241,16 @@ load_dna_variants <- function( ) ) } - if("HGVSp_short" %in% colnames(results[['variant']])){ + + if ("HGVSp_short" %in% colnames(results[['variant']])) { results[['variant']] <- results[['variant']] |> dplyr::rename( HGVSP = "HGVSp_short" ) } - if("TSG_RANK" %in% colnames(results[['variant']])){ + + if ("TSG_RANK" %in% colnames(results[['variant']])) { results[['variant']] <- results[['variant']] |> dplyr::rename( @@ -202,39 +258,75 @@ load_dna_variants <- function( ) } - if("BIOMARKER_MATCH" %in% colnames(calls) & - "VAR_ID" %in% colnames(calls)){ + if (vartype == 'cna') { - biomarker_set <- - calls |> - dplyr::filter(!is.na(.data$BIOMARKER_MATCH)) - - citations <- as.data.frame( - ref_data[['biomarker']][['literature']] |> + results[['variant']] <- + results[['variant']] |> + dplyr::mutate(REFSEQ_TRANSCRIPT_ID = dplyr::if_else( + is.na(.data$REFSEQ_TRANSCRIPT_ID), + "", + as.character(.data$REFSEQ_TRANSCRIPT_ID) + )) |> + dplyr::mutate( + TRANSCRIPT_OVERLAP = paste( + .data$ENSEMBL_TRANSCRIPT_ID, + .data$REFSEQ_TRANSCRIPT_ID, + .data$TRANSCRIPT_START, + .data$TRANSCRIPT_END, + .data$TRANSCRIPT_OVERLAP_PERCENT, sep="|" + )) |> dplyr::select( - c("EVIDENCE_ID", - "LINK") - ) |> - tidyr::separate_rows( - .data$EVIDENCE_ID, sep=";" - ) |> + -c("ENSEMBL_TRANSCRIPT_ID", + "REFSEQ_TRANSCRIPT_ID", + "TRANSCRIPT_START", + "TRANSCRIPT_END")) |> dplyr::group_by( - EVIDENCE_ID - ) |> + dplyr::across(-c("TRANSCRIPT_OVERLAP", + "TRANSCRIPT_OVERLAP_PERCENT"))) |> dplyr::summarise( - CITATION = paste( - unique(.data$LINK), collapse = ", " - ) + TRANSCRIPT_OVERLAP = paste(.data$TRANSCRIPT_OVERLAP, collapse=", "), + MAX_TRANSCRIPT_OVERLAP_PERCENT = + max(.data$TRANSCRIPT_OVERLAP_PERCENT, na.rm = T), + .groups = "drop" ) - ) - if(NROW(biomarker_set) > 0){ - results[['biomarker_evidence']][['all']][['any']] <- + } + + if ("BIOMARKER_MATCH" %in% colnames(results[['variant']]) & + "VAR_ID" %in% colnames(results[['variant']])) { + + biomarker_set <- + results[['variant']] |> + dplyr::filter(!is.na(.data$BIOMARKER_MATCH)) + + if (NROW(biomarker_set) > 0) { + + citations <- as.data.frame( + ref_data[['biomarker']][['literature']] |> + dplyr::select( + c("EVIDENCE_ID", + "LINK") + ) |> + tidyr::separate_rows( + c("EVIDENCE_ID"), + sep=";" + ) |> + dplyr::group_by( + EVIDENCE_ID + ) |> + dplyr::summarise( + CITATION = paste( + unique(.data$LINK), collapse = ", " + ) + ) + ) + results[['biomarker_evidence']][['items']] <- as.data.frame( biomarker_set |> dplyr::select( c("VAR_ID", - "BIOMARKER_MATCH"), + "VARIANT_CLASS", + "BIOMARKER_MATCH"), ) |> dplyr::distinct() |> tidyr::separate_rows( @@ -247,8 +339,15 @@ load_dna_variants <- function( "BIOMARKER_MATCHTYPE"), sep = "\\|" ) |> - dplyr::rename(BIOMARKER_MATCH = BIOMARKER_MATCHTYPE) |> + dplyr::mutate(VARIANT_ID = as.character(.data$VARIANT_ID)) |> + dplyr::left_join( + dplyr::select( + ref_data[['biomarker']][['variant']], + c("VARIANT_ID", "ENTREZGENE","BIOMARKER_SOURCE")), + by = c("VARIANT_ID","BIOMARKER_SOURCE")) |> + dplyr::rename(BIOMARKER_MATCH = .data$BIOMARKER_MATCHTYPE) |> dplyr::mutate(BIOMARKER_RESOLUTION = dplyr::case_when( + stringr::str_detect(.data$BIOMARKER_MATCH,"by_cna_segment") ~ "gene", stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") ~ "genomic", !stringr::str_detect(.data$BIOMARKER_MATCH,"by_genomic_coord") & stringr::str_detect(.data$BIOMARKER_MATCH,"by_hgvsp_principal") ~ "hgvsp", @@ -310,54 +409,70 @@ load_dna_variants <- function( ), by = c("EVIDENCE_ID"), relationship = "many-to-many" ) |> + dplyr::rename( + BM_VARIANT_ID = .data$VARIANT_ID, + BM_EVIDENCE_ID = .data$EVIDENCE_ID, + BM_SOURCE = .data$BIOMARKER_SOURCE, + BM_RESOLUTION = .data$BIOMARKER_RESOLUTION, + BM_MATCH = .data$BIOMARKER_MATCH, + BM_PRIMARY_SITE = .data$PRIMARY_SITE, + BM_EVIDENCE_TYPE = .data$EVIDENCE_TYPE, + BM_CANCER_TYPE = .data$CANCER_TYPE, + BM_VARIANT_ORIGIN = .data$VARIANT_ORIGIN, + BM_EVIDENCE_LEVEL = .data$EVIDENCE_LEVEL, + BM_EVIDENCE_DESCRIPTION = .data$EVIDENCE_DESCRIPTION, + BM_THERAPEUTIC_CONTEXT = .data$THERAPEUTIC_CONTEXT, + BM_CLINICAL_SIGNIFICANCE = .data$CLINICAL_SIGNIFICANCE, + BM_CITATION = .data$CITATION, + BM_RATING = .data$RATING, + BM_EVIDENCE_DIRECTION = .data$EVIDENCE_DIRECTION, + BM_MOLECULAR_PROFILE_NAME = .data$MOLECULAR_PROFILE_NAME, + BM_MOLECULAR_PROFILE_TYPE = .data$MOLECULAR_PROFILE_TYPE + ) |> + dplyr::select( + c("VAR_ID", + "VARIANT_CLASS", + "ENTREZGENE", + "BM_SOURCE", + "BM_VARIANT_ID", + "BM_EVIDENCE_ID", + "BM_EVIDENCE_TYPE", + "BM_EVIDENCE_LEVEL", + "BM_EVIDENCE_DESCRIPTION", + "BM_EVIDENCE_DIRECTION", + "BM_CLINICAL_SIGNIFICANCE", + "BM_VARIANT_ORIGIN", + "BM_CANCER_TYPE", + "BM_PRIMARY_SITE", + "BM_MATCH", + "BM_RESOLUTION"), + dplyr::everything() + ) |> + dplyr::filter( + .data$BM_VARIANT_ORIGIN == variant_origin & + .data$BM_MOLECULAR_PROFILE_TYPE == "Any") |> dplyr::distinct() ) + } - if(NROW(results[['biomarker_evidence']][['all']][['any']]) > 0){ - - for (type in pcgrr::evidence_types) { - results[['biomarker_evidence']][[type]][["any"]] <- - results[['biomarker_evidence']][['all']][['any']] |> - dplyr::filter( - .data$VARIANT_ORIGIN == variant_origin & - .data$EVIDENCE_TYPE == stringr::str_to_title(type)) - if (NROW(results[['biomarker_evidence']][[type]][["any"]]) > 0) { - results[['biomarker_evidence']][[type]][["A_B"]] <- - results[['biomarker_evidence']][[type]][["any"]] |> - dplyr::filter( - stringr::str_detect( - .data$EVIDENCE_LEVEL, "^(A|B|B1|B2):")) - - if (NROW(results[['biomarker_evidence']][[type]][["A_B"]]) > 0) { - results[['biomarker_evidence']][[type]][["A_B"]] <- - results[['biomarker_evidence']][[type]][["A_B"]] |> - dplyr::arrange( - .data$EVIDENCE_LEVEL, - dplyr::desc( - .data$RATING)) - } - - results[['biomarker_evidence']][[type]][["C_D_E"]] <- - results[['biomarker_evidence']][[type]][["any"]] |> - dplyr::filter( - stringr::str_detect( - .data$EVIDENCE_LEVEL, "^(C|D|E):")) - - if (NROW(results[['biomarker_evidence']][[type]][["C_D_E"]]) > 0) { - results[['biomarker_evidence']][[type]][["C_D_E"]] <- - results[['biomarker_evidence']][[type]][["C_D_E"]] |> - dplyr::arrange( - .data$EVIDENCE_LEVEL, - dplyr::desc(.data$RATING)) - } - } - } - } + if (variant_origin == "Somatic") { + results <- pcgrr::assign_acmg_tiers( + vartype = vartype, + variants_df = results$variant, + primary_site = primary_site, + biomarker_items = + results$biomarker_evidence$items + ) } + }else{ log4r_fatal("Input data does not contain 'BIOMARKER_MATCH' column - fatal") } + results[['retained_info_tags']] <- paste( + retained_cols_renamed, collapse="," + ) + return(results) } diff --git a/pcgrr/R/kataegis.R b/pcgrr/R/kataegis.R index 6e739f3c..95ec58c9 100644 --- a/pcgrr/R/kataegis.R +++ b/pcgrr/R/kataegis.R @@ -205,8 +205,9 @@ generate_report_data_kataegis <- function(variant_set, sample_name = "SampleX", build = "grch37") { - pcg_report_kataegis <- pcgrr::init_report(class = "kataegis") - if(NROW(variant_set) == 0){ + pcg_report_kataegis <- + pcgrr::init_kataegis_content() + if (NROW(variant_set) == 0) { return(pcg_report_kataegis) } diff --git a/pcgrr/R/main.R b/pcgrr/R/main.R index ad155b40..220b29ee 100644 --- a/pcgrr/R/main.R +++ b/pcgrr/R/main.R @@ -44,7 +44,7 @@ generate_pcgr_report <- if (!is.null(cna_segments_tsv)) { - if(length(cna_segments_tsv) > 0){ + if (length(cna_segments_tsv) > 0) { invisible(assertthat::assert_that( file.exists(cna_segments_tsv), msg = paste0("Filename provided for argument 'cna_segments_tsv' (", @@ -67,7 +67,7 @@ generate_pcgr_report <- } if (!is.null(cpsr_report_fname)) { - if(length(cpsr_report_fname) > 0){ + if (length(cpsr_report_fname) > 0) { invisible(assertthat::assert_that( file.exists(cpsr_report_fname), @@ -204,7 +204,7 @@ generate_pcgr_report <- ## Estimate contribution of mutational signatures if (pcg_report[["metadata"]][["config"]][["msigs"]][["run"]] == T) { - if(NROW(pcg_report$content$snv_indel$variant_set$tsv) > 0){ + if (NROW(pcg_report$content$snv_indel$variant_set$tsv) > 0) { pcgrr::write_processed_vcf( calls = pcg_report$content$snv_indel$variant_set$tsv, sample_name = sample_name, @@ -329,7 +329,7 @@ generate_pcgr_report <- pcg_report[["content"]][["snv_indel"]][["variant_set"]][["noncoding"]] <- NULL pcg_report[["content"]][["snv_indel"]][["variant_set"]][["coding"]] <- NULL pcg_report[["content"]][["snv_indel"]][["variant_set"]][["all"]] <- NULL - if(!is.null(pcg_report[["content"]][["tumor_only"]])){ + if (!is.null(pcg_report[["content"]][["tumor_only"]])) { pcg_report[["content"]][["snv_indel"]][["variant_set"]][["tsv_unfiltered"]] <- pcg_report[["content"]][["tumor_only"]][["variant_set"]][["tsv_unfiltered"]] pcg_report[["content"]][["tumor_only"]][["variant_set"]][["tsv_unfiltered"]] <- NULL @@ -350,24 +350,16 @@ generate_pcgr_report <- #' Function that generates tiered variant sets for SNVs/InDels #' -#' @param sample_calls variant calls subject to mutational signature analysis -#' @param pcgr_data object with PCGR annotation data -#' @param sample_name sample identifier -#' @param config Object with PCGR configuration parameters -#' @param callset type of calls -#' @param biomarker_mapping_stringency quality level for biomarkers +#' @param pcg_report PCGR report object +#' @param callset Object with input calls (CNA, SNV/InDel) #' @param tier_model tier model (pcgr_acmg) #' #' @return pcg_report_data data frame with all report elements #' #' @export generate_report_data_snv_indel <- function( - sample_calls, - pcgr_data, - sample_name, - config, - callset = "somatic calls", - biomarker_mapping_stringency = 1, + pcg_report = NULL, + callset = NULL, tier_model = "pcgr_acmg") { pcgrr::log4r_info("------") @@ -375,8 +367,7 @@ generate_report_data_snv_indel <- function( paste0("Generating data for tiered cancer genome report - ", callset, " tier model '", tier_model, "'")) - pcg_report_snv_indel <- pcgrr::init_report(config = config, - class = "snv_indel") + pcg_report_snv_indel <- pcg_report[['content']][['snv_indel']] pcg_report_snv_indel[["eval"]] <- TRUE pcg_report_snv_indel[["variant_set"]][["all"]] <- sample_calls @@ -411,10 +402,10 @@ generate_report_data_snv_indel <- function( ## remove REGULATORY_ANNOTATION from display tags ## if regulatory annotation is not turned on - if(!is.null(config)){ - if(config[['other']][['vep_regulatory']] == FALSE){ + if (!is.null(config)) { + if (config[['other']][['vep_regulatory']] == FALSE) { for(e in c('all','tier4_display','tier5_display', - 'tsv')){ + 'tsv')) { annotation_tags[[e]] <- annotation_tags[[e]][!annotation_tags[[e]] == "REGULATORY_ANNOTATION"] } @@ -469,7 +460,7 @@ generate_report_data_snv_indel <- function( eitems = eitems_specific_tt) ## Assign putative TIER 1 variant set - pcg_report_snv_indel[["clin_eitem"]][["specific_ttype"]] <- + pcg_report_snv_indel[["clin_eitem"]][["query_ttype"]] <- biomarker_hits_snv_indels_specific$clin_eitem pcg_report_snv_indel[["variant_set"]][["tier1"]] <- biomarker_hits_snv_indels_specific$variant_set @@ -1041,7 +1032,7 @@ generate_report_data_tumor_only <- #' eitems = eitems_specific_tt) #' #' ## Assign putative TIER 1 variant set -#' pcg_report_cna[["clin_eitem"]][["specific_ttype"]] <- +#' pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- #' biomarker_hits_cna_specific$clin_eitem #' pcg_report_cna[["variant_set"]][["tier1"]] <- #' biomarker_hits_cna_specific$variant_set @@ -1345,7 +1336,7 @@ generate_report_data_tumor_only <- # eitems = eitems_specific_tt) # # ## Assign putative TIER 1 variant set -# pcg_report_cna[["clin_eitem"]][["specific_ttype"]] <- +# pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- # biomarker_hits_cna_specific$clin_eitem # pcg_report_cna[["variant_set"]][["tier1"]] <- # biomarker_hits_cna_specific$variant_set @@ -1538,7 +1529,7 @@ write_report_output <- function(report, settings[["conf"]][["visual_reporting"]][["visual_theme"]] ## Somatic/tumor report settings - if (tier_model == "pcgr_acmg"){ + if (tier_model == "pcgr_acmg") { pcgrr_tmpl <- system.file("templates", package = "pcgrr") disclaimer <- file.path(pcgrr_tmpl, "disclaimer.md") @@ -1563,7 +1554,7 @@ write_report_output <- function(report, sequencing_design <- "Tumor-Only" css_fname <- file.path(pcgrr_tmpl, "pcgr_flexdb_tumor_only.css") - if (flexdb == FALSE){ + if (flexdb == FALSE) { css_fname <- file.path(pcgrr_tmpl, "pcgr_rmarkdown_tumor_only.css") } } @@ -1612,7 +1603,7 @@ write_report_output <- function(report, toc_depth <- 3 ## Ignore collapsing menu for CPSR - if(tier_model == 'cpsr'){ + if (tier_model == 'cpsr') { toc_float <- list(collapsed = FALSE, smooth_scroll = FALSE, @@ -1622,7 +1613,7 @@ write_report_output <- function(report, ## If nonfloating TOC is chosen (PCGR/CPSR), set toc_float to FALSE nonfloating_toc <- as.logical(settings[["conf"]][["visual_reporting"]][["nonfloating_toc"]]) - if(nonfloating_toc == T){ + if (nonfloating_toc == T) { toc_float <- F } @@ -1635,7 +1626,7 @@ write_report_output <- function(report, "templates", "_header.html", package = "pcgrr") - if(tier_model == "cpsr"){ + if (tier_model == "cpsr") { header <- system.file( "templates", "_header.html", @@ -1681,28 +1672,28 @@ write_report_output <- function(report, report_strip <- report - if(tier_model != "cpsr"){ - if(!is.null(report_strip$content$rainfall)){ + if (tier_model != "cpsr") { + if (!is.null(report_strip$content$rainfall)) { report_strip$content$rainfall <- NULL } - if(!is.null(report_strip$content$tmb)){ + if (!is.null(report_strip$content$tmb)) { report_strip$content$tmb$tcga_tmb <- NULL } - if(!is.null(report_strip$content$clinicaltrials)){ + if (!is.null(report_strip$content$clinicaltrials)) { report_strip$content$clinicaltrials <- NULL } - if(!is.null(report_strip$content$msi)){ - if(!is.null(report_strip$content$msi$prediction)){ + if (!is.null(report_strip$content$msi)) { + if (!is.null(report_strip$content$msi$prediction)) { report_strip$content$msi$prediction$tcga_dataset <- NULL } } - if(!is.null(report_strip$content$snv_indel$disp)){ + if (!is.null(report_strip$content$snv_indel$disp)) { report_strip$content$snv_indel$disp <- NULL } - if(!is.null(report_strip$content$snv_indel$variant_set)){ - if(!is.null(report_strip$content$snv_indel$variant_set$maf)){ + if (!is.null(report_strip$content$snv_indel$variant_set)) { + if (!is.null(report_strip$content$snv_indel$variant_set$maf)) { report_strip$content$snv_indel$variant_set$maf <- NULL } } @@ -1730,13 +1721,13 @@ write_report_output <- function(report, "AF_CONTROL", "TIER") - if(!is.null(report_strip$content$snv_indel$variant_set)){ + if (!is.null(report_strip$content$snv_indel$variant_set)) { - for(o in c('tsv')){ + for(o in c('tsv')) { - if(!is.null(report_strip$content$snv_indel$variant_set[[o]])){ + if (!is.null(report_strip$content$snv_indel$variant_set[[o]])) { - if(nrow(report_strip$content$snv_indel$variant_set[[o]]) == 0){ + if (nrow(report_strip$content$snv_indel$variant_set[[o]]) == 0) { next } assertable::assert_colnames( @@ -1766,7 +1757,7 @@ write_report_output <- function(report, ## NOTE: set max size of report object to 750 Mb - have not figured out ## what the exact size should be for jsonlite::toJSON to succeed/fail - if(utils::object.size(report_strip) < 750000000){ + if (utils::object.size(report_strip) < 750000000) { pcgr_json <- jsonlite::toJSON( report_strip, pretty = T, na = "string", @@ -1802,7 +1793,7 @@ write_report_output <- function(report, file = fnames[[output_format]], sep = "\t", col.names = T, row.names = F, quote = F) - # if(tier_model == "pcgr_acmg"){ + # if (tier_model == "pcgr_acmg") { # pcgrr::log4r_info( # paste0("Writing SNV/InDel Excel output file with ", # "PCGR annotations")) diff --git a/pcgrr/R/main2.R b/pcgrr/R/main2.R new file mode 100644 index 00000000..12bb5db4 --- /dev/null +++ b/pcgrr/R/main2.R @@ -0,0 +1,1801 @@ +#' Function that generates all contents of the cancer genome report (PCGR) +#' +#' @param yaml_fname Name of PCGR configuration file (yaml) +#' +#' @export + +generate_pcgr_report2 <- + function(yaml_fname = NULL) { + + invisible(assertthat::assert_that( + !is.null(yaml_fname), + msg = "Object 'yaml_fname' cannot be NULL" + )) + pcgrr::check_file_exists(yaml_fname) + + pcg_report <- pcgrr::init_report( + yaml_fname = yaml_fname, + report_mode = "PCGR") + + settings <- pcg_report$settings + ref_data <- pcg_report$ref_data + + callset_snv <- + pcgrr::load_somatic_snv_indel( + fname = settings$molecular_data$fname_mut_tsv, + ref_data = ref_data, + settings = settings + ) + + callset_cna <- NULL + if (settings$molecular_data$fname_cna_tsv != "None") { + callset_cna <- + pcgrr::load_somatic_cna( + fname = settings$molecular_data$fname_cna_tsv, + ref_data = ref_data, + settings = settings + ) + } + + + conf_somatic_snv <- + settings$conf$somatic_snv + conf_somatic_cna <- + settings$conf$somatic_cna + conf_other <- + settings$conf$other + assay_properties <- + settings$conf$assay_properties + sample_properties <- + settings$conf$sample_properties + + #pcgrr::log4r_info(paste0("Initializing PCGR report - sample ", sample_name)) + #pcgrr::log4r_info("------") + + # if (!is.null(cpsr_report_fname)) { + # if (length(cpsr_report_fname) > 0) { + # + # invisible(assertthat::assert_that( + # file.exists(cpsr_report_fname), + # msg = paste0("Filename provided for argument 'cpsr_report' (", + # cpsr_report_fname, ") does not exist"))) + # invisible(assertthat::assert_that( + # file.size(cpsr_report_fname) > 0, + # msg = paste0("File provided for argument 'cpsr_report' (", + # cpsr_report_fname, ") has a filesize of zero"))) + # } + # + # } + + ## Retrieve relevant clinical trials for the tumor type in question + + if (as.logical(settings$conf$clinicaltrials$run) == T) { + # pcg_report_trials <- + # pcgrr::generate_report_data_trials( + # ref_data = ref_data, + # settings = settings) + # ## Update genome report with trial data + # pcg_report <- + # pcgrr::update_report(pcg_report, pcg_report_trials, + # a_elem = "clinicaltrials") + } + + if (NROW(callset_snv$variant) > 0) { + + ## Perform analyses in tumor-only mode + if (assay_properties[["vcf_tumor_only"]] == TRUE) { + pcg_report_tumor_only <- + pcgrr::generate_report_data_tumor_only( + sample_calls, + sample_name, config) + + ## Generate data for SNVs/InDels + ## - + # pcg_report_snv_indel_filtered <- + # pcgrr::generate_report_data_snv_indel( + # pcg_report_tumor_only[["variant_set"]][["filtered"]], + # pcgr_data, + # sample_name, + # config, + # callset = "germline-filtered callset", + # tier_model = tier_model) + + pcg_report_tumor_only[["upset_data"]] <- + pcgrr::make_upset_plot_data( + pcg_report_tumor_only$variant_set$tsv_unfiltered, config) + num_upset_sources <- 0 + for (c in colnames(pcg_report_tumor_only[["upset_data"]])) { + if (c != "VAR_ID") { + if (sum(pcg_report_tumor_only[["upset_data"]][, c]) > 0) { + num_upset_sources <- num_upset_sources + 1 + } + } + } + if (num_upset_sources >= 2) { + pcg_report_tumor_only[["upset_plot_valid"]] <- TRUE + } + + ## Update genome report with SNV/InDels (display, tiers etc) + pcg_report <- + pcgrr::update_report(pcg_report, pcg_report_snv_indel_filtered, + a_elem = "snv_indel") + pcg_report <- + pcgrr::update_report(pcg_report, pcg_report_tumor_only, + a_elem = "tumor_only") + + ## Generate data for rainfall plot (SNVs) + pcg_report_rainfall <- + pcgrr::generate_report_data_rainfall( + pcg_report$content$snv_indel$variant_set$tsv, + build = pcg_report$metadata$genome_assembly) + ## Update genome report + pcg_report <- + pcgrr::update_report(pcg_report, pcg_report_rainfall, + a_elem = "rainfall") + + }else{ + ## Generate report data for SNVs/InDels + pcg_report_snv_indel <- + pcgrr::generate_report_data_snv_indel( + pcg_report, + callset = callset_snv, + tier_model = "pcgr_acmg") + + ## Update genome report + pcg_report <- pcgrr::update_report( + pcg_report, pcg_report_snv_indel, + a_elem = "snv_indel") + } + + ## Estimate contribution of mutational signatures + if (conf_somatic_snv[["mutational_signatures"]][["run"]] == T) { + + if (NROW(callset_snv$variant) > 0) { + pcg_report_signatures <- + pcgrr::generate_report_data_signatures_mp( + callset_snv = callset_snv, + settings = pcg_report$settings, + ref_data = pcg_report$ref_data) + + ## Update genome report with signature info + pcg_report <- pcgrr::update_report( + pcg_report, + pcg_report_signatures, + a_elem = "m_signature_mp") + } + + ## Generate report data for rainfall plot + pcg_report_rainfall <- + pcgrr::generate_report_data_rainfall( + variant_set = callset_snv$variant, + build = pcg_report$settings$genome_assembly) + + ## Update genome report + pcg_report <- + pcgrr::update_report(pcg_report, + pcg_report_rainfall, + a_elem = "rainfall") + + ## Generate report data for kataegis events (for WES/WGS runs) + if (stringr::str_detect( + assay_properties[["type"]], + "WGS|WES")) { + pcg_report_kataegis <- + pcgrr::generate_report_data_kataegis( + variant_set = callset_snv$variant, + sample_name = settings$sample_id, + build = settings$genome_assembly) + ## Update genome report + pcg_report <- pcgrr::update_report( + pcg_report, + pcg_report_kataegis, + a_elem = "kataegis") + } + } + + ## If assay is Tumor-Control and WES/WGS - perform MSI prediction + if (as.logical(settings$conf$somatic_snv$msi$run) == T & + stringr::str_detect(assay_properties[["type"]], "WGS|WES") & + as.logical(assay_properties[["vcf_tumor_only"]]) == FALSE) { + pcg_report_msi <- + pcgrr::generate_report_data_msi( + variant_set = callset_snv$variant, + ref_data = ref_data, + settings = settings) + + ## Update genome report with MSI info + pcg_report <- + pcgrr::update_report( + pcg_report, + pcg_report_msi, + a_elem = "msi") + } + + ## Generate report contents for analysis of mutational burden (TMB) + if (settings$conf$somatic_snv$tmb$run == T) { + pcg_report_tmb <- + pcgrr::generate_report_data_tmb( + pcg_report[["content"]][["snv_indel"]][["variant_set"]][["tsv"]], + pcgr_data, sample_name, config) + + ## Update genome report with TMB info + pcg_report <- pcgrr::update_report( + pcg_report, + pcg_report_tmb, + a_elem = "tmb") + } + }else{ + pcg_report[["content"]][["snv_indel"]][["zero"]] <- TRUE + pcg_report[["metadata"]][["config"]][["other"]][["list_noncoding"]] <- FALSE + } + + # if (!is.null(cpsr_report_fname)) { + # pcg_report[["content"]][["cpsr"]][['eval']] <- TRUE + # + # pcg_report[['content']][['cpsr']][['report']] <- + # jsonlite::fromJSON( + # gzfile(cpsr_report_fname) + # ) + # + # ## append report elements in pcg_report[['content']][['cpsr]][['cpsr_json']] + # } + + if (!is.null(cna_segments_tsv)) { + pcg_report_cna <- + pcgrr::generate_report_data_cna( + cna_segments_tsv, + pcgr_data, + sample_name, + config, + oncotree = pcg_report[["metadata"]][["phenotype"]][["oncotree_query"]], + transcript_overlap_pct = config[["cna"]][["cna_overlap_pct"]]) + pcg_report <- + pcgrr::update_report(pcg_report, + pcg_report_cna, + a_elem = "cna") + + } + + pcg_report_value_box <- pcgrr::generate_report_data_value_box( + pcg_report, pcgr_data, sample_name, config) + pcg_report <- pcgrr::update_report( + pcg_report, pcg_report_value_box, + a_elem = "value_box") + + for (elem in c("tier1", "tier2", "tier3", "tier4")) { + stat <- paste0("n_", elem) + pcg_report[["content"]][["snv_indel"]][["v_stat"]][[stat]] <- + nrow(pcg_report[["content"]][["snv_indel"]][["variant_set"]][[elem]]) + pcg_report[["content"]][["snv_indel"]][["variant_set"]][[elem]] <- NULL + } + pcg_report[["content"]][["snv_indel"]][["variant_set"]][["noncoding"]] <- NULL + pcg_report[["content"]][["snv_indel"]][["variant_set"]][["coding"]] <- NULL + pcg_report[["content"]][["snv_indel"]][["variant_set"]][["all"]] <- NULL + if (!is.null(pcg_report[["content"]][["tumor_only"]])) { + pcg_report[["content"]][["snv_indel"]][["variant_set"]][["tsv_unfiltered"]] <- + pcg_report[["content"]][["tumor_only"]][["variant_set"]][["tsv_unfiltered"]] + pcg_report[["content"]][["tumor_only"]][["variant_set"]][["tsv_unfiltered"]] <- NULL + pcg_report[["content"]][["tumor_only"]][["variant_set"]][["filtered"]] <- NULL + } + pcg_report[["content"]][["snv_indel"]][["variant_set"]][["all"]] <- NULL + pcg_report[["content"]][["cna"]][["variant_set"]][["cna_print"]] <- NULL + pcg_report[["metadata"]][["phenotype"]] <- list() + gc() + + # if (!is.null(cna_plot) && cna_plot != "None") { + # pcg_report[["content"]][["cna_plot"]][["png"]] <- cna_plot + # pcg_report[["content"]][["cna_plot"]][["eval"]] <- TRUE + # } + return(pcg_report) + } + + +#' Function that generates tiered variant sets for SNVs/InDels +#' +#' @param pcg_report PCGR report object +#' @param callset Object with input calls (CNA, SNV/InDel) +#' @param tier_model tier model (pcgr_acmg) +#' +#' @return pcg_report_data data frame with all report elements +#' +#' @export +generate_report_data_snv_indel2 <- function( + pcg_report = NULL, + callset = NULL, + tier_model = "pcgr_acmg") { + + pcgrr::log4r_info("------") + pcgrr::log4r_info( + paste0("Generating data for tiered cancer genome report - ", + " tier model '", tier_model, "'")) + + pcg_report_snv_indel <- pcg_report[['content']][['snv_indel']] + pcg_report_snv_indel[["eval"]] <- TRUE + pcg_report_snv_indel[["variant_set"]][["all"]] <- callset[['variant']] + + ## Get basic variant statistics (type, coding status) + call_stats <- pcgrr::variant_stats_report( + callset[['variant']], + name = "v_stat") + for (stat in c("n", "n_snv", "n_indel", "n_coding", "n_noncoding")) { + pcg_report_snv_indel[["v_stat"]][[stat]] <- + call_stats[["v_stat"]][[stat]] + } + pcgrr::log4r_info( + paste0("Number of protein-coding variants: ", + pcg_report_snv_indel[["v_stat"]][["n_coding"]])) + + # if (pcg_report_snv_indel[["v_stat"]][["n"]] > 0) { + # + # tumor_type <- + # pcg_report[["settings"]][["conf"]][["sample_properties"]][["site"]] + # + # ## Assign putative TIER 2 variant set + # for(etype in c('predictive','prognostic','diagnostic')) { + # pcg_report_snv_indel[["clin_eitem"]][["any_ttype"]][[etype]] <- + # callset[['biomarker_evidence']][[etype]] + # } + # + # pcg_report_snv_indel[["variant_set"]][["tier2"]] <- + # callset[["biomarker_evidence"]][["all"]][["any"]] |> + # dplyr::filter(EVIDENCE_TYPE == "Prognostic" | + # EVIDENCE_TYPE == "Predictive" | + # EVIDENCE_TYPE == "Diagnostic") + # + # if (NROW(pcg_report_snv_indel[["variant_set"]][["tier2"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier2"]] <- + # pcg_report_snv_indel[["variant_set"]][["tier2"]] |> + # dplyr::select("VAR_ID") |> + # dplyr::distinct() |> + # dplyr::inner_join( + # pcg_report_snv_indel[["variant_set"]][["all"]], + # by = "VAR_ID") + # } + # + # ## Get all clinical evidence items that + # ## overlap query set (if tumor type is specified) + # if (tumor_type != "Cancer, NOS") { + # + # ## Assign putative TIER 1 variant set + # for(etype in c('predictive','prognostic','diagnostic')) { + # for(elevel in c('any','A_B','C_D_E')) { + # if (NROW(callset[['biomarker_evidence']][[etype]][[elevel]]) > 0) { + # pcg_report_snv_indel[["clin_eitem"]][["query_ttype"]][[etype]][[elevel]] <- + # callset[['biomarker_evidence']][[etype]][[elevel]] |> + # dplyr::filter(!is.na(PRIMARY_SITE) & PRIMARY_SITE == tumor_type) + # + # pcg_report_snv_indel[["clin_eitem"]][["other_ttype"]][[etype]][[elevel]] <- + # callset[['biomarker_evidence']][[etype]][[elevel]] |> + # dplyr::filter(is.na(PRIMARY_SITE) | PRIMARY_SITE != tumor_type) + # } + # } + # } + # + # pcg_report_snv_indel[["variant_set"]][["tier1"]] <- + # callset[["biomarker_evidence"]][["all"]][["any"]] |> + # dplyr::filter(PRIMARY_SITE == tumor_type & + # stringr::str_detect( + # EVIDENCE_LEVEL,"^(A|B)") & + # (EVIDENCE_TYPE == "Prognostic" | + # EVIDENCE_TYPE == "Predictive" | + # EVIDENCE_TYPE == "Diagnostic")) + # + # if (NROW(pcg_report_snv_indel[["variant_set"]][["tier1"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier1"]] <- + # pcg_report_snv_indel[["variant_set"]][["tier1"]] |> + # dplyr::select("VAR_ID") |> + # dplyr::distinct() |> + # dplyr::inner_join( + # pcg_report_snv_indel[["variant_set"]][["all"]], + # by = "VAR_ID") + # + # if (NROW(pcg_report_snv_indel[["variant_set"]][["tier1"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier2"]] <- + # pcg_report_snv_indel[["variant_set"]][["tier2"]] |> + # dplyr::anti_join( + # dplyr::select(pcg_report_snv_indel[["variant_set"]][["tier1"]], + # VAR_ID), + # by = "VAR_ID" + # ) + # } + # } + # } + # + # ## Remove potential overlap/redundancies and assign final + # ## TIER1/TIER2 classification + # #pcg_report_snv_indel <- pcgrr::assign_tier1_tier2_acmg(pcg_report_snv_indel) + # tier12 <- dplyr::bind_rows( + # data.frame( + # 'VAR_ID' = unique( + # pcg_report_snv_indel[["variant_set"]][["tier1"]]$VAR_ID)), + # data.frame( + # 'VAR_ID' = unique( + # pcg_report_snv_indel[["variant_set"]][["tier2"]]$VAR_ID))) + # + # ## Determine TIER 3 variant set: coding mutations in + # ## oncogenes/tumor suppressors/cancer census genes + # pcg_report_snv_indel[["variant_set"]][["tier3"]] <- + # pcg_report_snv_indel[["variant_set"]][["all"]] |> + # dplyr::filter(.data$CODING_STATUS == "coding") |> + # dplyr::filter( + # (!is.na(.data$ONCOGENE) & .data$ONCOGENE == TRUE) | + # (!is.na(.data$TUMOR_SUPPRESSOR) & .data$TUMOR_SUPPRESSOR == TRUE)) + # + # if (NROW(tier12) > 0 & + # NROW(pcg_report_snv_indel[["variant_set"]][["tier3"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier3"]] <- + # dplyr::anti_join(pcg_report_snv_indel[["variant_set"]][["tier3"]], + # tier12, by = c("VAR_ID")) + # } + # tier123 <- tier12 + # if (nrow(pcg_report_snv_indel[["variant_set"]][["tier3"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier3"]] <- + # pcg_report_snv_indel[["variant_set"]][["tier3"]] |> + # dplyr::arrange(dplyr::desc(.data$ONCOGENICITY_SCORE), + # dplyr::desc(.data$TISSUE_ASSOC_RANK), + # dplyr::desc(.data$GLOBAL_ASSOC_RANK)) + # tier123 <- tier12 |> + # dplyr::bind_rows( + # dplyr::select(pcg_report_snv_indel[["variant_set"]][["tier3"]], + # "VAR_ID")) |> + # dplyr::distinct() + # pcg_report_snv_indel[["disp"]][["tier3"]][["proto_oncogene"]] <- + # dplyr::select( + # pcg_report_snv_indel[["variant_set"]][["tier3"]], + # dplyr::any_of(annotation_tags[["tier3_display"]])) |> + # dplyr::filter(.data$ONCOGENE == TRUE & + # (is.na(.data$TUMOR_SUPPRESSOR) | + # .data$TUMOR_SUPPRESSOR == FALSE)) + # pcg_report_snv_indel[["disp"]][["tier3"]][["tumor_suppressor"]] <- + # dplyr::select( + # pcg_report_snv_indel[["variant_set"]][["tier3"]], + # dplyr::any_of(annotation_tags[["tier3_display"]])) |> + # dplyr::filter(!is.na(.data$TUMOR_SUPPRESSOR) & + # .data$TUMOR_SUPPRESSOR == TRUE) + # } + # + # ## Determine TIER 4: Other coding mutations + # pcg_report_snv_indel[["variant_set"]][["tier4"]] <- + # dplyr::select(pcg_report_snv_indel[["variant_set"]][["all"]], + # dplyr::any_of(annotation_tags[["all"]])) |> + # dplyr::filter(.data$CODING_STATUS == "coding") + # if (NROW(tier123) > 0 & + # NROW(pcg_report_snv_indel[["variant_set"]][["tier4"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier4"]] <- + # dplyr::anti_join(pcg_report_snv_indel[["variant_set"]][["tier4"]], + # tier123, by = c("GENOMIC_CHANGE")) + # } + # if (nrow(pcg_report_snv_indel[["variant_set"]][["tier4"]]) > 0) { + # pcg_report_snv_indel[["variant_set"]][["tier4"]] <- + # pcg_report_snv_indel[["variant_set"]][["tier4"]] |> + # dplyr::arrange(dplyr::desc(.data$TISSUE_ASSOC_RANK), + # dplyr::desc(.data$GLOBAL_ASSOC_RANK)) + # pcg_report_snv_indel[["disp"]][["tier4"]] <- + # dplyr::select( + # pcg_report_snv_indel[["variant_set"]][["tier4"]], + # dplyr::any_of(annotation_tags[["tier4_display"]])) + # } + # + # ## Determine non-coding mutation set + # pcg_report_snv_indel[["variant_set"]][["noncoding"]] <- + # dplyr::select(pcg_report_snv_indel[["variant_set"]][["all"]], + # dplyr::any_of(annotation_tags[["all"]])) |> + # dplyr::filter(.data$CODING_STATUS == "noncoding") + # if (nrow(pcg_report_snv_indel[["variant_set"]][["noncoding"]]) > 0) { + # if (nrow(tier123) > 0) { + # pcg_report_snv_indel[["variant_set"]][["noncoding"]] <- + # dplyr::anti_join(pcg_report_snv_indel[["variant_set"]][["noncoding"]], + # tier123, + # by = c("VAR_ID")) + # } + # pcg_report_snv_indel[["variant_set"]][["noncoding"]] <- + # pcg_report_snv_indel[["variant_set"]][["noncoding"]] |> + # dplyr::arrange(dplyr::desc(.data$OPENTARGETS_RANK)) + # pcg_report_snv_indel[["disp"]][["noncoding"]] <- + # dplyr::select( + # pcg_report_snv_indel[["variant_set"]][["noncoding"]], + # dplyr::any_of(annotation_tags[["tier5_display"]])) + # } + + ## Make TSV content with variant set + # pcg_report_snv_indel[["v_stat"]][["n_noncoding"]] <- + # pcg_report_snv_indel[["variant_set"]][["noncoding"]] |> nrow() + # pcg_report_snv_indel[["variant_set"]][["tsv"]] <- + # pcgrr::generate_tier_tsv( + # pcg_report_snv_indel[["variant_set"]], + # config, + # annotation_tags, + # sample_name = sample_name) + + # } + + return(pcg_report_snv_indel) + +} + + +#' Function that generates germline-filtered callset and PCGR +#' report statistics for a given tumor-only callsets +#' +#' @param unfiltered_sample_calls variant calls +#' @param sample_name sample identifier +#' @param pcgr_config Object with PCGR configuration parameters +#' +#' @export +generate_report_data_tumor_only <- + function(unfiltered_sample_calls, + sample_name, + pcgr_config) { + + sample_calls <- unfiltered_sample_calls + gline_filter_stats <- list() + for (m in c("remain_post_gnomad", + "remain_post_clinvar", + "remain_post_dbsnp", + "remain_post_pon", + "remain_post_nonexonic", + "remain_post_hom", + "remain_post_het")) { + gline_filter_stats[m] <- 0 + } + + ## initiate report + pcg_report_to <- + pcgrr::init_tumor_only_content() + + ## assign evidence tags for germline/somatic state of variants, + ## partially based on user-defined criteria + ## (population allele frequency thresholds) + vcalls <- + pcgrr::assign_somatic_germline_evidence(sample_calls, pcgr_config) + + ## assign somatic classification based on accumulation + ## of evidence tags and user-defined options + vcalls <- + pcgrr::assign_somatic_classification(vcalls, pcgr_config) + + ## Assign statistics to successive filtering levels for + ## different evidence criteria + ## excluded germline calls found in gnomAD + gline_filter_stats[["remain_post_gnomad"]] <- + nrow(vcalls) - + nrow(vcalls[vcalls$SOMATIC_CLASSIFICATION == "GERMLINE_GNOMAD", ]) + pcgrr::log4r_info(paste0("Excluding coinciding germline variants in ", + "gnomAD populations")) + pcgrr::log4r_info(paste0("Total sample calls remaining: ", + gline_filter_stats[["remain_post_gnomad"]])) + + ## excluded germline calls found in ClinVar + gline_filter_stats[["remain_post_clinvar"]] <- + gline_filter_stats[["remain_post_gnomad"]] - + nrow(vcalls[vcalls$SOMATIC_CLASSIFICATION == "GERMLINE_CLINVAR", ]) + pcgrr::log4r_info(paste0("Excluding coinciding germline variants in ClinVar")) + pcgrr::log4r_info(paste0("Total sample calls remaining: ", + gline_filter_stats[["remain_post_clinvar"]])) + + + ## excluded germline calls found in panel of normals (if provided) + gline_filter_stats[["remain_post_pon"]] <- + gline_filter_stats[["remain_post_clinvar"]] + if (pcgr_config[["tumor_only"]][["exclude_pon"]] == TRUE) { + gline_filter_stats[["remain_post_pon"]] <- + gline_filter_stats[["remain_post_pon"]] - + nrow(vcalls[vcalls$SOMATIC_CLASSIFICATION == "GERMLINE_PON", ]) + pcgrr::log4r_info( + paste0("Excluding putative germline variants found in calls ", + "from panel-of-normals (PON)")) + pcgrr::log4r_info( + paste0("Total sample calls remaining: ", + gline_filter_stats[["remain_post_pon"]])) + } + + ## excluded germline calls found with 100% allelic fraction + ## (likely homozygous germline variants) + gline_filter_stats[["remain_post_hom"]] <- + gline_filter_stats[["remain_post_pon"]] + if (pcgr_config[["tumor_only"]][["exclude_likely_hom_germline"]] == TRUE) { + gline_filter_stats[["remain_post_hom"]] <- + gline_filter_stats[["remain_post_hom"]] - + nrow(vcalls[vcalls$SOMATIC_CLASSIFICATION == "GERMLINE_HOMOZYGOUS", ]) + pcgrr::log4r_info( + paste0("Excluding likely homozygous germline variants found ", + "as variants with 100% allelic fraction")) + pcgrr::log4r_info(paste0("Total sample calls remaining: ", + gline_filter_stats[["remain_post_hom"]])) + } + + ## excluded germline calls found as likely heterozygous germline variants + gline_filter_stats[["remain_post_het"]] <- + gline_filter_stats[["remain_post_hom"]] + if (pcgr_config[["tumor_only"]][["exclude_likely_het_germline"]] == TRUE) { + gline_filter_stats[["remain_post_het"]] <- + gline_filter_stats[["remain_post_het"]] - + nrow(vcalls[vcalls$SOMATIC_CLASSIFICATION == "GERMLINE_HETEROZYGOUS", ]) + pcgrr::log4r_info(paste0( + "Excluding likely heterozygous germline variants found as variants ", + "with 40-60% allelic fraction and recorded in gnomAD + dbSNP")) + pcgrr::log4r_info(paste0("Total sample calls remaining: ", + gline_filter_stats[["remain_post_het"]])) + } + + ## excluded calls with dbSNP germline status (if set in config) + gline_filter_stats[["remain_post_dbsnp"]] <- + gline_filter_stats[["remain_post_het"]] + if (pcgr_config[["tumor_only"]][["exclude_dbsnp_nonsomatic"]] == TRUE) { + + pcgrr::log4r_info( + paste0("Excluding non-somatically associated dbSNP variants ", + "(dbSNP - not recorded as somatic in DoCM/ClinVar", + "and not registered in COSMIC or found in TCGA")) + + gline_filter_stats[["remain_post_dbsnp"]] <- + gline_filter_stats[["remain_post_dbsnp"]] - + nrow(vcalls[vcalls$SOMATIC_CLASSIFICATION == "GERMLINE_DBSNP", ]) + pcgrr::log4r_info(paste0("Total sample calls remaining: ", + gline_filter_stats[["remain_post_dbsnp"]])) + } + + unfiltered_sample_calls <- vcalls + vcalls <- vcalls |> + dplyr::filter(.data$SOMATIC_CLASSIFICATION == "SOMATIC") + + gline_filter_stats[["remain_post_nonexonic"]] <- + gline_filter_stats[["remain_post_dbsnp"]] + if (pcgr_config[["tumor_only"]][["exclude_nonexonic"]] == TRUE) { + pcgrr::log4r_info(paste0("Excluding non-exonic variants")) + vcalls <- dplyr::filter(vcalls, .data$EXONIC_STATUS == "exonic") + pcgrr::log4r_info(paste0("Total sample calls remaining: ", + nrow(vcalls))) + gline_filter_stats[["remain_post_nonexonic"]] <- nrow(vcalls) + } + + pcg_report_to[["eval"]] <- TRUE + pcg_report_to[["variant_set"]][["tsv_unfiltered"]] <- unfiltered_sample_calls |> + dplyr::select(.data$GENOMIC_CHANGE, + .data$VAR_ID, + .data$DP_TUMOR, + .data$AF_TUMOR, + .data$SYMBOL, + .data$EXONIC_STATUS, + .data$CONSEQUENCE, + .data$STATUS_PON, + .data$STATUS_LIKELY_GERMLINE_HOMOZYGOUS, + .data$STATUS_LIKELY_GERMLINE_HETEROZYGOUS, + .data$STATUS_DBSNP_GERMLINE, + .data$STATUS_POPFREQ_1KG_ABOVE_TOLERATED, + .data$STATUS_POPFREQ_GNOMAD_ABOVE_TOLERATED, + .data$STATUS_CLINVAR_GERMLINE, + .data$SOMATIC_CLASSIFICATION) + pcg_report_to[["variant_set"]][["filtered"]] <- vcalls + pcg_report_to[["v_stat"]][["unfiltered_n"]] <- + nrow(unfiltered_sample_calls) + pcg_report_to[["v_stat"]][["onekg_n_remain"]] <- + gline_filter_stats[["remain_post_onekg"]] + pcg_report_to[["v_stat"]][["gnomad_n_remain"]] <- + gline_filter_stats[["remain_post_gnomad"]] + pcg_report_to[["v_stat"]][["clinvar_n_remain"]] <- + gline_filter_stats[["remain_post_clinvar"]] + pcg_report_to[["v_stat"]][["pon_n_remain"]] <- + gline_filter_stats[["remain_post_pon"]] + pcg_report_to[["v_stat"]][["hom_n_remain"]] <- + gline_filter_stats[["remain_post_hom"]] + pcg_report_to[["v_stat"]][["het_n_remain"]] <- + gline_filter_stats[["remain_post_het"]] + pcg_report_to[["v_stat"]][["dbsnp_n_remain"]] <- + gline_filter_stats[["remain_post_dbsnp"]] + pcg_report_to[["v_stat"]][["nonexonic_n_remain"]] <- + gline_filter_stats[["remain_post_nonexonic"]] + for (db_filter in c("onekg", "gnomad", "dbsnp", "pon", + "clinvar", "hom", "het", "nonexonic")) { + if (pcg_report_to[["v_stat"]][[paste0(db_filter, "_n_remain")]] > 0 & + pcg_report_to[["v_stat"]][["unfiltered_n"]] > 0) { + pcg_report_to[["v_stat"]][[paste0(db_filter, "_frac_remain")]] <- + round((as.numeric(pcg_report_to[["v_stat"]][[paste0(db_filter, + "_n_remain")]]) / + pcg_report_to[["v_stat"]][["unfiltered_n"]]) * 100, digits = 2) + } + } + return(pcg_report_to) + +} + +#' Function that annotates CNV segment files +#' +#' param cna_segments_tsv CNV file name with chromosomal log(2)-ratio segments +#' param pcgr_data object with PCGR annotation data +#' param sample_name sample identifier +#' param pcgr_config Object with PCGR configuration parameters +#' param oncotree Data frame with phenotype terms relevant for tumor type +#' param transcript_overlap_pct required aberration overlap fraction +#' (percent) for reported transcripts (default 100 percent) +#' +#' export +#' generate_report_data_cna <- +#' function(cna_segments_tsv, +#' pcgr_data, +#' sample_name, +#' pcgr_config, +#' oncotree, +#' transcript_overlap_pct = 100) { +#' +#' invisible( +#' assertthat::assert_that( +#' file.exists(cna_segments_tsv), +#' msg = paste0("File 'cna_segments_tsv' (", +#' cna_segments_tsv, ") does not exist"))) +#' pcg_report_cna <- pcgrr::init_report(config = pcgr_config, +#' class = "cna") +#' log_r_homdel <- pcgr_config[["cna"]][["log_r_homdel"]] +#' log_r_gain <- pcgr_config[["cna"]][["log_r_gain"]] +#' tumor_type <- pcgr_config[["t_props"]][["tumor_type"]] +#' MEGABASE <- 1000000 +#' +#' pcgrr::log4r_info("------") +#' pcgrr::log4r_info(paste0("Generating report data for copy number segment file ", +#' cna_segments_tsv)) +#' +#' ## READ INPUT FILE, VALIDATE INPUT CHROMOSOMES AND SEGMENTS, ADD CYTOBAND INFO +#' cna_df <- utils::read.table(file = cna_segments_tsv, header = T, +#' stringsAsFactors = F, sep = "\t", +#' comment.char = "", quote = "") |> +#' dplyr::rename(chromosome = Chromosome, +#' LogR = Segment_Mean, +#' segment_start = Start, +#' segment_end = End) |> +#' dplyr::distinct() |> +#' dplyr::select( +#' c("chromosome","LogR", +#' "segment_start","segment_end")) |> +#' dplyr::mutate( +#' chromosome = stringr::str_replace( +#' .data$chromosome, "^chr", "")) |> +#' pcgrr::get_valid_chromosomes( +#' chromosome_column = "chromosome", +#' bsg = pcgr_data[["assembly"]][["bsg"]]) |> +#' pcgrr::get_valid_chromosome_segments( +#' genome_assembly = pcgr_data[["assembly"]][["grch_name"]], +#' bsg = pcgr_data[["assembly"]][["bsg"]]) |> +#' dplyr::filter(!is.na(.data$LogR)) |> +#' dplyr::mutate(LogR = round(as.numeric(.data$LogR), digits = 3)) |> +#' dplyr::mutate(SEGMENT_ID = paste0(.data$chromosome, ":", +#' .data$segment_start, "-", +#' .data$segment_end)) |> +#' pcgrr::get_cna_cytoband(pcgr_data = pcgr_data) |> +#' dplyr::mutate(SAMPLE_ID = sample_name) |> +#' pcgrr::append_ucsc_segment_link( +#' hgname = pcgr_data[["assembly"]][["hg_name"]], +#' chrom = "chromosome", +#' start = "segment_start", +#' end = "segment_end") |> +#' dplyr::mutate( +#' SEGMENT_LENGTH_MB = +#' round((as.numeric((.data$segment_end - .data$segment_start) / +#' MEGABASE)), +#' digits = 5)) |> +#' dplyr::rename(SEGMENT = .data$SEGMENT_LINK, LOG_R = .data$LogR) +#' +#' ## MAKE SIMPLE SEGMENTS DATA FRAME FOR FILTERING IN REPORT +#' cna_segments <- cna_df |> +#' dplyr::select(.data$SEGMENT, +#' .data$SEGMENT_LENGTH_MB, +#' .data$CYTOBAND, +#' .data$LOG_R, +#' .data$EVENT_TYPE) |> +#' dplyr::distinct() +#' +#' #### FIND AND APPEND GENCODE TRANSCRIPTS THAT OVERLAP +#' cna_transcript_df <- +#' pcgrr::get_cna_overlapping_transcripts( +#' cna_df, pcgr_data = pcgr_data) +#' #get_cna_overlapping_transcripts( +#' # cna_df, pcgr_data = pcgr_data) +#' +#' #### GENERATE DATAFRAME OF UNIQUE TRANSCRIPT-CNA SEGMENTS FOR OUTPUT TSV +#' cna_transcript_df_print <- cna_transcript_df |> +#' dplyr::select(.data$chrom, +#' .data$segment_start, +#' .data$segment_end, +#' .data$SEGMENT_ID, +#' .data$SEGMENT_LENGTH_MB, +#' .data$EVENT_TYPE, +#' .data$CYTOBAND, +#' .data$LOG_R, +#' .data$SAMPLE_ID, +#' .data$ensembl_gene_id, +#' .data$symbol, +#' .data$ensembl_transcript_id, +#' .data$transcript_start, +#' .data$transcript_end, +#' .data$transcript_overlap_percent, +#' .data$name, +#' .data$biotype, +#' .data$tumor_suppressor, +#' .data$oncogene, +#' .data$intogen_driver, +#' .data$chembl_compound_id, +#' .data$gencode_tag, +#' .data$gencode_release) |> +#' magrittr::set_colnames(tolower(names(.))) +#' +#' avg_transcript_overlap <- as.data.frame( +#' cna_transcript_df |> +#' dplyr::filter(.data$biotype == "protein_coding") |> +#' dplyr::group_by(.data$SEGMENT_ID, .data$symbol) |> +#' dplyr::summarise( +#' MEAN_TRANSCRIPT_CNA_OVERLAP = mean( +#' .data$transcript_overlap_percent), +#' TRANSCRIPTS = paste0(.data$ensembl_transcript_id, collapse = ", "), +#' .groups = "drop") |> +#' dplyr::rename(SYMBOL = .data$symbol) |> +#' dplyr::mutate( +#' MEAN_TRANSCRIPT_CNA_OVERLAP = +#' round(.data$MEAN_TRANSCRIPT_CNA_OVERLAP, digits = 2)) +#' ) +#' +#' cna_transcript_df <- +#' dplyr::select(cna_transcript_df, -.data$ensembl_transcript_id) |> +#' dplyr::filter(.data$biotype == "protein_coding") |> +#' dplyr::distinct() |> +#' dplyr::mutate(VAR_ID = as.character(rep(1:nrow(.)))) |> +#' magrittr::set_colnames(toupper(names(.))) |> +#' pcgrr::append_otargets_pheno_link( +#' pcgr_data = pcgr_data, +#' oncotree = oncotree) |> +#' dplyr::rename(OPENTARGETS_ASSOCIATIONS = +#' .data$OT_DISEASE_LINK) |> +#' dplyr::select(.data$VAR_ID, +#' .data$SEGMENT_ID, +#' .data$SYMBOL, +#' .data$ONCOGENE, +#' .data$ONCOGENE_EVIDENCE, +#' .data$TUMOR_SUPPRESSOR, +#' .data$TUMOR_SUPPRESSOR_EVIDENCE, +#' .data$CANCERGENE_SUPPORT, +#' .data$OPENTARGETS_ASSOCIATIONS, +#' .data$OPENTARGETS_RANK, +#' .data$ENTREZ_ID, +#' .data$CHROM, +#' .data$NAME, +#' .data$EVENT_TYPE, +#' .data$SEGMENT_LENGTH_MB, +#' .data$SEGMENT, +#' .data$TRANSCRIPT_OVERLAP_PERCENT, +#' .data$LOG_R) |> +#' dplyr::mutate(ENTREZ_ID = as.character(.data$ENTREZ_ID)) |> +#' dplyr::rename(GENENAME = .data$NAME, +#' TRANSCRIPT_OVERLAP = .data$TRANSCRIPT_OVERLAP_PERCENT, +#' CHROMOSOME = .data$CHROM) |> +#' dplyr::left_join(pcgr_data[["kegg"]][["pathway_links"]], +#' by = c("ENTREZ_ID" = "gene_id")) |> +#' dplyr::rename(KEGG_PATHWAY = .data$kegg_pathway_urls) +#' +#' ## Get gene annotation links +#' entrezgene_annotation_links <- +#' pcgrr::generate_annotation_link( +#' cna_transcript_df, +#' vardb = "GENE_NAME", +#' group_by_var = "VAR_ID", +#' link_key_var = "ENTREZ_ID", +#' link_display_var = "GENENAME", +#' url_prefix = "http://www.ncbi.nlm.nih.gov/gene/") +#' +#' cna_transcript_df <- cna_transcript_df |> +#' dplyr::left_join( +#' dplyr::rename(entrezgene_annotation_links, +#' GENE_NAME = .data$link), +#' by = c("VAR_ID")) |> +#' dplyr::select(.data$SEGMENT_ID, +#' .data$CHROMOSOME, +#' .data$SYMBOL, +#' .data$GENE_NAME, +#' .data$KEGG_PATHWAY, +#' .data$TUMOR_SUPPRESSOR, +#' .data$TUMOR_SUPPRESSOR_EVIDENCE, +#' .data$ONCOGENE, +#' .data$ONCOGENE_EVIDENCE, +#' .data$CANCERGENE_SUPPORT, +#' .data$OPENTARGETS_ASSOCIATIONS, +#' .data$OPENTARGETS_RANK, +#' .data$SEGMENT_LENGTH_MB, +#' .data$SEGMENT, +#' .data$EVENT_TYPE, +#' .data$LOG_R) |> +#' dplyr::distinct() |> +#' dplyr::left_join(avg_transcript_overlap, +#' by = c("SEGMENT_ID", "SYMBOL")) +#' +#' +#' n_cna_loss <- +#' dplyr::filter(cna_segments, .data$LOG_R <= log_r_homdel) |> +#' nrow() +#' n_cna_gain <- +#' dplyr::filter(cna_segments, .data$LOG_R >= log_r_gain) |> +#' nrow() +#' cna_segments_filtered <- cna_segments |> +#' dplyr::filter(.data$LOG_R >= log_r_gain | .data$LOG_R <= log_r_homdel) |> +#' dplyr::arrange(dplyr::desc(.data$LOG_R)) +#' pcgrr::log4r_info( +#' paste0("Detected ", nrow(cna_segments_filtered), +#' " segments subject to amplification/deletion (", +#' n_cna_loss, " deletions, ", n_cna_gain, +#' " gains according to user-defined log(2) ratio thresholds)")) +#' +#' +#' ## Get aberration sets related to tumor suppressor genes +#' ## /oncogenes/drug targets +#' onco_ts_sets <- +#' get_oncogene_tsgene_target_sets( +#' cna_transcript_df, +#' transcript_overlap_pct = transcript_overlap_pct, +#' log_r_homdel = log_r_homdel, +#' log_r_gain = log_r_gain, +#' tumor_type = tumor_type, +#' pcgr_data = pcgr_data) +#' +#' ## load all clinical evidence items () +#' eitems_any_tt <- pcgrr::load_eitems( +#' eitems_raw = pcgr_data$biomarkers, +#' alteration_types = "CNA", +#' ontology = +#' pcgr_data$phenotype$oncotree, +#' origin = "Somatic", +#' tumor_type_specificity = "any") +#' +#' +#' +#' ## Get all clinical evidence items that are related to +#' ## tumor suppressor genes/oncogenes/drug targets (NOT tumor-type specific) +#' biomarker_hits_cna_any <- +#' pcgrr::get_clin_assocs_cna( +#' onco_ts_sets, +#' annotation_tags = pcgr_data$annotation_tags, +#' eitems = eitems_any_tt) +#' +#' pcg_report_cna[["clin_eitem"]][["any_ttype"]] <- +#' biomarker_hits_cna_any[["clin_eitem"]] +#' pcg_report_cna[["variant_set"]][["tier2"]] <- +#' biomarker_hits_cna_any$variant_set +#' +#' ## Get all clinical evidence items that +#' ## overlap query set (if tumor type is specified) +#' if (tumor_type != "Cancer, NOS") { +#' +#' ## load tumor-type specific evidence items () +#' eitems_specific_tt <- pcgrr::load_eitems( +#' eitems_raw = pcgr_data$biomarkers, +#' alteration_types = "CNA", +#' ontology = +#' pcgr_data$phenotype$oncotree, +#' origin = "Somatic", +#' tumor_type_specificity = "specific", +#' tumor_type = tumor_type) +#' +#' biomarker_hits_cna_specific <- +#' pcgrr::get_clin_assocs_cna( +#' onco_ts_sets, +#' annotation_tags = pcgr_data$annotation_tags, +#' eitems = eitems_specific_tt) +#' +#' ## Assign putative TIER 1 variant set +#' pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- +#' biomarker_hits_cna_specific$clin_eitem +#' pcg_report_cna[["variant_set"]][["tier1"]] <- +#' biomarker_hits_cna_specific$variant_set +#' } +#' +#' pcg_report_cna[["eval"]] <- T +#' pcg_report_cna[["variant_set"]][["tsv"]] <- +#' cna_transcript_df_print +#' pcg_report_cna[["v_stat"]][["n_cna_gain"]] <- +#' n_cna_gain +#' pcg_report_cna[["v_stat"]][["n_cna_loss"]] <- +#' n_cna_loss +#' pcg_report_cna[["disp"]][["segment"]] <- +#' cna_segments_filtered +#' pcg_report_cna[["disp"]][["oncogene_gain"]] <- +#' onco_ts_sets[["oncogene_gain"]] +#' pcg_report_cna[["disp"]][["tsgene_loss"]] <- +#' onco_ts_sets[["tsgene_loss"]] +#' pcg_report_cna[["disp"]][["other_target"]] <- +#' onco_ts_sets[["other_target"]] +#' +#' +#' pcg_report_cna <- +#' pcgrr::assign_tier1_tier2_acmg_cna(pcg_report_cna) +#' +#' return(pcg_report_cna) +#' } +#' + +#' Function that annotates CNV segment files +#' +#' param yaml_fname PCGR yaml file +#' param ref_data PCGR/CPSR reference data object +#' +#' export +# generate_report_data_cna2 <- +# function(yaml_fname, +# ref_data) { +# +# ## 1. Validate CNA segments tsv +# ## - check file exists +# ## - check columns necessary +# ## - check types +# +# invisible( +# assertthat::assert_that( +# file.exists(cna_segments_tsv), +# msg = paste0("File 'cna_segments_tsv' (", +# cna_segments_tsv, ") does not exist"))) +# pcg_report_cna <- pcgrr::init_report( +# yaml_fname, report_mode = "PCGR") +# #log_r_homdel <- pcgr_config[["cna"]][["log_r_homdel"]] +# #log_r_gain <- pcgr_config[["cna"]][["log_r_gain"]] +# tumor_type <- pcgr_config[["t_props"]][["tumor_type"]] +# MEGABASE <- 1000000 +# +# pcgrr::log4r_info("------") +# pcgrr::log4r_info(paste0("Generating report data for copy number segment file ", +# cna_segments_tsv)) +# +# # ## READ INPUT FILE, VALIDATE INPUT CHROMOSOMES AND SEGMENTS, ADD CYTOBAND INFO +# # cna_df <- utils::read.table(file = cna_segments_tsv, header = T, +# # stringsAsFactors = F, sep = "\t", +# # comment.char = "", quote = "") |> +# # dplyr::rename(chromosome = Chromosome, +# # LogR = Segment_Mean, +# # segment_start = Start, +# # segment_end = End) |> +# # dplyr::distinct() |> +# # dplyr::select( +# # c("chromosome","LogR", +# # "segment_start","segment_end")) |> +# # dplyr::mutate( +# # chromosome = stringr::str_replace( +# # .data$chromosome, "^chr", "")) |> +# # pcgrr::get_valid_chromosomes( +# # chromosome_column = "chromosome", +# # bsg = pcgr_data[["assembly"]][["bsg"]]) |> +# # pcgrr::get_valid_chromosome_segments( +# # genome_assembly = pcgr_data[["assembly"]][["grch_name"]], +# # bsg = pcgr_data[["assembly"]][["bsg"]]) |> +# # dplyr::filter(!is.na(.data$LogR)) |> +# # dplyr::mutate(LogR = round(as.numeric(.data$LogR), digits = 3)) |> +# # dplyr::mutate(SEGMENT_ID = paste0(.data$chromosome, ":", +# # .data$segment_start, "-", +# # .data$segment_end)) |> +# # pcgrr::get_cna_cytoband(pcgr_data = pcgr_data) |> +# dplyr::mutate(SAMPLE_ID = sample_name) |> +# pcgrr::append_ucsc_segment_link( +# hgname = pcgr_data[["assembly"]][["hg_name"]], +# chrom = "chromosome", +# start = "segment_start", +# end = "segment_end") |> +# # dplyr::mutate( +# # SEGMENT_LENGTH_MB = +# # round((as.numeric((.data$segment_end - .data$segment_start) / +# # MEGABASE)), +# # digits = 5)) |> +# dplyr::rename(SEGMENT = .data$SEGMENT_LINK, LOG_R = .data$LogR) +# +# ## MAKE SIMPLE SEGMENTS DATA FRAME FOR FILTERING IN REPORT +# cna_segments <- cna_df |> +# dplyr::select(.data$SEGMENT, +# .data$SEGMENT_LENGTH_MB, +# .data$CYTOBAND, +# .data$LOG_R, +# .data$EVENT_TYPE) |> +# dplyr::distinct() +# +# #### FIND AND APPEND GENCODE TRANSCRIPTS THAT OVERLAP +# cna_transcript_df <- +# pcgrr::get_cna_overlapping_transcripts( +# cna_df, pcgr_data = pcgr_data) +# #get_cna_overlapping_transcripts( +# # cna_df, pcgr_data = pcgr_data) +# +# #### GENERATE DATAFRAME OF UNIQUE TRANSCRIPT-CNA SEGMENTS FOR OUTPUT TSV +# cna_transcript_df_print <- cna_transcript_df |> +# dplyr::select(.data$chrom, +# .data$segment_start, +# .data$segment_end, +# .data$SEGMENT_ID, +# .data$SEGMENT_LENGTH_MB, +# .data$EVENT_TYPE, +# .data$CYTOBAND, +# .data$LOG_R, +# .data$SAMPLE_ID, +# .data$ensembl_gene_id, +# .data$symbol, +# .data$ensembl_transcript_id, +# .data$transcript_start, +# .data$transcript_end, +# .data$transcript_overlap_percent, +# .data$name, +# .data$biotype, +# .data$tumor_suppressor, +# .data$oncogene, +# .data$intogen_driver, +# .data$chembl_compound_id, +# .data$gencode_tag, +# .data$gencode_release) |> +# magrittr::set_colnames(tolower(names(.))) +# +# avg_transcript_overlap <- as.data.frame( +# cna_transcript_df |> +# dplyr::filter(.data$biotype == "protein_coding") |> +# dplyr::group_by(.data$SEGMENT_ID, .data$symbol) |> +# dplyr::summarise( +# MEAN_TRANSCRIPT_CNA_OVERLAP = mean( +# .data$transcript_overlap_percent), +# TRANSCRIPTS = paste0(.data$ensembl_transcript_id, collapse = ", "), +# .groups = "drop") |> +# dplyr::rename(SYMBOL = .data$symbol) |> +# dplyr::mutate( +# MEAN_TRANSCRIPT_CNA_OVERLAP = +# round(.data$MEAN_TRANSCRIPT_CNA_OVERLAP, digits = 2)) +# ) +# +# cna_transcript_df <- +# dplyr::select(cna_transcript_df, -.data$ensembl_transcript_id) |> +# dplyr::filter(.data$biotype == "protein_coding") |> +# dplyr::distinct() |> +# dplyr::mutate(VAR_ID = as.character(rep(1:nrow(.)))) |> +# magrittr::set_colnames(toupper(names(.))) |> +# pcgrr::append_otargets_pheno_link( +# pcgr_data = pcgr_data, +# oncotree = oncotree) |> +# dplyr::rename(OPENTARGETS_ASSOCIATIONS = +# .data$OT_DISEASE_LINK) |> +# dplyr::select(.data$VAR_ID, +# .data$SEGMENT_ID, +# .data$SYMBOL, +# .data$ONCOGENE, +# .data$ONCOGENE_EVIDENCE, +# .data$TUMOR_SUPPRESSOR, +# .data$TUMOR_SUPPRESSOR_EVIDENCE, +# .data$CANCERGENE_SUPPORT, +# .data$OPENTARGETS_ASSOCIATIONS, +# .data$OPENTARGETS_RANK, +# .data$ENTREZ_ID, +# .data$CHROM, +# .data$NAME, +# .data$EVENT_TYPE, +# .data$SEGMENT_LENGTH_MB, +# .data$SEGMENT, +# .data$TRANSCRIPT_OVERLAP_PERCENT, +# .data$LOG_R) |> +# dplyr::mutate(ENTREZ_ID = as.character(.data$ENTREZ_ID)) |> +# dplyr::rename(GENENAME = .data$NAME, +# TRANSCRIPT_OVERLAP = .data$TRANSCRIPT_OVERLAP_PERCENT, +# CHROMOSOME = .data$CHROM) |> +# dplyr::left_join(pcgr_data[["kegg"]][["pathway_links"]], +# by = c("ENTREZ_ID" = "gene_id")) |> +# dplyr::rename(KEGG_PATHWAY = .data$kegg_pathway_urls) +# +# ## Get gene annotation links +# entrezgene_annotation_links <- +# pcgrr::generate_annotation_link( +# cna_transcript_df, +# vardb = "GENE_NAME", +# group_by_var = "VAR_ID", +# link_key_var = "ENTREZ_ID", +# link_display_var = "GENENAME", +# url_prefix = "http://www.ncbi.nlm.nih.gov/gene/") +# +# cna_transcript_df <- cna_transcript_df |> +# dplyr::left_join( +# dplyr::rename(entrezgene_annotation_links, +# GENE_NAME = .data$link), +# by = c("VAR_ID")) |> +# dplyr::select(.data$SEGMENT_ID, +# .data$CHROMOSOME, +# .data$SYMBOL, +# .data$GENE_NAME, +# .data$KEGG_PATHWAY, +# .data$TUMOR_SUPPRESSOR, +# .data$TUMOR_SUPPRESSOR_EVIDENCE, +# .data$ONCOGENE, +# .data$ONCOGENE_EVIDENCE, +# .data$CANCERGENE_SUPPORT, +# .data$OPENTARGETS_ASSOCIATIONS, +# .data$OPENTARGETS_RANK, +# .data$SEGMENT_LENGTH_MB, +# .data$SEGMENT, +# .data$EVENT_TYPE, +# .data$LOG_R) |> +# dplyr::distinct() |> +# dplyr::left_join(avg_transcript_overlap, +# by = c("SEGMENT_ID", "SYMBOL")) +# +# +# n_cna_loss <- +# dplyr::filter(cna_segments, .data$LOG_R <= log_r_homdel) |> +# nrow() +# n_cna_gain <- +# dplyr::filter(cna_segments, .data$LOG_R >= log_r_gain) |> +# nrow() +# cna_segments_filtered <- cna_segments |> +# dplyr::filter(.data$LOG_R >= log_r_gain | .data$LOG_R <= log_r_homdel) |> +# dplyr::arrange(dplyr::desc(.data$LOG_R)) +# pcgrr::log4r_info( +# paste0("Detected ", nrow(cna_segments_filtered), +# " segments subject to amplification/deletion (", +# n_cna_loss, " deletions, ", n_cna_gain, +# " gains according to user-defined log(2) ratio thresholds)")) +# +# +# ## Get aberration sets related to tumor suppressor genes +# ## /oncogenes/drug targets +# onco_ts_sets <- +# get_oncogene_tsgene_target_sets( +# cna_transcript_df, +# transcript_overlap_pct = transcript_overlap_pct, +# log_r_homdel = log_r_homdel, +# log_r_gain = log_r_gain, +# tumor_type = tumor_type, +# pcgr_data = pcgr_data) +# +# ## load all clinical evidence items () +# eitems_any_tt <- pcgrr::load_eitems( +# eitems_raw = pcgr_data$biomarkers, +# alteration_types = "CNA", +# ontology = +# pcgr_data$phenotype$oncotree, +# origin = "Somatic", +# tumor_type_specificity = "any") +# +# +# +# ## Get all clinical evidence items that are related to +# ## tumor suppressor genes/oncogenes/drug targets (NOT tumor-type specific) +# biomarker_hits_cna_any <- +# pcgrr::get_clin_assocs_cna( +# onco_ts_sets, +# annotation_tags = pcgr_data$annotation_tags, +# eitems = eitems_any_tt) +# +# pcg_report_cna[["clin_eitem"]][["any_ttype"]] <- +# biomarker_hits_cna_any[["clin_eitem"]] +# pcg_report_cna[["variant_set"]][["tier2"]] <- +# biomarker_hits_cna_any$variant_set +# +# ## Get all clinical evidence items that +# ## overlap query set (if tumor type is specified) +# if (tumor_type != "Cancer, NOS") { +# +# ## load tumor-type specific evidence items () +# eitems_specific_tt <- pcgrr::load_eitems( +# eitems_raw = pcgr_data$biomarkers, +# alteration_types = "CNA", +# ontology = +# pcgr_data$phenotype$oncotree, +# origin = "Somatic", +# tumor_type_specificity = "specific", +# tumor_type = tumor_type) +# +# biomarker_hits_cna_specific <- +# pcgrr::get_clin_assocs_cna( +# onco_ts_sets, +# annotation_tags = pcgr_data$annotation_tags, +# eitems = eitems_specific_tt) +# +# ## Assign putative TIER 1 variant set +# pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- +# biomarker_hits_cna_specific$clin_eitem +# pcg_report_cna[["variant_set"]][["tier1"]] <- +# biomarker_hits_cna_specific$variant_set +# } +# +# pcg_report_cna[["eval"]] <- T +# pcg_report_cna[["variant_set"]][["tsv"]] <- +# cna_transcript_df_print +# pcg_report_cna[["v_stat"]][["n_cna_gain"]] <- +# n_cna_gain +# pcg_report_cna[["v_stat"]][["n_cna_loss"]] <- +# n_cna_loss +# pcg_report_cna[["disp"]][["segment"]] <- +# cna_segments_filtered +# pcg_report_cna[["disp"]][["oncogene_gain"]] <- +# onco_ts_sets[["oncogene_gain"]] +# pcg_report_cna[["disp"]][["tsgene_loss"]] <- +# onco_ts_sets[["tsgene_loss"]] +# pcg_report_cna[["disp"]][["other_target"]] <- +# onco_ts_sets[["other_target"]] +# +# +# pcg_report_cna <- +# pcgrr::assign_tier1_tier2_acmg_cna(pcg_report_cna) +# +# return(pcg_report_cna) +# } +# + +#' Function that generates dense and tiered annotated variant datasets +#' @param variant_set List with tiered variants +#' @param config PCGR configuration settings +#' @param annotation_tags List with display columns +#' @param sample_name Sample identifier +#' +#' @return tsv_variants data frame with tier-annotated list of +#' variants for tab-separated output +#' +#' @export +generate_tier_tsv <- function(variant_set, + config, + annotation_tags, + sample_name = "test") { + + tags <- NULL + if (!is.null(config[["preserved_info_tags"]])) { + if (config[["preserved_info_tags"]] != "None") { + tags <- + stringr::str_split( + config[["preserved_info_tags"]], pattern = ",")[[1]] + } + } + pcgrr::log4r_info(paste0( + "Generating tiered set of result variants for output", + " in tab-separated values (TSV) file")) + tsv_variants <- NULL + for (tier in c("tier1", "tier2", "tier3", "tier4", "noncoding")) { + if (nrow(variant_set[[tier]]) > 0) { + tierset <- variant_set[[tier]] + tierset$VCF_SAMPLE_ID <- sample_name + tsv_columns <- annotation_tags[["tsv"]] + if (!is.null(tags)) { + for (t in tags) { + t <- stringr::str_trim(t) + if (t %in% colnames(tierset)) { + tsv_columns <- c(tsv_columns, t) + } + } + } + + if (tier == "tier1") { + tierset$TIER_DESCRIPTION <- "Variants of strong clinical significance" + tierset$TIER <- "TIER 1" + } + if (tier == "tier2") { + tierset$TIER_DESCRIPTION <- + "Variants of potential clinical significance" + tierset$TIER <- "TIER 2" + } + if (tier == "tier3") { + tierset$TIER_DESCRIPTION <- "Variants of uncertain significance" + tierset$TIER <- "TIER 3" + } + if (tier == "tier4") { + tierset$TIER_DESCRIPTION <- "Other coding mutation" + tierset$TIER <- "TIER 4" + } + if (tier == "noncoding") { + tierset$TIER_DESCRIPTION <- "Noncoding mutation" + tierset$TIER <- "NONCODING" + } + tierset <- tierset |> + dplyr::select(dplyr::any_of(tsv_columns)) |> + dplyr::distinct() + + tsv_variants <- dplyr::bind_rows(tsv_variants, tierset) + } + } + tsv_variants$GENE_NAME <- + unlist(lapply(stringr::str_match_all(tsv_variants$GENE_NAME, ">.+<"), + paste, collapse = ",")) + tsv_variants$GENE_NAME <- + stringr::str_replace_all(tsv_variants$GENE_NAME, ">|<", "") + tsv_variants$CLINVAR <- + unlist(lapply(stringr::str_match_all(tsv_variants$CLINVAR, ">.+<"), + paste, collapse = ",")) + tsv_variants$CLINVAR <- + stringr::str_replace_all(tsv_variants$CLINVAR, ">|<", "") + tsv_variants$PROTEIN_DOMAIN <- + unlist(lapply(stringr::str_match_all(tsv_variants$PROTEIN_DOMAIN, ">.+<"), + paste, collapse = ",")) + tsv_variants$PROTEIN_DOMAIN <- + stringr::str_replace_all(tsv_variants$PROTEIN_DOMAIN, ">|<", "") + tsv_variants$TCGA_FREQUENCY <- + stringr::str_replace_all( + tsv_variants$TCGA_FREQUENCY, + "|", "") + tsv_variants <- tsv_variants |> dplyr::distinct() + + return(tsv_variants) +} + + +#' Function that writes contents of PCGR object to various output formats +#' (Rmarkdown/flexdashboard HTML reports, JSON, tab-separated etc) +#' +#' @param report List object with all report data (PCGR/CPSR), settings etc. +#' @param tier_model type of tier model +#' @param output_format contents/file format of output +#' (html/json/tsv/cna_tsv etc) +#' @param flexdb logical indicating if HTML output should be dashboard + +#' @export +write_report_output <- function(report, + tier_model = "pcgr_acmg", + output_format = "html", + flexdb = FALSE) { + + settings <- report[['settings']] + project_directory <- settings[['output_dir']] + sample_name <- settings[['sample_id']] + genome_assembly <- settings[['genome_assembly']] + + sample_fname_pattern <- + paste(sample_name, tier_model, genome_assembly, sep = ".") + + fnames <- list() + fnames[["snv_tsv_unfiltered"]] <- + file.path(project_directory, + paste0(sample_fname_pattern, + ".snvs_indels.unfiltered.tsv")) + fnames[["msigs_tsv"]] <- + file.path(project_directory, + paste0(sample_fname_pattern, + ".mutational_signatures.tsv")) + fnames[["snv_tsv"]] <- + file.path(project_directory, + paste0(sample_fname_pattern, + ".snvs_indels.tiers.tsv")) + fnames[["xlsx"]] <- + file.path(project_directory, + paste0(sample_fname_pattern, + ".snvs_indels.tiers.xlsx")) + # fnames[["cna_tsv"]] <- + # file.path(project_directory, + # paste0(sample_fname_pattern, + # ".cna_segments.tsv")) + # fnames[["json"]] <- + # file.path(project_directory, + # paste0(sample_fname_pattern, ".json")) + fnames[["html"]] <- + file.path(project_directory, + paste0(sample_fname_pattern, ".html")) + if (flexdb == T) { + fnames[["html"]] <- + file.path(project_directory, + paste0(sample_fname_pattern, + ".flexdb.html")) + } + + + + ## Set to CPSR/germline settings as default + sequencing_design <- "Germline" + cpsr_tmpl <- system.file("templates", package = "cpsr") + disclaimer <- file.path(cpsr_tmpl, "disclaimer_predisposition.md") + markdown_input <- file.path(cpsr_tmpl, "cpsr_rmarkdown_report.Rmd") + css_fname <- file.path(cpsr_tmpl, "cpsr.css") + report_theme <- + settings[["conf"]][["visual_reporting"]][["visual_theme"]] + + ## Somatic/tumor report settings + if (tier_model == "pcgr_acmg") { + pcgrr_tmpl <- system.file("templates", package = "pcgrr") + + disclaimer <- file.path(pcgrr_tmpl, "disclaimer.md") + assay_props <- + settings[["conf"]][["assay_properties"]] + sequencing_assay <- + assay_props[["type"]] + + ## Flexdashboard layout + sequencing_design <- "Tumor-Control" + markdown_input <- file.path(pcgrr_tmpl, "pcgr_flexdb_report.Rmd") + css_fname <- file.path(pcgrr_tmpl, "pcgr_flexdb_tumor_control.css") + + ## Rmarkdown layout + if (flexdb == FALSE) { + markdown_input <- file.path(pcgrr_tmpl, "pcgr_rmarkdown_report.Rmd") + css_fname <- file.path(pcgrr_tmpl, "pcgr_rmarkdown_tumor_control.css") + } + + ## Tumor-only settings (CSS) + if (assay_props[["vcf_tumor_only"]] == T) { + sequencing_design <- "Tumor-Only" + css_fname <- file.path(pcgrr_tmpl, "pcgr_flexdb_tumor_only.css") + + if (flexdb == FALSE) { + css_fname <- file.path(pcgrr_tmpl, "pcgr_rmarkdown_tumor_only.css") + } + } + } + + if (output_format == "html") { + + if (flexdb == T & tier_model == "pcgr_acmg") { + pcgrr::log4r_info("------") + pcgrr::log4r_info( + "Writing HTML file (.html) with report contents - flexdashboard") + navbar_items <- list() + navbar_items[[1]] <- + list("title" = paste0( + "", sample_name, " | ", + report[["metadata"]][["config"]][["t_props"]][["tumor_type"]], + " | ", sequencing_design, " | ", sequencing_assay), + href = "", target = "_blank", align = "right") + navbar_items[[2]] <- + list("icon" = "fa-github", + href = "https://github.com/sigven/pcgr", target = "_blank", + align = "right") + + rmarkdown::render( + markdown_input, + output_format = + flexdashboard::flex_dashboard( + orientation = "rows", + favicon = system.file( + "templates","favicon-16x16.png", + package = "pcgrr"), + theme = "cosmo", + css = css_fname, + navbar = navbar_items), + output_file = fnames[["html"]], + output_dir = project_directory, + clean = T, + intermediates_dir = project_directory, + quiet = T) + }else{ + + toc_float <- + list(collapsed = TRUE, + smooth_scroll = TRUE, + print = TRUE) + toc_depth <- 3 + + ## Ignore collapsing menu for CPSR + if (tier_model == 'cpsr') { + toc_float <- + list(collapsed = FALSE, + smooth_scroll = FALSE, + print = TRUE) + } + + ## If nonfloating TOC is chosen (PCGR/CPSR), set toc_float to FALSE + nonfloating_toc <- + as.logical(settings[["conf"]][["visual_reporting"]][["nonfloating_toc"]]) + if (nonfloating_toc == T) { + toc_float <- F + } + + disclaimer <- system.file( + "templates", + "disclaimer.md", + package = "pcgrr") + + header <- system.file( + "templates", + "_header.html", + package = "pcgrr") + if (tier_model == "cpsr") { + header <- system.file( + "templates", + "_header.html", + package = "cpsr") + } + + pcgrr::log4r_info("------") + pcgrr::log4r_info(paste0( + "Writing HTML file (.html) with report contents - rmarkdown (theme = '", + report_theme,"')")) + rmarkdown::render( + markdown_input, + output_format = + rmarkdown::html_document( + theme = report_theme, + fig_width = 5, + fig_height = 4, + toc = T, + toc_depth = toc_depth, + toc_float = toc_float, + number_sections = F, + css = css_fname, + includes = + rmarkdown::includes( + in_header = header, + after_body = disclaimer)), + output_file = fnames[["html"]], + output_dir = project_directory, + clean = T, + intermediates_dir = project_directory, + quiet = T) + } + } + if (output_format == "json") { + if (!is.null(report[["cna_plot"]][["png"]])) { + report[["cna_plot"]][["png"]] <- NULL + } + if (!is.null(report[["tmb"]][["tcga_tmb"]])) { + report[["tmb"]][["tcga_tmb"]] <- NULL + } + pcgrr::log4r_info("------") + pcgrr::log4r_info("Writing JSON file (.json) with key report contents") + + report_strip <- report + + if (tier_model != "cpsr") { + if (!is.null(report_strip$content$rainfall)) { + report_strip$content$rainfall <- NULL + } + if (!is.null(report_strip$content$tmb)) { + report_strip$content$tmb$tcga_tmb <- NULL + } + if (!is.null(report_strip$content$clinicaltrials)) { + report_strip$content$clinicaltrials <- NULL + } + if (!is.null(report_strip$content$msi)) { + if (!is.null(report_strip$content$msi$prediction)) { + report_strip$content$msi$prediction$tcga_dataset <- NULL + } + } + + if (!is.null(report_strip$content$snv_indel$disp)) { + report_strip$content$snv_indel$disp <- NULL + } + + if (!is.null(report_strip$content$snv_indel$variant_set)) { + if (!is.null(report_strip$content$snv_indel$variant_set$maf)) { + report_strip$content$snv_indel$variant_set$maf <- NULL + } + } + + key_tsv_cols <- c("GENOMIC_CHANGE", + "VARIANT_CLASS", + "SYMBOL", + "ENTREZ_ID", + "ENSEMBL_TRANSCRIPT_ID", + "TUMOR_SUPPRESSOR", + "ONCOGENE", + "CONSEQUENCE", + "PROTEIN_CHANGE", + "PROTEIN_DOMAIN", + "CODING_STATUS", + "EXONIC_STATUS", + "HGVSp", + "MUTATION_HOTSPOT", + "DBSNPRSID", + "COSMIC_MUTATION_ID", + "CALL_CONFIDENCE", + "DP_TUMOR", + "AF_TUMOR", + "DP_CONTROL", + "AF_CONTROL", + "TIER") + + if (!is.null(report_strip$content$snv_indel$variant_set)) { + + for(o in c('tsv')) { + + if (!is.null(report_strip$content$snv_indel$variant_set[[o]])) { + + if (nrow(report_strip$content$snv_indel$variant_set[[o]]) == 0) { + next + } + assertable::assert_colnames( + report_strip$content$snv_indel$variant_set[[o]], + colnames = key_tsv_cols, + only_colnames = F, + quiet = T + ) + + report_strip$content$snv_indel$variant_set[[o]] <- + dplyr::select( + report_strip$content$snv_indel$variant_set[[o]], + dplyr::any_of(key_tsv_cols) + ) + + } + } + } + + } ## if tier_model != "cpsr" + + + size <- format(utils::object.size(report_strip), units = "auto") + #hsize <- R.utils::hsize.object_size(size) + pcgrr::log4r_info(paste0("Size of PCGR report object for JSON output: ", size)) + + + ## NOTE: set max size of report object to 750 Mb - have not figured out + ## what the exact size should be for jsonlite::toJSON to succeed/fail + if (utils::object.size(report_strip) < 750000000) { + + pcgr_json <- jsonlite::toJSON( + report_strip, pretty = T, na = "string", + null = "null", force = T) + write(pcgr_json, fnames[["json"]]) + gzip_command <- paste0("gzip -f ", fnames[["json"]]) + system(gzip_command, intern = F) + }else{ + pcgrr::log4r_info("JSON output not possible - report contents too large (> 750Mb)") + + } + } + + if (output_format == "snv_tsv" | output_format == "snv_tsv_unfiltered") { + output_format_slim <- stringr::str_replace(output_format, "snv_", "") + if (NROW( + report[["content"]][["snv_indel"]][["variant_set"]][[output_format_slim]]) > 0) { + pcgrr::log4r_info("------") + if (tier_model == "pcgr_acmg") { + pcgrr::log4r_info( + paste0("Writing SNV/InDel tab-separated output file with ", + "PCGR annotations - ('", + output_format_slim, "')")) + } + if (tier_model == "cpsr") { + pcgrr::log4r_info( + paste0("Writing SNV/InDel tab-separated output file ", + "with CPSR annotations - ('", + output_format_slim, "')")) + } + utils::write.table( + report[["content"]][["snv_indel"]][["variant_set"]][[output_format_slim]], + file = fnames[[output_format]], sep = "\t", col.names = T, + row.names = F, quote = F) + + # if (tier_model == "pcgr_acmg") { + # pcgrr::log4r_info( + # paste0("Writing SNV/InDel Excel output file with ", + # "PCGR annotations")) + # workbook <- openxlsx::createWorkbook() + # openxlsx::addWorksheet(workbook, + # sheetName = "SNV_INDELS") + # + # ## set automatic column widths + # openxlsx::setColWidths( + # workbook, + # sheet = "SNV_INDELS", + # cols = 1:ncol(report[["content"]][["snv_indel"]][["variant_set"]][[output_format_slim]]), + # widths = "auto") + # + # ## write with default Excel Table style + # openxlsx::writeDataTable( + # workbook, + # sheet = "SNV_INDELS", + # x = report[["content"]][["snv_indel"]][["variant_set"]][[output_format_slim]], + # startRow = 1, + # startCol = 1, + # colNames = TRUE, + # tableStyle = "TableStyleMedium15") + # + # openxlsx::saveWorkbook( + # workbook, + # fnames[['excel']], + # overwrite = TRUE) + # } + } + } + + if (output_format == "msigs_tsv") { + if ( + NROW(report[["content"]][["m_signature_mp"]][["result"]][["tsv"]]) > 0) { + pcgrr::log4r_info("------") + pcgrr::log4r_info(paste0( + "Writing tab-separated output file with details ", + "of contributing mutational signatures - ('tsv')")) + utils::write.table(report[["content"]][["m_signature_mp"]][["result"]][["tsv"]], + file = fnames[[output_format]], sep = "\t", col.names = T, + row.names = F, quote = F) + } + } + + if (output_format == "cna_tsv") { + if (NROW(report[["content"]][["cna"]][["variant_set"]][["tsv"]]) > 0) { + pcgrr::log4r_info("------") + pcgrr::log4r_info( + "Writing CNA tab-separated output file with PCGR annotations (.tsv.gz)") + utils::write.table(report[["content"]][["cna"]][["variant_set"]][["tsv"]], + file = fnames[["cna_tsv"]], sep = "\t", col.names = T, + row.names = F, quote = F) + gzip_command <- paste0("gzip -f ", fnames[["cna_tsv"]]) + system(gzip_command, intern = F) + } + } + +} + diff --git a/pcgrr/R/msi.R b/pcgrr/R/msi.R index f1e2a7ae..06a489b5 100644 --- a/pcgrr/R/msi.R +++ b/pcgrr/R/msi.R @@ -1,7 +1,7 @@ #' Function that predicts MSI status based on fraction of indels among calls #' -#' @param vcf_data_df data frame with somatic mutations/indels -#' @param pcgr_data object with PCGR datasets +#' @param variant_set data frame with somatic mutations/indels +#' @param ref_data PCGR reference data object #' @param msi_prediction_model statistical model for MSI prediction #' @param msi_prediction_dataset underlying dataset from TCGA used for #' development of statistical classifier @@ -10,40 +10,66 @@ #' @return msi_data #' #' @export -predict_msi_status <- function(vcf_data_df, pcgr_data, +predict_msi_status <- function(variant_set, + ref_data, msi_prediction_model, msi_prediction_dataset, - target_size_mb, sample_name = "Test") { + target_size_mb, + sample_name = "Test") { mutations_valid <- pcgrr::get_valid_chromosomes( - vcf_data_df, + variant_set, chromosome_column = "CHROM", - bsg = pcgr_data[["assembly"]][["bsg"]]) + bsg = ref_data[["assembly"]][["bsg"]]) mutations_valid <- mutations_valid |> - dplyr::select(.data$CHROM, .data$POS, .data$REF, .data$ALT, .data$CONSEQUENCE, .data$SYMBOL, - .data$GENOMIC_CHANGE, .data$VARIANT_CLASS, .data$PROTEIN_DOMAIN, - .data$GENE_NAME, .data$PROTEIN_CHANGE, .data$MUTATION_HOTSPOT, - .data$CLINVAR, .data$TCGA_FREQUENCY, .data$AF_TUMOR, .data$DP_TUMOR, - .data$AF_CONTROL, .data$DP_CONTROL, .data$CALL_CONFIDENCE, - .data$SIMPLEREPEATS_HIT, .data$WINMASKER_HIT) + dplyr::select( + .data$CHROM, + .data$POS, + .data$REF, + .data$ALT, + .data$CONSEQUENCE, + .data$SYMBOL, + .data$GENOMIC_CHANGE, + .data$VARIANT_CLASS, + .data$PROTEIN_DOMAIN, + .data$GENENAME, + .data$PROTEIN_CHANGE, + .data$MUTATION_HOTSPOT, + .data$CLINVAR, + .data$TCGA_FREQUENCY, + .data$AF_TUMOR, + .data$DP_TUMOR, + .data$AF_CONTROL, + .data$DP_CONTROL, + .data$CALL_CONFIDENCE, + .data$SIMPLEREPEATS_HIT, + .data$WINMASKER_HIT) vcf_df_repeatAnnotated <- mutations_valid |> - dplyr::mutate(repeatStatus = - dplyr::if_else(.data$SIMPLEREPEATS_HIT == T, - "simpleRepeat", as.character(NA))) |> - dplyr::mutate(winMaskStatus = - dplyr::if_else(.data$WINMASKER_HIT == T, - "winMaskDust", as.character(NA))) - - msi_stats <- data.frame("sample_name" = sample_name, stringsAsFactors = F) + dplyr::mutate( + repeatStatus = + dplyr::if_else( + .data$SIMPLEREPEATS_HIT == T, + "simpleRepeat", as.character(NA))) |> + dplyr::mutate( + winMaskStatus = + dplyr::if_else( + .data$WINMASKER_HIT == T, + "winMaskDust", as.character(NA))) + + msi_stats <- data.frame( + "sample_name" = sample_name, stringsAsFactors = F) msi_stats1 <- vcf_df_repeatAnnotated |> - dplyr::filter(!is.na(.data$repeatStatus) & (.data$VARIANT_CLASS == "insertion" | - .data$VARIANT_CLASS == "deletion")) |> + dplyr::filter( + !is.na(.data$repeatStatus) & + (.data$VARIANT_CLASS == "insertion" | + .data$VARIANT_CLASS == "deletion")) |> dplyr::summarise(repeat_indels = dplyr::n()) msi_stats2 <- vcf_df_repeatAnnotated |> - dplyr::filter(!is.na(.data$repeatStatus) & .data$VARIANT_CLASS == "SNV") |> + dplyr::filter(!is.na(.data$repeatStatus) & + .data$VARIANT_CLASS == "SNV") |> dplyr::summarise(repeat_SNVs = dplyr::n()) msi_stats3 <- vcf_df_repeatAnnotated |> @@ -57,7 +83,8 @@ predict_msi_status <- function(vcf_data_df, pcgr_data, dplyr::summarise(winmask_indels = dplyr::n()) winmask_snvs <- vcf_df_repeatAnnotated |> - dplyr::filter(!is.na(.data$winMaskStatus) & .data$VARIANT_CLASS == "SNV") |> + dplyr::filter(!is.na(.data$winMaskStatus) & + .data$VARIANT_CLASS == "SNV") |> dplyr::summarise(winmask_SNVs = dplyr::n()) winmask_tot <- vcf_df_repeatAnnotated |> @@ -71,7 +98,8 @@ predict_msi_status <- function(vcf_data_df, pcgr_data, dplyr::summarise(nonRepeat_indels = dplyr::n()) msi_stats5 <- vcf_df_repeatAnnotated |> - dplyr::filter(is.na(.data$repeatStatus) & .data$VARIANT_CLASS == "SNV") |> + dplyr::filter(is.na(.data$repeatStatus) & + .data$VARIANT_CLASS == "SNV") |> dplyr::summarise(nonRepeat_SNVs = dplyr::n()) msi_stats6 <- vcf_df_repeatAnnotated |> @@ -118,42 +146,48 @@ predict_msi_status <- function(vcf_data_df, pcgr_data, dplyr::filter( .data$SYMBOL == "MSH3" & stringr::str_detect( - .data$CONSEQUENCE, "frameshift_|missense_|splice_|stop_|start_|frame_")) |> + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|start_|frame_")) |> dplyr::summarise(MSH3 = dplyr::n()) msi_stats14 <- vcf_df_repeatAnnotated |> dplyr::filter( .data$SYMBOL == "MSH6" & stringr::str_detect( - .data$CONSEQUENCE, "frameshift_|missense_|splice_|stop_|start_|frame_")) |> + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|start_|frame_")) |> dplyr::summarise(MSH6 = dplyr::n()) msi_stats15 <- vcf_df_repeatAnnotated |> dplyr::filter( .data$SYMBOL == "PMS1" & stringr::str_detect( - .data$CONSEQUENCE, "frameshift_|missense_|splice_|stop_|start_|frame_")) |> + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|start_|frame_")) |> dplyr::summarise(PMS1 = dplyr::n()) msi_stats16 <- vcf_df_repeatAnnotated |> dplyr::filter( .data$SYMBOL == "PMS2" & stringr::str_detect( - .data$CONSEQUENCE, "frameshift_|missense_|splice_|stop_|start_|frame_")) |> + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|start_|frame_")) |> dplyr::summarise(PMS2 = dplyr::n()) msi_stats17 <- vcf_df_repeatAnnotated |> dplyr::filter( .data$SYMBOL == "POLE" & stringr::str_detect( - .data$CONSEQUENCE, "frameshift_|missense_|splice_|stop_|start_|frame_")) |> + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|start_|frame_")) |> dplyr::summarise(POLE = dplyr::n()) msi_stats18 <- vcf_df_repeatAnnotated |> dplyr::filter( .data$SYMBOL == "POLD1" & stringr::str_detect( - .data$CONSEQUENCE, "frameshift_|missense_|splice_|stop_|start_|frame_")) |> + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|start_|frame_")) |> dplyr::summarise(POLD1 = dplyr::n()) msi_stats1$sample_name <- sample_name @@ -214,9 +248,14 @@ predict_msi_status <- function(vcf_data_df, pcgr_data, msi_stats$tmb <- as.numeric(msi_stats$indelSNVs) / target_size_mb msi_stats$tmb_indel <- as.numeric(msi_stats$indels) / target_size_mb msi_stats$tmb_snv <- as.numeric(msi_stats$SNVs) / target_size_mb - for (stat in c("fracWinMaskIndels", "fracWinMaskSNVs", "fracRepeatIndels", - "fracRepeatIndels", "fracNonRepeatIndels", "fracIndels", - "tmb", "tmb_snv", "tmb_indel")) { + for (stat in c("fracWinMaskIndels", + "fracWinMaskSNVs", + "fracRepeatIndels", + "fracRepeatIndels", + "fracNonRepeatIndels", + "fracIndels", + "tmb", "tmb_snv", + "tmb_indel")) { if (nrow(msi_stats[is.na(msi_stats[stat]), ]) > 0) { msi_stats[is.na(msi_stats[stat]), ][stat] <- 0 } @@ -224,34 +263,45 @@ predict_msi_status <- function(vcf_data_df, pcgr_data, mmr_pol_df <- mutations_valid |> dplyr::filter( - stringr::str_detect(.data$SYMBOL, - "^(MLH1|MLH3|MSH2|MSH3|MSH6|PMS1|PMS2|POLD1|POLE)$") & - stringr::str_detect(.data$CONSEQUENCE, - "frameshift_|missense_|splice_|stop_|inframe_")) - mmr_pol_df <- dplyr::select(mmr_pol_df, -c(.data$CHROM, .data$POS, .data$REF, .data$ALT)) - mmr_pol_df <- dplyr::rename(mmr_pol_df, GENE = .data$SYMBOL) - mmr_pol_df <- mmr_pol_df |> - dplyr::select(.data$GENE, .data$CONSEQUENCE, .data$PROTEIN_CHANGE, - .data$GENE_NAME, .data$VARIANT_CLASS, .data$PROTEIN_DOMAIN, - dplyr::everything()) - - msi_predictors <- c("fracWinMaskIndels", "fracWinMaskSNVs", - "fracRepeatIndels", - "fracNonRepeatIndels", - "fracIndels", "MLH1", "MLH3", "MSH2", - "MSH3", "MSH6", "PMS1", - "PMS2", "POLD1", "POLE", "tmb", - "tmb_indel", "tmb_snv") - msi_class <- stats::predict(msi_prediction_model, - dplyr::select(msi_stats, msi_predictors)) + stringr::str_detect( + .data$SYMBOL, + "^(MLH1|MLH3|MSH2|MSH3|MSH6|PMS1|PMS2|POLD1|POLE)$") & + stringr::str_detect( + .data$CONSEQUENCE, + "frameshift_|missense_|splice_|stop_|inframe_")) |> + dplyr::select(-c("CHROM","POS","REF","ALT")) |> + dplyr::rename(GENE = .data$SYMBOL) |> + dplyr::select( + .data$GENE, + .data$CONSEQUENCE, + .data$PROTEIN_CHANGE, + .data$GENENAME, + .data$VARIANT_CLASS, + .data$PROTEIN_DOMAIN, + dplyr::everything()) + + msi_predictors <- c( + "fracWinMaskIndels", + "fracWinMaskSNVs", + "fracRepeatIndels", + "fracNonRepeatIndels", + "fracIndels", "MLH1", + "MLH3", "MSH2", + "MSH3", "MSH6", "PMS1", + "PMS2", "POLD1", + "POLE", "tmb", + "tmb_indel", "tmb_snv") + + msi_class <- stats::predict( + msi_prediction_model, + dplyr::select(msi_stats, msi_predictors)) + if (msi_class == "MSS") { msi_stats$predicted_class <- "MSS (Microsatellite stable)" - #msi_stats$vb <- "MSI status:\nMSS" msi_stats$vb <- "MSS" } else{ msi_stats$predicted_class <- "MSI.H (Microsatellite instability - high)" - #msi_stats$vb <- "MSI status:\nMSI - High" msi_stats$vb <- "MSI - High" } pcgrr::log4r_info(paste0("Predicted MSI status: ", @@ -268,34 +318,38 @@ predict_msi_status <- function(vcf_data_df, pcgr_data, #' Function that generates MSI prediction data for PCGR report #' -#' @param sample_calls variant calls subject to mutational signature analysis -#' @param pcgr_data object with PCGR annotation data -#' @param sample_name sample identifier -#' @param pcgr_config Object with PCGR configuration parameters +#' @param variant_set variant calls subject to MSI classification +#' @param ref_data PCGR reference data object +#' @param settings PCGR run configuration settings #' #' @export -generate_report_data_msi <- function(sample_calls, - pcgr_data, - sample_name, - pcgr_config) { +generate_report_data_msi <- function( + variant_set, + ref_data = NULL, + settings = NULL) { + + pcg_report_msi <- pcgrr::init_msi_content() - pcg_report_msi <- pcgrr::init_report(config =pcgr_config, - class = "msi") pcgrr::log4r_info("------") pcgrr::log4r_info("Predicting microsatellite instability status") - msi_sample_calls <- sample_calls |> dplyr::filter(.data$EXONIC_STATUS == "exonic") - pcgrr::log4r_info(paste0("n = ", - nrow(msi_sample_calls), - " exonic variants used for MSI prediction")) + msi_sample_calls <- variant_set |> + dplyr::filter(.data$EXONIC_STATUS == "exonic") + pcgrr::log4r_info( + paste0("n = ", + nrow(msi_sample_calls), + " exonic variants used for MSI prediction")) if (nrow(msi_sample_calls) >= 1) { pcg_report_msi[["prediction"]] <- pcgrr::predict_msi_status( - msi_sample_calls, pcgr_data, - msi_prediction_model = pcgr_data[["msi"]][["model"]][["model"]], - msi_prediction_dataset = pcgr_data[["msi"]][["model"]][["tcga_dataset"]], - target_size_mb = pcgr_config$assay_props$target_size_mb, - sample_name = sample_name) + variant_set = msi_sample_calls, + ref_data, + msi_prediction_model = ref_data[["msi"]][["model"]], + msi_prediction_dataset = ref_data[["msi"]][["tcga_dataset"]], + target_size_mb = + settings$conf$assay_properties$effective_target_size_mb, + sample_name = settings$sample_id) + pcg_report_msi[["eval"]] <- TRUE } else{ @@ -318,14 +372,16 @@ generate_report_data_msi <- function(sample_calls, msi_indel_fraction_plot <- function(tcga_msi_dataset, indel_fraction) { - color_vec <- utils::head(pcgrr::color_palette[["tier"]][["values"]],2) + color_vec <- utils::head( + pcgrr::color_palette[["tier"]][["values"]], 2) names(color_vec) <- c("MSS", "MSI.H") p <- ggplot2::ggplot(data = tcga_msi_dataset) + - ggplot2::geom_histogram(mapping = ggplot2::aes(x = .data$fracIndels, - color = .data$MSI_status, - fill = .data$MSI_status), - position = "dodge", binwidth = 0.01) + + ggplot2::geom_histogram( + mapping = ggplot2::aes(x = .data$fracIndels, + color = .data$MSI_status, + fill = .data$MSI_status), + position = "dodge", binwidth = 0.01) + ggplot2::ylab("Number of TCGA samples") + ggplot2::scale_fill_manual(values = color_vec) + ggplot2::scale_color_manual(values = color_vec) + diff --git a/pcgrr/R/mutation.R b/pcgrr/R/mutation.R index 75056746..04721cff 100644 --- a/pcgrr/R/mutation.R +++ b/pcgrr/R/mutation.R @@ -44,95 +44,3 @@ assign_mutation_type <- function(var_df) { return(var_df) } - -#' Function that transforms a tier-structured variant data frame -#' into a MAF-like data frame (for input to 2020plus, MutSigCV) -#' -#' @param maf_df data frame with somatic mutations -#' @param genome_seq BSgenome object -#' @param seqinfo seqinfo object - -#' @return maf_all -#' -#' @export -get_proper_maf_alleles <- function(maf_df, genome_seq, seqinfo) { - - maf_df_valid <- - pcgrr::get_valid_chromosomes(maf_df, - chromosome_column = "Chromosome", - bsg = genome_seq) - if ("end" %in% colnames(maf_df_valid)) { - maf_df_valid <- dplyr::select(maf_df_valid, -.data$end) - } - - maf_snv <- maf_df_valid |> - dplyr::filter(.data$Variant_Type == "SNP") |> - dplyr::mutate(REF = .data$Reference_Allele, - ALT = .data$Tumor_Seq_Allele2, POS = .data$Start_Position) - - maf_all <- maf_snv - maf_ins <- dplyr::filter(maf_df_valid, .data$Variant_Type == "INS") - maf_del <- dplyr::filter(maf_df_valid, .data$Variant_Type == "DEL") - - if (nrow(maf_del) > 0) { - ## get appropriate alleles (VCF-like) of reference and alternate (DELETIONS) - maf_del_gr <- - GenomicRanges::makeGRangesFromDataFrame(maf_del, keep.extra.columns = T, - seqinfo = seqinfo, - seqnames.field = "Chromosome", - start.field = "Start_Position", - end.field = "End_Position", - ignore.strand = T, - starts.in.df.are.0based = F) - - maf_del_flank_gr <- GenomicRanges::flank(maf_del_gr, width = 1, start = T) - maf_del_flank_seq <- Biostrings::getSeq(genome_seq, maf_del_flank_gr) - maf_del_seq <- Biostrings::getSeq(genome_seq, maf_del_gr) - vcf_alleles_alt <- - data.frame(ALT = - toupper(unlist(strsplit(toString(maf_del_flank_seq), ", "))), - stringsAsFactors = F) - vcf_alleles_ref <- - data.frame(REF = - toupper(unlist(strsplit(toString(maf_del_seq), ", "))), - stringsAsFactors = F) - vcf_alleles <- cbind(vcf_alleles_ref, vcf_alleles_alt) - vcf_alleles$REF <- paste0(vcf_alleles$ALT, vcf_alleles$REF) - maf_del <- cbind(maf_del, vcf_alleles) - maf_del$POS <- maf_del$Start_Position - 1 - - maf_all <- rbind(maf_all, maf_del) - } - - if (nrow(maf_ins) > 0) { - ## get appropriate alleles (VCF-like) of reference and alternate (INSERTIONS) - maf_ins_gr <- - GenomicRanges::makeGRangesFromDataFrame(maf_ins, - keep.extra.columns = T, - seqinfo = seqinfo, - seqnames.field = "Chromosome", - start.field = "Start_Position", - end.field = "Start_Position", - ignore.strand = T, - starts.in.df.are.0based = F) - maf_ins_seq <- Biostrings::getSeq(genome_seq, maf_ins_gr) - vcf_alleles_alt <- - data.frame(REF = - toupper(unlist(strsplit(toString(maf_ins_seq), ", "))), - stringsAsFactors = F) - maf_ins <- cbind(maf_ins, vcf_alleles_alt) - maf_ins$ALT <- paste0(maf_ins$REF, maf_ins$Tumor_Seq_Allele2) - maf_ins$POS <- maf_ins$Start_Position - - maf_all <- rbind(maf_all, maf_ins) - } - - - maf_all$CHROM <- stringr::str_replace(maf_all$Chromosome, "chr", "") - maf_all$GENOMIC_CHANGE <- - paste(paste(paste(paste0("g.chr", maf_all$CHROM), - maf_all$POS, sep = ":"), maf_all$REF, sep = ":"), - maf_all$ALT, sep = ">") - return(maf_all) - -} diff --git a/pcgrr/R/mutational_burden.R b/pcgrr/R/mutational_burden.R index 8b5c2469..64dcbd52 100644 --- a/pcgrr/R/mutational_burden.R +++ b/pcgrr/R/mutational_burden.R @@ -30,7 +30,7 @@ generate_report_data_tmb <- function(sample_calls, pcg_report_tmb[["eval"]] <- TRUE - if(NROW(sample_calls) > 0){ + if (NROW(sample_calls) > 0) { pcg_report_tmb[["v_stat"]][["n_tmb"]] <- sample_calls |> dplyr::filter( @@ -132,7 +132,7 @@ plot_tmb_primary_site_tcga <- function(tcga_tmb, p_site = "Liver", legend.text = ggplot2::element_text(family = "Helvetica", size = 14)) - if(tmb_estimate > 0){ + if (tmb_estimate > 0) { tmb_plot_site <- tmb_plot_site + ggplot2::geom_hline( yintercept = as.numeric(tmb_estimate), size = 0.9, diff --git a/pcgrr/R/mutational_signatures.R b/pcgrr/R/mutational_signatures.R index 5a2353f1..841debca 100644 --- a/pcgrr/R/mutational_signatures.R +++ b/pcgrr/R/mutational_signatures.R @@ -1,62 +1,84 @@ #' Function that generates mutational signatures data for PCGR report #' -#' @param vcf_fname VCF file processed with PCGR annotation pipeline - -#' possibly filtered for depth/allelic fraction -#' @param pcgr_data object with PCGR annotation data -#' @param sample_name sample identifier -#' @param pcgr_config Object with PCGR configuration parameters -#' @param type_specific logical indicating if all reference signatures are to be -#' included (F) rather than those known to be prevalent in the tumor (T) +#' @param callset_snv Somatic callset (SNV) +#' @param ref_data PCGR reference data object +#' @param settings PCGR configuration settings object #' #' @export generate_report_data_signatures_mp <- - function(vcf_fname, - pcgr_data, - sample_name, - pcgr_config, - type_specific = T) { + function(callset_snv = NULL, + ref_data = NULL, + settings = NULL) { + + cosmic_metadata <- + ref_data$metadata |> + dplyr::filter(.data$source_abbreviation == "cosmic_mutsigs") |> + dplyr::select(c("source_version")) |> + dplyr::mutate( + source_version = stringr::str_replace_all( + .data$source_version, "[\r\n]" , "")) pcgrr::log4r_info("------") - pcgrr::log4r_info(paste0("Identifying weighted contributions of reference ", - "mutational signatures (COSMIC v3.2) using ", - "MutationalPatterns")) - assay <- tolower(pcgr_config$assay_props$type) + pcgrr::log4r_info( + paste0("Identifying weighted contributions of reference ", + "mutational signatures (COSMIC ", + cosmic_metadata$source_version,") using ", + "MutationalPatterns")) + #assay <- tolower(pcgr_config$assay_props$type) + assay <- tolower(settings$conf$assay_properties$type) + + vcf_name_mutsig_analysis <- + file.path(settings$output_dir, + paste( + settings$sample_id, + stringi::stri_rand_strings( + 1, 15, pattern = "[A-Za-z0-9]"), + "mutational_patterns_input.vcf", + sep=".")) + + pcgrr::write_processed_vcf( + calls = callset_snv$variant, + sample_name = settings$sample_id, + output_directory = settings$output_dir, + vcf_fname = vcf_name_mutsig_analysis) pcg_report_signatures <- - pcgrr::init_report(config = pcgr_config, - class = "m_signature_mp") + pcgrr::init_m_signature_content() + + fit_signatures_to_ttype <- !as.logical( + settings$conf$somatic_snv$mutational_signatures$all_reference_signatures + ) ## Retrieve relevant signatures for the tumor in question prevalent_site_signatures <- NULL - if(type_specific == T){ + if (fit_signatures_to_ttype == T) { prevalent_site_signatures <- - pcgrr::get_prevalent_site_signatures( - site = pcgr_config[["t_props"]][["tumor_type"]], + pcgrr::get_prevalent_site_signatures2( + site = settings$conf$sample_properties$site, min_prevalence_pct = - pcgr_config[["msigs"]][["prevalence_reference_signatures"]], - pcgr_data = pcgr_data, + settings$conf$somatic_snv$mutational_signatures$prevalence_reference_signatures, + ref_data = ref_data, incl_poss_artifacts = - pcgr_config[["msigs"]][["include_artefact_signatures"]]) - } - if(type_specific == F){ + settings$conf$somatic_snv$mutational_signatures$include_artefact_signatures) + }else{ prevalent_site_signatures <- - pcgrr::get_prevalent_site_signatures( + pcgrr::get_prevalent_site_signatures2( site = "Any", min_prevalence_pct = - pcgr_config[["msigs"]][["prevalence_reference_signatures"]], - pcgr_data = pcgr_data, + settings$conf$somatic_snv$mutational_signatures$prevalence_reference_signatures, + ref_data = ref_data, incl_poss_artifacts = - pcgr_config[["msigs"]][["include_artefact_signatures"]]) + settings$conf$somatic_snv$mutational_signatures$include_artefact_signatures) } ## read MutationalPattern VCF file - if(file.exists(vcf_fname)){ + if (file.exists(glue::glue("{vcf_name_mutsig_analysis}.gz"))) { vcfs <- suppressMessages(suppressWarnings( MutationalPatterns::read_vcfs_as_granges( - vcf_files = vcf_fname, - sample_names = sample_name, - genome = pcgr_data[["assembly"]][["ref_genome"]], + vcf_files = glue::glue("{vcf_name_mutsig_analysis}.gz"), + sample_names = settings$sample_id, + genome = ref_data$assembly$bsg, predefined_dbs_mbs = T), ) ) @@ -66,7 +88,7 @@ generate_report_data_signatures_mp <- pcg_report_signatures[["eval"]] <- TRUE - if (length(vcfs[[1]]) >= pcgr_config[["msigs"]][["mutation_limit"]]) { + if (length(vcfs[[1]]) >= settings$conf$somatic_snv$mutational_signatures[["mutation_limit"]]) { ## assign variants to variant set pcg_report_signatures[["variant_set"]][["all"]] <- @@ -84,25 +106,27 @@ generate_report_data_signatures_mp <- mut_mat <- MutationalPatterns::mut_matrix( vcf_list = vcfs, - ref_genome = pcgr_data[["assembly"]][["ref_genome"]], + ref_genome = ref_data$assembly$bsg, extension = 1) mut_mat <- mut_mat + 0.0001 - ## get reference signatures (COSMIC v3.2) + ## get reference signatures (COSMIC v3.4) all_reference_signatures <- MutationalPatterns::get_known_signatures( muttype = "snv", genome = stringr::str_replace( - pcgr_data[["assembly"]][["grch_name"]], "grc", "GRC" + ref_data$assembly$grch_name, "grc", "GRC" ), incl_poss_artifacts = - pcgr_config[["msigs"]][["include_artefact_signatures"]] + as.logical( + settings$conf$somatic_snv$mutational_signatures$include_artefact_signatures + ) ) ## select subset of signatures based on those prevalent in tumor type/tissue selected_sigs <- intersect( colnames(all_reference_signatures), - unique(prevalent_site_signatures$aetiology$signature_id) + unique(prevalent_site_signatures$aetiology$SIGNATURE_ID) ) selected_reference_signatures <- all_reference_signatures[, selected_sigs] @@ -124,33 +148,38 @@ generate_report_data_signatures_mp <- ## assess the relative contribution of each reference mutational signature tot <- as.data.frame( stats::setNames(reshape2::melt(colSums(fit_ref[["contribution"]])), - c("tot"))) |> - dplyr::mutate(sample_id = as.character(rownames(.))) |> - magrittr::set_rownames(NULL) + c("tot"))) + tot$sample_id <- rownames(tot) + rownames(tot) <- NULL ## add information on aetiologies, and aggregate contributions ## pr. aetiology contributions_per_signature <- as.data.frame(stats::setNames(reshape2::melt(fit_ref[["contribution"]]), - c("signature_id", "sample_id", + c("SIGNATURE_ID", "sample_id", "contribution_raw"))) |> - dplyr::mutate(signature_id = as.character(.data$signature_id)) |> + dplyr::mutate(SIGNATURE_ID = as.character(.data$SIGNATURE_ID)) |> dplyr::mutate(sample_id = as.character(.data$sample_id)) |> dplyr::left_join(tot, by = "sample_id") |> dplyr::mutate(prop_signature = round(as.numeric(.data$contribution_raw) / tot, digits = 3)) |> - dplyr::select(.data$signature_id, .data$sample_id, .data$prop_signature) |> + dplyr::select(.data$SIGNATURE_ID, + .data$sample_id, + .data$prop_signature) |> dplyr::filter(.data$prop_signature > 0) |> dplyr::arrange(dplyr::desc(.data$prop_signature)) |> dplyr::left_join( dplyr::select( - pcgr_data[["mutational_signature"]], - .data$signature_id, - .data$aetiology, - .data$comments, - .data$aetiology_keyword), - by = c("signature_id")) |> - dplyr::rename(group = .data$aetiology_keyword) |> + ref_data$misc$mutational_signature, + .data$SIGNATURE_ID, + .data$AETIOLOGY, + .data$COMMENTS, + .data$AETIOLOGY_KEYWORD), + by = c("SIGNATURE_ID")) |> + dplyr::rename(group = .data$AETIOLOGY_KEYWORD, + signature_id = .data$SIGNATURE_ID, + aetiology = .data$AETIOLOGY, + comments = .data$COMMENTS) |> dplyr::mutate( contribution = paste0(round(.data$prop_signature * 100, digits = 2), "%")) |> @@ -160,7 +189,7 @@ generate_report_data_signatures_mp <- contributions_per_signature |> dplyr::group_by(.data$group) |> dplyr::summarise(prop_group = sum(.data$prop_signature), - signature_id_group = paste(.data$signature_id, collapse=", "), + signature_id_group = paste(.data$SIGNATURE_ID, collapse=", "), .groups = "drop") ) @@ -187,7 +216,7 @@ generate_report_data_signatures_mp <- ## choose only signatures attributed to 25 different aetiologies missing_aetiologies <- contributions_per_signature |> dplyr::filter(is.na(.data$col)) - if(nrow(missing_aetiologies) > 0){ + if (nrow(missing_aetiologies) > 0) { log4r_warn(paste0("Found contributions from more than 25 aetiologies - ", "showing signatures from 25 different aetiologies only")) contributions_per_signature <- contributions_per_signature |> @@ -201,31 +230,33 @@ generate_report_data_signatures_mp <- contributions <- list() contributions[["per_group"]] <- contributions_per_group contributions[["per_signature"]] <- contributions_per_signature + tsv_data <- data.frame() ## Get output for tab-separated file ## - contribution per signature id and reference signatures used - if(!is.null(prevalent_site_signatures$aetiology) & - NROW(contributions[["per_signature"]]) > 0){ - if("signature_id" %in% colnames(prevalent_site_signatures$aetiology)){ - reference_sigs <- paste(sort(prevalent_site_signatures$aetiology$signature_id), + if (!is.null(prevalent_site_signatures$aetiology) & + NROW(contributions[["per_signature"]]) > 0) { + if ("SIGNATURE_ID" %in% colnames(prevalent_site_signatures$aetiology)) { + reference_sigs <- paste(sort(prevalent_site_signatures$aetiology$SIGNATURE_ID), collapse=",") tsv_data <- contributions[["per_signature"]] |> pcgrr::remove_cols_from_df( - cnames = c("contribution","col","aetiology","comments")) |> + cnames = c("contribution","col","AETIOLOGY","COMMENTS")) |> dplyr::mutate( - all_reference_signatures = !type_specific, - tumor_type = pcgr_config[["t_props"]][["tumor_type"]], - reference_collection = "COSMIC_v32", + all_reference_signatures = !fit_signatures_to_ttype, + tumor_type = settings$conf$sample_properties$site, + reference_collection = "COSMIC_v34", reference_signatures = reference_sigs, fitting_accuracy = - round(sim_original_reconstructed$cosine_sim * 100, digits = 1)) + round(sim_original_reconstructed$cosine_sim * 100, digits = 1)) |> + dplyr::rename(signature_id = SIGNATURE_ID) } } - vr <- vcfs[[sample_name]] + vr <- vcfs[[settings$sample_id]] GenomeInfoDb::seqlengths(vr) <- - GenomeInfoDb::seqlengths(pcgr_data[["assembly"]][["bsg"]])[GenomeInfoDb::seqlevels(pcgr_data[["assembly"]][["bsg"]]) %in% unique(GenomeInfoDb::seqlevels(vr))] - chromosomes <- utils::head(GenomeInfoDb::seqnames(pcgr_data[["assembly"]][["bsg"]]), 24) + GenomeInfoDb::seqlengths(ref_data$assembly$bsg)[GenomeInfoDb::seqlevels(ref_data$assembly$bsg) %in% unique(GenomeInfoDb::seqlevels(vr))] + chromosomes <- utils::head(GenomeInfoDb::seqnames(ref_data$assembly$bsg), 24) pcg_report_signatures[["result"]][["vr"]] <- vr pcg_report_signatures[["result"]][["mut_mat"]] <- mut_mat @@ -252,21 +283,23 @@ generate_report_data_signatures_mp <- nrow(pcg_report_signatures[["variant_set"]][["all"]]), ") for reconstruction of mutational signatures by ", "MutationalPatterns, limit set to ", - pcgr_config[["msigs"]][["mutation_limit"]])) + settings$conf$somatic_snv$mutational_signatures$mutation_limit)) } } } + system(glue::glue("rm -f {vcf_name_mutsig_analysis}*")) + return(pcg_report_signatures) } #' Function that retrieves prevalent signatures for a given tumor type/primary site -#' Data is collected from COSMIC v3.2. +#' Data is collected from COSMIC v3.4. #' #' @param site Primary tumor site #' @param custom_collection Custom collection of signatures from COSMIC -#' @param pcgr_data PCGR data object +#' @param ref_data PCGR reference data object #' @param min_prevalence_pct Minimum prevalence (pct) of signature in #' cohorts associated with primary site - #' used to select reference signatures for inclusion in signature reconstruction @@ -277,15 +310,25 @@ generate_report_data_signatures_mp <- get_prevalent_site_signatures <- function(site = "Any", custom_collection = NULL, - pcgr_data = NULL, + ref_data = NULL, min_prevalence_pct = 5, incl_poss_artifacts = T) { - if(is.null(custom_collection)){ + cosmic_metadata <- + ref_data$metadata |> + dplyr::filter(source_abbreviation == "cosmic_mutsigs") |> + dplyr::select(source_version) |> + dplyr::mutate( + source_version = stringr::str_replace_all( + source_version, "[\r\n]" , "")) + + if (is.null(custom_collection)) { pcgrr::log4r_info(paste0( "Retrieving prevalent (prevalence >= ", min_prevalence_pct, " percent) reference signatures for ", - site, ", using COSMIC v3.2 collection")) + site, ", using COSMIC ", + cosmic_metadata$source_version, + " collection")) } pcgrr::log4r_info(paste0( "Inclusion of mutational signature artefacts (e.g. sequencing artefacts): ", @@ -293,136 +336,146 @@ get_prevalent_site_signatures <- invisible( assertthat::assert_that( - !is.null(pcgr_data[["mutational_signature"]]), + !is.null(ref_data$misc$mutational_signature), msg = - "Cannot load ref. aetiologies (COSMIC v3.2) of mutational signatures")) + paste0( + "Cannot load ref. aetiologies (COSMIC ", + cosmic_metadata$source_version, + ") of mutational signatures"))) invisible( assertthat::assert_that( - is.data.frame(pcgr_data[["mutational_signature"]]), + is.data.frame(ref_data$misc$mutational_signature), msg = "Reference aetiologies must be of type data.frame()")) invisible( assertthat::assert_that( min_prevalence_pct == 1 | - min_prevalence_pct == 2 | min_prevalence_pct == 5 | - min_prevalence_pct == 10 | min_prevalence_pct == 15 | + min_prevalence_pct == 2 | + min_prevalence_pct == 5 | + min_prevalence_pct == 10 | + min_prevalence_pct == 15 | min_prevalence_pct == 20, msg = "Argument 'min_prevalence_pct' must be any of '0, 2, 5, 10, 15 or 20'")) valid_signature_ids <- - unique(pcgr_data[["mutational_signature"]]$signature_id) + unique(ref_data$misc$mutational_signature$SIGNATURE_ID) signatures_prevalence <- data.frame() - if(!is.null(custom_collection)){ + if (!is.null(custom_collection)) { invisible( assertthat::assert_that( is.character(custom_collection), msg = "Argument 'custom_collection' must be a character vector")) pcgrr::log4r_info(paste0( - "Retrieving reference signatures from COSMIC v3.2 collection based on user-defined collection (", + "Retrieving reference signatures from COSMIC ", + cosmic_metadata$source_version, + " collection based on user-defined collection (", paste(unique(custom_collection), collapse=", "), ")") ) i <- 1 - while(i <= length(custom_collection)){ - if(!(custom_collection[i] %in% valid_signature_ids)){ + while(i <= length(custom_collection)) { + if (!(custom_collection[i] %in% valid_signature_ids)) { log4r_warn(paste0( "Could not find specified custom signature id '", - custom_collection[i], "' in COSMIC v3.2 reference collection", + custom_collection[i], "' in COSMIC ", + cosmic_metadata$source_version, + " reference collection", " - ignoring")) } i <- i + 1 } signatures_prevalence <- - pcgr_data[["mutational_signature"]] |> - dplyr::select(c("signature_id", - "aetiology_keyword", - "aetiology", - "associated_signatures", - "comments")) |> - dplyr::filter(.data$signature_id %in% custom_collection) |> + ref_data$misc$mutational_signature |> + dplyr::select(c("SIGNATURE_ID", + "AETIOLOGY_KEYWORD", + "AETIOLOGY", + "ASSOCIATED_SIGNATURES", + "COMMENTS")) |> + dplyr::filter(.data$SIGNATURE_ID %in% custom_collection) |> dplyr::distinct() }else{ unique_sites_with_signature_prevalence <- - unique(pcgr_data[["mutational_signatures"]][["primary_site"]]) + unique(ref_data$misc$mutational_signature[["PRIMARY_SITE"]]) if (!(site %in% unique_sites_with_signature_prevalence)) { pcgrr::log4r_info( paste0("Primary tumor site '", site, "' ", "does not have any signatures with significant ", "prevalence - considering all")) signatures_prevalence <- - pcgr_data[["mutational_signature"]] |> - dplyr::select(.data$signature_id, - .data$aetiology_keyword, - .data$aetiology, - .data$associated_signatures, - .data$comments) |> + ref_data$misc$mutational_signature |> + dplyr::select(c("SIGNATURE_ID", + "AETIOLOGY_KEYWORD", + "AETIOLOGY", + "ASSOCIATED_SIGNATURES", + "COMMENTS")) |> dplyr::distinct() }else{ signatures_prevalence <- - pcgr_data[["mutational_signature"]] |> - dplyr::filter(.data$primary_site == site) |> - dplyr::select(.data$signature_id, - .data$primary_site, - .data$prevalence_pct, - .data$prevalence_above_5pct, - .data$prevalence_above_10pct, - .data$prevalence_above_15pct, - .data$prevalence_above_20pct, - .data$aetiology_keyword, - .data$aetiology, - .data$associated_signatures, - .data$comments) |> + ref_data$misc$mutational_signature |> + dplyr::filter(.data$PRIMARY_SITE == site) |> + dplyr::select(.data$SIGNATURE_ID, + .data$PRIMARY_SITE, + .data$PREVALENCE_PCT, + .data$PREVALENCE_ABOVE_5PCT, + .data$PREVALENCE_ABOVE_10PCT, + .data$PREVALENCE_ABOVE_15PCT, + .data$PREVALENCE_ABOVE_20PCT, + .data$AETIOLOGY_KEYWORD, + .data$AETIOLOGY, + .data$ASSOCIATED_SIGNATURES, + .data$COMMENTS) |> dplyr::distinct() if (min_prevalence_pct > 0) { if (min_prevalence_pct == 5) { signatures_prevalence <- signatures_prevalence |> - dplyr::filter(.data$prevalence_above_5pct == T | - is.na(.data$prevalence_above_5pct)) + dplyr::filter(.data$PREVALENCE_ABOVE_5PCT == T | + is.na(.data$PREVALENCE_ABOVE_5PCT)) }else if (min_prevalence_pct == 10) { signatures_prevalence <- signatures_prevalence |> - dplyr::filter(.data$prevalence_above_10pct == T | - is.na(.data$prevalence_above_10pct)) + dplyr::filter(.data$PREVALENCE_ABOVE_10PCT == T | + is.na(.data$PREVALENCE_ABOVE_10PCT)) } else if (min_prevalence_pct == 15) { signatures_prevalence <- signatures_prevalence |> - dplyr::filter(.data$prevalence_above_15pct == T | - is.na(.data$prevalence_above_15pct)) + dplyr::filter(.data$PREVALENCE_ABOVE_15PCT == T | + is.na(.data$PREVALENCE_ABOVE_15PCT)) }else if (min_prevalence_pct == 20) { signatures_prevalence <- signatures_prevalence |> - dplyr::filter(.data$prevalence_above_20pct == T | - is.na(.data$prevalence_above_20pct)) - }else if (min_prevalence_pct == 2 | min_prevalence_pct == 1){ + dplyr::filter(.data$PREVALENCE_ABOVE_20PCT == T | + is.na(.data$PREVALENCE_ABOVE_20PCT)) + }else if (min_prevalence_pct == 2 | min_prevalence_pct == 1) { signatures_prevalence <- signatures_prevalence |> - dplyr::filter(!is.na(.data$prevalence_pct)) |> - dplyr::filter(.data$prevalence_pct >= min_prevalence_pct) + dplyr::filter(!is.na(.data$PREVALENCE_PCT)) |> + dplyr::filter(.data$PREVALENCE_PCT >= min_prevalence_pct) } } signatures_prevalence <- signatures_prevalence |> - dplyr::select(-c(.data$primary_site, - .data$prevalence_above_5pct, - .data$prevalence_above_10pct, - .data$prevalence_above_15pct, - .data$prevalence_above_20pct)) |> + dplyr::select(-c(.data$PRIMARY_SITE, + .data$PREVALENCE_ABOVE_5PCT, + .data$PREVALENCE_ABOVE_10PCT, + .data$PREVALENCE_ABOVE_15PCT, + .data$PREVALENCE_ABOVE_20PCT)) |> dplyr::distinct() |> - dplyr::arrange(dplyr::desc(.data$prevalence_pct)) |> - dplyr::select(-.data$prevalence_pct) + dplyr::arrange(dplyr::desc(.data$PREVALENCE_PCT)) |> + dplyr::select(-.data$PREVALENCE_PCT) } } - if(incl_poss_artifacts == F){ + if (incl_poss_artifacts == F) { signatures_prevalence <- signatures_prevalence |> - dplyr::filter(!stringr::str_detect(.data$aetiology_keyword,"artefact")) + dplyr::filter(!stringr::str_detect( + .data$AETIOLOGY_KEYWORD,"artefact")) } signatures_prevalence <- signatures_prevalence |> dplyr::distinct() ## Subset signature matrix - keeping only columns (signatures) ## to those defined by primary site/custom collection - sigs <- unique(signatures_prevalence$signature_id) + sigs <- unique(signatures_prevalence$SIGNATURE_ID) pcgrr::log4r_info(paste0("Limiting reference collection to signatures: ", paste(sigs, collapse = ", "))) @@ -443,11 +496,13 @@ get_prevalent_site_signatures <- #' @param build genome assembly (grch37/grch38) #' #' @export -generate_report_data_rainfall <- function(variant_set, colors = NULL, - autosomes = FALSE, build = NULL) { +generate_report_data_rainfall <- function(variant_set, + colors = NULL, + autosomes = FALSE, + build = NULL) { - pcg_report_rainfall <- pcgrr::init_report(class = "rainfall") - if(NROW(variant_set) == 0){ + pcg_report_rainfall <- pcgrr::init_rainfall_content() + if (NROW(variant_set) == 0) { return(pcg_report_rainfall) } diff --git a/pcgrr/R/reference_data.R b/pcgrr/R/reference_data.R index 32013882..2889e5e8 100644 --- a/pcgrr/R/reference_data.R +++ b/pcgrr/R/reference_data.R @@ -7,7 +7,7 @@ #' load_reference_data <- function( pcgr_db_assembly_dir = NULL, - genome_assembly = "grch38"){ + genome_assembly = "grch38") { pcgr_ref_data <- list() @@ -17,11 +17,11 @@ load_reference_data <- function( pcgr_ref_data[["assembly"]] <- list() pcgr_ref_data[["assembly"]][["grch_name"]] <- genome_assembly - pcgr_ref_data[["assembly"]][["grch_name"]] <- "hg19" + pcgr_ref_data[["assembly"]][["hg_name"]] <- "hg19" pcgr_ref_data[["assembly"]][["ref_genome"]] <- "BSgenome.Hsapiens.UCSC.hg19" if (genome_assembly == "grch38") { pcgr_ref_data[["assembly"]][["grch_name"]] <- genome_assembly - pcgr_ref_data[["assembly"]][["grch_name"]] <- "hg38" + pcgr_ref_data[["assembly"]][["hg_name"]] <- "hg38" pcgr_ref_data[["assembly"]][["ref_genome"]] <- "BSgenome.Hsapiens.UCSC.hg38" } @@ -36,7 +36,7 @@ load_reference_data <- function( pcgr_ref_data[['vcf_infotags']] <- data.frame() - for(t in c('vep','other')){ + for(t in c('vep','other')) { infotag_fname <- file.path( pcgr_db_assembly_dir, paste0("vcf_infotags_", t, ".tsv")) @@ -53,32 +53,32 @@ load_reference_data <- function( ) } for(cat in c('tcga','clinvar','gwas','gnomad_non_cancer', - 'dbmts','dbnsfp','panel_of_normals')){ + 'dbmts','dbnsfp','panel_of_normals')) { vcfanno_fname <- file.path( pcgr_db_assembly_dir,"variant","vcf",cat, paste0(cat,".vcfanno.vcf_info_tags.txt")) raw_lines <- readLines(vcfanno_fname) - for(l in raw_lines){ - if(startsWith(l,"##INFO")){ + for(l in raw_lines) { + if (startsWith(l,"##INFO")) { tag <- stringr::str_replace( stringr::str_match(l,"ID=[A-Za-z|_]{1,}")[,1], "ID=","") number <- NA - if(stringr::str_detect(l, "Number=1,")){ + if (stringr::str_detect(l, "Number=1,")) { number <- 1 } - if(stringr::str_detect(l, "Number=0,")){ + if (stringr::str_detect(l, "Number=0,")) { number <- 0 } type <- "String" - if(stringr::str_detect(l, "Type=Integer,")){ + if (stringr::str_detect(l, "Type=Integer,")) { type <- "Integer" } - if(stringr::str_detect(l, "Type=Float,")){ + if (stringr::str_detect(l, "Type=Float,")) { type <- "Float" } - if(stringr::str_detect(l, "Type=Flag,")){ + if (stringr::str_detect(l, "Type=Flag,")) { type <- "Flag" } description <- @@ -89,10 +89,10 @@ load_reference_data <- function( "Description=\\\"|\\\">","") category <- "pcgr_cpsr" - if(cat == "dbmts" | cat == "gnomad_non_cancer"){ + if (cat == "dbmts" | cat == "gnomad_non_cancer") { category <- "cpsr" } - if(cat == "panel_of_normals"){ + if (cat == "panel_of_normals") { category <- "pcgr" } df <- data.frame( @@ -116,6 +116,7 @@ load_reference_data <- function( pcgr_ref_data[["gene"]][["cpg"]] <- data.frame() pcgr_ref_data[['gene']][['gene_xref']] <- data.frame() pcgr_ref_data[['gene']][['transcript_xref']] <- data.frame() + pcgr_ref_data[['gene']][['otp_rank']] <- data.frame() cpg_tsv_fname <- file.path( pcgr_db_assembly_dir, "gene", "tsv", @@ -170,9 +171,23 @@ load_reference_data <- function( ) |> dplyr::distinct() - colnames(pcgr_ref_data[['gene']][['transcript_xref']]) <- - toupper(colnames(pcgr_ref_data[['gene']][['transcript_xref']])) + otp_rank_tsv_fname <- file.path( + pcgr_db_assembly_dir, "gene", "tsv", + "gene_transcript_xref", + "otp_rank.tsv.gz" + ) + check_file_exists(otp_rank_tsv_fname) + pcgr_ref_data[['gene']][['otp_rank']] <- as.data.frame( + readr::read_tsv( + otp_rank_tsv_fname, show_col_types = F, + na = c('.'))) |> + dplyr::filter(!is.na(.data$entrezgene)) |> + dplyr::mutate(entrezgene = as.character(.data$entrezgene)) |> + dplyr::distinct() + + colnames(pcgr_ref_data[['gene']][['otp_rank']]) <- + toupper(colnames(pcgr_ref_data[['gene']][['otp_rank']])) pcgr_ref_data[['gene']][['gene_xref']] <- as.data.frame( readr::read_tsv(gene_xref_tsv_fname, show_col_types = F)) |> @@ -193,7 +208,7 @@ load_reference_data <- function( "cancergene_evidence") ) |> dplyr::rename( - genename = name + genename = .data$name ) |> dplyr::mutate( entrezgene = as.character(.data$entrezgene) @@ -273,14 +288,14 @@ load_reference_data <- function( ## Get variant statistics for(vardb in c('clinvar','gwas','tcga', 'gnomad_non_cancer','dbmts', - 'dbnsfp')){ + 'dbnsfp')) { varstats_fname <- file.path( pcgr_db_assembly_dir, "variant", "vcf", vardb, paste0(vardb,".vcf_varstats.tsv") ) - if(file.exists(varstats_fname)){ + if (file.exists(varstats_fname)) { pcgr_ref_data[['variant']][['varstats']][[vardb]] <- as.data.frame( readr::read_tsv( @@ -346,14 +361,14 @@ load_reference_data <- function( 'mutational_signature', 'pathway', 'hotspot', - 'protein_domain')){ + 'protein_domain')) { fname_misc <- file.path( pcgr_db_assembly_dir, "misc", "tsv", elem, paste0(elem,".tsv.gz") ) - # if(elem == 'hotspot'){ + # if (elem == 'hotspot') { # fname_misc <- file.path( # pcgr_db_assembly_dir, "misc", "tsv", elem, # paste0(elem,".tsv.gz") @@ -409,9 +424,9 @@ load_reference_data <- function( ## 7. Biomarkers pcgr_ref_data[['biomarker']] <- list() - for(elem in c('clinical','variant','literature')){ + for(elem in c('clinical','variant','literature')) { pcgr_ref_data[['biomarker']][[elem]] <- data.frame() - for(db in c('cgi','civic')){ + for(db in c('cgi','civic')) { fname <- file.path( pcgr_db_assembly_dir, "biomarker", "tsv", @@ -420,12 +435,23 @@ load_reference_data <- function( check_file_exists(fname) bm_data <- as.data.frame( readr::read_tsv(fname, show_col_types = F, na = ".")) - if("source_id" %in% colnames(bm_data)){ + if ("source_id" %in% colnames(bm_data)) { bm_data <- bm_data |> dplyr::mutate( source_id = as.character(.data$source_id)) } + if ('entrezgene' %in% colnames(bm_data)) { + bm_data <- bm_data |> + dplyr::mutate( + entrezgene = as.character(.data$entrezgene)) + } + if ('variant_id' %in% colnames(bm_data)) { + bm_data <- bm_data |> + dplyr::mutate( + variant_id = as.character(.data$variant_id)) + } + pcgr_ref_data[['biomarker']][[elem]] <- dplyr::bind_rows( pcgr_ref_data[['biomarker']][[elem]], bm_data @@ -439,7 +465,7 @@ load_reference_data <- function( ## Metadata pcgr_ref_data[['metadata']] <- data.frame() for(dtype in c('gene','gwas','hotspot','other', - 'phenotype','biomarker','drug')){ + 'phenotype','biomarker','drug')) { fname <- file.path( pcgr_db_assembly_dir, ".METADATA", "tsv", @@ -451,12 +477,12 @@ load_reference_data <- function( dplyr::mutate(datatype = dtype) |> dplyr::mutate(wflow = dplyr::case_when( stringr::str_detect( - source_abbreviation, + .data$source_abbreviation, paste0( "^(gepa|cpg_other|maxwell2016|acmg_sf|dbmts|", "woods_dnarepair|gerp|tcga_pancan_2018|gwas_catalog)")) ~ "cpsr", stringr::str_detect( - source_abbreviation, + .data$source_abbreviation, "^(cytoband|mitelmandb|tcga|nci|intogen|opentargets|dgidb|pubchem)$") ~ "pcgr", TRUE ~ as.character("pcgr_cpsr") )) @@ -465,8 +491,8 @@ load_reference_data <- function( pcgr_ref_data[['metadata']] |> dplyr::bind_rows(metadata_dtype) |> dplyr::filter( - source_abbreviation != "foundation_one" & - source_abbreviation != "illumina" + .data$source_abbreviation != "foundation_one" & + .data$source_abbreviation != "illumina" ) } diff --git a/pcgrr/R/report.R b/pcgrr/R/report.R index f397b509..2e7a9dca 100644 --- a/pcgrr/R/report.R +++ b/pcgrr/R/report.R @@ -48,7 +48,7 @@ init_report <- function(yaml_fname = NULL, vcf_tag_AN <- "gnomADe_non_cancer_AN" vcf_tag_AC <- "gnomADe_non_cancer_AC" vcf_tag_NHOMALT <- "gnomADe_non_cancer_NHOMALT" - if(population != "global"){ + if (population != "global") { vcf_tag_AF <- paste0("gnomADe_non_cancer_",toupper(population),"_AF") vcf_tag_AN <- @@ -61,7 +61,7 @@ init_report <- function(yaml_fname = NULL, pop_desc_df <- report$ref_data$vcf_infotags[ report$ref_data$vcf_infotags$tag == vcf_tag_AF,] - if(NROW(pop_desc_df) == 1){ + if (NROW(pop_desc_df) == 1) { population_description <- pop_desc_df$description report[["settings"]][["conf"]][["variant_classification"]][["vcftag_gnomad_AF"]] <- vcf_tag_AF @@ -98,7 +98,7 @@ init_report <- function(yaml_fname = NULL, report[["content"]][[a_elem]] <- list() report[["content"]][[a_elem]][["eval"]] <- FALSE - if(a_elem == "tumor_purity" | a_elem == "tumor_ploidy"){ + if (a_elem == "tumor_purity" | a_elem == "tumor_ploidy") { report[["content"]][[a_elem]][["eval"]] <- TRUE } @@ -203,7 +203,7 @@ update_report <- function(report, report_data, #' #' @export init_tmb_content <- function(tcga_tmb = NULL, - config = NULL){ + config = NULL) { invisible(assertthat::assert_that(!is.null(tcga_tmb))) invisible(assertthat::assert_that(is.data.frame(tcga_tmb) & @@ -235,7 +235,7 @@ init_tmb_content <- function(tcga_tmb = NULL, # #' @return rep updated PCGR report structure - initialized for CNA content #' @export -init_cna_content <- function(rep = NULL){ +init_cna_content <- function(rep = NULL) { invisible(assertthat::assert_that(!is.null(rep))) invisible(assertthat::assert_that(!is.null(rep[['disp']]))) @@ -271,7 +271,7 @@ init_cna_content <- function(rep = NULL){ # #' @return rep updated PCGR report structure - initialized for SNV/InDel content #' @export -init_snv_indel_content <- function(rep = NULL){ +init_snv_indel_content <- function(rep = NULL) { invisible(assertthat::assert_that(!is.null(rep))) invisible(assertthat::assert_that(!is.null(rep[['disp']]))) @@ -306,7 +306,7 @@ init_snv_indel_content <- function(rep = NULL){ #' #' @return rep Report structure initialized for signature data #' @export -init_m_signature_content <- function(){ +init_m_signature_content <- function() { rep <- list() rep[["eval"]] <- FALSE @@ -332,14 +332,38 @@ init_m_signature_content <- function(){ return(rep) } -#init_msi_content <- function(){} -#init_kataegis_content <- function(){} +#' Function that initiates report element with MSI classification +#' +#' @export +init_msi_content <- function() { + rep <- list() + + rep[["eval"]] <- FALSE + rep[["missing_data"]] <- FALSE + rep[["prediction"]] <- list() + + return(rep) + +} + +#' Function that initiates report element with kataegis information +#' +#' @export +init_kataegis_content <- function() { + rep <- list() + + rep[["eval"]] <- FALSE + rep[["events"]] <- data.frame() + + return(rep) + +} #' Function that initiates report element with rainfall information #' #' @return rep Report structure initialized for rainfall data #' @export -init_rainfall_content <- function(){ +init_rainfall_content <- function() { rep <- list() @@ -367,7 +391,7 @@ init_rainfall_content <- function(){ #' #' @return rep Report structure initialized for tumor-only data #' @export -init_tumor_only_content <- function(){ +init_tumor_only_content <- function() { rep <- list() rep[["eval"]] <- FALSE @@ -399,7 +423,7 @@ init_tumor_only_content <- function(){ #' #' @return rep Report structure initialized for value box data #' @export -init_valuebox_content <- function(){ +init_valuebox_content <- function() { rep <- list() rep[["eval"]] <- FALSE @@ -431,7 +455,7 @@ init_valuebox_content <- function(){ #' #' @return rep Report structure initialized for ranked display #' @export -init_report_display_content <- function(){ +init_report_display_content <- function() { rep <- list() rep[["eval"]] <- FALSE @@ -451,7 +475,7 @@ init_report_display_content <- function(){ #' #' @return rep Report structure initialized for variant data #' @export -init_var_content <- function(){ +init_var_content <- function() { rep <- list() @@ -461,7 +485,7 @@ init_var_content <- function(){ rep[["variant_set"]] <- list() rep[["v_stat"]] <- list() rep[["zero"]] <- FALSE - for (tumorclass in c("any_ttype", "other_ttype", "specific_ttype")) { + for (tumorclass in c("any_ttype", "other_ttype", "query_ttype")) { rep[["clin_eitem"]][[tumorclass]] <- list() for (e_type in c("prognostic", "diagnostic", "predictive")) { for (e_level in c("A_B", "C_D_E", "any")) { @@ -477,7 +501,7 @@ init_var_content <- function(){ #' #' @return rep Report structure initialized for germline data (CPSR) #' @export -init_germline_content <- function(){ +init_germline_content <- function() { rep <- list() rep[["max_dt_rows"]] <- 0 @@ -498,12 +522,12 @@ init_germline_content <- function(){ rep[["clin_eitem"]] <- list() for (evidence_type in pcgrr::evidence_types) { rep[["clin_eitem"]][[evidence_type]] <- list() - for(level in pcgrr::evidence_levels){ + for(level in pcgrr::evidence_levels) { rep[["clin_eitem"]][[evidence_type]][[level]] <- data.frame() } rep[['clin_eitem']][['all']] <- list() - for(level in pcgrr::evidence_levels){ + for(level in pcgrr::evidence_levels) { rep[["clin_eitem"]][['all']][[level]] <- data.frame() } @@ -532,7 +556,7 @@ init_germline_content <- function(){ #' @export load_yaml <- function(yml_fname, report_mode = "CPSR") { - if(!file.exists(yml_fname)){ + if (!file.exists(yml_fname)) { log4r_fatal( paste0("YAML file '",yml_fname,"' does not exist - exiting")) } @@ -541,11 +565,11 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { for(t in c('sample_id', 'genome_assembly', 'workflow', - 'output_dir')){ - if(is.null(report_settings[[t]])){ + 'output_dir')) { + if (is.null(report_settings[[t]])) { missing_yaml_info <- T }else{ - if(identical(typeof(report_settings[[t]]),"character") == F){ + if (identical(typeof(report_settings[[t]]),"character") == F) { missing_yaml_info <- T } } @@ -553,16 +577,16 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { for(t in c('conf', 'molecular_data', 'reference_data', - 'software')){ - if(is.null(report_settings[[t]])){ + 'software')) { + if (is.null(report_settings[[t]])) { missing_yaml_info <- T }else{ - if(identical(typeof(report_settings[[t]]),"list") == F){ + if (identical(typeof(report_settings[[t]]),"list") == F) { missing_yaml_info <- T } } } - if(missing_yaml_info == F){ + if (missing_yaml_info == F) { log4r_info(paste0( "Successfully parsed YAML configuration file - reporting mode: ", report_mode)) }else{ @@ -574,7 +598,7 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { ## check that it matches the report_mode ## return it - if(report_settings[['workflow']] != report_mode){ + if (report_settings[['workflow']] != report_mode) { log4r_fatal( paste0("Cannot read YAML file from ", report_settings[['workflow']], @@ -583,9 +607,9 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { } ref_data <- list() - if(dir.exists( + if (dir.exists( report_settings[['reference_data']][['path']] - )){ + )) { ref_data <- load_reference_data( pcgr_db_assembly_dir = report_settings[['reference_data']][['path']], genome_assembly = report_settings[['genome_assembly']] @@ -609,7 +633,7 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { if (identical( typeof( report_settings[['conf']][['sample_properties']][['phenotype']]), - "list")){ + "list")) { report_settings[['conf']][['sample_properties']][['phenotype']] <- as.data.frame( rrapply::rrapply( @@ -619,26 +643,26 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { for(col in c('do_id','do_name','efo_id','efo_name', 'icd10_code','ot_name','ot_primary_site', - 'primary_site','ot_code','ot_code_path')){ + 'primary_site','ot_code','ot_code_path')) { - if(NROW(report_settings[['conf']][['sample_properties']][['phenotype']][ - report_settings[['conf']][['sample_properties']][['phenotype']][[col]] == "NaN",]) > 0){ + if (NROW(report_settings[['conf']][['sample_properties']][['phenotype']][ + report_settings[['conf']][['sample_properties']][['phenotype']][[col]] == "NaN",]) > 0) { report_settings[['conf']][['sample_properties']][['phenotype']][ report_settings[['conf']][['sample_properties']][['phenotype']][[col]] == "NaN",col] <- as.character(NA) } - if(NROW(report_settings[['conf']][['sample_properties']][['phenotype']][ - is.nan(report_settings[['conf']][['sample_properties']][['phenotype']][[col]]),]) > 0){ + if (NROW(report_settings[['conf']][['sample_properties']][['phenotype']][ + is.nan(report_settings[['conf']][['sample_properties']][['phenotype']][[col]]),]) > 0) { report_settings[['conf']][['sample_properties']][['phenotype']][ is.nan(report_settings[['conf']][['sample_properties']][['phenotype']]),col] <- as.character(NA) } } - for(col in c('do_cancer_slim','ot_level')){ - if(NROW(report_settings[['conf']][['sample_properties']][['phenotype']][ - is.nan(report_settings[['conf']][['sample_properties']][['phenotype']][[col]]),]) > 0){ + for(col in c('do_cancer_slim','ot_level')) { + if (NROW(report_settings[['conf']][['sample_properties']][['phenotype']][ + is.nan(report_settings[['conf']][['sample_properties']][['phenotype']][[col]]),]) > 0) { report_settings[['conf']][['sample_properties']][['phenotype']][ is.nan(report_settings[['conf']][['sample_properties']][['phenotype']][[col]]),col] <- as.numeric(NA) @@ -656,47 +680,47 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { 'source_license', 'source_license_url', 'source_url', - 'source_citation')){ + 'source_citation')) { - if(NROW(report_settings[['reference_data']][['source_metadata']][ - report_settings[['reference_data']][['source_metadata']][[col]] == "NaN",]) > 0){ + if (NROW(report_settings[['reference_data']][['source_metadata']][ + report_settings[['reference_data']][['source_metadata']][[col]] == "NaN",]) > 0) { report_settings[['reference_data']][['source_metadata']][ report_settings[['reference_data']][['source_metadata']][[col]] == "NaN",col] <- as.character(NA) } } - if(report_mode == "CPSR"){ + if (report_mode == "CPSR") { report_settings[['conf']][['gene_panel']][['panel_genes']] <- as.data.frame( rrapply::rrapply( report_settings$conf$gene_panel$panel_genes, how = "bind")) - if(NROW(report_settings[['conf']][['gene_panel']][['panel_genes']]) == 1){ - for(e in c('panel_id','panel_url','panel_version')){ + if (NROW(report_settings[['conf']][['gene_panel']][['panel_genes']]) == 1) { + for(e in c('panel_id','panel_url','panel_version')) { report_settings[['conf']][['gene_panel']][['panel_genes']][,e] <- NA } - for(e in c('mod','moi')){ - if(is.nan(report_settings$conf$gene_panel$panel_genes[,e])){ + for(e in c('mod','moi')) { + if (is.nan(report_settings$conf$gene_panel$panel_genes[,e])) { report_settings[['conf']][['gene_panel']][['panel_genes']][,e] <- NA } } }else{ - for(col in c('panel_id','panel_version')){ + for(col in c('panel_id','panel_version')) { - if(NROW(report_settings[['conf']][['gene_panel']][['panel_genes']][ - is.nan(report_settings[['conf']][['gene_panel']][['panel_genes']][[col]]),]) > 0){ + if (NROW(report_settings[['conf']][['gene_panel']][['panel_genes']][ + is.nan(report_settings[['conf']][['gene_panel']][['panel_genes']][[col]]),]) > 0) { report_settings[['conf']][['gene_panel']][['panel_genes']][ is.nan(report_settings[['conf']][['gene_panel']][['panel_genes']][[col]]),col] <- as.numeric(NA) } } - for(col in c('mod','moi')){ + for(col in c('mod','moi')) { - if(NROW(report_settings[['conf']][['gene_panel']][['panel_genes']][ - is.nan(report_settings[['conf']][['gene_panel']][['panel_genes']][[col]]),]) > 0){ + if (NROW(report_settings[['conf']][['gene_panel']][['panel_genes']][ + is.nan(report_settings[['conf']][['gene_panel']][['panel_genes']][[col]]),]) > 0) { report_settings[['conf']][['gene_panel']][['panel_genes']][ is.nan(report_settings[['conf']][['gene_panel']][['panel_genes']][[col]]),col] <- as.character(NA) @@ -717,9 +741,9 @@ load_yaml <- function(yml_fname, report_mode = "CPSR") { pcgrr::color_palette[["none"]][["values"]][1] report_settings$conf$visual_reporting[["color_value_box"]] <- pcgrr::color_palette[["report_color"]][["values"]][1] - if(report_mode == "PCGR" & + if (report_mode == "PCGR" & !is.null(report_settings$conf$assay_properties)) { - if(report_settings$conf$assay_properties$vcf_tumor_only == 1) { + if (report_settings$conf$assay_properties$vcf_tumor_only == 1) { report_settings$conf$visual_reporting[["color_value_box"]] <- pcgrr::color_palette[["report_color"]][["values"]][2] } diff --git a/pcgrr/R/utils.R b/pcgrr/R/utils.R index 5c2ca658..31f37a26 100644 --- a/pcgrr/R/utils.R +++ b/pcgrr/R/utils.R @@ -466,7 +466,7 @@ append_tcga_var_link <- function(var_df, #' #' @export append_tfbs_annotation <- - function(var_df){ + function(var_df) { if (any(grepl(paste0("^CONSEQUENCE$"), names(var_df))) & any(grepl(paste0("^VAR_ID$"), names(var_df))) & @@ -485,7 +485,7 @@ append_tfbs_annotation <- )) |> dplyr::distinct() - if(nrow(var_df_unique_slim) > 0){ + if (nrow(var_df_unique_slim) > 0) { var_df_unique_slim_melted <- as.data.frame( var_df_unique_slim |> tidyr::separate_rows(.data$REGULATORY_ANNOTATION, sep=",") |> @@ -495,7 +495,7 @@ append_tfbs_annotation <- )) ) - if(nrow(var_df_unique_slim_melted) > 0){ + if (nrow(var_df_unique_slim_melted) > 0) { pcgrr::log4r_info(paste0( "Found TF binding site annotations for ", @@ -595,7 +595,7 @@ append_dbmts_var_link <- sep = "\\|", convert = T) |> dplyr::filter(.data$ens_trans_id == .data$ENSEMBL_TRANSCRIPT_ID) ) - if(nrow(var_df_unique_slim_melted) > 0){ + if (nrow(var_df_unique_slim_melted) > 0) { var_df_unique_slim_melted <- var_df_unique_slim_melted |> dplyr::select(-c(.data$ENSEMBL_TRANSCRIPT_ID, .data$algorithms_call)) |> dplyr::mutate(miRNA_TARGET_HIT = dplyr::case_when( @@ -707,7 +707,7 @@ append_dbnsfp_var_link <- function(var_df) { #' @export append_drug_var_link <- function( vcf_data_df, - ref_data = NULL){ + ref_data = NULL) { pcgrr::log4r_info("Adding annotation links - targeted cancer drugs") @@ -719,27 +719,38 @@ append_drug_var_link <- function( dplyr::filter(!is.na(.data$SYMBOL)) |> dplyr::distinct() if (nrow(var_drug_df) > 0) { + cancer_drugs <- - dplyr::select(ref_data[["drug"]], - c("SYMBOL","DRUG_NAME", + dplyr::select(ref_data[["drug"]][['targeted']], + c("SYMBOL", + "ATC_TREATMENT_CATEGORY", + "ATC_LEVEL3", + "DRUG_NAME", "DRUG_MAX_PHASE_INDICATION", "DRUG_ACTION_TYPE", "DRUG_LINK")) |> + dplyr::filter( + .data$ATC_TREATMENT_CATEGORY != "cancer_unclassified") |> dplyr::distinct() |> dplyr::mutate( DRUG_ACTION_TYPE = stringr::str_to_title( .data$DRUG_ACTION_TYPE )) |> + dplyr::filter( + .data$DRUG_ACTION_TYPE != "Other") |> + dplyr::filter( + .data$DRUG_MAX_PHASE_INDICATION > 1) |> dplyr::arrange( .data$SYMBOL, dplyr::desc(.data$DRUG_MAX_PHASE_INDICATION)) |> dplyr::select(-c("DRUG_MAX_PHASE_INDICATION")) |> dplyr::distinct() |> dplyr::group_by( - .data$SYMBOL, - .data$DRUG_ACTION_TYPE + .data$SYMBOL ) |> dplyr::summarise( + DRUG_ACTION_TYPE = paste( + unique(.data$DRUG_ACTION_TYPE), collapse=", "), TARGETED_CANCER_DRUGS = paste( .data$DRUG_LINK, collapse=", "), TARGETED_CANCER_DRUGS2 = paste( @@ -749,13 +760,17 @@ append_drug_var_link <- function( var_drug_df <- var_drug_df |> dplyr::left_join( cancer_drugs, - by = c("SYMBOL" = "SYMBOL")) |> + by = c("SYMBOL")) |> dplyr::filter(!is.na(.data$TARGETED_CANCER_DRUGS2)) |> + dplyr::select(-c("DRUG_ACTION_TYPE")) |> dplyr::distinct() if (NROW(var_drug_df) > 0) { - vcf_data_df <- dplyr::left_join( - vcf_data_df, var_drug_df, - by = c("VAR_ID" = "VAR_ID")) + vcf_data_df <- + dplyr::left_join( + vcf_data_df, + var_drug_df, + by = c("VAR_ID","SYMBOL") + ) }else{ vcf_data_df$TARGETED_CANCER_DRUGS <- NA vcf_data_df$TARGETED_CANCER_DRUGS2 <- NA @@ -896,18 +911,33 @@ append_otargets_pheno_link <- function(var_df, #' #' @param vcf_data_df Data frame of sample variants from VCF #' @param ref_data PCGR reference data bundle object +#' @param site Primary tumor site +#' @param pos_var variable reflecting chromosome order (POS/SEGMENT_START) #' @return vcf_data_df #' #' @export append_cancer_gene_evidence <- function(vcf_data_df = NULL, - ref_data = NULL){ + ref_data = NULL, + site = 'Any', + pos_var = 'POS') { if (any(grepl(paste0("^ENTREZGENE$"), names(vcf_data_df))) & - any(grepl(paste0("^ENSEMBL_GENE_ID$"), names(vcf_data_df)))){ + any(grepl(paste0("^ENSEMBL_GENE_ID$"), names(vcf_data_df)))) { pcgrr::log4r_info(paste0("Adding literature evidence for cancer-relevant genes")) + + tissue_gene_ranks <- ref_data[['gene']][['otp_rank']] |> + dplyr::select(c("ENTREZGENE", "PRIMARY_SITE", "TISSUE_ASSOC_RANK")) |> + dplyr::filter(.data$PRIMARY_SITE == site) |> + dplyr::distinct() + + global_gene_ranks <- ref_data[['gene']][['otp_rank']] |> + dplyr::select(c("ENTREZGENE", "GLOBAL_ASSOC_RANK")) |> + dplyr::distinct() + + vcf_data_df_1 <- vcf_data_df |> dplyr::filter(!is.na(.data$ENTREZGENE)) vcf_data_df_2 <- vcf_data_df |> @@ -919,7 +949,7 @@ append_cancer_gene_evidence <- is.na(.data$ENTREZGENE) & is.na(.data$ENSEMBL_GENE_ID)) - if(NROW(vcf_data_df_1) > 0){ + if (NROW(vcf_data_df_1) > 0) { vcf_data_df_1 <- vcf_data_df_1 |> dplyr::left_join( @@ -931,20 +961,48 @@ append_cancer_gene_evidence <- "CANCERGENE_EVIDENCE")), !is.na(.data$ENTREZGENE)), by = c("ENTREZGENE" = "ENTREZGENE", - "ENSEMBL_GENE_ID" = "ENSEMBL_GENE_ID")) + "ENSEMBL_GENE_ID" = "ENSEMBL_GENE_ID")) |> + dplyr::distinct() + + ## Add gene ranks (Open Targets Platform) + ## - according to primary tumor types/sites + ## - globally (across all tumor types/sites) + vcf_data_df_1 <- vcf_data_df_1 |> + dplyr::left_join( + global_gene_ranks, by = "ENTREZGENE") |> + dplyr::mutate(GLOBAL_ASSOC_RANK = dplyr::if_else( + is.na(.data$GLOBAL_ASSOC_RANK), + as.numeric(0), + as.numeric(.data$GLOBAL_ASSOC_RANK) + )) + if (NROW(tissue_gene_ranks) > 0) { + tissue_gene_ranks$PRIMARY_SITE <- NULL + vcf_data_df_1 <- vcf_data_df_1 |> + dplyr::left_join( + tissue_gene_ranks, by = "ENTREZGENE") |> + dplyr::mutate(TISSUE_ASSOC_RANK = dplyr::if_else( + is.na(.data$TISSUE_ASSOC_RANK), + as.numeric(0), + as.numeric(.data$TISSUE_ASSOC_RANK) + )) + }else{ + vcf_data_df_1 <- vcf_data_df_1 |> + dplyr::mutate(TISSUE_ASSOC_RANK = as.numeric(0)) + } } - if(NROW(vcf_data_df_2) > 0){ + if (NROW(vcf_data_df_2) > 0) { vcf_data_df_2 <- vcf_data_df_2 |> dplyr::left_join( dplyr::filter( dplyr::select( ref_data[["gene"]][["gene_xref"]], - c("ENTREZGENE", "ENSEMBL_GENE_ID","CANCERGENE_EVIDENCE")), - !is.na(.data$ENTREZGENE)), - by = c("ENSEMBL_GENE_ID" = "ENSEMBL_GENE_ID")) + c("ENSEMBL_GENE_ID","CANCERGENE_EVIDENCE")), + !is.na(.data$ENSEMBL_GENE_ID)), + by = c("ENSEMBL_GENE_ID")) |> + dplyr::distinct() } @@ -953,7 +1011,12 @@ append_cancer_gene_evidence <- vcf_data_df_2, vcf_data_df_3) |> dplyr::distinct() |> - pcgrr::order_variants() + dplyr::mutate(CANCERGENE_EVIDENCE = dplyr::if_else( + .data$CANCERGENE_EVIDENCE == ".", + as.character(NA), + as.character(.data$CANCERGENE_EVIDENCE) + )) |> + pcgrr::order_variants(pos_var = pos_var) } @@ -971,7 +1034,7 @@ append_cancer_gene_evidence <- #' @export append_gwas_citation_phenotype <- function(vcf_data_df = NULL, - ref_data = NULL){ + ref_data = NULL) { invisible(assertthat::assert_that( @@ -1435,11 +1498,11 @@ get_calls <- function(tsv_gz_file, ## convert all columns with only NA values to character type - if(NROW(vcf_data_df) > 0){ + if (NROW(vcf_data_df) > 0) { num_rows <- NROW(vcf_data_df) - for (n in colnames(vcf_data_df)){ - if(length(vcf_data_df[is.na(vcf_data_df[,n]),n]) == num_rows){ - if(typeof(vcf_data_df[,n]) == "logical"){ + for (n in colnames(vcf_data_df)) { + if (length(vcf_data_df[is.na(vcf_data_df[,n]),n]) == num_rows) { + if (typeof(vcf_data_df[,n]) == "logical") { vcf_data_df[,n] <- as.character(vcf_data_df[,n]) } } @@ -1450,9 +1513,9 @@ get_calls <- function(tsv_gz_file, af_pop_columns_numeric <- colnames(vcf_data_df)[stringr::str_detect(colnames(vcf_data_df), "_AF_[0-9A-Z]{1,}$")] - if(NROW(vcf_data_df) > 0){ - for (col in af_pop_columns_numeric){ - if(typeof(vcf_data_df[, col]) != "double"){ + if (NROW(vcf_data_df) > 0) { + for (col in af_pop_columns_numeric) { + if (typeof(vcf_data_df[, col]) != "double") { vcf_data_df[, col] <- as.numeric(vcf_data_df[, col]) } } @@ -1463,9 +1526,9 @@ get_calls <- function(tsv_gz_file, colnames(vcf_data_df), "NON_CANCER_(AC|AN|NHOMALT)")] - if(NROW(vcf_data_df) > 0){ - for (col in af_pop_columns_integer){ - if(typeof(vcf_data_df[, col]) != "integer"){ + if (NROW(vcf_data_df) > 0) { + for (col in af_pop_columns_integer) { + if (typeof(vcf_data_df[, col]) != "integer") { vcf_data_df[, col] <- as.integer(vcf_data_df[, col]) } } @@ -1488,7 +1551,8 @@ get_calls <- function(tsv_gz_file, #' #' #' @export -write_processed_vcf <- function(calls, sample_name = NULL, +write_processed_vcf <- function(calls, + sample_name = NULL, output_directory = NULL, vcf_fname = NULL) { @@ -1512,7 +1576,9 @@ write_processed_vcf <- function(calls, sample_name = NULL, sample_vcf_content_fname <- file.path(output_directory, paste0(sample_name, ".", - sample(100000, 1), ".vcf_content.tsv")) + stringi::stri_rand_strings( + 1, 15, pattern = "[A-Za-z0-9]"), + ".vcf_content.tsv")) write(header_lines, file = vcf_fname, sep = "\n") sample_vcf <- vcf_df[, c("CHROM", "POS", "ID", "REF", @@ -1579,7 +1645,7 @@ detect_vcf_sample_name <- function(df, sample_name = NULL, cpsr = FALSE) { #' @export targeted_drugs_pr_ttype <- function(ttype, pcgr_data, - ignore_on_label_early_phase = T){ + ignore_on_label_early_phase = T) { pcgrr::log4r_info( paste0("Retrieving targeted drugs (on-label and off-label) for ", @@ -1604,7 +1670,7 @@ targeted_drugs_pr_ttype <- function(ttype, site_candidates[["off_label"]] ## If tumor type not specified, off-label indications make no sense - if(ttype == "Any"){ + if (ttype == "Any") { drug_candidates[["off_label"]] <- data.frame() } @@ -1634,7 +1700,7 @@ targeted_drugs_pr_ttype <- function(ttype, } all_candidates <- drug_candidates[["on_label_early_phase"]] - if(NROW(drug_candidates[['off_label']]) > 0){ + if (NROW(drug_candidates[['off_label']]) > 0) { all_candidates <- dplyr::full_join(drug_candidates[["off_label"]], drug_candidates[["on_label_early_phase"]], by = "symbol") @@ -1677,7 +1743,7 @@ targeted_drugs_pr_ttype <- function(ttype, targeted_drugs_summarise <- function( candidate_drugs = NULL, link_label = "DRUGS_ON_LABEL", - indication_label = "DRUGS_ON_LABEL_INDICATIONS"){ + indication_label = "DRUGS_ON_LABEL_INDICATIONS") { invisible(assertthat::assert_that(!is.null(candidate_drugs))) invisible(assertthat::assert_that(is.data.frame(candidate_drugs))) @@ -1814,10 +1880,10 @@ pkg_exists <- function(p) { #' @param fname Name of file to check #' #' @export -check_file_exists <- function(fname){ +check_file_exists <- function(fname) { - if(file.exists(fname)){ - if(file.size(fname) == 0){ + if (file.exists(fname)) { + if (file.size(fname) == 0) { log4r_fatal( paste0("File ", fname, " has zero size - exiting") ) diff --git a/pcgrr/R/validate.R b/pcgrr/R/validate.R deleted file mode 100644 index f6dedc1c..00000000 --- a/pcgrr/R/validate.R +++ /dev/null @@ -1,10 +0,0 @@ -#' -#' -#' -validate_settings <- function(settings = NULL, type = "PCGR") { - -} - -validate_ref_data <- function(ref_data = NULL) { - -} diff --git a/pcgrr/R/value_boxes.R b/pcgrr/R/value_boxes.R index 44d259b2..68a28a6a 100644 --- a/pcgrr/R/value_boxes.R +++ b/pcgrr/R/value_boxes.R @@ -92,10 +92,10 @@ generate_report_data_value_box <- function(pcg_report, } } - if (rep_cont[['kataegis']][["eval"]]){ + if (rep_cont[['kataegis']][["eval"]]) { pcg_report_value_box[["kataegis"]] <- "None" num_events <- NROW(rep_cont$kataegis$events) - if(num_events > 0){ + if (num_events > 0) { num_events <- NROW(rep_cont$kataegis$events |> dplyr::filter(.data$confidence == 3)) # pcg_report_value_box[["kataegis"]] <- diff --git a/pcgrr/data-raw/data-raw.R b/pcgrr/data-raw/data-raw.R index 0dfc1a56..3683bf30 100755 --- a/pcgrr/data-raw/data-raw.R +++ b/pcgrr/data-raw/data-raw.R @@ -82,6 +82,7 @@ data_coltype_defs[['cna_somatic_raw']] <- readr::cols_only( SEGMENT_START = readr::col_double(), SEGMENT_END = readr::col_double(), VAR_ID = readr::col_character(), + VARIANT_CLASS = readr::col_character(), N_MAJOR = readr::col_integer(), N_MINOR = readr::col_integer(), CHROMOSOME_ARM = readr::col_character(), @@ -178,6 +179,7 @@ data_coltype_defs[['snv_indel_somatic_raw']] <- readr::cols_only( CLINVAR_CONFLICTED = readr::col_logical(), CLINVAR_REVIEW_STATUS_STARS = readr::col_integer(), CLINVAR_NUM_SUBMITTERS = readr::col_integer(), + CLINVAR_VARIANT_ORIGIN = readr::col_character(), PANEL_OF_NORMALS = readr::col_logical(), DBSNPRSID = readr::col_character(), COSMIC_MUTATION_ID = readr::col_character(), @@ -358,6 +360,200 @@ data_coltype_defs[['snv_indel_germline_raw']] <- readr::cols_only( usethis::use_data(data_coltype_defs, overwrite = T) +tsv_cols <- + c('CHROM', + 'POS', + 'REF', + 'ALT', + 'GENOMIC_CHANGE', + 'GENOME_VERSION', + 'SAMPLE_ID', + 'VARIANT_CLASS', + 'SYMBOL', + 'PROTEIN_CHANGE', + 'CONSEQUENCE', + 'LOSS_OF_FUNCTION', + 'GENENAME', + 'PROTEIN_DOMAIN', + 'CDS_CHANGE', + 'CODING_STATUS', + 'EXONIC_STATUS', + 'MUTATION_HOTSPOT', + 'MUTATION_HOTSPOT_CANCERTYPE', + 'HGVSc', + 'HGVSp', + 'ENTREZGENE', + 'CANONICAL', + 'CCDS', + 'UNIPROT_ACC', + 'ENSEMBL_TRANSCRIPT_ID', + 'ENSEMBL_PROTEIN_ID', + 'REFSEQ_TRANSCRIPT_ID', + 'REFSEQ_PROTEIN_ID', + 'TRANSCRIPT_MANE_SELECT', + 'ONCOGENE', + 'TUMOR_SUPPRESSOR', + 'PREDICTED_EFFECT', + 'REGULATORY_ANNOTATION', + 'ONCOGENICITY', + 'ONCOGENICITY_CLASSIFICATION_CODE', + 'ONCOGENICITY_SCORE', + 'VEP_ALL_CSQ', + 'WINMASKER_HIT', + 'SIMPLEREPEATS_HIT', + 'gnomADe_AF', + 'DBSNPRSID', + 'COSMIC_MUTATION_ID', + 'TCGA_FREQUENCY', + 'TCGA_PANCANCER_COUNT', + 'CLINVAR', + 'CLINVAR_CLNSIG', + 'BIOMARKER_MATCH', + 'TARGETED_CANCER_DRUGS2', + 'CALL_CONFIDENCE', + 'DP_TUMOR', + 'AF_TUMOR', + 'DP_CONTROL', + 'AF_CONTROL', + 'TIER', + 'TIER_DESCRIPTION', + 'GENOMIC_CHANGE', + 'GENOME_VERSION') + +display_cols <- list() +display_cols[['tier1_2']] <- + c('SYMBOL', + 'PROTEIN_CHANGE', + 'CONSEQUENCE', + 'CANCER_TYPE', + 'EVIDENCE_LEVEL', + 'CLINICAL_SIGNIFICANCE', + 'EVIDENCE_TYPE', + 'THERAPEUTIC_CONTEXT', + 'EVIDENCE_DIRECTION', + 'VARIANT_ORIGIN', + 'DISEASE_ONTOLOGY_ID', + 'DESCRIPTION', + 'BIOMARKER_MATCH', + 'BIOMARKER_SOURCE_DB', + 'EVIDENCE_ID', + 'CITATION', + 'RATING', + 'GENENAME', + 'PROTEIN_DOMAIN', + 'CDS_CHANGE', + 'MUTATION_HOTSPOT', + 'MUTATION_HOTSPOT_CANCERTYPE', + 'TCGA_FREQUENCY', + 'HGVSc', + 'HGVSp', + 'ENSEMBL_TRANSCRIPT_ID', + 'ENSEMBL_PROTEIN_ID', + 'REFSEQ_TRANSCRIPT_ID', + 'PREDICTED_EFFECT', + 'ONCOGENICITY', + 'ONCOGENICITY_CLASSIFICATION_CODE', + 'ONCOGENICITY_SCORE', + 'VEP_ALL_CSQ', + 'DBSNP', + 'COSMIC', + 'CLINVAR', + 'TARGETED_CANCER_DRUGS', + 'CALL_CONFIDENCE', + 'DP_TUMOR', + 'AF_TUMOR', + 'DP_CONTROL', + 'AF_CONTROL', + 'GENOMIC_CHANGE', + 'GENOME_VERSION') + +display_cols[['tier3']] <- + c('SYMBOL', + 'PROTEIN_CHANGE', + 'GENENAME', + 'CONSEQUENCE', + 'ONCOGENICITY', + 'PROTEIN_DOMAIN', + 'MUTATION_HOTSPOT', + 'COSMIC', + 'CDS_CHANGE', + 'HGVSc', + 'HGVSp', + 'MUTATION_HOTSPOT_CANCERTYPE', + 'TCGA_FREQUENCY', + 'ENSEMBL_TRANSCRIPT_ID', + 'ENSEMBL_PROTEIN_ID', + 'REFSEQ_TRANSCRIPT_ID', + 'PREDICTED_EFFECT', + 'ONCOGENICITY_CLASSIFICATION_CODE', + 'ONCOGENICITY_SCORE', + 'VEP_ALL_CSQ', + 'DBSNP', + 'CLINVAR', + 'TARGETED_CANCER_DRUGS', + 'ONCOGENE', + 'TUMOR_SUPPRESSOR', + 'CANCERGENE_EVIDENCE', + 'CALL_CONFIDENCE', + 'DP_TUMOR', + 'AF_TUMOR', + 'DP_CONTROL', + 'AF_CONTROL', + 'GENOMIC_CHANGE', + 'GENOME_VERSION') + +display_cols[['tier4']] <- + c('SYMBOL', + 'PROTEIN_CHANGE', + 'GENENAME', + 'CONSEQUENCE', + 'ONCOGENICITY', + 'PROTEIN_DOMAIN', + 'COSMIC', + 'CDS_CHANGE', + 'TCGA_FREQUENCY', + 'HGVSc', + 'HGVSp', + 'ENSEMBL_TRANSCRIPT_ID', + 'ENSEMBL_PROTEIN_ID', + 'REFSEQ_TRANSCRIPT_ID', + 'PREDICTED_EFFECT', + 'REGULATORY_ANNOTATION', + 'ONCOGENICITY_CLASSIFICATION_CODE', + 'ONCOGENICITY_SCORE', + 'VEP_ALL_CSQ', + 'DBSNP', + 'CLINVAR', + 'TARGETED_CANCER_DRUGS', + 'CALL_CONFIDENCE', + 'DP_TUMOR', + 'AF_TUMOR', + 'DP_CONTROL', + 'AF_CONTROL', + 'GENOMIC_CHANGE', + 'GENOME_VERSION') + +display_cols[['tier5']] <- + c('SYMBOL', + 'GENENAME', + 'CONSEQUENCE', + 'COSMIC', + 'DBSNP', + 'CLINVAR', + 'TCGA_FREQUENCY', + 'ENSEMBL_TRANSCRIPT_ID', + 'ENSEMBL_PROTEIN_ID', + 'REFSEQ_TRANSCRIPT_ID', + 'REGULATORY_ANNOTATION', + 'VEP_ALL_CSQ', + 'CALL_CONFIDENCE', + 'DP_TUMOR', + 'AF_TUMOR', + 'DP_CONTROL', + 'AF_CONTROL', + 'GENOMIC_CHANGE', + 'GENOME_VERSION') + #---- variant_db_url ----# variant_db_url <- data.frame( diff --git a/pcgrr/data/cancer_phenotypes_regex.rda b/pcgrr/data/cancer_phenotypes_regex.rda index d6b05ba290992d4886bbaf1dee218a850d11eb7e..381c553036f30093d8addab85afe820ff0eb37cc 100644 GIT binary patch delta 525 zcmV+o0`mQv1e^p8LRx4!F+o`-Q(4Q`YUz;<8-M3ZUAEl~OeFM;G@hC%>7q1fH1eLN zo{)*%B}YclLXR!H|L49!Rq zIDZv5@-RZ6ldI?M>?-0pF~O1SRJz5>OKVi1nxZviBDudBV(TikRdE~hKkn&JH7+nq zh4muUR`J`X?%hL!?B`=&Pu0hm#0v zFsX$|N(q%yNsTOk;~+>DSYXu@Rh7VvuLUckI$;EuDi4n}75m~Qz zwo%}3YE_eNqkV*6iy4NlEs?;?HN{wKNh2{wU7J(`mSaLEZPa_cAtKYc?gb#iQ+9(f zMBEGmrC5nwax1~fY>t$-A5L6i&GM(ng6D>u2700002CTM~Q={+WZ0002fK}<}hLliwgXa}fip=BuhourCUCP*`&RFMi% zl|p5Tje2?f^4pfzw!3+D)6P%RpKZ5t8&$dGYPa)5KGH>)&eJYL6=scMkks6PBY9(o zGJlF#5_fxiV&bqa$<6MuV#XTiFD@$u1sD;mP>Tru#)Au?daC9+@IU^E0BT$0m<(~# zSBX8<{K{EgI?9#JrpH`QJ7}usTML7j&@2E#DY?uYz^*sClEf2*a%LH&@pg5&?Ga|A zB(N$@ow`-kq||J*TU~Re<|wE@W>rhU#0gihMhF^VLM&lA)NMTM5dn02IX z1_92DM6S6N;OtmE2*DymfzfY~0+6f(35}oxF=7Y7z_S|Sy8GmaCXBkC3sP(_F~pRK z@!w(DGeLtZQ&&Kftv5B&leO6diIp(Ecc^4EQ%57&Qvk$eHhiLp31jqSKgQ5%r$0Ua Oi@744C`dGuxbOfWFzw9% diff --git a/pcgrr/data/color_palette.rda b/pcgrr/data/color_palette.rda index ebcda276f2b43c8fc0d7c41d8dfb07c561423fbc..3fd337f6dc478c20e85e26d48bf23b68576a52df 100644 GIT binary patch delta 648 zcmV;30(bqc1+E1SLRx4!F+o`-Q(3cL=`4{B8Gn4phSUR6BTrKvsL7ycHlP3i$TVaP z0BAKdm^D2nfKbrT27#fbfHY_T0me+4X@tng7!ir11Yrh^FaRKtK&FO`29)&CJv7jb zsgMBEO{s)v4XJ`H4#c2>y6!ZhFhM9mB!)2){QY*s+!AMud>m;5lB8W4l}T>GfiTL+ zRDYDV5AEpL(aj640ZIxGAOO#mxX%SLmj?{EvFJuIGcz;ZV;M#Z6slCLuC*5wziu&Y zbGFFRRnv6s_yIs?9u<(+>DYLJBYeY(he52{EM7=71KA zK`B5;1a~`F0U#K@NI$tlt~KFw#Dtp?o-+R({36u#jMC4U!qGMzRG6Z|1M?Z@?@(zg ze1_H`h^I=Lh>028RWD9}sgP9S2#GX`lUV^De;o&KcoYj2Or<1=2igc1tut83r1F80 z#h^4+#jX#0?jX1e7AJs#kdg|P=hBm|@aotdbGv$LfPML>70eki2goV9CKo$vOGKB|!s3!hG$Z=nTO{TwX}_c7&#Dm|L$Mz2TxbffLW&%byi7ZITgQp5h=O zf9!xFdQsZa9pLgpt&Qd%3&{Lqb#^e>L+C82jTlra$llk%#^Oi{Tz4K=F4$y=y_l7m zQvVPag6S!P#|l_$GXaAe4ZRdME5z}t$$@tMF-}K0I1L!^-Y@ndx73r*PbzdGl#pJs zBy2MrvKKu@1redDi1d$gjb&zdWZoXBI&j~-+Ty|4ptU*iTmjz#4sbWS%Q4-eGSJCY i>o!>u?gX>I-%6w_3Cn^D?t{TE_`8xR!i0sI|4Crj$0e-* delta 648 zcmV;30(bqc1+E1SLRx4!F+o`-Q(2nOrs0te8GoF}hSU(!h|@-TqfCLI+JFE9AkmOC z01Zth4NpV?AT$A>XlbAg8UO%slO~#BGBO4PVran_L8D9n2qcgom`xf}#7&bzHl{!W zO*W^aQR)L~R`x?$laN(Z%#+3h5Ob19VT4EX_1N=(L5~;jXUWGr=%8f@(&1tR!HtMc zlz%mi_VjD(>2(ePN(vYV05j!oGj%jegN9sK@J2B+Gc(>}87(W+s#L75wHFk>ZZU0j z$eO@GftekVdy$bF*B33I)?;A4uTOk1zq)NU@^j0P+noI9B)|gJgoMcg>0lf(Op^h4 z!U#RWKqJAgU;+RzzDPf~!@2cX(!zw9gfyeY&4R&cZ9^Q4e9a7quy)EKdji33ndk3N zX)An&)**luK~miMQgz-PTLZ3lZ%uFyzcjGSVKj_BVNJ0xx~;7x zCSOAIQ_cVvz<>zq2pTpM)n zm=J&UKoLD5R4I4Vfosv-p#E)-yO5pc-5%1j$kQ+A};W#vsF=|KfH zynUuaq5|ritvqBhoPS{46Y455yB1-z!$vp?S(hz)bC;%%EWkDZ9MN%`ac$H|802D} in|0dqiqgvCi7C*nBMvt&x((!C#oUoj6eK3JDY!tCCmzWF diff --git a/pcgrr/data/data_coltype_defs.rda b/pcgrr/data/data_coltype_defs.rda index fd1eff8b5cdb8c4681d687993819e471f4645b6a..fa531718093fa55b1227b69b9e8382127c2931f0 100644 GIT binary patch literal 1922 zcmV-|2YvWLT4*^jL0KkKS)$8{@Bkl_f1v;W|GPi{5CA{_|M36s-=Gix00H0$e|ogS zfHVLA0009hKpL7!>KZa>f&c&j0MGyc00#{;!3>6)WMsktrhqV-G|7Mj4GtPK_}tCKm^MSh-DjbXweiA!XTnP zBM8bI7jGnR_{u1%e!}80%uOZ%0tyHDnK3#GXE+WVdoV8~%T*nz9(>7SbLFdh;nmP2cn9@NNb8gSN z^YUzPk> z#6N=%xw7x%yY0I<_c#*MlAhB=otKhX=6cR;SF)F54$Pl2l%PP2kRXCFfe1d(U}0qN@Uzr_}}x=9@h8uXOA?G&kpWHI*J0b+>OkYcg56^`G0rj;>r> zU7r`eF+4fZ!tPC;6jLgU>4i+*myWd}Ztf|-(J;-(zEV@u)g{@cah=!6bvIdda?7JZ zmVczQ@|Tu}u)T}BwQg%qp)}re$!W$X$oDsn=5VrPUQcIdbeCp#9p$AxPXg_c$w^9v zdOVnm6yZLFzGoIVxT<{|y$_S0zc@Naif*^jIl97G=I%O%`+6f&XGXI{@S4e2nOTF> zYWG`~oFw@AJq+EBo>q2R8#*>;P**tMyZV=9QZ#VsHki86cJ6hh8{KGFso7}t-EBw0 zZ6}e8qs+GzZ0p{pe%Hy$ZzEQ&7Iwc|=U!%K+r`A~rc~Udv@5o?EiHAW7`ZcIVwh4{ zNlKKdl4UM0h9OKZDUp^cD_b%teA%>R)pl}Ai#a6jVA+;j3rj4-yrn%&n256Fw4W|jz%Jb=#vOZJio_Xi|{T(LFoSQJJU1=4+>F8HUi?p0wk6?9b%6%&x0R zIlSI_94XsU{8>ZRs-;y|gJh{EJN_9ty1u95KBmoiEZ%Tiq?T;&l2=A1wUWEhr!G-Qj-Y(K7CXs3g{;e1nz57?P?n^ikYTe}xw zT^7p_=EH%N+!1u?>w27%Q$)|3c1%a>t2Q^`Z$%);#*?p9CH?|z>i5_xKxmYCM}g@|&l& zRExFCcVhZ5=O%V?_SkiKy3UDzGSeyhuKii{%jdp}`LCn*v{`nIS}x(1cg6D0CVDDp z@mlIMX_k8He{%Y2sl#`lM<$J#PA!?N$16B^*>+gi;L*E^o6ECb3Yqk7*J;VWAF9>G zGv!ugEL|>HW|ZZN87W;cjyPSiu-(g=jNN(X%X{k0XD25Ol`YZ7X6!HNYPWpOZdSUr z;@fpj-LTbab#He!1=xLV)#j59ZoO$ux7#c!qyGqVkVpFUvVH%}WP;=>1%5B$yw5ZD zzaAM#BSaek4Jra<$3gscnt*SPcWBfL*#tPCgisSkj{~}(fr@~egSsdY0R6M^oIGR^ z%^?PV%43t1vUjJ_hm{b@xC#gy;MxTuPwYr8sQ|J|Xch?&tzr@kioy&avMC5HfXhWo z0>V5R3pk&&C#v6++v(q}V>>^xOOkLNO;D4iWA4#d?V z1}A1o2sk2C_e%E45?TijeaLO-!3^g#iKz<`DhDSNO(H}dDH7@-)gcGP;+oHa{WP59 zAnhUwkb|6pIZNG(e7Ve{@ATR=?qx01k=Y=NDIo>-OUy_rc6{K6{X{|dC;yANBAh5l IQDwyV0C#4#YybcN literal 1898 zcmV-w2bK6jT4*^jL0KkKSx9-_e*hm?|DgZ>|GPi{5CA{_|M0)>-=Gix00H0$e|r1e z=4R-1=l}o!20-QE1{$8F9-sgopc()G0000007^!aCX*(J^%?*G00z|2=zstKI0g}f z(-6QQ82~ViGGxLq5YR%EDWGYiO#z?)000Jn0BG8PMMS3cK*$;cCV&9Zk)upa4^RQ7 zfC`cjOoYUlG-)sjo~G2vwKN$9pQ<+0fCf(ZAiJ!pf(|%>3>P@VN=YI=q?gMw}E89#Z+z zR&nwxOG!l*$@Tuo?M(0yyq0fEMXkjG?h!ntkoFV~vq@ZB6((1%G)c?InV0x~LsaiPFe2}MEC0g@n~ zTB3pqks%MYB=4yp84(5oDx%9SSrcw4Q0UAiH^mqzN&pq?)~8TB)#7NXS=mtK&Q6h% zb2wTVH9fX!dPeD+HKd=D)ffDGu^P=iW#lZE!@7*A%etjXueGh3bW?VUrBKnGlc(HU zKg*Al{+Gic_Il)&l;rd>g&Jz?{YzlU@M!ilc~w!EUM>Y?W`L+p1!;mYM*)a&VKAp? zrUiOfa3!ZDJ?4u#-brVn?K!nx%3X{)lAS9t7q-YwTz0#Q))eUNd zwMh=?D?4*p5;e*OQn`= zy5p6n#fomkhSWakv(q|v&Zk`s`Q6QBM~&Sr+nzOrAwR5Q`$#8{^X;J4ky zjxH*nhc8p;=kd%tqscd0@SNQtEcADs1AVIDZGtZxLMixpL+8%KHOYR)XJO6OG3L=rKPU4qZcM@Oj8O=DJfEwGEAk#@Wd&F z>D!6foWx!UR6Dfm?I@h!i9={3rjBcc}_YQxiE9( zDLvCzE6>p_WPK;!Ju}nx^>WDd^qi@pwEB9#tLRe4=M0=jY?}`DV=>qm7bp4$dqY86xvr59XH|DdO;W-xT7*^QK)U)lS8&-;1!W<+8*%u;68P1YJA5 zuVa#GXqoeF$%y>bXJw2o(Ne0E^HBKnFuO;Cm6PiDzTZ9)dHko!d?!@u=T0jn{YqaF z#|hiQ_DZ$aF15Qcd)1%XW5I^LD?Hqvox5bL&tjdC=GpO9>F<<_waa%^^kL3S?B(yU zTq?Mfh_ww#68U3|xLtnI$}U2{#6W1I2}lY?N2``c$UXxS23XY?5FsPwm30M#qY^fZ zs20dJWa8PH%yP4bs>`y*#|DkuRNh^h_*BoMcDqhZ`2IGoDW59J>R7s5vdt;W7BW)1 zv5q)hvasFDnvC5!=gWKQ&1WYr8!B75$7bv==GAW*oZPK-YsI$eowmbOsnNaM+!tZ` zotK(SIl6SEHs5TptsnS9oCyD3mQSznSs=LzfnSsS*O}&iTH*v|iE1EWKoW)t&zBX7 z0s1suL+lrH5TXb{07>Vr-atV3KoZE_6bK+4FN=?)gmXa#f4XCnl(Kjy=tI{;GPJq^ z2Ov2>q?Y86x@ZimGXTg*3~`X5i5P~3r6Ly~Y{63?$gs*&m6(H{xK0NOOa z$+H}qX|}P+wNM1yHH~2%cYsha{3cV$6S>_ubxlw~iPt0$a73x_mG6`!v<@E|klWRQ z8O~`FQWhjs4o)bVLLl-;mr)LgAo;wLS@J)QlbjHCLJEQoa3hqx*tgT1%09lkM!oE% kx)M6Tizx&b+|1%A5OZ#dpkYuzoPWjKkxmpO5*~M-z`*jXvH$=8 diff --git a/pcgrr/data/effect_prediction_algos.rda b/pcgrr/data/effect_prediction_algos.rda index 0f3e66f20ba697d5c15e44e3ee9274af4332bac2..dad8df10e53f247a00f2d7c296d6158178359be6 100644 GIT binary patch delta 991 zcmV<510ekO2kr+BLRx4!F+o`-Q(3ocwC#}&AAh={>dm0)8YI(3nx3URh6y&Aq}14_ z>S*;RsL`g>^o;celgQF(gHI`-8YG^km>QUv(m(@1GyrL)hJa`Q^$$?cXc9Uk<>S?E>G-v<-Xk;`102*i#D5t7vp`bJaKmY&$GynhtKmnlPp`#{^8Z^+z$YB6v z0Ds6d&;SD*E3+J9@+9V+^en-Z9L1m7k+JS|)dUWdGky6>&z z1qGNBiKRF;pl6$75IBfxaO28Pvh2R}A;y5QYLBw3+erb8ENwLuSfKOH>6j51^&?S& z%vH?;Ffgkk-2K}3C^5UZIhDP=E19hkwm0gP&#vdaRLV<8@`iNCsniRkvI`6Ha13 z(19>8A&LBj(o=R&Dn07p!cB$LNG>D6S$M-RFwR-j6v?d%&$fy4>Abk~!s;0bGMR$J zNU#*q9^D`%Y)G3cDkLe1O%CBI>tUIgcN0klK?Vkss&2X*aBPPOuoX~I1b>0$9Jg}I zFHo3t{vdVSmHZGT@+w@e)B+?ev-4MjV9&Fbr#KEdch!3t+FVN$S9) ziDpt7;!Og*k%$-W<<=mTE?NlOYU3i$a<$J!|0!De*c1pmLY#WTIVPlI6Hkr>_ezAN zld$Z?N*mcWS`sK6v--I<_}IXSt2Em(hD}%HCb+=z(TI?mXw*3|AxNT?10+Clkkn#_ z2{@BjP>twO5}Y)kgi*K$MR95s>&?u@Y45p;kx0YT?5r_KJ(CqUS%r*&LeYEwdklZ# N?ntK!5*F>2ouE+o#Z~|S delta 1000 zcmVouB9#d*GVtQ>;)X*|S5vEM1qfMlPKxhVl4Kx5W&;g;LpaM}E1Vd7L z6x7qyG%*1683Q2EkkDutKmn!+6jc3FO*GM_fEoY*000000iXcTB@$?e$?AHI57j+K zsDJe|VgLYSGyr4(WY7RXP;;y3)lXH7Kzgw=!ZhlXlR&>}RGxxsUEEFxkVHsL%lDzW zo>ylnFUFp!*8}uX`>{g}1zN7eh6K4vCsX5st4dHio0poI8~Pk?Eh!lc6KQPHD|W*I zh$a+70YpJtG(Z}QAkc)5CuYPFVlEBq<$q~T1dwc%pMJZkXJ+EY*4*bO@<#dFcL5wa z-|kMlJ^QR4AyA-_B?9rnyk`nD{TqGve;GF^k@z5k2|^^o9*T38n9)d7zauHe@cZu6 zP?b*fSCmZ&Wmsqwdf9r9qRt9Lr3i}m+$!ILfSd~ku#uWN-D4?2LXatlbxk*wn171$iPF%TiV^e6KJ>y9#aoE|OufLKD6`!=Q@J>W# z(T+RTl~}6w{&@3a$qzkzI^?aeIYKZG}^M^KeVQGs|zHnD3>GCigi7mQ4}j zU1lg^y4=~GeDff5v7pswmf=A9hJTXFRL^k>1<$^FZpa3NP#H_%)0bPPCNiLiHzi}$ zL;<%ZDfkp72n2Mf<1wgSu%S8+Im9FIvg%-9kaW)+D5?r85=3e~i$ENDOEyYwBpp${ zr$cCFS;K5s5YH*H&=R~VH(`eLLotSU?Gmz{5-N`5kr=2kLt)pX+hdLw$A1z?lE9LU z4(wX6=hdsEocnP_7xQ0NXY|;uxmHx#JmcOFTl{#f1=NP(ZqXoy?*p=xqi$WKts;YyoA2pc=Tiz0z-n6rYqQ^=Lh}pDMNuL&AsWXbk`^gV z86pFehNBcXNyM1K&NqQbN@&u85lY-4R;pN)*D}VUVeP?_ky*)3AK+1W1>BKN6eJ1wFO*~PUnxB>xX m5;7x6@3SK(s*T0O8C7@GW=c16yZWb{UYsr|*3|ipsDwbc|FBj_eOyuy!J!bfV+3jD{4G6Glo#Qp{Hv7!!Ua`CA>BEGFe` zZkMT=vXIGRh0H1r-l|D!Gv3Zw=Aj}G;bvK$hJ9MIVtB4Tk<N(s>gRpa5y7 zks37F5YPdpp^-Er)YQsvDD_4JJwpbB0BMksVHtl3l+`^)qI#NSF&K|fFpU9(^*u%= znhhRN3k$^;--V*hAU_S2$t00cftN;nA}rCPEy@Ui1H=K!ALQ($-=V&yTs_#aGShD; z1{Ls2TQx3tZxHvKg)Bkf3I(P?|+IV1)H2F zBUs&d?83sZY!}qb5o=UyHd%@^Kr6<;$Wh>uvF9j%90ON8FU;KpPcqi`eAej@?P#WM zIMWr?g&vb0hMB8YoGX!Xwla}95;m+dOt62^-*ji*o@gPZZ2GJ|&FlI~iOk)CoKD$o zDsrbd8n(ew>#p=wUz@CUlaMB&5GRcZA1*tn^E}qM##uO=Eb5aynu`VaSvPx)iniD^ z*x1_oZ1?n4vd=eSS@W2K_(Bf1jCESEs3c(Pu<;13TrhMACRmsdsFt!Ye!%TlTYZ1g z3!!ml&q3wYNSsQ7(K<9XApkAK#RiBRd_*%_R5A$jF>=Cg55L6)rKt*nuEH-cATzY6 z1P@AAAtlt#m=yh}T^0f$dSa8=hMVNX5J@DIKCHeF!WPSWLt~J*0kTBrGfF`K4$&q> z&vD%~B330sCdZay_ZJbguOPAsfJ1)<&xS}gfbd$f@amd)@EzNdy^?ihSfpAz+AnS`9DoG> delta 730 zcmV<00ww+W1^)#OLRx4!F+o`-Q&}RW3ipu?9yO(@n@o%})ih|-)Yy^f14oH5003w` zG@3m}i46c6YG@Nc)K4luDYY7m(mg{)o}kmz(WZcrT^WA}O;gf(nW3c7qY<=>5s9GF zPgBt}Y3egc9jl z^zGW$K1u^0%+)-uuug0c-U&iV^GJ*OK}M2ZbO;!dJgpE6QiwuH9Ull1qY!>N`z^`W zl2c9hc0zw8dT%?bDN89xG7clDjeG<1xxuD@`oB&Rj17PfgF6>+;0JQ=mv^UEOxQ4X zk(eGZ85ftyq8Kk%7aw@b%l4NFkk#`Y=A;99hbV zrFP12J#}eQwL7}x%@s1b@WoasI!9SF1kgeR%g&?i&s82bj>?>3lZnF2sWWp?T(;{b z?QyYNZG%mXjjo?YzIxj%c>Ge$n7ke2Aj9?VGv7vrkmTZ2)u#`B$9`1yg}i$$A&>SPX$X&RL~;iWKK-HG)<1l6&!;GWn%yc%I3L@5?IV|al5JDFhJ)tPXLcE z7`=k0D#VwbvC!7kAQn)?Su5#&e#BU=x$E5(fvDrX9h>@YGZ6;VvqfOO{%`F7gZ>IG MCLRx4!F+o`-Q(5W9V5gA|9Dh)@SZZlZLrj6701W^QGzNeT21bnp zO(7m6^qWu`00000000J>4l-oXh-smKni!fi$%q*kO&V#088DJUYB8yl^wU#Q)cqs| zfHtE{(guyGrjI6|@`ozgnxiana%3x^z zu{_qkG%2q(TcK^2Pd!m$08p6NX;Zs^#D6*yrxHxCtSQB$*v(*g2nI2bshb0MrBowPi-Jb`8*!Nj;IGy^OFmU$2Y z%R2AJe$63fGR#4FTU&n@UULE6ptmlZuZT!H$j?r^-S!Eyvx7Lnf$xS#Ry2ojgI#TS zD7G((H_1qAn#?ymkl{NFbQL03#006Nijvrfm-6;_;g SKX(KR_`8xR!i0xTI|Vp4v*B?7 delta 533 zcmV+w0_y#s1f>KHLRx4!F+o`-Q(4l0#io%C9Df4T4MJ#!jSVz3GH3t+ri~36001-w zr=*iK(`t>XW}`p=01SXK0MHLp)Es2VqY%?W0W>i*X_F8#Fq$;e2r^+Lgw$;lDB7o~ z>SwAA4^g0PMwzI{$+a^{y){E+4pp@kMx1eSWH2MShA85 zKOs`4s?(Myoy(p|YfG%oucxDysIdSfn18s?r*`3pbSF+JOt7pe#f0P8Kp;8+Fxq#M zj4DPwlA^WIZ{&R@RS+jKER~qnzhe@-gyvN(+DZr3CN1#U;)G4=w^@*IRe_DX zTY<5;4^9yu15=L^E;ii(;)Ok&#yUVW_)5B9SZ%K@smt`}7=+W(`y>`T(Ry)xZ+@4$ z4rm=2KsB}3`CJVUD<~?U3<(8UiaZ$*g#^OOy1EGg7zT(Li`IQ2fQuyn&;$vZ*)Ju) zU=u6@Rojz3?5&l8$T9%bV(OY85H2@P^iq 0, "#00a65a", pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -40,7 +40,7 @@ flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > ### Prognostic evidence items ```{r} -entries_prognostic <- NROW(pcg_report$content$cna$clin_eitem$specific_ttype$prognostic$A_B) +entries_prognostic <- NROW(pcg_report$content$cna$clin_eitem$query_ttype$prognostic$A_B) flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > 0, "#00a65a", pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -49,7 +49,7 @@ flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > ### Predictive evidence items ```{r} -entries_predictive <- NROW(pcg_report$content$cna$clin_eitem$specific_ttype$predictive$A_B) +entries_predictive <- NROW(pcg_report$content$cna$clin_eitem$query_ttype$predictive$A_B) flexdashboard::valueBox(entries_predictive, color = ifelse(entries_predictive > 0, "#00a65a", pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -69,9 +69,9 @@ Row cat('\nEvidence items associated with variants in tier 1 (right panel) can be filtered according to various criteria:\n') variants_tier1 <- dplyr::bind_rows( - pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['diagnostic']][['A_B']], - pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['predictive']][['A_B']], - pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['prognostic']][['A_B']]) + pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['diagnostic']][['A_B']], + pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['predictive']][['A_B']], + pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['prognostic']][['A_B']]) variants_tier1_shared <- crosstalk::SharedData$new(variants_tier1) crosstalk::bscols( diff --git a/pcgrr/inst/templates/pcgr_flexdb/flexdb_scna_tier2.Rmd b/pcgrr/inst/templates/pcgr_flexdb/flexdb_scna_tier2.Rmd index d826ba35..6220c650 100755 --- a/pcgrr/inst/templates/pcgr_flexdb/flexdb_scna_tier2.Rmd +++ b/pcgrr/inst/templates/pcgr_flexdb/flexdb_scna_tier2.Rmd @@ -31,7 +31,7 @@ flexdashboard::valueBox(length(unique(pcg_report$content$cna$disp$tier2$SEGMENT) ```{r} entries_diagnostic <- NROW(pcg_report$content$cna$clin_eitem$other_ttype$diagnostic$A_B) + - NROW(pcg_report$content$cna$clin_eitem$specific_ttype$diagnostic$C_D_E) + NROW(pcg_report$content$cna$clin_eitem$query_ttype$diagnostic$C_D_E) flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > 0, pcg_report[['metadata']][['color_palette']][['success']][['values']][1], pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -41,7 +41,7 @@ flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > ```{r} entries_prognostic <- NROW(pcg_report$content$cna$clin_eitem$other_ttype$prognostic$A_B) + - NROW(pcg_report$content$cna$clin_eitem$specific_ttype$prognosticc$C_D_E) + NROW(pcg_report$content$cna$clin_eitem$query_ttype$prognosticc$C_D_E) flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > 0, pcg_report[['metadata']][['color_palette']][['success']][['values']][1], pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -51,7 +51,7 @@ flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > ```{r} entries_predictive <- NROW(pcg_report$content$cna$clin_eitem$other_ttype$predictive$A_B) + - NROW(pcg_report$content$cna$clin_eitem$specific_ttype$predictive$C_D_E) + NROW(pcg_report$content$cna$clin_eitem$query_ttype$predictive$C_D_E) flexdashboard::valueBox(entries_predictive, color = ifelse(entries_predictive > 0, "#00a65a", pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -74,9 +74,9 @@ variants_tier2 <- dplyr::bind_rows( pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['diagnostic']][['A_B']], pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['predictive']][['A_B']], pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['prognostic']][['A_B']], - pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['diagnostic']][['C_D_E']], - pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['predictive']][['C_D_E']], - pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['prognostic']][['C_D_E']]) |> + pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['diagnostic']][['C_D_E']], + pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['predictive']][['C_D_E']], + pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['prognostic']][['C_D_E']]) |> dplyr::arrange(EVIDENCE_LEVEL, RATING) diff --git a/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier1.Rmd b/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier1.Rmd index 9a1368b6..9b66cb86 100755 --- a/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier1.Rmd +++ b/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier1.Rmd @@ -14,9 +14,9 @@ flexdashboard::valueBox("TIER 1", caption = NULL, color = '#000000', icon = NULL ```{r} -tier1_evidence_items <- pcg_report$content$snv_indel$clin_eitem$specific_ttype$diagnostic$A_B |> - dplyr::bind_rows(pcg_report$content$snv_indel$clin_eitem$specific_ttype$prognostic$A_B) |> - dplyr::bind_rows(pcg_report$content$snv_indel$clin_eitem$specific_ttype$predictive$A_B) +tier1_evidence_items <- pcg_report$content$snv_indel$clin_eitem$query_ttype$diagnostic$A_B |> + dplyr::bind_rows(pcg_report$content$snv_indel$clin_eitem$query_ttype$prognostic$A_B) |> + dplyr::bind_rows(pcg_report$content$snv_indel$clin_eitem$query_ttype$predictive$A_B) flexdashboard::valueBox(length(unique(tier1_evidence_items$SYMBOL)), color = pcg_report$metadata$color_value_box) @@ -34,7 +34,7 @@ flexdashboard::valueBox(pcg_report$content$snv_indel$v_stat$n_tier1, ### Diagnostic evidence items ```{r} -entries_diagnostic <- NROW(pcg_report$content$snv_indel$clin_eitem$specific_ttype$diagnostic$A_B) +entries_diagnostic <- NROW(pcg_report$content$snv_indel$clin_eitem$query_ttype$diagnostic$A_B) flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > 0, pcg_report[['metadata']][['color_palette']][['success']][['values']][1], pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -43,7 +43,7 @@ flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > ### Prognostic evidence items ```{r} -entries_prognostic <- NROW(pcg_report$content$snv_indel$clin_eitem$specific_ttype$prognostic$A_B) +entries_prognostic <- NROW(pcg_report$content$snv_indel$clin_eitem$query_ttype$prognostic$A_B) flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > 0, pcg_report[['metadata']][['color_palette']][['success']][['values']][1], pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -52,7 +52,7 @@ flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > ### Predictive evidence items ```{r} -entries_predictive <- NROW(pcg_report$content$snv_indel$clin_eitem$specific_ttype$predictive$A_B) +entries_predictive <- NROW(pcg_report$content$snv_indel$clin_eitem$query_ttype$predictive$A_B) flexdashboard::valueBox(entries_predictive, color = ifelse(entries_predictive > 0, pcg_report[['metadata']][['color_palette']][['success']][['values']][1], pcg_report$metadata$color_none), icon = "fa-file-prescription") @@ -71,9 +71,9 @@ Row cat('\nEvidence items associated with variants in tier 1 (right panel) can be filtered according to various criteria:\n') variants_tier1 <- dplyr::bind_rows( - pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['diagnostic']][['A_B']], - pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['predictive']][['A_B']], - pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['prognostic']][['A_B']]) + pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['diagnostic']][['A_B']], + pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['predictive']][['A_B']], + pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['prognostic']][['A_B']]) variants_tier1_shared <- crosstalk::SharedData$new(variants_tier1) crosstalk::bscols( diff --git a/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier2.Rmd b/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier2.Rmd index 96c8dc6b..325b2534 100755 --- a/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier2.Rmd +++ b/pcgrr/inst/templates/pcgr_flexdb/flexdb_snv_tier2.Rmd @@ -16,11 +16,11 @@ flexdashboard::valueBox("TIER 2", caption = NULL, color = '#000000', icon = NULL ```{r} # flexdashboard::valueBox(stringr::str_replace_all(pcg_report$content$value_box$tier2,"Tier 2 variants:\n",""), color = pcg_report$metadata$color_value_box, icon = "fa-dna") -tier2_evidence_items <- dplyr::bind_rows(pcg_report$content$snv_indel$clin_eitem$specific_ttype$diagnostic$C_D_E, +tier2_evidence_items <- dplyr::bind_rows(pcg_report$content$snv_indel$clin_eitem$query_ttype$diagnostic$C_D_E, pcg_report$content$snv_indel$clin_eitem$other_ttype$diagnostic$A_B, - pcg_report$content$snv_indel$clin_eitem$specific_ttype$prognostic$C_D_E, + pcg_report$content$snv_indel$clin_eitem$query_ttype$prognostic$C_D_E, pcg_report$content$snv_indel$clin_eitem$other_ttype$prognostic$A_B, - pcg_report$content$snv_indel$clin_eitem$specific_ttype$predictive$C_D_E, + pcg_report$content$snv_indel$clin_eitem$query_ttype$predictive$C_D_E, pcg_report$content$snv_indel$clin_eitem$other_ttype$predictive$A_B) @@ -39,7 +39,7 @@ flexdashboard::valueBox(pcg_report$content$snv_indel$v_stat$n_tier2, color = pcg ### Diagnostic evidence items ```{r} -entries_diagnostic <- NROW(pcg_report$content$snv_indel$clin_eitem$specific_ttype$diagnostic$C_D_E) + +entries_diagnostic <- NROW(pcg_report$content$snv_indel$clin_eitem$query_ttype$diagnostic$C_D_E) + NROW(pcg_report$content$snv_indel$clin_eitem$other_ttype$diagnostic$A_B) flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > 0, "#00a65a", @@ -49,7 +49,7 @@ flexdashboard::valueBox(entries_diagnostic, color = ifelse(entries_diagnostic > ### Prognostic evidence items ```{r} -entries_prognostic <- NROW(pcg_report$content$snv_indel$clin_eitem$specific_ttype$prognostic$C_D_E) + +entries_prognostic <- NROW(pcg_report$content$snv_indel$clin_eitem$query_ttype$prognostic$C_D_E) + NROW(pcg_report$content$snv_indel$clin_eitem$other_ttype$prognostic$A_B) flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > 0, "#00a65a", @@ -59,7 +59,7 @@ flexdashboard::valueBox(entries_prognostic, color = ifelse(entries_prognostic > ### Predictive evidence items ```{r} -entries_predictive <- NROW(pcg_report$content$snv_indel$clin_eitem$specific_ttype$predictive$C_D_E) + +entries_predictive <- NROW(pcg_report$content$snv_indel$clin_eitem$query_ttype$predictive$C_D_E) + NROW(pcg_report$content$snv_indel$clin_eitem$other_ttype$predictive$A_B) flexdashboard::valueBox(entries_predictive, color = ifelse(entries_predictive > 0, "#00a65a", @@ -79,9 +79,9 @@ Row cat('\nEvidence items associated with variants in TIER 2 (right panel) can be interactively explored according to various criteria\n:') variants_tier2 <- dplyr::bind_rows( - pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['diagnostic']][['C_D_E']], - pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['predictive']][['C_D_E']], - pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['diagnostic']][['C_D_E']], + pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['diagnostic']][['C_D_E']], + pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['predictive']][['C_D_E']], + pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['diagnostic']][['C_D_E']], pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['diagnostic']][['A_B']], pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['predictive']][['A_B']], pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['prognostic']][['A_B']]) |> diff --git a/pcgrr/inst/templates/pcgr_rmarkdown/cna_biomarkers.Rmd b/pcgrr/inst/templates/pcgr_rmarkdown/cna_biomarkers.Rmd index 45371e4b..21675328 100644 --- a/pcgrr/inst/templates/pcgr_rmarkdown/cna_biomarkers.Rmd +++ b/pcgrr/inst/templates/pcgr_rmarkdown/cna_biomarkers.Rmd @@ -3,10 +3,10 @@ A total of __`r NROW(pcg_report[['content']][['cna']][["disp"]][['tier1']]) + NROW(pcg_report[['content']][['cna']][["disp"]][['tier2']])`__ aberrations are associated with clinical evidence items in the [database for clinical interpretations of variants in cancer, CIViC](https://civic.genome.wustl.edu/#/home), with the following number of evidence items: - * Predictive: __`r NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['predictive']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['predictive']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['predictive']][['C_D_E']])`__ evidence items - * Prognostic: __`r NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['prognostic']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['prognostic']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['prognostic']][['C_D_E']])`__ evidence items - * Diagnostic: __`r NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['diagnostic']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['diagnostic']][['A_B']]) + -NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][['diagnostic']][['C_D_E']])`__ evidence items + * Predictive: __`r NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['predictive']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['predictive']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['predictive']][['C_D_E']])`__ evidence items + * Prognostic: __`r NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['prognostic']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['prognostic']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['prognostic']][['C_D_E']])`__ evidence items + * Diagnostic: __`r NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['diagnostic']][['A_B']]) + NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][['diagnostic']][['A_B']]) + +NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][['diagnostic']][['C_D_E']])`__ evidence items

@@ -23,17 +23,17 @@ for(sig in c('strong_significance','potential_significance')){ show_cna_filters[[sig]][[type]] <- F missing_cna_variants[[sig]][[type]] <- T cna_evidence_items[[sig]][[type]] <- data.frame() - if(sig == 'strong_significance' & NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][[type]][['A_B']]) > 0){ + if(sig == 'strong_significance' & NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][[type]][['A_B']]) > 0){ show_cna_filters[[sig]][[type]] <- T missing_cna_variants[[sig]][[type]] <- F - cna_evidence_items[[sig]][[type]] <- pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][[type]][['A_B']] + cna_evidence_items[[sig]][[type]] <- pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][[type]][['A_B']] } - if(sig == 'potential_significance' & (NROW(pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][[type]][['C_D_E']]) > 0 | NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][[type]][['A_B']]) > 0)){ + if(sig == 'potential_significance' & (NROW(pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][[type]][['C_D_E']]) > 0 | NROW(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][[type]][['A_B']]) > 0)){ show_cna_filters[[sig]][[type]] <- T missing_cna_variants[[sig]][[type]] <- F cna_evidence_items[[sig]][[type]] <- - dplyr::bind_rows(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][[type]][['A_B']], pcg_report[['content']][['cna']][['clin_eitem']][['specific_ttype']][[type]][['C_D_E']]) + dplyr::bind_rows(pcg_report[['content']][['cna']][['clin_eitem']][['other_ttype']][[type]][['A_B']], pcg_report[['content']][['cna']][['clin_eitem']][['query_ttype']][[type]][['C_D_E']]) } } } diff --git a/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier1.Rmd b/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier1.Rmd index b892d41b..eb843213 100644 --- a/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier1.Rmd +++ b/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier1.Rmd @@ -2,9 +2,9 @@ ### Tier 1 - Variants of strong clinical significance {.tabset} - Considering evidence items with strongest evidence levels (A & B) for the query tumor type (_`r pcg_report[['metadata']][['config']][["t_props"]][['tumor_type']]`_) in the [database for clinical interpretations of variants in cancer, CIViC](https://civic.genome.wustl.edu/#/home) or [Cancer Biomarkers database](https://www.cancergenomeinterpreter.org/biomarkers), a total of __`r NROW(pcg_report[['content']][['snv_indel']][["disp"]][['tier1']])`__ unique, somatic variants were found, with the following number of evidence items: - * Tier 1 - Predictive/Therapeutic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['predictive']][['A_B']])`__ evidence items - * Tier 1 - Prognostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['prognostic']][['A_B']])`__ evidence items - * Tier 1 - Diagnostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['diagnostic']][['A_B']])`__ evidence items + * Tier 1 - Predictive/Therapeutic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['predictive']][['A_B']])`__ evidence items + * Tier 1 - Prognostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['prognostic']][['A_B']])`__ evidence items + * Tier 1 - Diagnostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['diagnostic']][['A_B']])`__ evidence items

@@ -28,7 +28,7 @@ missing_tier1_items <- list() for(type in c('diagnostic','prognostic','predictive')){ show_tier1_filters[[type]] <- F missing_tier1_items[[type]] <- T - if(NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][[type]][['A_B']]) > 0){ + if(NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][[type]][['A_B']]) > 0){ show_tier1_filters[[type]] <- T missing_tier1_items[[type]] <- F } @@ -55,7 +55,7 @@ if(missing_tier1_items[['predictive']] == F | ```{r tier1_predictive, echo=F, results = 'asis', eval = show_tier1_filters[['predictive']]} -variants_tier1_predictive_shared <- crosstalk::SharedData$new(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['predictive']][['A_B']]) +variants_tier1_predictive_shared <- crosstalk::SharedData$new(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['predictive']][['A_B']]) crosstalk::bscols( list( crosstalk::filter_select("CANCER_TYPE", "Cancer type", variants_tier1_predictive_shared, ~CANCER_TYPE), @@ -116,7 +116,7 @@ if(missing_tier1_items[['prognostic']] == F & ```{r tier1_prognostic, echo=F, results='asis', eval = show_tier1_filters[['prognostic']]} -variants_tier1_prognostic_shared <- crosstalk::SharedData$new(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['prognostic']][['A_B']]) +variants_tier1_prognostic_shared <- crosstalk::SharedData$new(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['prognostic']][['A_B']]) crosstalk::bscols( list( crosstalk::filter_select("CONSEQUENCE", "Consequence", variants_tier1_prognostic_shared, ~CONSEQUENCE), @@ -175,7 +175,7 @@ if(missing_tier1_items[['diagnostic']] == F & ```{r tier1_diagnostic, echo=F, results='asis', eval = show_tier1_filters[['diagnostic']]} -variants_tier1_diagnostic_shared <- crosstalk::SharedData$new(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['diagnostic']][['A_B']]) +variants_tier1_diagnostic_shared <- crosstalk::SharedData$new(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['diagnostic']][['A_B']]) crosstalk::bscols( list( crosstalk::filter_select("CONSEQUENCE", "Consequence", variants_tier1_diagnostic_shared, ~CONSEQUENCE), diff --git a/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier2.Rmd b/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier2.Rmd index 0109a3d9..73029250 100644 --- a/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier2.Rmd +++ b/pcgrr/inst/templates/pcgr_rmarkdown/snv_tier2.Rmd @@ -2,9 +2,9 @@ ### Tier 2 - Variants of potential clinical significance {.tabset} - Tier 2 considers evidence items of _i)_ strong evidence levels (A & B) in other tumor types, and _ii)_ weak evidence levels (C, D & E) in the query tumor type (_`r pcg_report[['metadata']][['config']][["t_props"]][['tumor_type']]`_). Using the [database for clinical interpretations of variants in cancer (CIViC) ](https://civic.genome.wustl.edu/#/home) and [Cancer Biomarkers database](https://www.cancergenomeinterpreter.org/biomarkers), a total of __`r NROW(pcg_report[['content']][['snv_indel']][["disp"]][['tier2']])`__ unique, somatic variants were found in the tumor sample: - * Tier 2 - Predictive/Therapeutic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['predictive']][['A_B']]) + NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['predictive']][['C_D_E']])`__ evidence items - * Tier 2 - Prognostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['prognostic']][['A_B']]) + NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['prognostic']][['C_D_E']])`__ evidence items - * Tier 2 - Diagnostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['diagnostic']][['A_B']]) + NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][['diagnostic']][['C_D_E']])`__ evidence items + * Tier 2 - Predictive/Therapeutic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['predictive']][['A_B']]) + NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['predictive']][['C_D_E']])`__ evidence items + * Tier 2 - Prognostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['prognostic']][['A_B']]) + NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['prognostic']][['C_D_E']])`__ evidence items + * Tier 2 - Diagnostic: __`r NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][['diagnostic']][['A_B']]) + NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][['diagnostic']][['C_D_E']])`__ evidence items

@@ -29,8 +29,8 @@ for(type in c('diagnostic','prognostic','predictive')){ show_tier2_filters[[type]] <- F missing_tier2_items[[type]] <- T eitems[[type]] <- data.frame() - if(NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][[type]][['A_B']]) > 0 | NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][[type]][['C_D_E']]) > 0){ - eitems[[type]] <- dplyr::bind_rows(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][[type]][['A_B']], pcg_report[['content']][['snv_indel']][['clin_eitem']][['specific_ttype']][[type]][['C_D_E']]) |> + if(NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][[type]][['A_B']]) > 0 | NROW(pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][[type]][['C_D_E']]) > 0){ + eitems[[type]] <- dplyr::bind_rows(pcg_report[['content']][['snv_indel']][['clin_eitem']][['other_ttype']][[type]][['A_B']], pcg_report[['content']][['snv_indel']][['clin_eitem']][['query_ttype']][[type]][['C_D_E']]) |> dplyr::arrange(desc(RATING)) show_tier2_filters[[type]] <- T missing_tier2_items[[type]] <- F diff --git a/pcgrr/man/append_cancer_gene_evidence.Rd b/pcgrr/man/append_cancer_gene_evidence.Rd index cb61d000..6c87e938 100644 --- a/pcgrr/man/append_cancer_gene_evidence.Rd +++ b/pcgrr/man/append_cancer_gene_evidence.Rd @@ -4,12 +4,21 @@ \alias{append_cancer_gene_evidence} \title{Function that appends cancer gene evidence links} \usage{ -append_cancer_gene_evidence(vcf_data_df = NULL, ref_data = NULL) +append_cancer_gene_evidence( + vcf_data_df = NULL, + ref_data = NULL, + site = "Any", + pos_var = "POS" +) } \arguments{ \item{vcf_data_df}{Data frame of sample variants from VCF} \item{ref_data}{PCGR reference data bundle object} + +\item{site}{Primary tumor site} + +\item{pos_var}{variable reflecting chromosome order (POS/SEGMENT_START)} } \value{ vcf_data_df diff --git a/pcgrr/man/assign_acmg_tiers.Rd b/pcgrr/man/assign_acmg_tiers.Rd new file mode 100644 index 00000000..6b0e10d2 --- /dev/null +++ b/pcgrr/man/assign_acmg_tiers.Rd @@ -0,0 +1,29 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/acmg.R +\name{assign_acmg_tiers} +\alias{assign_acmg_tiers} +\title{Function that assigns tier classifications to somatic CNA segments and +SNVs/InDels, based on the presence of biomarker evidence found in +the variant set} +\usage{ +assign_acmg_tiers( + vartype = "snv_indel", + primary_site = "Any", + variants_df = NULL, + biomarker_items = NULL +) +} +\arguments{ +\item{vartype}{variant type ('snv_indel' or 'cna')} + +\item{primary_site}{primary tumor site} + +\item{variants_df}{data frame with variants (SNVs/InDels or CNAs)} + +\item{biomarker_items}{data frame with biomarker evidence items} +} +\description{ +Function that assigns tier classifications to somatic CNA segments and +SNVs/InDels, based on the presence of biomarker evidence found in +the variant set +} diff --git a/pcgrr/man/assign_germline_popfreq_status.Rd b/pcgrr/man/assign_germline_popfreq_status.Rd index 8eaa2aca..dc454718 100644 --- a/pcgrr/man/assign_germline_popfreq_status.Rd +++ b/pcgrr/man/assign_germline_popfreq_status.Rd @@ -8,7 +8,7 @@ if any population frequency exceeds max_tolerated_af} \usage{ assign_germline_popfreq_status( sample_calls, - pop = "EUR", + pop = "NFE", dbquery = "gnomADe", max_tolerated_af = 0.01 ) diff --git a/pcgrr/man/assign_somatic_classification.Rd b/pcgrr/man/assign_somatic_classification.Rd index 2ae141f9..c0af6ed2 100644 --- a/pcgrr/man/assign_somatic_classification.Rd +++ b/pcgrr/man/assign_somatic_classification.Rd @@ -6,12 +6,12 @@ based on evidence found in variant set, potentially limited by user-defined options} \usage{ -assign_somatic_classification(sample_calls, config) +assign_somatic_classification(sample_calls, settings) } \arguments{ -\item{sample_calls}{data frame with variants} +\item{sample_calls}{data frame with putative somatic variants} -\item{config}{configuration object} +\item{settings}{PCGR configuration settings} } \value{ sample_calls diff --git a/pcgrr/man/assign_somatic_germline_evidence.Rd b/pcgrr/man/assign_somatic_germline_evidence.Rd index 705a2ec2..a94cb608 100644 --- a/pcgrr/man/assign_somatic_germline_evidence.Rd +++ b/pcgrr/man/assign_somatic_germline_evidence.Rd @@ -5,12 +5,12 @@ \title{Function that appends several tags denoting evidence for somatic/germline status of variants} \usage{ -assign_somatic_germline_evidence(sample_calls, config) +assign_somatic_germline_evidence(sample_calls, settings = NULL) } \arguments{ \item{sample_calls}{data frame with variants} -\item{config}{configuration object} +\item{settings}{PCGR configuration settings} } \value{ sample_calls diff --git a/pcgrr/man/generate_pcgr_report2.Rd b/pcgrr/man/generate_pcgr_report2.Rd new file mode 100644 index 00000000..84273f02 --- /dev/null +++ b/pcgrr/man/generate_pcgr_report2.Rd @@ -0,0 +1,14 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main2.R +\name{generate_pcgr_report2} +\alias{generate_pcgr_report2} +\title{Function that generates all contents of the cancer genome report (PCGR)} +\usage{ +generate_pcgr_report2(yaml_fname = NULL) +} +\arguments{ +\item{yaml_fname}{Name of PCGR configuration file (yaml)} +} +\description{ +Function that generates all contents of the cancer genome report (PCGR) +} diff --git a/pcgrr/man/generate_report_data_msi.Rd b/pcgrr/man/generate_report_data_msi.Rd index bac9da71..1fd7da9e 100644 --- a/pcgrr/man/generate_report_data_msi.Rd +++ b/pcgrr/man/generate_report_data_msi.Rd @@ -4,16 +4,14 @@ \alias{generate_report_data_msi} \title{Function that generates MSI prediction data for PCGR report} \usage{ -generate_report_data_msi(sample_calls, pcgr_data, sample_name, pcgr_config) +generate_report_data_msi(variant_set, ref_data = NULL, settings = NULL) } \arguments{ -\item{sample_calls}{variant calls subject to mutational signature analysis} +\item{variant_set}{variant calls subject to MSI classification} -\item{pcgr_data}{object with PCGR annotation data} +\item{ref_data}{PCGR reference data object} -\item{sample_name}{sample identifier} - -\item{pcgr_config}{Object with PCGR configuration parameters} +\item{settings}{PCGR run configuration settings} } \description{ Function that generates MSI prediction data for PCGR report diff --git a/pcgrr/man/generate_report_data_signatures_mp.Rd b/pcgrr/man/generate_report_data_signatures_mp.Rd index 07ecdbf6..36e27347 100644 --- a/pcgrr/man/generate_report_data_signatures_mp.Rd +++ b/pcgrr/man/generate_report_data_signatures_mp.Rd @@ -5,25 +5,17 @@ \title{Function that generates mutational signatures data for PCGR report} \usage{ generate_report_data_signatures_mp( - vcf_fname, - pcgr_data, - sample_name, - pcgr_config, - type_specific = T + callset_snv = NULL, + ref_data = NULL, + settings = NULL ) } \arguments{ -\item{vcf_fname}{VCF file processed with PCGR annotation pipeline - -possibly filtered for depth/allelic fraction} +\item{callset_snv}{Somatic callset (SNV)} -\item{pcgr_data}{object with PCGR annotation data} +\item{ref_data}{PCGR reference data object} -\item{sample_name}{sample identifier} - -\item{pcgr_config}{Object with PCGR configuration parameters} - -\item{type_specific}{logical indicating if all reference signatures are to be -included (F) rather than those known to be prevalent in the tumor (T)} +\item{settings}{PCGR configuration settings object} } \description{ Function that generates mutational signatures data for PCGR report diff --git a/pcgrr/man/generate_report_data_snv_indel.Rd b/pcgrr/man/generate_report_data_snv_indel.Rd index f076c749..79b56a02 100644 --- a/pcgrr/man/generate_report_data_snv_indel.Rd +++ b/pcgrr/man/generate_report_data_snv_indel.Rd @@ -5,27 +5,15 @@ \title{Function that generates tiered variant sets for SNVs/InDels} \usage{ generate_report_data_snv_indel( - sample_calls, - pcgr_data, - sample_name, - config, - callset = "somatic calls", - biomarker_mapping_stringency = 1, + pcg_report = NULL, + callset = NULL, tier_model = "pcgr_acmg" ) } \arguments{ -\item{sample_calls}{variant calls subject to mutational signature analysis} +\item{pcg_report}{PCGR report object} -\item{pcgr_data}{object with PCGR annotation data} - -\item{sample_name}{sample identifier} - -\item{config}{Object with PCGR configuration parameters} - -\item{callset}{type of calls} - -\item{biomarker_mapping_stringency}{quality level for biomarkers} +\item{callset}{Object with input calls (CNA, SNV/InDel)} \item{tier_model}{tier model (pcgr_acmg)} } diff --git a/pcgrr/man/generate_report_data_snv_indel2.Rd b/pcgrr/man/generate_report_data_snv_indel2.Rd new file mode 100644 index 00000000..ed7c8d19 --- /dev/null +++ b/pcgrr/man/generate_report_data_snv_indel2.Rd @@ -0,0 +1,25 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/main2.R +\name{generate_report_data_snv_indel2} +\alias{generate_report_data_snv_indel2} +\title{Function that generates tiered variant sets for SNVs/InDels} +\usage{ +generate_report_data_snv_indel2( + pcg_report = NULL, + callset = NULL, + tier_model = "pcgr_acmg" +) +} +\arguments{ +\item{pcg_report}{PCGR report object} + +\item{callset}{Object with input calls (CNA, SNV/InDel)} + +\item{tier_model}{tier model (pcgr_acmg)} +} +\value{ +pcg_report_data data frame with all report elements +} +\description{ +Function that generates tiered variant sets for SNVs/InDels +} diff --git a/pcgrr/man/generate_report_data_tumor_only.Rd b/pcgrr/man/generate_report_data_tumor_only.Rd index 4ea8c3d6..e966b78d 100644 --- a/pcgrr/man/generate_report_data_tumor_only.Rd +++ b/pcgrr/man/generate_report_data_tumor_only.Rd @@ -1,10 +1,16 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/main.R +% Please edit documentation in R/main.R, R/main2.R \name{generate_report_data_tumor_only} \alias{generate_report_data_tumor_only} \title{Function that generates germline-filtered callset and PCGR report statistics for a given tumor-only callsets} \usage{ +generate_report_data_tumor_only( + unfiltered_sample_calls, + sample_name, + pcgr_config +) + generate_report_data_tumor_only( unfiltered_sample_calls, sample_name, @@ -19,6 +25,9 @@ generate_report_data_tumor_only( \item{pcgr_config}{Object with PCGR configuration parameters} } \description{ +Function that generates germline-filtered callset and PCGR +report statistics for a given tumor-only callsets + Function that generates germline-filtered callset and PCGR report statistics for a given tumor-only callsets } diff --git a/pcgrr/man/generate_tier_tsv.Rd b/pcgrr/man/generate_tier_tsv.Rd index f218cd5a..60df86f9 100644 --- a/pcgrr/man/generate_tier_tsv.Rd +++ b/pcgrr/man/generate_tier_tsv.Rd @@ -1,9 +1,11 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/main.R +% Please edit documentation in R/main.R, R/main2.R \name{generate_tier_tsv} \alias{generate_tier_tsv} \title{Function that annotates CNV segment files} \usage{ +generate_tier_tsv(variant_set, config, annotation_tags, sample_name = "test") + generate_tier_tsv(variant_set, config, annotation_tags, sample_name = "test") } \arguments{ @@ -16,10 +18,21 @@ generate_tier_tsv(variant_set, config, annotation_tags, sample_name = "test") \item{sample_name}{Sample identifier} } \value{ +tsv_variants data frame with tier-annotated list of +variants for tab-separated output + tsv_variants data frame with tier-annotated list of variants for tab-separated output } \description{ +param cna_segments_tsv CNV file name with chromosomal log(2)-ratio segments +param pcgr_data object with PCGR annotation data +param sample_name sample identifier +param pcgr_config Object with PCGR configuration parameters +param oncotree Data frame with phenotype terms relevant for tumor type +param transcript_overlap_pct required aberration overlap fraction +(percent) for reported transcripts (default 100 percent) + param cna_segments_tsv CNV file name with chromosomal log(2)-ratio segments param pcgr_data object with PCGR annotation data param sample_name sample identifier @@ -297,7 +310,314 @@ if (tumor_type != "Cancer, NOS") \{ eitems = eitems_specific_tt) ## Assign putative TIER 1 variant set - pcg_report_cna[["clin_eitem"]][["specific_ttype"]] <- + pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- + biomarker_hits_cna_specific$clin_eitem + pcg_report_cna[["variant_set"]][["tier1"]] <- + biomarker_hits_cna_specific$variant_set +\} + +pcg_report_cna[["eval"]] <- T +pcg_report_cna[["variant_set"]][["tsv"]] <- + cna_transcript_df_print +pcg_report_cna[["v_stat"]][["n_cna_gain"]] <- + n_cna_gain +pcg_report_cna[["v_stat"]][["n_cna_loss"]] <- + n_cna_loss +pcg_report_cna[["disp"]][["segment"]] <- + cna_segments_filtered +pcg_report_cna[["disp"]][["oncogene_gain"]] <- + onco_ts_sets[["oncogene_gain"]] +pcg_report_cna[["disp"]][["tsgene_loss"]] <- + onco_ts_sets[["tsgene_loss"]] +pcg_report_cna[["disp"]][["other_target"]] <- + onco_ts_sets[["other_target"]] + + +pcg_report_cna <- + pcgrr::assign_tier1_tier2_acmg_cna(pcg_report_cna) + +return(pcg_report_cna) +}\if{html}{\out{}} + +} + +Function that annotates CNV segment files + +param yaml_fname PCGR yaml file +param ref_data PCGR/CPSR reference data object + +export +Function that generates dense and tiered annotated variant datasets + +export +generate_report_data_cna <- +function(cna_segments_tsv, +pcgr_data, +sample_name, +pcgr_config, +oncotree, +transcript_overlap_pct = 100) { + +\if{html}{\out{
}}\preformatted{invisible( + assertthat::assert_that( + file.exists(cna_segments_tsv), + msg = paste0("File 'cna_segments_tsv' (", + cna_segments_tsv, ") does not exist"))) +pcg_report_cna <- pcgrr::init_report(config = pcgr_config, + class = "cna") +log_r_homdel <- pcgr_config[["cna"]][["log_r_homdel"]] +log_r_gain <- pcgr_config[["cna"]][["log_r_gain"]] +tumor_type <- pcgr_config[["t_props"]][["tumor_type"]] +MEGABASE <- 1000000 + +pcgrr::log4r_info("------") +pcgrr::log4r_info(paste0("Generating report data for copy number segment file ", + cna_segments_tsv)) + +## READ INPUT FILE, VALIDATE INPUT CHROMOSOMES AND SEGMENTS, ADD CYTOBAND INFO +cna_df <- utils::read.table(file = cna_segments_tsv, header = T, + stringsAsFactors = F, sep = "\\t", + comment.char = "", quote = "") |> + dplyr::rename(chromosome = Chromosome, + LogR = Segment_Mean, + segment_start = Start, + segment_end = End) |> + dplyr::distinct() |> + dplyr::select( + c("chromosome","LogR", + "segment_start","segment_end")) |> + dplyr::mutate( + chromosome = stringr::str_replace( + .data$chromosome, "^chr", "")) |> + pcgrr::get_valid_chromosomes( + chromosome_column = "chromosome", + bsg = pcgr_data[["assembly"]][["bsg"]]) |> + pcgrr::get_valid_chromosome_segments( + genome_assembly = pcgr_data[["assembly"]][["grch_name"]], + bsg = pcgr_data[["assembly"]][["bsg"]]) |> + dplyr::filter(!is.na(.data$LogR)) |> + dplyr::mutate(LogR = round(as.numeric(.data$LogR), digits = 3)) |> + dplyr::mutate(SEGMENT_ID = paste0(.data$chromosome, ":", + .data$segment_start, "-", + .data$segment_end)) |> + pcgrr::get_cna_cytoband(pcgr_data = pcgr_data) |> + dplyr::mutate(SAMPLE_ID = sample_name) |> + pcgrr::append_ucsc_segment_link( + hgname = pcgr_data[["assembly"]][["hg_name"]], + chrom = "chromosome", + start = "segment_start", + end = "segment_end") |> + dplyr::mutate( + SEGMENT_LENGTH_MB = + round((as.numeric((.data$segment_end - .data$segment_start) / + MEGABASE)), + digits = 5)) |> + dplyr::rename(SEGMENT = .data$SEGMENT_LINK, LOG_R = .data$LogR) + +## MAKE SIMPLE SEGMENTS DATA FRAME FOR FILTERING IN REPORT +cna_segments <- cna_df |> + dplyr::select(.data$SEGMENT, + .data$SEGMENT_LENGTH_MB, + .data$CYTOBAND, + .data$LOG_R, + .data$EVENT_TYPE) |> + dplyr::distinct() + +#### FIND AND APPEND GENCODE TRANSCRIPTS THAT OVERLAP +cna_transcript_df <- + pcgrr::get_cna_overlapping_transcripts( + cna_df, pcgr_data = pcgr_data) +#get_cna_overlapping_transcripts( +# cna_df, pcgr_data = pcgr_data) + +#### GENERATE DATAFRAME OF UNIQUE TRANSCRIPT-CNA SEGMENTS FOR OUTPUT TSV +cna_transcript_df_print <- cna_transcript_df |> + dplyr::select(.data$chrom, + .data$segment_start, + .data$segment_end, + .data$SEGMENT_ID, + .data$SEGMENT_LENGTH_MB, + .data$EVENT_TYPE, + .data$CYTOBAND, + .data$LOG_R, + .data$SAMPLE_ID, + .data$ensembl_gene_id, + .data$symbol, + .data$ensembl_transcript_id, + .data$transcript_start, + .data$transcript_end, + .data$transcript_overlap_percent, + .data$name, + .data$biotype, + .data$tumor_suppressor, + .data$oncogene, + .data$intogen_driver, + .data$chembl_compound_id, + .data$gencode_tag, + .data$gencode_release) |> + magrittr::set_colnames(tolower(names(.))) + +avg_transcript_overlap <- as.data.frame( + cna_transcript_df |> + dplyr::filter(.data$biotype == "protein_coding") |> + dplyr::group_by(.data$SEGMENT_ID, .data$symbol) |> + dplyr::summarise( + MEAN_TRANSCRIPT_CNA_OVERLAP = mean( + .data$transcript_overlap_percent), + TRANSCRIPTS = paste0(.data$ensembl_transcript_id, collapse = ", "), + .groups = "drop") |> + dplyr::rename(SYMBOL = .data$symbol) |> + dplyr::mutate( + MEAN_TRANSCRIPT_CNA_OVERLAP = + round(.data$MEAN_TRANSCRIPT_CNA_OVERLAP, digits = 2)) +) + +cna_transcript_df <- + dplyr::select(cna_transcript_df, -.data$ensembl_transcript_id) |> + dplyr::filter(.data$biotype == "protein_coding") |> + dplyr::distinct() |> + dplyr::mutate(VAR_ID = as.character(rep(1:nrow(.)))) |> + magrittr::set_colnames(toupper(names(.))) |> + pcgrr::append_otargets_pheno_link( + pcgr_data = pcgr_data, + oncotree = oncotree) |> + dplyr::rename(OPENTARGETS_ASSOCIATIONS = + .data$OT_DISEASE_LINK) |> + dplyr::select(.data$VAR_ID, + .data$SEGMENT_ID, + .data$SYMBOL, + .data$ONCOGENE, + .data$ONCOGENE_EVIDENCE, + .data$TUMOR_SUPPRESSOR, + .data$TUMOR_SUPPRESSOR_EVIDENCE, + .data$CANCERGENE_SUPPORT, + .data$OPENTARGETS_ASSOCIATIONS, + .data$OPENTARGETS_RANK, + .data$ENTREZ_ID, + .data$CHROM, + .data$NAME, + .data$EVENT_TYPE, + .data$SEGMENT_LENGTH_MB, + .data$SEGMENT, + .data$TRANSCRIPT_OVERLAP_PERCENT, + .data$LOG_R) |> + dplyr::mutate(ENTREZ_ID = as.character(.data$ENTREZ_ID)) |> + dplyr::rename(GENENAME = .data$NAME, + TRANSCRIPT_OVERLAP = .data$TRANSCRIPT_OVERLAP_PERCENT, + CHROMOSOME = .data$CHROM) |> + dplyr::left_join(pcgr_data[["kegg"]][["pathway_links"]], + by = c("ENTREZ_ID" = "gene_id")) |> + dplyr::rename(KEGG_PATHWAY = .data$kegg_pathway_urls) + +## Get gene annotation links +entrezgene_annotation_links <- + pcgrr::generate_annotation_link( + cna_transcript_df, + vardb = "GENE_NAME", + group_by_var = "VAR_ID", + link_key_var = "ENTREZ_ID", + link_display_var = "GENENAME", + url_prefix = "http://www.ncbi.nlm.nih.gov/gene/") + +cna_transcript_df <- cna_transcript_df |> + dplyr::left_join( + dplyr::rename(entrezgene_annotation_links, + GENE_NAME = .data$link), + by = c("VAR_ID")) |> + dplyr::select(.data$SEGMENT_ID, + .data$CHROMOSOME, + .data$SYMBOL, + .data$GENE_NAME, + .data$KEGG_PATHWAY, + .data$TUMOR_SUPPRESSOR, + .data$TUMOR_SUPPRESSOR_EVIDENCE, + .data$ONCOGENE, + .data$ONCOGENE_EVIDENCE, + .data$CANCERGENE_SUPPORT, + .data$OPENTARGETS_ASSOCIATIONS, + .data$OPENTARGETS_RANK, + .data$SEGMENT_LENGTH_MB, + .data$SEGMENT, + .data$EVENT_TYPE, + .data$LOG_R) |> + dplyr::distinct() |> + dplyr::left_join(avg_transcript_overlap, + by = c("SEGMENT_ID", "SYMBOL")) + + +n_cna_loss <- + dplyr::filter(cna_segments, .data$LOG_R <= log_r_homdel) |> + nrow() +n_cna_gain <- + dplyr::filter(cna_segments, .data$LOG_R >= log_r_gain) |> + nrow() +cna_segments_filtered <- cna_segments |> + dplyr::filter(.data$LOG_R >= log_r_gain | .data$LOG_R <= log_r_homdel) |> + dplyr::arrange(dplyr::desc(.data$LOG_R)) +pcgrr::log4r_info( + paste0("Detected ", nrow(cna_segments_filtered), + " segments subject to amplification/deletion (", + n_cna_loss, " deletions, ", n_cna_gain, + " gains according to user-defined log(2) ratio thresholds)")) + + +## Get aberration sets related to tumor suppressor genes +## /oncogenes/drug targets +onco_ts_sets <- + get_oncogene_tsgene_target_sets( + cna_transcript_df, + transcript_overlap_pct = transcript_overlap_pct, + log_r_homdel = log_r_homdel, + log_r_gain = log_r_gain, + tumor_type = tumor_type, + pcgr_data = pcgr_data) + +## load all clinical evidence items () +eitems_any_tt <- pcgrr::load_eitems( + eitems_raw = pcgr_data$biomarkers, + alteration_types = "CNA", + ontology = + pcgr_data$phenotype$oncotree, + origin = "Somatic", + tumor_type_specificity = "any") + + + +## Get all clinical evidence items that are related to +## tumor suppressor genes/oncogenes/drug targets (NOT tumor-type specific) +biomarker_hits_cna_any <- + pcgrr::get_clin_assocs_cna( + onco_ts_sets, + annotation_tags = pcgr_data$annotation_tags, + eitems = eitems_any_tt) + +pcg_report_cna[["clin_eitem"]][["any_ttype"]] <- + biomarker_hits_cna_any[["clin_eitem"]] +pcg_report_cna[["variant_set"]][["tier2"]] <- + biomarker_hits_cna_any$variant_set + +## Get all clinical evidence items that +## overlap query set (if tumor type is specified) +if (tumor_type != "Cancer, NOS") \{ + + ## load tumor-type specific evidence items () + eitems_specific_tt <- pcgrr::load_eitems( + eitems_raw = pcgr_data$biomarkers, + alteration_types = "CNA", + ontology = + pcgr_data$phenotype$oncotree, + origin = "Somatic", + tumor_type_specificity = "specific", + tumor_type = tumor_type) + + biomarker_hits_cna_specific <- + pcgrr::get_clin_assocs_cna( + onco_ts_sets, + annotation_tags = pcgr_data$annotation_tags, + eitems = eitems_specific_tt) + + ## Assign putative TIER 1 variant set + pcg_report_cna[["clin_eitem"]][["query_ttype"]] <- biomarker_hits_cna_specific$clin_eitem pcg_report_cna[["variant_set"]][["tier1"]] <- biomarker_hits_cna_specific$variant_set diff --git a/pcgrr/man/get_population_tag.Rd b/pcgrr/man/get_population_tag.Rd deleted file mode 100644 index 161947c6..00000000 --- a/pcgrr/man/get_population_tag.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/germline.R -\name{get_population_tag} -\alias{get_population_tag} -\title{Function that retrieves name of VCF INFO tag and -population description for gnomad/1000G population} -\usage{ -get_population_tag(population_code, db = "1KG", subset = NA) -} -\arguments{ -\item{population_code}{three-letter code} - -\item{db}{1KG or GNOMAD} - -\item{subset}{NA or "non_cancer" (for GNOMAD)} -} -\value{ -pop_tag_info -} -\description{ -Function that retrieves name of VCF INFO tag and -population description for gnomad/1000G population -} diff --git a/pcgrr/man/get_prevalent_site_signatures.Rd b/pcgrr/man/get_prevalent_site_signatures.Rd index 4d67dfa8..489e52aa 100644 --- a/pcgrr/man/get_prevalent_site_signatures.Rd +++ b/pcgrr/man/get_prevalent_site_signatures.Rd @@ -3,12 +3,12 @@ \name{get_prevalent_site_signatures} \alias{get_prevalent_site_signatures} \title{Function that retrieves prevalent signatures for a given tumor type/primary site -Data is collected from COSMIC v3.2.} +Data is collected from COSMIC v3.4.} \usage{ get_prevalent_site_signatures( site = "Any", custom_collection = NULL, - pcgr_data = NULL, + ref_data = NULL, min_prevalence_pct = 5, incl_poss_artifacts = T ) @@ -18,7 +18,7 @@ get_prevalent_site_signatures( \item{custom_collection}{Custom collection of signatures from COSMIC} -\item{pcgr_data}{PCGR data object} +\item{ref_data}{PCGR reference data object} \item{min_prevalence_pct}{Minimum prevalence (pct) of signature in cohorts associated with primary site - @@ -29,5 +29,5 @@ are to be included} } \description{ Function that retrieves prevalent signatures for a given tumor type/primary site -Data is collected from COSMIC v3.2. +Data is collected from COSMIC v3.4. } diff --git a/pcgrr/man/get_proper_maf_alleles.Rd b/pcgrr/man/get_proper_maf_alleles.Rd deleted file mode 100644 index 80b2ef37..00000000 --- a/pcgrr/man/get_proper_maf_alleles.Rd +++ /dev/null @@ -1,23 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/mutation.R -\name{get_proper_maf_alleles} -\alias{get_proper_maf_alleles} -\title{Function that transforms a tier-structured variant data frame -into a MAF-like data frame (for input to 2020plus, MutSigCV)} -\usage{ -get_proper_maf_alleles(maf_df, genome_seq, seqinfo) -} -\arguments{ -\item{maf_df}{data frame with somatic mutations} - -\item{genome_seq}{BSgenome object} - -\item{seqinfo}{seqinfo object} -} -\value{ -maf_all -} -\description{ -Function that transforms a tier-structured variant data frame -into a MAF-like data frame (for input to 2020plus, MutSigCV) -} diff --git a/pcgrr/man/init_kataegis_content.Rd b/pcgrr/man/init_kataegis_content.Rd new file mode 100644 index 00000000..0d39335c --- /dev/null +++ b/pcgrr/man/init_kataegis_content.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/report.R +\name{init_kataegis_content} +\alias{init_kataegis_content} +\title{Function that initiates report element with kataegis information} +\usage{ +init_kataegis_content() +} +\description{ +Function that initiates report element with kataegis information +} diff --git a/pcgrr/man/init_msi_content.Rd b/pcgrr/man/init_msi_content.Rd new file mode 100644 index 00000000..da404e1a --- /dev/null +++ b/pcgrr/man/init_msi_content.Rd @@ -0,0 +1,11 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/report.R +\name{init_msi_content} +\alias{init_msi_content} +\title{Function that initiates report element with MSI classification} +\usage{ +init_msi_content() +} +\description{ +Function that initiates report element with MSI classification +} diff --git a/pcgrr/man/load_dna_variants.Rd b/pcgrr/man/load_dna_variants.Rd index c01b72af..ac17e3b9 100644 --- a/pcgrr/man/load_dna_variants.Rd +++ b/pcgrr/man/load_dna_variants.Rd @@ -3,28 +3,34 @@ \name{load_dna_variants} \alias{load_dna_variants} \title{Function that reads and validates CNA or SNV/InDel TSV files -file from PCGR/CPSR pre-report pipeline} +file from PCGR/CPSR pre-report (Python) pipeline} \usage{ load_dna_variants( fname = NA, cols = NULL, ref_data = NULL, + vartype = "snv_indel", + primary_site = "Any", retained_info_tags = "None", variant_origin = "Somatic" ) } \arguments{ -\item{fname}{Path to raw file with DNA aberrations (PCGR/CPSR)} +\item{fname}{Path to raw input file with DNA aberrations (PCGR/CPSR)} -\item{cols}{column type definitions of input} +\item{cols}{column type definitions of raw input file} \item{ref_data}{reference data object} +\item{vartype}{type of DNA aberrations ('snv_indel','cna')} + +\item{primary_site}{primary site of tumor} + \item{retained_info_tags}{VCF INFO tags to be retained in output (SNVs/InDels)} \item{variant_origin}{Germline/Somatic} } \description{ Function that reads and validates CNA or SNV/InDel TSV files -file from PCGR/CPSR pre-report pipeline +file from PCGR/CPSR pre-report (Python) pipeline } diff --git a/pcgrr/man/load_somatic_cna.Rd b/pcgrr/man/load_somatic_cna.Rd index 6c37394f..65db2c4f 100644 --- a/pcgrr/man/load_somatic_cna.Rd +++ b/pcgrr/man/load_somatic_cna.Rd @@ -5,12 +5,14 @@ \title{Function that reads and validates a fully annotated CNA file from PCGR pre-report pipeline} \usage{ -load_somatic_cna(fname, ref_data = NULL) +load_somatic_cna(fname, ref_data = NULL, settings = NULL) } \arguments{ \item{fname}{Path to file name} \item{ref_data}{Object with reference data} + +\item{settings}{Object with PCGR report configuration} } \description{ Function that reads and validates a fully annotated CNA file from PCGR diff --git a/pcgrr/man/make_upset_plot_data.Rd b/pcgrr/man/make_upset_plot_data.Rd index 97927100..d3c0a0ee 100644 --- a/pcgrr/man/make_upset_plot_data.Rd +++ b/pcgrr/man/make_upset_plot_data.Rd @@ -2,9 +2,8 @@ % Please edit documentation in R/germline.R \name{make_upset_plot_data} \alias{make_upset_plot_data} -\title{Function that makes input data for an UpSet plot -(filtering/intersection results) for the somatic-germline -classification procedure} +\title{Function that retrieves name of VCF INFO tag and +population description for gnomad/1000G population} \usage{ make_upset_plot_data(calls, config) } diff --git a/pcgrr/man/predict_msi_status.Rd b/pcgrr/man/predict_msi_status.Rd index 8dc47f0c..1b172054 100644 --- a/pcgrr/man/predict_msi_status.Rd +++ b/pcgrr/man/predict_msi_status.Rd @@ -5,8 +5,8 @@ \title{Function that predicts MSI status based on fraction of indels among calls} \usage{ predict_msi_status( - vcf_data_df, - pcgr_data, + variant_set, + ref_data, msi_prediction_model, msi_prediction_dataset, target_size_mb, @@ -14,9 +14,9 @@ predict_msi_status( ) } \arguments{ -\item{vcf_data_df}{data frame with somatic mutations/indels} +\item{variant_set}{data frame with somatic mutations/indels} -\item{pcgr_data}{object with PCGR datasets} +\item{ref_data}{PCGR reference data object} \item{msi_prediction_model}{statistical model for MSI prediction} diff --git a/pcgrr/man/write_report_output.Rd b/pcgrr/man/write_report_output.Rd index 3a628bb3..82633a4a 100644 --- a/pcgrr/man/write_report_output.Rd +++ b/pcgrr/man/write_report_output.Rd @@ -1,10 +1,17 @@ % Generated by roxygen2: do not edit by hand -% Please edit documentation in R/main.R +% Please edit documentation in R/main.R, R/main2.R \name{write_report_output} \alias{write_report_output} \title{Function that writes contents of PCGR object to various output formats (Rmarkdown/flexdashboard HTML reports, JSON, tab-separated etc)} \usage{ +write_report_output( + report, + tier_model = "pcgr_acmg", + output_format = "html", + flexdb = FALSE +) + write_report_output( report, tier_model = "pcgr_acmg", @@ -23,6 +30,9 @@ write_report_output( \item{flexdb}{logical indicating if HTML output should be dashboard} } \description{ +Function that writes contents of PCGR object to various output formats +(Rmarkdown/flexdashboard HTML reports, JSON, tab-separated etc) + Function that writes contents of PCGR object to various output formats (Rmarkdown/flexdashboard HTML reports, JSON, tab-separated etc) } diff --git a/scripts/pcgrr.R b/scripts/pcgrr.R index 0253b08e..155e8b14 100755 --- a/scripts/pcgrr.R +++ b/scripts/pcgrr.R @@ -1,28 +1,36 @@ #!/usr/bin/env Rscript +options(warn=-1) .libPaths(R.home("library")) # use conda R pkgs, not e.g. user's local installation -suppressWarnings(suppressPackageStartupMessages(library(argparse))) suppressWarnings(suppressPackageStartupMessages(library(pcgrr))) -suppressWarnings(suppressPackageStartupMessages(library(stringr))) +suppressWarnings(suppressPackageStartupMessages(library(log4r))) +suppressWarnings(suppressPackageStartupMessages(library(argparse))) +args <- commandArgs(trailingOnly=TRUE) -# my_log4r_layout <- function(level, ...) { -# paste0(format(Sys.time()), " - pcgr-report-generation - ", -# level, " - ", ..., "\n", collapse = "") -# } +yaml_fname <- as.character(args[1]) + +my_log4r_layout <- function(level, ...) { + paste0(format(Sys.time()), " - pcgr-report-generation - ", + level, " - ", ..., "\n", collapse = "") +} + +log4r_logger <- + log4r::logger( + threshold = "INFO", appenders = log4r::console_appender(my_log4r_layout)) -# log4r_logger <- log4r::logger(threshold = "INFO", -# appenders = log4r::console_appender(my_log4r_layout)) +# this gets passed on to all the log4r_* functions inside the pkg +options("PCGRR_LOG4R_LOGGER" = log4r_logger) -# # this gets passed on to all the log4r_* functions inside the pkg -# options("PCGRR_LOG4R_LOGGER" = log4r_logger) +yaml_fname <- "/Users/sigven/project_data/packages/package__pcgr/bundle_update_2023/pcgr/tumor_sample.BRCA.pcgr_acmg.grch38.conf.yaml" -# pcg_report <- NULL +## Generate report content +pcg_report <- pcgrr::generate_pcgr_report2( + yaml_fname = yaml_fname +) -# defaultW <- getOption("warn") -# options(warn = -1) # # ## Generate report object # pcg_report <-