simplified tier assignment

sigven · Feb 18, 2024 · 05ed185 · 05ed185
1 parent b2cb64f
commit 05ed185
Show file tree

Hide file tree

Showing 64 changed files with 3,932 additions and 1,053 deletions.
diff --git a/pcgr/arg_checker.py b/pcgr/arg_checker.py
@@ -106,12 +106,12 @@ def check_args(arg_dict):
 
     # if assay is targeted or mode is Tumor-Only, MSI prediction will not be performed/switched off
     assay_type = 'Tumor-Control'
-    if arg_dict['estimate_msi_status'] is True and (arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True):
+    if arg_dict['estimate_msi'] is True and (arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True):
         if arg_dict['tumor_only'] is True:
             assay_type = 'Tumor-Only'
         warn_msg = f"MSI status prediction can be applied for WGS/WES tumor-control assays only (query type: {arg_dict['assay']}|{assay_type}) - analysis will be omitted"
         warn_message(warn_msg, logger)
-        arg_dict['estimate_msi_status'] = 0
+        arg_dict['estimate_msi'] = 0
 
     # minimum number of mutations required for mutational signature reconstruction cannot be less than 100 (somewhat arbitrary lower threshold, recommended value is 200)
     if int(arg_dict['min_mutations_signatures']) < int(pcgr_vars.RECOMMENDED_N_MUT_SIGNATURE):
@@ -124,8 +124,8 @@ def check_args(arg_dict):
             error_message(err_msg, logger)
 
     # if MSI status is to be estimated, mutational burden must be turned on
-    if arg_dict['estimate_msi_status'] is True and arg_dict['estimate_tmb'] is False:
-        err_msg = "Prediction of MSI status ('--estimate_msi_status') requires mutational burden analysis ('--estimate_tmb')"
+    if arg_dict['estimate_msi'] is True and arg_dict['estimate_tmb'] is False:
+        err_msg = "Prediction of MSI status ('--estimate_msi') requires mutational burden analysis ('--estimate_tmb')"
         error_message(err_msg, logger)
 
     if arg_dict['tumor_only'] is True:

diff --git a/pcgr/biomarker.py b/pcgr/biomarker.py
@@ -21,7 +21,7 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
 
    Returns:
    - variant_biomarkers: A dictionary containing variant biomarkers. The keys are variant alias types 
-     ('dbsnp', 'hgvsp', 'hgvsc', 'genomic', 'exon', 'other', 'aa_region'), and the values are 
+     ('dbsnp', 'hgvsp', 'hgvsc', 'genomic', 'exon', 'other_gene', 'aa_region'), and the values are 
      dictionaries containing variant information.
 
    Note:
@@ -33,7 +33,7 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
    """
 
    variant_biomarkers = {} ##dictionary to return
-   for variant_alias_type in ['dbsnp','hgvsp','hgvsc','genomic','exon','other','aa_region']:
+   for variant_alias_type in ['dbsnp','hgvsp','hgvsc','genomic','exon','other_gene','aa_region']:
       variant_biomarkers[variant_alias_type] = {}
    check_file_exists(biomarker_clinical_fname, logger)
 
@@ -92,12 +92,12 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
                entry_alias_type = str(row['alias_type']).replace("_grch37", "")
                entry_alias_type = entry_alias_type.replace("_grch38", "")
 
-               if entry_alias_type == "other":
+               if entry_alias_type == "other_gene":
                   if bool(re.search(r'^((ACTIVATING )?MUTATION|LOSS|START LOSS)$', row['variant_alias'])) is True:
                      varkey = str(row['entrezgene'])
-                     if not varkey in variant_biomarkers['other']:
-                        variant_biomarkers['other'][varkey] = []
-                     variant_biomarkers['other'][varkey].append(row)
+                     if not varkey in variant_biomarkers['other_gene']:
+                        variant_biomarkers['other_gene'][varkey] = []
+                     variant_biomarkers['other_gene'][varkey].append(row)
 
                if entry_alias_type == 'exon':
                   exons = row['variant_exon']
@@ -131,20 +131,18 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
                if biomarker_vartype == 'CNA' and (row['alteration_type'].startswith('CNA')):
                   row['clinical_evidence_items'] = '.'
                   if row['variant_id'] in variant_to_clinical_evidence.keys():
-                     row['clinical_evidence_items'] = variant_to_clinical_evidence[row['variant_id']]                 
-                  entry_alias_type = str(row['alias_type']).replace("_grch37", "")
-                  entry_alias_type = entry_alias_type.replace("_grch38", "")
+                     row['clinical_evidence_items'] = variant_to_clinical_evidence[row['variant_id']]                                 
 
-                  if entry_alias_type == "other":
+                  if row['alias_type'] == "other_gene":
                      if bool(re.search(r'^(AMPLIFICATION|DELETION)$', row['variant_alias'])) is True:
                         varkey = str(row['entrezgene']) + "_" + \
                            re.sub(r"transcript_","",str(row['variant_consequence']))
-                        if not varkey in variant_biomarkers['other']:
-                           variant_biomarkers['other'][varkey] = []
+                        if not varkey in variant_biomarkers['other_gene']:
+                           variant_biomarkers['other_gene'][varkey] = []
                         del row['variant_exon']
                         del row['gene']
                         del row['alias_type']
-                        variant_biomarkers['other'][varkey].append(row)
+                        variant_biomarkers['other_gene'][varkey].append(row)
 
 
 
@@ -327,8 +325,8 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi
 
       ## Match biomarkers indicated by gene only - "gene level" resolution
       if entrezgene != "." and principal_csq_entrezgene is True:
-         if str(entrezgene) in variant_biomarkers['other'].keys():
-            hits_gene = variant_biomarkers['other'][str(entrezgene)]
+         if str(entrezgene) in variant_biomarkers['other_gene'].keys():
+            hits_gene = variant_biomarkers['other_gene'][str(entrezgene)]
             for ghit in hits_gene:
                bkey3 = f"{ghit['biomarker_source']}|{ghit['variant_id']}|{ghit['clinical_evidence_items']}"
                ## match biomarkers annotated as "Mutation" only for a given gene - 

diff --git a/pcgr/cna.py b/pcgr/cna.py
@@ -10,7 +10,7 @@
 from pcgr import utils
 from pybedtools import BedTool
 from pcgr.annoutils import nuclear_chromosomes
-from pcgr.utils import error_message, warn_message, check_file_exists
+from pcgr.utils import error_message, warn_message, check_file_exists, remove_file
 from pcgr.biomarker import load_biomarkers
 
 def annotate_cna_segments(output_fname: str, 
@@ -129,8 +129,8 @@ def annotate_cna_segments(output_fname: str,
         biomarkers[db] = load_biomarkers(
             logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA')
 
-        for key in biomarkers[db]['other']:
-            biomarker_data = biomarkers[db]['other'][key]
+        for key in biomarkers[db]['other_gene']:
+            biomarker_data = biomarkers[db]['other_gene'][key]
             biomarker_item = str(db) + '|' + str(biomarker_data[0]['variant_id']) + \
                     '|' + str(biomarker_data[0]['clinical_evidence_items']) + '|by_cna_segment'
             if not key in cna_actionable_dict:               
@@ -154,6 +154,10 @@ def annotate_cna_segments(output_fname: str,
     cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] > 0,"loss_cond"] = False
     cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] == 0,"loss_cond"] = True
 
+    cna_query_segment_df['variant_class'] = 'undefined'
+    cna_query_segment_df.loc[cna_query_segment_df.amp_cond, 'variant_class'] = 'gain'
+    cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'variant_class'] = 'homdel'
+
     cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'aberration_key'] =  \
         cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'entrezgene'].astype(str) + '_ablation'
 
@@ -165,7 +169,7 @@ def annotate_cna_segments(output_fname: str,
 
     ## remove all temporary files
     for fname in temp_files:
-        utils.remove(fname)
+        remove_file(fname)
 
     cna_query_segment_df.columns = map(str.upper, cna_query_segment_df.columns)
     cna_query_segment_df.rename(columns = {'CHROMOSOME':'CHROM','SEGMENT_ID':'VAR_ID'}, inplace = True)
@@ -253,7 +257,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_d
 
     ## remove all temporary files
     for fname in temp_files:
-        utils.remove(fname)
+        remove_file(fname)
 
     return cytoband_annotated_segments
 
@@ -363,7 +367,7 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,
 
     ## remove all temporary files
     for fname in temp_files:
-        utils.remove(fname)
+        remove_file(fname)
 
     return(cna_segments_annotated)
 

diff --git a/pcgr/config.py b/pcgr/config.py
@@ -94,7 +94,7 @@ def create_config(arg_dict, workflow = "PCGR"):
             'exclude_nonexonic': int(arg_dict['exclude_nonexonic'])
         }
         conf_options['somatic_snv']['msi'] = {
-            'run': int(arg_dict['estimate_msi_status'])
+            'run': int(arg_dict['estimate_msi'])
         }
         conf_options['somatic_snv']['tmb'] = {
             'run': int(arg_dict['estimate_tmb']),            

diff --git a/pcgr/cpsr.py b/pcgr/cpsr.py
@@ -38,7 +38,7 @@ def get_args():
     optional_panel.add_argument('--panel_id',dest = "virtual_panel_id",type = str, default = "-1", help="Comma-separated string with identifier(s) of predefined virtual cancer predisposition gene panels,\nchoose any combination of the following identifiers (GEP = Genomics England PanelApp):\n" + str(pcgr_vars.panels))
     optional_panel.add_argument('--custom_list',dest = "custom_list",help="Provide custom list of genes from virtual panel 0 (single-column .txt/.tsv file with Ensembl gene identifiers),\n alternative to predefined panels provided with --panel_id)")
     optional_panel.add_argument('--custom_list_name',dest = "custom_list_name", default="None", help="Set name for custom made panel/list (single word - no whitespace), will be displayed in the report")
-    optional_panel.add_argument('--diagnostic_grade_only', action="store_true",help="For panel_id's 1-42 (Genomics England PanelApp) - consider genes with a GREEN status only, default: %(default)s")
+    optional_panel.add_argument('--diagnostic_grade_only', action="store_true",help="For panel_id's 1-44 (Genomics England PanelApp) - consider genes with a GREEN status only, default: %(default)s")
 
     optional_other.add_argument('--force_overwrite', action = "store_true", help='By default, the script will fail with an error if any output file already exists.\n You can force the overwrite of existing result files by using this flag, default: %(default)s')
     optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version()))
@@ -204,7 +204,6 @@ def run_cpsr(conf_options, cpsr_paths):
                                       output_vcf = vep_vcf)
 
         logger = getlogger('cpsr-vep')
-        #print(str(vep_command["main"]))
 
         logger.info((
             f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor (version {pcgr_vars.VEP_VERSION}, "
@@ -224,7 +223,6 @@ def run_cpsr(conf_options, cpsr_paths):
         check_subprocess(logger, vep_command["tabix"], debug)
         logger.info("Finished cpsr-vep")
         print('----')
-        #exit(0)
 
         ## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs
         logger = getlogger('cpsr-vcfanno')

diff --git a/pcgr/main.py b/pcgr/main.py
@@ -62,10 +62,9 @@ def cli():
     optional_allelic_support.add_argument("--control_af_max", type=float, default=1, dest="control_af_max", help="If VCF INFO tag for variant allelic fraction (control) is specified and found, set maximum tolerated AF for inclusion in report (default: %(default)s)")
 
     optional_tmb_msi.add_argument("--estimate_tmb", action="store_true", help="Estimate tumor mutational burden from the total number of somatic mutations and target region size, default: %(default)s")
-    #optional_tmb_msi.add_argument("--tmb_algorithm", dest="tmb_algorithm", default="all_coding", choices=[ "all_coding", "nonsyn"], help="Method for calculation of TMB, all coding variants (Chalmers et al., Genome Medicine, 2017), or non-synonymous variants only, default: %(default)s")
     optional_tmb_msi.add_argument("--tmb_dp_min", dest="tmb_dp_min", default=0, help="If VCF INFO tag for sequencing depth (tumor) is specified and found, set minimum required sequencing depth for TMB calculation: default: %(default)s")
     optional_tmb_msi.add_argument("--tmb_af_min", dest="tmb_af_min", default=0, help="If VCF INFO tag for allelic fraction (tumor) is specified and found, set minimum required allelic fraction for TMB calculation: default: %(default)s")
-    optional_tmb_msi.add_argument("--estimate_msi_status", action="store_true", help="Predict microsatellite instability status from patterns of somatic mutations/indels, default: %(default)s")
+    optional_tmb_msi.add_argument("--estimate_msi", action="store_true", help="Predict microsatellite instability status from patterns of somatic mutations/indels, default: %(default)s")
 
 
     optional_assay.add_argument("--assay", dest="assay", default="WES", choices=[ "WGS", "WES","TARGETED"], help="Type of DNA sequencing assay performed for input data (VCF), default: %(default)s")
@@ -77,7 +76,7 @@ def cli():
     optional_signatures.add_argument("--min_mutations_signatures", type=int, default=200, dest="min_mutations_signatures", help="Minimum number of SNVs required for reconstruction of mutational signatures (SBS) by MutationalPatterns (default: %(default)s, minimum n = 100)")
     optional_signatures.add_argument("--all_reference_signatures", action="store_true", help="Use all reference mutational signatures (SBS, n = 67) in signature reconstruction rather than only those already attributed to the tumor type (default: %(default)s)")
     optional_signatures.add_argument("--include_artefact_signatures", action="store_true", help="Include sequencing artefacts in the collection of reference signatures (default: %(default)s")
-    optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=5, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)")
+    optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=1, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)")
 
     optional_other.add_argument("--cpsr_report", dest="cpsr_report", help="CPSR report file (Gzipped JSON - file ending with 'cpsr.<genome_assembly>.json.gz' -  germline report of patient's blood/control sample")
     optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)")

diff --git a/pcgr/pcgr_vars.py b/pcgr/pcgr_vars.py
@@ -3,7 +3,7 @@
 from pcgr._version import __version__
 
 PCGR_VERSION = __version__
-DB_VERSION = '20240203'
+DB_VERSION = '20240209'
 
 ## MISCELLANEOUS
 NCBI_BUILD_MAF = 'GRCh38'
@@ -103,7 +103,7 @@
       37: "Renal cancer pertinent cancer susceptibility (GEP)",
       38: "Rhabdoid tumour predisposition (GEP)",
       39: "Sarcoma cancer susceptibility (GEP)",
-      40: "Sarcoma susceptbility (GEP)",
+      40: "Sarcoma susceptibility (GEP)",
       41: "Thyroid cancer pertinent cancer susceptibility (GEP)",
       42: "Tumour predisposition - childhood onset (GEP)",
       43: "Upper gastrointestinal cancer pertinent cancer susceptibility (GEP)",

diff --git a/pcgrr/DESCRIPTION b/pcgrr/DESCRIPTION
@@ -2,7 +2,7 @@ Package: pcgrr
 Type: Package
 Title: Personal Cancer Genome ReporteR
 Version: 1.4.1.9001
-Date: 2023-12-30
+Date: 2024-12-18
 Authors@R:
     c(person(given = "Sigve",
              family = "Nakken",
@@ -69,5 +69,5 @@ Suggests:
     BSgenome.Hsapiens.UCSC.hg38
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Roxygen: list(markdown = TRUE)
diff --git a/pcgrr/NAMESPACE b/pcgrr/NAMESPACE
@@ -9,6 +9,7 @@ export(append_gwas_citation_phenotype)
 export(append_otargets_pheno_link)
 export(append_tcga_var_link)
 export(append_tfbs_annotation)
+export(assign_acmg_tiers)
 export(assign_germline_popfreq_status)
 export(assign_mutation_type)
 export(assign_somatic_classification)
@@ -28,11 +29,13 @@ export(filter_eitems_by_site)
 export(filter_read_support)
 export(generate_annotation_link)
 export(generate_pcgr_report)
+export(generate_pcgr_report2)
 export(generate_report_data_kataegis)
 export(generate_report_data_msi)
 export(generate_report_data_rainfall)
 export(generate_report_data_signatures_mp)
 export(generate_report_data_snv_indel)
+export(generate_report_data_snv_indel2)
 export(generate_report_data_tmb)
 export(generate_report_data_trials)
 export(generate_report_data_tumor_only)
@@ -45,15 +48,15 @@ export(get_cna_overlapping_transcripts)
 export(get_genome_obj)
 export(get_oncogene_tsgene_target_sets)
 export(get_ordinary_chromosomes)
-export(get_population_tag)
 export(get_prevalent_site_signatures)
-export(get_proper_maf_alleles)
 export(get_valid_chromosomes)
 export(het_af_germline_status)
 export(hom_af_status)
 export(init_cna_content)
 export(init_germline_content)
+export(init_kataegis_content)
 export(init_m_signature_content)
+export(init_msi_content)
 export(init_rainfall_content)
 export(init_report)
 export(init_report_display_content)