Skip to content

Commit

Permalink
simplified tier assignment
Browse files Browse the repository at this point in the history
  • Loading branch information
sigven committed Feb 18, 2024
1 parent b2cb64f commit 05ed185
Show file tree
Hide file tree
Showing 64 changed files with 3,932 additions and 1,053 deletions.
8 changes: 4 additions & 4 deletions pcgr/arg_checker.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,12 +106,12 @@ def check_args(arg_dict):

# if assay is targeted or mode is Tumor-Only, MSI prediction will not be performed/switched off
assay_type = 'Tumor-Control'
if arg_dict['estimate_msi_status'] is True and (arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True):
if arg_dict['estimate_msi'] is True and (arg_dict['assay'] == 'TARGETED' or arg_dict['tumor_only'] is True):
if arg_dict['tumor_only'] is True:
assay_type = 'Tumor-Only'
warn_msg = f"MSI status prediction can be applied for WGS/WES tumor-control assays only (query type: {arg_dict['assay']}|{assay_type}) - analysis will be omitted"
warn_message(warn_msg, logger)
arg_dict['estimate_msi_status'] = 0
arg_dict['estimate_msi'] = 0

# minimum number of mutations required for mutational signature reconstruction cannot be less than 100 (somewhat arbitrary lower threshold, recommended value is 200)
if int(arg_dict['min_mutations_signatures']) < int(pcgr_vars.RECOMMENDED_N_MUT_SIGNATURE):
Expand All @@ -124,8 +124,8 @@ def check_args(arg_dict):
error_message(err_msg, logger)

# if MSI status is to be estimated, mutational burden must be turned on
if arg_dict['estimate_msi_status'] is True and arg_dict['estimate_tmb'] is False:
err_msg = "Prediction of MSI status ('--estimate_msi_status') requires mutational burden analysis ('--estimate_tmb')"
if arg_dict['estimate_msi'] is True and arg_dict['estimate_tmb'] is False:
err_msg = "Prediction of MSI status ('--estimate_msi') requires mutational burden analysis ('--estimate_tmb')"
error_message(err_msg, logger)

if arg_dict['tumor_only'] is True:
Expand Down
28 changes: 13 additions & 15 deletions pcgr/biomarker.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
Returns:
- variant_biomarkers: A dictionary containing variant biomarkers. The keys are variant alias types
('dbsnp', 'hgvsp', 'hgvsc', 'genomic', 'exon', 'other', 'aa_region'), and the values are
('dbsnp', 'hgvsp', 'hgvsc', 'genomic', 'exon', 'other_gene', 'aa_region'), and the values are
dictionaries containing variant information.
Note:
Expand All @@ -33,7 +33,7 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
"""

variant_biomarkers = {} ##dictionary to return
for variant_alias_type in ['dbsnp','hgvsp','hgvsc','genomic','exon','other','aa_region']:
for variant_alias_type in ['dbsnp','hgvsp','hgvsc','genomic','exon','other_gene','aa_region']:
variant_biomarkers[variant_alias_type] = {}
check_file_exists(biomarker_clinical_fname, logger)

Expand Down Expand Up @@ -92,12 +92,12 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
entry_alias_type = str(row['alias_type']).replace("_grch37", "")
entry_alias_type = entry_alias_type.replace("_grch38", "")

if entry_alias_type == "other":
if entry_alias_type == "other_gene":
if bool(re.search(r'^((ACTIVATING )?MUTATION|LOSS|START LOSS)$', row['variant_alias'])) is True:
varkey = str(row['entrezgene'])
if not varkey in variant_biomarkers['other']:
variant_biomarkers['other'][varkey] = []
variant_biomarkers['other'][varkey].append(row)
if not varkey in variant_biomarkers['other_gene']:
variant_biomarkers['other_gene'][varkey] = []
variant_biomarkers['other_gene'][varkey].append(row)

if entry_alias_type == 'exon':
exons = row['variant_exon']
Expand Down Expand Up @@ -131,20 +131,18 @@ def load_biomarkers(logger, biomarker_variant_fname, biomarker_clinical_fname, b
if biomarker_vartype == 'CNA' and (row['alteration_type'].startswith('CNA')):
row['clinical_evidence_items'] = '.'
if row['variant_id'] in variant_to_clinical_evidence.keys():
row['clinical_evidence_items'] = variant_to_clinical_evidence[row['variant_id']]
entry_alias_type = str(row['alias_type']).replace("_grch37", "")
entry_alias_type = entry_alias_type.replace("_grch38", "")
row['clinical_evidence_items'] = variant_to_clinical_evidence[row['variant_id']]

if entry_alias_type == "other":
if row['alias_type'] == "other_gene":
if bool(re.search(r'^(AMPLIFICATION|DELETION)$', row['variant_alias'])) is True:
varkey = str(row['entrezgene']) + "_" + \
re.sub(r"transcript_","",str(row['variant_consequence']))
if not varkey in variant_biomarkers['other']:
variant_biomarkers['other'][varkey] = []
if not varkey in variant_biomarkers['other_gene']:
variant_biomarkers['other_gene'][varkey] = []
del row['variant_exon']
del row['gene']
del row['alias_type']
variant_biomarkers['other'][varkey].append(row)
variant_biomarkers['other_gene'][varkey].append(row)



Expand Down Expand Up @@ -327,8 +325,8 @@ def match_csq_biomarker(transcript_csq_elements, variant_biomarkers, rec, princi

## Match biomarkers indicated by gene only - "gene level" resolution
if entrezgene != "." and principal_csq_entrezgene is True:
if str(entrezgene) in variant_biomarkers['other'].keys():
hits_gene = variant_biomarkers['other'][str(entrezgene)]
if str(entrezgene) in variant_biomarkers['other_gene'].keys():
hits_gene = variant_biomarkers['other_gene'][str(entrezgene)]
for ghit in hits_gene:
bkey3 = f"{ghit['biomarker_source']}|{ghit['variant_id']}|{ghit['clinical_evidence_items']}"
## match biomarkers annotated as "Mutation" only for a given gene -
Expand Down
16 changes: 10 additions & 6 deletions pcgr/cna.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from pcgr import utils
from pybedtools import BedTool
from pcgr.annoutils import nuclear_chromosomes
from pcgr.utils import error_message, warn_message, check_file_exists
from pcgr.utils import error_message, warn_message, check_file_exists, remove_file
from pcgr.biomarker import load_biomarkers

def annotate_cna_segments(output_fname: str,
Expand Down Expand Up @@ -129,8 +129,8 @@ def annotate_cna_segments(output_fname: str,
biomarkers[db] = load_biomarkers(
logger, variant_fname, clinical_fname, biomarker_vartype = 'CNA')

for key in biomarkers[db]['other']:
biomarker_data = biomarkers[db]['other'][key]
for key in biomarkers[db]['other_gene']:
biomarker_data = biomarkers[db]['other_gene'][key]
biomarker_item = str(db) + '|' + str(biomarker_data[0]['variant_id']) + \
'|' + str(biomarker_data[0]['clinical_evidence_items']) + '|by_cna_segment'
if not key in cna_actionable_dict:
Expand All @@ -154,6 +154,10 @@ def annotate_cna_segments(output_fname: str,
cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] > 0,"loss_cond"] = False
cna_query_segment_df.loc[cna_query_segment_df['n_major'] + cna_query_segment_df['n_minor'] == 0,"loss_cond"] = True

cna_query_segment_df['variant_class'] = 'undefined'
cna_query_segment_df.loc[cna_query_segment_df.amp_cond, 'variant_class'] = 'gain'
cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'variant_class'] = 'homdel'

cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'aberration_key'] = \
cna_query_segment_df.loc[cna_query_segment_df.loss_cond, 'entrezgene'].astype(str) + '_ablation'

Expand All @@ -165,7 +169,7 @@ def annotate_cna_segments(output_fname: str,

## remove all temporary files
for fname in temp_files:
utils.remove(fname)
remove_file(fname)

cna_query_segment_df.columns = map(str.upper, cna_query_segment_df.columns)
cna_query_segment_df.rename(columns = {'CHROMOSOME':'CHROM','SEGMENT_ID':'VAR_ID'}, inplace = True)
Expand Down Expand Up @@ -253,7 +257,7 @@ def annotate_cytoband(cna_segments_bt: BedTool, output_dir: str, pcgr_build_db_d

## remove all temporary files
for fname in temp_files:
utils.remove(fname)
remove_file(fname)

return cytoband_annotated_segments

Expand Down Expand Up @@ -363,7 +367,7 @@ def annotate_transcripts(cna_segments_bt: BedTool, output_dir: str,

## remove all temporary files
for fname in temp_files:
utils.remove(fname)
remove_file(fname)

return(cna_segments_annotated)

Expand Down
2 changes: 1 addition & 1 deletion pcgr/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def create_config(arg_dict, workflow = "PCGR"):
'exclude_nonexonic': int(arg_dict['exclude_nonexonic'])
}
conf_options['somatic_snv']['msi'] = {
'run': int(arg_dict['estimate_msi_status'])
'run': int(arg_dict['estimate_msi'])
}
conf_options['somatic_snv']['tmb'] = {
'run': int(arg_dict['estimate_tmb']),
Expand Down
4 changes: 1 addition & 3 deletions pcgr/cpsr.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ def get_args():
optional_panel.add_argument('--panel_id',dest = "virtual_panel_id",type = str, default = "-1", help="Comma-separated string with identifier(s) of predefined virtual cancer predisposition gene panels,\nchoose any combination of the following identifiers (GEP = Genomics England PanelApp):\n" + str(pcgr_vars.panels))
optional_panel.add_argument('--custom_list',dest = "custom_list",help="Provide custom list of genes from virtual panel 0 (single-column .txt/.tsv file with Ensembl gene identifiers),\n alternative to predefined panels provided with --panel_id)")
optional_panel.add_argument('--custom_list_name',dest = "custom_list_name", default="None", help="Set name for custom made panel/list (single word - no whitespace), will be displayed in the report")
optional_panel.add_argument('--diagnostic_grade_only', action="store_true",help="For panel_id's 1-42 (Genomics England PanelApp) - consider genes with a GREEN status only, default: %(default)s")
optional_panel.add_argument('--diagnostic_grade_only', action="store_true",help="For panel_id's 1-44 (Genomics England PanelApp) - consider genes with a GREEN status only, default: %(default)s")

optional_other.add_argument('--force_overwrite', action = "store_true", help='By default, the script will fail with an error if any output file already exists.\n You can force the overwrite of existing result files by using this flag, default: %(default)s')
optional_other.add_argument('--version', action='version', version=str(utils.get_cpsr_version()))
Expand Down Expand Up @@ -204,7 +204,6 @@ def run_cpsr(conf_options, cpsr_paths):
output_vcf = vep_vcf)

logger = getlogger('cpsr-vep')
#print(str(vep_command["main"]))

logger.info((
f"CPSR - STEP 1: Basic variant annotation with Variant Effect Predictor (version {pcgr_vars.VEP_VERSION}, "
Expand All @@ -224,7 +223,6 @@ def run_cpsr(conf_options, cpsr_paths):
check_subprocess(logger, vep_command["tabix"], debug)
logger.info("Finished cpsr-vep")
print('----')
#exit(0)

## CPSR|vcfanno - run vcfanno on query VCF with a number of relevant annotated VCFs
logger = getlogger('cpsr-vcfanno')
Expand Down
5 changes: 2 additions & 3 deletions pcgr/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,10 +62,9 @@ def cli():
optional_allelic_support.add_argument("--control_af_max", type=float, default=1, dest="control_af_max", help="If VCF INFO tag for variant allelic fraction (control) is specified and found, set maximum tolerated AF for inclusion in report (default: %(default)s)")

optional_tmb_msi.add_argument("--estimate_tmb", action="store_true", help="Estimate tumor mutational burden from the total number of somatic mutations and target region size, default: %(default)s")
#optional_tmb_msi.add_argument("--tmb_algorithm", dest="tmb_algorithm", default="all_coding", choices=[ "all_coding", "nonsyn"], help="Method for calculation of TMB, all coding variants (Chalmers et al., Genome Medicine, 2017), or non-synonymous variants only, default: %(default)s")
optional_tmb_msi.add_argument("--tmb_dp_min", dest="tmb_dp_min", default=0, help="If VCF INFO tag for sequencing depth (tumor) is specified and found, set minimum required sequencing depth for TMB calculation: default: %(default)s")
optional_tmb_msi.add_argument("--tmb_af_min", dest="tmb_af_min", default=0, help="If VCF INFO tag for allelic fraction (tumor) is specified and found, set minimum required allelic fraction for TMB calculation: default: %(default)s")
optional_tmb_msi.add_argument("--estimate_msi_status", action="store_true", help="Predict microsatellite instability status from patterns of somatic mutations/indels, default: %(default)s")
optional_tmb_msi.add_argument("--estimate_msi", action="store_true", help="Predict microsatellite instability status from patterns of somatic mutations/indels, default: %(default)s")


optional_assay.add_argument("--assay", dest="assay", default="WES", choices=[ "WGS", "WES","TARGETED"], help="Type of DNA sequencing assay performed for input data (VCF), default: %(default)s")
Expand All @@ -77,7 +76,7 @@ def cli():
optional_signatures.add_argument("--min_mutations_signatures", type=int, default=200, dest="min_mutations_signatures", help="Minimum number of SNVs required for reconstruction of mutational signatures (SBS) by MutationalPatterns (default: %(default)s, minimum n = 100)")
optional_signatures.add_argument("--all_reference_signatures", action="store_true", help="Use all reference mutational signatures (SBS, n = 67) in signature reconstruction rather than only those already attributed to the tumor type (default: %(default)s)")
optional_signatures.add_argument("--include_artefact_signatures", action="store_true", help="Include sequencing artefacts in the collection of reference signatures (default: %(default)s")
optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=5, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)")
optional_signatures.add_argument("--prevalence_reference_signatures", type=int, default=1, choices=[1,2,5,10,15,20], help="Minimum tumor-type prevalence (in percent) of reference signatures to be included in refitting procedure (default: %(default)s)")

optional_other.add_argument("--cpsr_report", dest="cpsr_report", help="CPSR report file (Gzipped JSON - file ending with 'cpsr.<genome_assembly>.json.gz' - germline report of patient's blood/control sample")
optional_other.add_argument("--vcf2maf", action="store_true", help="Generate a MAF file for input VCF using https://github.com/mskcc/vcf2maf (default: %(default)s)")
Expand Down
4 changes: 2 additions & 2 deletions pcgr/pcgr_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
from pcgr._version import __version__

PCGR_VERSION = __version__
DB_VERSION = '20240203'
DB_VERSION = '20240209'

## MISCELLANEOUS
NCBI_BUILD_MAF = 'GRCh38'
Expand Down Expand Up @@ -103,7 +103,7 @@
37: "Renal cancer pertinent cancer susceptibility (GEP)",
38: "Rhabdoid tumour predisposition (GEP)",
39: "Sarcoma cancer susceptibility (GEP)",
40: "Sarcoma susceptbility (GEP)",
40: "Sarcoma susceptibility (GEP)",
41: "Thyroid cancer pertinent cancer susceptibility (GEP)",
42: "Tumour predisposition - childhood onset (GEP)",
43: "Upper gastrointestinal cancer pertinent cancer susceptibility (GEP)",
Expand Down
4 changes: 2 additions & 2 deletions pcgrr/DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ Package: pcgrr
Type: Package
Title: Personal Cancer Genome ReporteR
Version: 1.4.1.9001
Date: 2023-12-30
Date: 2024-12-18
Authors@R:
c(person(given = "Sigve",
family = "Nakken",
Expand Down Expand Up @@ -69,5 +69,5 @@ Suggests:
BSgenome.Hsapiens.UCSC.hg38
Encoding: UTF-8
LazyData: true
RoxygenNote: 7.2.3
RoxygenNote: 7.3.1
Roxygen: list(markdown = TRUE)
7 changes: 5 additions & 2 deletions pcgrr/NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ export(append_gwas_citation_phenotype)
export(append_otargets_pheno_link)
export(append_tcga_var_link)
export(append_tfbs_annotation)
export(assign_acmg_tiers)
export(assign_germline_popfreq_status)
export(assign_mutation_type)
export(assign_somatic_classification)
Expand All @@ -28,11 +29,13 @@ export(filter_eitems_by_site)
export(filter_read_support)
export(generate_annotation_link)
export(generate_pcgr_report)
export(generate_pcgr_report2)
export(generate_report_data_kataegis)
export(generate_report_data_msi)
export(generate_report_data_rainfall)
export(generate_report_data_signatures_mp)
export(generate_report_data_snv_indel)
export(generate_report_data_snv_indel2)
export(generate_report_data_tmb)
export(generate_report_data_trials)
export(generate_report_data_tumor_only)
Expand All @@ -45,15 +48,15 @@ export(get_cna_overlapping_transcripts)
export(get_genome_obj)
export(get_oncogene_tsgene_target_sets)
export(get_ordinary_chromosomes)
export(get_population_tag)
export(get_prevalent_site_signatures)
export(get_proper_maf_alleles)
export(get_valid_chromosomes)
export(het_af_germline_status)
export(hom_af_status)
export(init_cna_content)
export(init_germline_content)
export(init_kataegis_content)
export(init_m_signature_content)
export(init_msi_content)
export(init_rainfall_content)
export(init_report)
export(init_report_display_content)
Expand Down

0 comments on commit 05ed185

Please sign in to comment.