## genotype preprocessing

Genotype preprocessing of somatic mutation had done using nextflow/sarek and vcf processing using R. this code describes how-to tutorial.

### 1. CHIP panel preprocessing

1. liftOver 

```
liftOver -bedPlus=3 -tab UCSC_SGI_CHIP_V1_Rev2_TE-93684413_hg19_210301150844.modified.bed /ssd-data/support/tool/liftover/hg19ToHg38.over.chain.gz UCSC_SGI_CHIP_V1_Rev2_TE-93684413_hg19_210301150844_liftover_to_hg38.bed UCSC_SGI_CHIP_V1_Rev2_TE-93684413_hg19_210301150844_nolifted.bed
```
2. calling CHIP using nextflow/sarek

__calling somatic variant__
```
nextflow run nf-core/sarek -r 3.1.1 -c nextflow_targeted_hg38.config \
    -resume -profile docker --nucleotides_per_second 30000 \
    --genome GATK.GRCh38 \
    --intervals UCSC_SGI_CHIP_V1_Rev2_TE-93684413_hg19_210301150844_liftover_to_hg38.bed --wes \
    --input samplelist.csv \
    --tools mutect2,vep,snpeff \
    --outdir .
```

### 2. WES preprocessing

* this pipeline needed to access cloud service provider such as AWS. do not recommend to execute following code in local computer. 

1. calling somatic mutations in WES using nextflow/sarek
```
./nextflow run nf-core/sarek -r 3.1.1 -c ./aws_nf.conf -resume -profile docker,awsbatch \
-bucket-dir s3://bucket/ \
--wes \
--genome GATK.GRCh38 \
--igenomes_base s3://bucket/references/ \
--intervals s3://bucket/references/S04380110_Regions.bed \
--input ./samplesheet_WES_pilot_aws.csv \
--tools mutect2,vep,snpeff \
--outdir s3://bucket/key_out/
```


### 3. gathering VCFs and finalize files

1. retain PASS calls

```
bcftools view -f 'PASS,.' each_sample.VEP.vcf.gz > each_sample.PASS.VEP.vcf.gz
bcftools view -f 'PASS,.' each_sample.snpeff.vcf.gz > each_sample.PASS.snpeff.vcf.gz
```

2. use following scripts in R

In [None]:
vlib = c("tidyverse", "data.table", "vcfR", "ggpubr")
lapply(vlib, require, character.only = TRUE) |> suppressMessages()
chr_anno_path = "202111-CHIP/CHIPv4_batch5/annotation/"
chr_probeset = "202111-CHIP/CHIP/SGI_CHIP/all_target_segments_covered_by_probes_SGI_CHIP_V1_Rev2_TE-93684413_GRCh38_lifted.bed"

In [None]:
setwd(chr_anno_path)
probeset = fread(chr_probeset)
c_list_samples = list.files(pattern = "D_*")
df_fname = data.frame(
    ID = c_list_samples,
    snpeff = paste0(c_list_samples, "/", c_list_samples, ".mutect2.filtered_snpEff.ann.vcf.gz"),
    vep = paste0(c_list_samples, "/", c_list_samples, ".mutect2.filtered_VEP.ann.vcf.gz")
)
cat("sample size is", nrow(df_fname), "\n")
df_fname %>% head()

In [None]:
# pre-defined
c_ANN_colnames = c("Allele", "Annotation", "Annotation_Impact", 
                    "Gene_Name", "Gene_ID", "Feature_Type", 
                   "Feature_ID", "Transcript_BioType", "RANK", 
                   "HGVS.c", "HGVS.p", "cDNA_Pos", 
                   "CDS_Pos", "AA_Pos", "Distance", 
                   "NOTES")

c_CSQ_colnames = c("Allele","Consequence","IMPACT",
                    "SYMBOL","Gene","Feature_type",
                    "Feature","BIOTYPE","EXON",
                   "INTRON","HGVSc","HGVSp",
                   "cDNA_position","CDS_position","Protein_position",
                   "Amino_acids","Codons","Existing_variation",
                   "DISTANCE","STRAND","FLAGS",
                   "VARIANT_CLASS","SYMBOL_SOURCE",
                   "HGNC_ID","CANONICAL","MANE_SELECT",
                   "MANE_PLUS_CLINICAL","TSL","APPRIS",
                   "CCDS","ENSP","SWISSPROT",
                   "TREMBL","UNIPARC","UNIPROT_ISOFORM",
                   "GENE_PHENO","SIFT","PolyPhen","DOMAINS",
                   "miRNA","AF","AFR_AF","AMR_AF","EAS_AF","EUR_AF","SAS_AF",
                   "gnomADe_AF","gnomADe_AFR_AF","gnomADe_AMR_AF","gnomADe_ASJ_AF","gnomADe_EAS_AF","gnomADe_FIN_AF","gnomADe_NFE_AF","gnomADe_OTH_AF","gnomADe_SAS_AF",
                   "gnomADg_AF","gnomADg_AFR_AF","gnomADg_AMI_AF","gnomADg_AMR_AF","gnomADg_ASJ_AF","gnomADg_EAS_AF","gnomADg_FIN_AF","gnomADg_MID_AF",
                   "gnomADg_NFE_AF","gnomADg_OTH_AF","gnomADg_SAS_AF",
                   "MAX_AF","MAX_AF_POPS",
                   "FREQS","CLIN_SIG","SOMATIC",
                   "PHENO","PUBMED",
                   "MOTIF_NAME","MOTIF_POS","HIGH_INF_POS","MOTIF_SCORE_CHANGE","TRANSCRIPTION_FACTORS")

In [None]:
setwd("202111-CHIP/CHIPv4_batch5/annotation/")
list_res = list(mode = "vector", length = nrow(df_fname))
for (each in 1:nrow(df_fname)){
    vcf = read.vcfR(paste0(df_fname$snpeff[each]), verbose = F)
    tidy_vcf = vcfR2tidy(vcf) |> suppressMessages()
    if (nrow(tidy_vcf$fix)> 0){
        # clarify filters
        df_snpeff_each_selected = tidy_vcf$fix %>% 
            filter(FILTER == "PASS") %>%  # collect pass calls
            select(CHROM, POS, REF, ALT, AS_SB_TABLE, ANN) %>% 
            add_column(.before = 1, ID = df_fname$ID[each])
        df_snpeff_each_gt = tidy_vcf$gt %>%
            select(POS, gt_AD, gt_AF, gt_DP)
        df_snpeff_each_res = left_join(df_snpeff_each_selected, df_snpeff_each_gt) %>% 
            select(ID:AS_SB_TABLE, gt_AD:gt_DP, ANN) %>% 
            separate(col = ANN, into = c_ANN_colnames, sep = "\\|", extra = "drop" )
    } else {
        df_snpeff_each_res = NULL
    }
  list_res[[each]] = df_snpeff_each_res
  cat(each, "th dataset was processed\n")
}

In [None]:
df_snpeff_output = list_res %>% bind_rows() %>% 
  mutate(bind = paste0(`CHROM`, ":", POS, "_", REF, "_", ALT)) %>% 
  group_by(bind) %>% add_count() %>% 
  separate(gt_AD, into = c("AC_REF", "AC_ALT"), sep = ",")  %>%
  #filter(!(Annotation %in% c("synonymous_variant", "intergenic_region", "intron_variant"))) %>%
  write_delim("202111-CHIP/CHIPv4_batch5/230324_vcf_snpeff_output_v1.txt", delim = "\t")

In [None]:
setwd("/mnt/workspace/202111-CHIP/CHIPv4_batch5/annotation/")
list_res = list(mode = "vector", length = nrow(df_fname))

In [None]:
for (each in 1:nrow(df_fname)){
  vcf = read.vcfR(paste0(df_fname$vep[each]), verbose = F)
  tidy_vcf = vcfR2tidy(vcf) |> suppressMessages()
  if (nrow(tidy_vcf$fix) > 0) {
    # clarify filters
    df_vep_each_selected = tidy_vcf$fix %>%
      filter(FILTER == "PASS") %>% # filtering DP
      select(ChromKey, CHROM, POS, REF, ALT, AS_SB_TABLE, CSQ) %>%
      add_column(.before = 1, ID = df_fname$ID[each])
    df_vep_each_gt = tidy_vcf$gt %>%
      select(ChromKey, POS, gt_AD, gt_AF, gt_DP)
    df_vep_each_res = left_join(df_vep_each_selected, df_vep_each_gt) %>%
      select(ID:AS_SB_TABLE, gt_AD:gt_DP, CSQ) %>%
      separate(col = CSQ, into = c_CSQ_colnames, sep = "\\|", extra = "drop") %>%
      select(ID:AS_SB_TABLE, gt_AD:gt_DP, Consequence, SIFT, PolyPhen, SYMBOL:Feature, Existing_variation, AF:MAX_AF_POPS, -ChromKey)
  } else {
    df_vep_each_res = NULL
  }
  list_res[[each]] = df_vep_each_res
  cat(each, "th dataset was processed\n")
}


In [None]:
df_vep_output  = list_res %>% bind_rows() %>%
  mutate(bind = paste0(`CHROM`, ":", POS, "_", REF, "_", ALT)) %>%
  group_by(bind) %>%
  mutate(n = n())

In [None]:
setwd("/mnt/workspace/202111-CHIP/CHIPv4_batch5/")
left_join(df_snpeff_output, df_vep_output) %>% write_delim("230324_annotation_merged_raw_output.txt", delim = "\t")

In [None]:
setwd("202111-CHIP/CHIPv4_batch5/")
fread("230324_annotation_merged_raw_output.txt") %>%
  filter(
    AC_ALT >= 10 & AC_REF >= 10 & gt_DP >= 200 &
    ((gt_AF >= 0.02 & gt_AF <= 0.40) | (gt_AF >= 0.60 & gt_AF <= 0.98))
  ) %>% # technically correct calls
  mutate(temp_AF_gnomAD = ifelse(
    (ifelse(is.na(gnomADe_AF), 0, gnomADe_AF) >= ifelse(is.na(gnomADg_AF), 0, gnomADg_AF)),
    gnomADe_AF,
    gnomADg_AF
  )) %>%  # add max AF
  filter((grepl(x = Existing_variation, pattern = "C") == TRUE) | # conserve COSV calls
           (grepl(x = Existing_variation, pattern = "C") == FALSE &
             (temp_AF_gnomAD < 1e-5 | is.na(temp_AF_gnomAD) == TRUE))) %>% # remove non-COSV calls with high gnomAD freq
  select(-temp_AF_gnomAD, -n) %>%
  group_by(bind) %>%
  add_count() %>%
  filter(n < floor(nrow(df_fname) / 10)) %>% # add count and cohort-based filter
  write_delim("230324_annotation_merged_filtered_output.txt", delim = "\t")

# manually filter after this step.