# Tests

In [1]:
import os
import sys
import logging
import requests
import json
import re
import pandas as pd

sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))
from lib.nanuq import Nanuq
from lib.gapp import Phenotips
from lib.gapp import BSSH
from lib.samplesheet import SampleSheet

nq    = Nanuq()
pho   = Phenotips()
bssh  = BSSH()

class Args:
    # Simulate an argparse instance
    #
    def __init__(self, run='LH00336_0006', level='info'):
        self.run   = run
        self.level = level

args = Args(level='debug')
vars(args)

{'run': 'LH00336_0006', 'level': 'debug'}

In [2]:
sheet = SampleSheet('D:\\HSJ\\Projects\\PRAGMatIQ\\Runs\\Seq_10B_PRAG_20240130.csv')
sheet.sections['DragenGermline_Data'][0]

['Sample_ID',
 'ReferenceGenomeDir',
 'VariantCallingMode',
 'QcCoverage1BedFile',
 'QcCoverage2BedFile',
 'QcCoverage3BedFile',
 'QcCrossContaminationVcfFile']

In [4]:
content = nq.get_samplesheet('LH00336_0009').text.splitlines()
content

['[Header]',
 'FileFormatVersion,2',
 'RunName,LH00336_0009',
 'InstrumentPlatform,NovaSeq',
 'IndexOrientation,Forward',
 '',
 '[Reads]',
 'Read1Cycles,151',
 'Read2Cycles,151',
 'Index1Cycles,10',
 'Index2Cycles,10',
 '',
 '[BCLConvert_Settings]',
 'SoftwareVersion,3.8.4',
 'FastqCompressionFormat,gzip',
 '',
 '[BCLConvert_Data]',
 'Lane,Sample_ID,Index,Index2,Sample_Project',
 '1,24159,TCTCATGATA,AACACGTGGA,PRAGMatIQ_CHUSJ',
 '1,24160,CGAGGCCAAG,GTGTTACCGG,PRAGMatIQ_CHUSJ',
 '1,24161,TTCACGAGAC,AGATTGTTAC,PRAGMatIQ_CHUSJ',
 '1,24162,GCGTGGATGG,TTGACCAATG,PRAGMatIQ_CHUSJ',
 '1,24163,TCTGGTATCC,CGTTGCTTAC,PRAGMatIQ_CHUSJ',
 '1,24164,CATTAGTGCG,TGACTACATA,PRAGMatIQ_CHUSJ',
 '1,24212,ACGGTCAGGA,CGGCCTCGTT,PRAGMatIQ_CUSM',
 '1,24213,GGCAAGCCAG,CAAGCATCCG,PRAGMatIQ_CUSM',
 '1,24214,TGTCGCTGGT,TCGTCTGACT,PRAGMatIQ_CUSM',
 '1,24215,ACCGTTACAA,CTCATAGCGA,PRAGMatIQ_CHUQ',
 '1,24216,TATGCCTTAC,AGACACATTA,PRAGMatIQ_CHUQ',
 '1,24217,ACTGGATCTA,TCGCCGCTAG,PRAGMatIQ_CHUQ',
 '1,24251,TGGTACCTAA,CAT

In [5]:
samples = []
for line in content:
    if line.startswith('['):
        section = line
    else:
        if section.startswith('[DragenGermline_Data]'):
            cols = line.split(',')
            if not line.startswith('Sample_ID') and len(cols) > 1:
                samples.append(cols[0])
samples


[]

In [None]:
# Get authorization token from Emedgene
# Please note - the Authorization header is only valid for a limited time, and
# expires after 8H. In that case, any request made with an expired token will 
# return a 403 code. To resolve, re-do the Login procedure to get a new token.
#
url      = "https://chusaintejustine.emedgene.com/api/auth/api_login/"
payload  = '{"username": "cqgc.bioinfo.hsj@ssss.gouv.qc.ca", "password": "3175CoteSainte-Catherine"}'
headers  = {'Content-Type': 'application/json'}
response = requests.request("POST", url, headers=headers, data=payload)
auth_header = response.json()["Authorization"]
auth_header


In [None]:
# Get case for EMG634551172 (GM240123)
#
case_id = "EMG634551172"
get_test_response = requests.get(f'https://chusaintejustine.emedgene.com/api/test/{case_id}/', json={}, headers={'Authorization': auth_header})
# with open('D:\\HSJ\\Projects\\PRAGMatIQ\\emg_scripts\\emg_case_example-GM240123.json', 'w') as fh:
#     json.dump(get_test_response.json(), fh, indent=4)
get_test_response.json().keys()


In [None]:
# Transfer files to aws
# --profile "emedgene" or "emedgene-eval"
aws s3 ls --profile emedgene-eval s3://cac1-prodca-emg-downloads/Ste_Justine_eval/upload

In [20]:
sample = '24159'
samples_metrics = {}
with open(f"{sample}.metrics.json", "r") as fh:
    metrics = json.load(fh)['Attributes']['illumina_dragen_complete_v0_4']

samples_metrics[sample] = metrics
samples_metrics

{'24159': {'aligned_bases': 116701731183,
  'aligned_bases_in_genome': 116701731183,
  'aligned_bases_in_genome_pct': 100.0,
  'aligned_reads': 813636857,
  'aligned_reads_in_genome': 813636857,
  'aligned_reads_in_genome_pct': 100.0,
  'autosomal_median_coverage': 52.79,
  'average_alignment_coverage_over_genome': 38.61,
  'average_autosomal_coverage_over_genome': 40.86,
  'average_chr_x_coverage_over_genome': 20.09,
  'average_chr_y_coverage_over_genome': 17.58,
  'average_mitochondrial_coverage_over_genome': 2981.83,
  'bases_in_reference_genome': 3706390998,
  'chr10_autosomal_median_coverage_ratio': 1.0,
  'chr11_autosomal_median_coverage_ratio': 1.0,
  'chr12_autosomal_median_coverage_ratio': 1.0,
  'chr13_autosomal_median_coverage_ratio': 1.0,
  'chr14_autosomal_median_coverage_ratio': 1.0,
  'chr15_autosomal_median_coverage_ratio': 1.0,
  'chr16_autosomal_median_coverage_ratio': 1.01,
  'chr17_autosomal_median_coverage_ratio': 1.01,
  'chr18_autosomal_median_coverage_ratio': 0.

In [23]:
df_samples_metrics = pd.DataFrame.from_dict(samples_metrics, orient="index")
df_samples_metrics

Unnamed: 0,aligned_bases,aligned_bases_in_genome,aligned_bases_in_genome_pct,aligned_reads,aligned_reads_in_genome,aligned_reads_in_genome_pct,autosomal_median_coverage,average_alignment_coverage_over_genome,average_autosomal_coverage_over_genome,average_chr_x_coverage_over_genome,...,variants_snp_transversions_pass,variants_snps_pass,variants_snps_pass_pct,variants_ti_to_tv_ratio_pass,variants_total_pass,variants_total_pass_pct,x_median_autosomal_median,x_median_coverage,y_median_autosomal_median,y_median_coverage
24159,116701731183,116701731183,100.0,813636857,813636857,100.0,52.79,38.61,40.86,20.09,...,1899623,4144821,80.51,1.98,5148371,100.0,0.5,26.23,0.51,26.78


In [None]:
subset_cols = ['aligned_reads', 'aligned_reads_in_genome_pct', 'cnv_coverage_uniformity']
df = df_samples_metrics