In [1]:
%load_ext autoreload
%autoreload 2
import os
import pandas as pd
import numpy as np 
import random
from pyfaidx import Fasta
from tqdm.auto import tqdm
import pickle as pk
from flaml import AutoML
from collections import Counter

# ------------
from metaCH.src.feature_extraction.extract_features import extract_features
from metaCH.src.classification.training import cfDNA_classifier_train_save, seq_classifier_train_save,cfDNA_classifier_train, meta_classifier_train_save
from metaCH.src.utils import load_config
config = load_config()

# Stage 1

## Training the METk model
- The instruction on how to train the METk model can be found in the [METk repo](https://github.com/gaarangoa/METk)
- METk weights can be downloaded from below
https://github.com/gaarangoa/METk?tab=readme-ov-file#download-metk-embeddings

# Stage 2

## 2.a cfDNA-based classifier training

In [None]:
### Read/preprocess Razavi cfDNA data
path_razavi_dataset = config["path_info"]["path_razavi_dataset"]

genomics_table_path=path_razavi_dataset+'/Razavi.CHIP.tsv'
clinical_table_path=path_razavi_dataset+'/41591_2019_652_MOESM8_ESM.xlsx'
genomics_table = pd.read_csv(genomics_table_path, sep='\t')
clinical_table = pd.read_excel(clinical_table_path, sheet_name=2, skiprows=0, engine='openpyxl').drop_duplicates(['patient_id'])

genomics_table['Gene_Name']=genomics_table['Hugo_Symbol']
genomics_table['Tumor_Allele_1']=genomics_table['Tumor_Allele']
genomics_table['Tumor_Allele_2']=genomics_table['Tumor_Allele']
genomics_table['Variant_Type']=genomics_table['Mutation_Type'].\
replace({'Small insertion': 'INS','Small deletion': 'DEL','Single base substitution': 'SNP'})
genomics_table['Reference_Genome']='GRCh37'
genomics_table['CANCER_TYPE'] = [
    'Non-Small Cell Lung Cancer' if i == 'Lung' else 'Prostate Cancer' if i == 'Prostate' else 'Breast Cancer' if i == 'Breast' else 'Control' for i in genomics_table.tissue
]
genomics_table = genomics_table[genomics_table.Trial.isin(['Razavi_2019_wbc_matched', 'Razavi_2019_biopsy_matched'])].reset_index(drop=True)
genomics_table['label'] = ['Blood' if i == 'Razavi_2019_wbc_matched' else 'Tumor' for i in genomics_table.Trial]
genomics_table = genomics_table[genomics_table.CANCER_TYPE != 'Control'].reset_index(drop=True)
parsed_data=genomics_table.copy()

### Extract features from the parsed data
print(genomics_table.shape)
features_metk_cfdna = extract_features(parsed_data, 'paper')


### Train the cfDNA-based classifier and save the model
save_path='./models/cfDNA_classifier.pk'
cfDNA_classifier_train_save(features_metk_cfdna, save_path)

# from metaCH.src.classification.training import cfDNA_classifier_train
# results = cfDNA_classifier_train(features_metk_cfdna, 'paper')

## 2.b Sequence-based classifier training

In [None]:
## Read/Combine/preprocess Tumor+blood sequencing datasets
path_cBio_MSK_dataset = config["path_info"]["path_cBio_MSK_datasets"]

tumor = pd.read_csv(path_cBio_MSK_dataset + '/msk_impact_2017/data_mutations_extended.txt',\
    sep='\t', low_memory=False, comment='#')
tumor['SAMPLE_ORIGIN'] = 'Tumor'

clinical_file = path_cBio_MSK_dataset + '/msk_impact_2017/data_clinical_sample.txt'
clinical_tumor = pd.read_csv(clinical_file, sep='\t', low_memory=False, comment='#')

clinical_patient_file = path_cBio_MSK_dataset + '/msk_impact_2017/data_clinical_patient.txt'
clinical_patient_tumor = pd.read_csv(clinical_patient_file, sep='\t', low_memory=False, comment='#')

clinical_tumor = pd.merge(clinical_tumor, clinical_patient_tumor, on='PATIENT_ID', how='left')
clinical_tumor['dataset_origin'] = 'Tumor'

# -----------------

ch = pd.read_csv(
   path_cBio_MSK_dataset + '/msk_ch_2020/data_mutations_extended.txt', 
    sep='\t', low_memory=False, comment='#'
)
ch['SAMPLE_ORIGIN'] = 'Blood'


clinical_file = path_cBio_MSK_dataset + '/msk_ch_2020/data_clinical_sample.txt'
clinical_chip = pd.read_csv(clinical_file, sep='\t', low_memory=False, comment='#')

clinical_patient_file = path_cBio_MSK_dataset + '/msk_ch_2020/data_clinical_patient.txt'
clinical_patient_chip = pd.read_csv(clinical_patient_file, sep='\t', low_memory=False, comment='#')

clinical_chip = pd.merge(clinical_chip, clinical_patient_chip, on='PATIENT_ID', how='left')
clinical_chip['dataset_origin'] = 'Blood'

# -----------------
clinical_data = pd.concat([clinical_tumor, clinical_chip]).drop_duplicates(subset=['SAMPLE_ID'])
dataset = pd.concat([tumor, ch])
print(Counter(dataset.Variant_Type))
dataset= dataset[dataset.Variant_Type.isin(['INS', 'DEL', 'SNP','SNV'])].reset_index(drop=True)
dataset['Gene_Name']=dataset['Hugo_Symbol']
dataset['Tumor_Allele_1']=dataset['Tumor_Seq_Allele1']
dataset['Tumor_Allele_2']=dataset['Tumor_Seq_Allele2']
dataset['Sample_ID']=dataset['Tumor_Sample_Barcode']
dataset['Reference_Genome']=dataset['NCBI_Build']
# -----------------------
dataset = dataset[dataset.Variant_Classification != 'Silent'].reset_index(drop=True)
dataset = pd.merge(dataset, clinical_data, left_on='Sample_ID', right_on='SAMPLE_ID', how='inner')
print(Counter(dataset.Variant_Type))
dataset.cbp_driver_annotation.fillna('Tumor', inplace=True)
dataset = dataset.drop_duplicates(
    subset=['Chromosome', 'Start_Position', 'Reference_Allele', 'Tumor_Allele_1', 'Tumor_Allele_2', 'cbp_driver_annotation', 'HGVSp_Short']
)
print(Counter(dataset.Variant_Type))

# ------------------------- Extract features from the parsed data
features_metk_seq = extract_features(dataset, 'paper')

# ------------------------- Train the sequence-based classifier and save the model
save_path='./models/seq{n}_classifier.pk'
seq_classifier_train_save(features_metk_seq, save_path)

# Stage 3

## training the meta classifier

In [None]:
master_cfdna_dataset = cfDNA_classifier_train(features_metk_cfdna)

[prod_CH, chpd_features] = pk.load(open('./models/seq2_classifier_final.pk', 'rb'))
[prod_CHPD, ch_features] = pk.load(open('./models/seq1_classifier_final.pk', 'rb'))
    
master_cfdna_dataset['CH_seq2'] =prod_CH.predict_proba(master_cfdna_dataset[ch_features])[:, 0]
master_cfdna_dataset['CH_seq1'] =prod_CHPD.predict_proba(master_cfdna_dataset[chpd_features])[:, 0]

metaclass_features=["CH_cfdna","CH_seq2","CH_seq1"]

# ------------------------- Train the meta classifier and save the model
save_path='./models/metaClassifier.pk'
meta_classifier_train_save(master_cfdna_dataset, metaclass_features, save_path)
    