In [1]:
pip install astropy

Note: you may need to restart the kernel to use updated packages.


In [3]:
sys.path.insert(0, '../../Code')

import astropy
import numpy as np
import os
import pandas as pd
import statistics
import sys  
import train_and_predict
import util


from astropy.table import Table
from Bio import SeqIO
from data import *
from sklearn.metrics import classification_report



In [4]:
download = False

output_dir_path = r'..\..\Data\Bacpacs\\'

if download:
    all_genomes = df2['Genome ID']
    download_genomes(all_genomes, output_dir_path)

# Parse PATRIC genomes files

In [5]:
bacpacs_dir_path = r'..\..\Data\Bacpacs\\'
bacpacs_patric_files_path = bacpacs_dir_path + 'patric_files\\' 
bacpacs_genomes_path = bacpacs_dir_path + 'bacpacs_genomes.fasta'
bacpacs_metadata_path = bacpacs_dir_path + 'bacpacs_test.csv'

In [6]:
UNKNOWN_PGFAM = 'X'

In [7]:
def parse_patric_file(file_path):
    
    genome_df = pd.read_csv(file_path, sep='\t')
    pgfams = genome_df['pgfam_id'].fillna(UNKNOWN_PGFAM)
    
    return pgfams

In [8]:
def write_genomes_files(patric_files_folder, output_path):
    with os.scandir(patric_files_folder) as entries:
        with open(output_path, 'w') as out_f:
            for entry in entries:
                if entry.is_file():
                    genome_id = entry.name.split(FILE_SUFFIX)[0]
                    genome_pgfams = parse_patric_file(entry)

                    out_f.write(f'>{genome_id}\n')
                    out_f.write('\n'.join(genome_pgfams))
                    out_f.write('\n')
                    
#write_genomes_files(bacpacs_patric_files_path, bacpacs_genomes_path)

# Load Bacpacs Dataset

In [9]:
bacpacs_dataset = GenomesData(bacpacs_genomes_path, bacpacs_metadata_path)

In [10]:
len(bacpacs_dataset)

94

In [11]:
labels = bacpacs_dataset.y

## Balanced dataset (one genome per species)

In [12]:
balanced_dataset_genomes = bacpacs_dataset.metadata[bacpacs_dataset.metadata['Balanced Test'] == 'Yes'].index

In [13]:
bacpacs_data_balanced = bacpacs_dataset.data[balanced_dataset_genomes]

In [14]:
len(bacpacs_data_balanced)

40

In [15]:
balanced_labels = labels[balanced_dataset_genomes]

In [16]:
target_names = ['NHP', 'HP']

# WSPC model

In [17]:
model_path = '..\..\WSPC Model\WSPC_model.pkl'

WSPC = util.load_model(model_path)

In [18]:
preds = WSPC.predict(bacpacs_dataset.data) 

#### Entire test

In [19]:
print(classification_report(labels, preds, target_names=target_names))

              precision    recall  f1-score   support

         NHP       0.93      0.88      0.90        16
          HP       0.97      0.99      0.98        78

    accuracy                           0.97        94
   macro avg       0.95      0.93      0.94        94
weighted avg       0.97      0.97      0.97        94



NHP recall = specificity   
HP recall = sensitivity

In [20]:
balanced_preds = WSPC.predict(bacpacs_data_balanced)

#### Balanced test

In [21]:
print(classification_report(balanced_labels, balanced_preds, target_names=target_names))

              precision    recall  f1-score   support

         NHP       1.00      0.87      0.93        15
          HP       0.93      1.00      0.96        25

    accuracy                           0.95        40
   macro avg       0.96      0.93      0.95        40
weighted avg       0.95      0.95      0.95        40



# deepac model

In [22]:
# pip install patool


In [23]:
import patoolib


In [28]:
results_zip_path = r'..\..\Data\Bacpacs\deepac_on_bacpacsTest.rar'
outdir =  r'..\..\Data\Bacpacs\\'
deepac_on_bacpacs_dir_path = r'..\..\Data\Bacpacs\deepac_on_bacpacsTest\\'

In [25]:
patoolib.extract_archive(results_zip_path, outdir=outdir)

patool: Extracting ..\..\Data\Bacpacs\deepac_on_bacpacsTest.rar ...
patool: running "C:\Program Files\WinRAR\rar.EXE" x -- C:\Users\Shaked\Desktop\wspc_rep_git_18.2\wspc_rep\Data\Bacpacs\deepac_on_bacpacsTest.rar
patool:     with cwd=..\..\Data\Bacpacs\\
patool: ... ..\..\Data\Bacpacs\deepac_on_bacpacsTest.rar extracted to `..\..\Data\Bacpacs\\'.


'..\\..\\Data\\Bacpacs\\\\'

In [29]:
# create  dictionary: key - genome name, value - dictionary(key:frag name, val: prediction)
def create_all_preds_per_genome(seqFile):
    all_p_per_genome_patho_test = {}
    for fasta in seqFile:
        name_and_p = fasta.description
        genome_name = name_and_p.split('pathogenic/')[1].split('.fq')[0]
        frag =  name_and_p.split('.fq.')[1].split(' | ')[0]
        all_p_per_genome_patho_test.setdefault(genome_name, {})
        all_p_per_genome_patho_test[genome_name][frag] = float(name_and_p.split('pp=')[1]) 

    print(f'results len: {len(all_p_per_genome_patho_test)}')          
    return all_p_per_genome_patho_test

In [30]:

"""
 1. "One of the major challenges of pathogenic potential prediction from single
reads is the lack of biological context. However, if all the reads in a sample
originate from the exactly same organism, we can predict the pathogenic
potential of that organism by a ---majority vote---. In the context of probabilistic
estimates of the class label (returned by both PaPrBaG and our neural
networks), we can implement that as a ---simple mean over predictions for
all the individual reads---. For BLAST, we can just assign the label predicted
for the majority of reads.
Building upon this idea, we can boost read-based performance if
we consider ***read pairs***, assumed to originate from the same organism
even in metagenomic samples. To this end, ---we average predictions for
the corresponding pairs in our test set---. The classifiers may still predict
pathogenic potentials for isolated sequences if so desired. We can integrate
binary predictions (e.g. returned by BLAST), taking into account the
missing and conflicting predictions for some of the reads. We treat
missing predictions as undefined values and implement the accept anything
operator of ternary logic. It returns a positive label if and only if one of
the input values is positive, and the other is not negative. Conversely, it
returns a negative label if and only if one of the input values is negative,
and the other is not positive. ---The result is undefined when both inputs are
undefined, or in case of conflicting input values.---
"""

def calculate_final_genomes_predictions(test_preds_1, test_preds_2):
    all_p_per_genome_test1 = create_all_preds_per_genome(test_preds_1)
    all_p_per_genome_test2 = create_all_preds_per_genome(test_preds_2)
    genomes_predictions = {}
    for genome in all_p_per_genome_test1.keys():
        all_frags1 = all_p_per_genome_test1[genome]
        all_frags2 = all_p_per_genome_test2[genome]
        if len((set(all_frags1) - set(all_frags2))) > 0:
            print('Error')
            return
        all_preds = []
        for frag_id in all_frags1.keys():
            y_pred_1 = all_frags1[frag_id]
            y_pred_2 = all_frags2[frag_id]
            y_pred = (y_pred_1 + y_pred_2)/2
            all_preds.append(y_pred)
            
        final_pred = statistics.mean(all_preds)
        genomes_predictions[genome] = final_pred
    
    return genomes_predictions

In [31]:
def get_deepac_predictions(test_genomes, test_labels, patho_genomes_predictions, nonpatho_genomes_predictions):
    deepac_preds_df = pd.DataFrame(index=test_genomes, columns=['Label', 'deepac_pred'])
    for genome_id in test_genomes:
        deepac_preds_df.at[genome_id, 'Label'] = int(test_labels.at[genome_id])
        if str(genome_id) in  patho_genomes_predictions.keys():
            deepac_preds_df.at[genome_id, 'deepac_pred'] = round(patho_genomes_predictions[genome_id])
            continue
        elif str(genome_id) in  nonpatho_genomes_predictions.keys():
            deepac_preds_df.at[genome_id, 'deepac_pred'] = round(nonpatho_genomes_predictions[genome_id])
            continue
        print(f'problem with genome: {genome_id}')

    return deepac_preds_df

## deepac s

In [32]:
patho_test_1_preds_s = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'patho_bacpacs_test1_withp_s.fasta'),'fasta')
patho_test_2_preds_s = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'patho_bacpacs_test2_withp_s.fasta'),'fasta')
patho_genomes_predictions_s = calculate_final_genomes_predictions(patho_test_1_preds_s, patho_test_2_preds_s)

results len: 60
results len: 60


In [33]:
nonpatho_test_1_preds_s = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'nonpatho_bacpacs_test1_withp_s.fasta'),'fasta')
nonpatho_test_2_preds_s = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'nonpatho_bacpacs_test2_withp_s.fasta'),'fasta')
nonpatho_genomes_predictions_s = calculate_final_genomes_predictions(nonpatho_test_1_preds_s, nonpatho_test_2_preds_s)

results len: 40
results len: 40


In [34]:
deepac_s_preds_df = get_deepac_predictions(bacpacs_dataset.metadata.index, labels,
                                                  patho_genomes_predictions_s, nonpatho_genomes_predictions_s)

#### Balanced test

In [35]:
deepac_s_balanced_preds_df = deepac_s_preds_df.loc[balanced_dataset_genomes]

In [36]:
print(classification_report(list(deepac_s_balanced_preds_df['Label']),
                            list(deepac_s_balanced_preds_df['deepac_pred']), target_names=target_names))

              precision    recall  f1-score   support

         NHP       1.00      0.33      0.50        15
          HP       0.71      1.00      0.83        25

    accuracy                           0.75        40
   macro avg       0.86      0.67      0.67        40
weighted avg       0.82      0.75      0.71        40



#### Entire test

In [37]:
print(classification_report(list(deepac_s_preds_df['Label']),
                            list(deepac_s_preds_df['deepac_pred']), target_names=target_names))

              precision    recall  f1-score   support

         NHP       0.83      0.31      0.45        16
          HP       0.88      0.99      0.93        78

    accuracy                           0.87        94
   macro avg       0.85      0.65      0.69        94
weighted avg       0.87      0.87      0.85        94



# DEEPAC R

In [38]:
patho_test_1_preds_r = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'patho_bacpacs_test1_withp.fasta'),'fasta')
patho_test_2_preds_r = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'patho_bacpacs_test2_withp.fasta'),'fasta')
patho_genomes_predictions_r = calculate_final_genomes_predictions(patho_test_1_preds_r, patho_test_2_preds_r)

results len: 60
results len: 60


In [39]:
nonpatho_test_1_preds_r = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'nonpatho_bacpacs_test1_withp.fasta'),'fasta')
nonpatho_test_2_preds_r = SeqIO.parse(open(deepac_on_bacpacs_dir_path + 'nonpatho_bacpacs_test2_withp.fasta'),'fasta')
nonpatho_genomes_predictions_r = calculate_final_genomes_predictions(nonpatho_test_1_preds_r, nonpatho_test_2_preds_r)

results len: 40
results len: 40


In [40]:
deepac_r_preds_df = get_deepac_predictions(bacpacs_dataset.metadata.index, labels,
                                                  patho_genomes_predictions_r, nonpatho_genomes_predictions_r)

#### Balanced test

In [41]:
deepac_r_balanced_preds_df = deepac_r_preds_df.loc[balanced_dataset_genomes]

In [42]:
print(classification_report(list(deepac_r_balanced_preds_df['Label']),
                            list(deepac_r_balanced_preds_df['deepac_pred']), target_names=target_names))

              precision    recall  f1-score   support

         NHP       1.00      0.33      0.50        15
          HP       0.71      1.00      0.83        25

    accuracy                           0.75        40
   macro avg       0.86      0.67      0.67        40
weighted avg       0.82      0.75      0.71        40



#### Entire test

In [43]:
print(classification_report(list(deepac_r_preds_df['Label']),
                            list(deepac_r_preds_df['deepac_pred']), target_names=target_names))

              precision    recall  f1-score   support

         NHP       0.83      0.31      0.45        16
          HP       0.88      0.99      0.93        78

    accuracy                           0.87        94
   macro avg       0.85      0.65      0.69        94
weighted avg       0.87      0.87      0.85        94



# Analyzing Paprbag, pathogenfinder and bacpacs results from bacpacs supplemantry

In [44]:
bacpacs_comp_analysis_sup = pd.read_csv(bacpacs_dir_path + 'Bacpacs-Supplementary.csv', dtype=str).set_index('Genome ID')
bacpacs_comp_analysis_sup = bacpacs_comp_analysis_sup.replace(['hp'], 1)
bacpacs_comp_analysis_sup = bacpacs_comp_analysis_sup.replace(['nhp'], 0)

In [45]:
bacpacs_comp_analysis_sup.head()

Unnamed: 0_level_0,pathogenicity,bacpacs,pathogenfinder,paprbag_fold1,paprbag_fold2,paprbag_fold3,paprbag_fold4,paprbag_fold5,Genome Name,Organism Name,Isolation Source,Habitat,Disease,Host Health,Other Clinical,Comments
Genome ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
1041522.28,1,1,1,1,1,1,1,1,Mycobacterium colombiense CECT 3035 strain CEC...,,blood,,,HIV,,M. colombiense infections were initially descr...
1196162.3,1,1,1,1,1,1,1,1,Listeria monocytogenes serotype 4b str. 10-080...,,Stool,,,Listeriosis,,Whole genome sequencing and comparison of List...
1196172.3,1,0,1,1,1,1,1,1,Listeria monocytogenes serotype 4b str. 02-128...,,Stool,,,Listeriosis,,Whole genome sequencing and comparison of List...
1280.11681,1,1,1,1,1,1,1,1,Staphylococcus aureus strain USA300-SUR15,,Nose,,,,,The study characterizes a Staphylococcus aureu...
1280.12234,1,1,1,1,1,1,1,1,Staphylococcus aureus strain JE2,,skin and soft tissue,,,Soft Tissue Infections,,Sequence the genome of Staphylococcus aureus JE2


In [46]:
def find_preds_from_sup(genomes_list, sup_results_df ,method_name):
    if method_name == 'paprbag':
        paprbag_folds = ['paprbag_fold1', 'paprbag_fold2', 'paprbag_fold3', 'paprbag_fold4', 'paprbag_fold5']
        method_preds_df = sup_results_df.loc[list(genomes_list)][paprbag_folds]
    else:
        method_preds_df = sup_results_df.loc[list(genomes_list)][[method_name]]
    return method_preds_df

# Bacpacs

In [47]:
bacpacs_preds = find_preds_from_sup(balanced_dataset_genomes, bacpacs_comp_analysis_sup, 'bacpacs')

In [48]:
print(classification_report(balanced_labels, bacpacs_preds, target_names=target_names))

              precision    recall  f1-score   support

         NHP       0.73      0.73      0.73        15
          HP       0.84      0.84      0.84        25

    accuracy                           0.80        40
   macro avg       0.79      0.79      0.79        40
weighted avg       0.80      0.80      0.80        40



# Pathogenfinder

In [49]:
pathogenfinder_preds = find_preds_from_sup(balanced_dataset_genomes, bacpacs_comp_analysis_sup, 'pathogenfinder')

In [50]:
print(classification_report(balanced_labels, pathogenfinder_preds, target_names=target_names))

              precision    recall  f1-score   support

         NHP       0.91      0.67      0.77        15
          HP       0.83      0.96      0.89        25

    accuracy                           0.85        40
   macro avg       0.87      0.81      0.83        40
weighted avg       0.86      0.85      0.84        40



# Paprbag

In [51]:
paprbag_preds = find_preds_from_sup(balanced_dataset_genomes, bacpacs_comp_analysis_sup, 'paprbag')
folds_results = []
for fold in ['paprbag_fold1', 'paprbag_fold2', 'paprbag_fold3', 'paprbag_fold4', 'paprbag_fold5']:
    fold_results = classification_report(balanced_labels, paprbag_preds[fold], target_names=target_names, output_dict=True)
    print(f'{fold}: {classification_report(balanced_labels, paprbag_preds[fold], target_names=target_names)}')
    folds_results.append(fold_results)

paprbag_fold1:               precision    recall  f1-score   support

         NHP       1.00      0.20      0.33        15
          HP       0.68      1.00      0.81        25

    accuracy                           0.70        40
   macro avg       0.84      0.60      0.57        40
weighted avg       0.80      0.70      0.63        40

paprbag_fold2:               precision    recall  f1-score   support

         NHP       1.00      0.20      0.33        15
          HP       0.68      1.00      0.81        25

    accuracy                           0.70        40
   macro avg       0.84      0.60      0.57        40
weighted avg       0.80      0.70      0.63        40

paprbag_fold3:               precision    recall  f1-score   support

         NHP       1.00      0.13      0.24        15
          HP       0.66      1.00      0.79        25

    accuracy                           0.68        40
   macro avg       0.83      0.57      0.51        40
weighted avg       0.79      

In [52]:
specificity = np.mean([fold['NHP']['recall'] for fold in folds_results])
print(f'specificity: {round(specificity,2)}')

specificity: 0.17


In [53]:
sensitivity = np.mean([fold['HP']['recall'] for fold in folds_results])
print(f'sensitivity: {round(sensitivity, 2)}')

sensitivity: 1.0
