In [17]:
import os
import astropy
from astropy.table import Table
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report

import sys  
sys.path.insert(0, '../../Code')

import util
from data import *
import train_and_predict

# Download Bacpacs Genomes

In [21]:
from ftplib import FTP
import pandas as pd
import sys

FILE_SUFFIX = '.PATRIC.features.tab'
logger = open('logger', 'w')

def read_genome_ids(input_file):
    with open(input_file) as f:
        ids = f.read().strip().split('\n')
    return ids

def grab_file(ftp, filename, local_path):

    local_file = open(local_path, 'wb')

    try:
        ftp.retrbinary('RETR ' + filename, local_file.write, 1024)
    except Exception as e:
        logger.write(e)

    local_file.close()


def download_genomes(genomes, output_dir_path):


    # connect to host, default port
    ftp = FTP(r'ftp.patricbrc.org')

    ftp.login()  # user anonymous, passwd anonymous@

    count = 0
    for i, genome_id in enumerate(genomes):

        try:
            ftp.cwd("/genomes/" + genome_id)
            local_file = output_dir_path + genome_id + FILE_SUFFIX
            remote_file = genome_id + FILE_SUFFIX
            grab_file(ftp, remote_file, local_file)

            print('downloaded genome: ' + genome_id + ' index: ' + str(i) + '\n')
            count += 1

        except Exception as e:
            # print('ERROR genome: ' + genome_id + ' index: ' + str(i) + '\n')
            logger.write(f'genome: {genome_id} index: {i} error: {e}\n')

    print('downloaded {} genomes'.format(count))
    ftp.quit()
    logger.close()

In [22]:
download = False

output_dir_path = r'..\..\Data\Bacpacs\\'

all_genomes = df2['Genome ID']
if download:
    download_genomes(all_genomes, output_dir_path)

downloaded genome: 470.3353 index: 0

downloaded genome: 106654.48 index: 1

downloaded genome: 520.659 index: 2

downloaded genome: 28450.385 index: 3

downloaded genome: 83554.74 index: 4

downloaded genome: 813.141 index: 5

downloaded genome: 545.38 index: 6

downloaded genome: 777.186 index: 7

downloaded genome: 1352.1760 index: 8

downloaded genome: 562.22306 index: 9

downloaded genome: 210.2912 index: 10

downloaded genome: 573.16474 index: 11

downloaded genome: 1639.2624 index: 12

downloaded genome: 1041522.28 index: 13

downloaded genome: 722731.3 index: 14

downloaded genome: 1773.8714 index: 15

downloaded genome: 2104.190 index: 16

downloaded genome: 37326.9 index: 17

downloaded genome: 28131.10 index: 18

downloaded genome: 287.4623 index: 19

downloaded genome: 1280.11681 index: 20

downloaded genome: 1302.83 index: 21

downloaded genome: 1338.30 index: 22

downloaded genome: 1902136.3 index: 23

downloaded genome: 730.54 index: 24

downloaded genome: 28025.19 index

# Parse PATRIC genomes files

In [6]:
bacpacs_dir_path = r'..\..\Data\Bacpacs\\'
bacpacs_patric_files_path = bacpacs_dir_path + 'patric_files\\' 
bacpacs_genomes_path = bacpacs_dir_path + 'bacpacs_genomes.fasta'
bacpacs_metadata_path = bacpacs_dir_path + 'bacpacs_test.csv'

In [6]:
UNKNOWN_PGFAM = 'X'

In [5]:
def parse_patric_file(file_path):
    
    genome_df = pd.read_csv(file_path, sep='\t')
    pgfams = genome_df['pgfam_id'].fillna(UNKNOWN_PGFAM)
    
    return pgfams

In [48]:
def write_genomes_files(patric_files_folder, output_path)
    with os.scandir(patric_files_folder) as entries:
        with open(output_path, 'w') as out_f:
            for entry in entries:
                if entry.is_file():
                    genome_id = entry.name.split(FILE_SUFFIX)[0]
                    genome_pgfams = parse_patric_file(entry)

                    out_f.write(f'>{genome_id}\n')
                    out_f.write('\n'.join(genome_pgfams))
                    out_f.write('\n')
                    
#write_genomes_files(bacpacs_patric_files_path, bacpacs_genomes_path)

# Load Bacpacs Dataset

In [11]:
bacpacs_dataset = GenomesData(bacpacs_genomes_path, bacpacs_metadata_path)

In [12]:
len(bacpacs_dataset)

94

In [13]:
bacpacs_dataset.metadata

Unnamed: 0_level_0,Unnamed: 0,Genome Name,BacPaCS Label,Label,Ref.,OPP,Balanced Test
Genome ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
470.3353,1,Acinetobacter baumannii strain HWBA8,NHP,HP,Clinical isolate \cite{yoon2017bla},\cite{peleg2008acinetobacter},Yes
106654.48,2,Acinetobacter nosocomialis strain SSA3,NHP,HP,Clinical isolate \cite{yoon2017bla},\cite{peleg2008acinetobacter},Yes
520.659,3,Bordetella pertussis strain B227,HP,HP,\cite{bowden2016genome},,Yes
28450.385,4,Burkholderia pseudomallei strain MSHR5864,HP,HP,\cite{sarovich2018raising},\cite{mangalea2017nitrate},Yes
83554.74,5,Chlamydia psittaci strain GIMC 2005:CpsCP1,HP,HP,\cite{feodorova2020data},,Yes
...,...,...,...,...,...,...,...
487.1548,90,Neisseria meningitidis strain M26417,HP,HP,\cite{folaranmi2017increased},\cite{bernardini2004proteome},No
287.3868,91,Pseudomonas aeruginosa strain RIVM-EMC2982,NHP,HP,Clinical isolate \cite{botelho2018two},\cite{diggle2020microbe},No
1280.12234,92,Staphylococcus aureus strain JE2,HP,HP,"\cite{1280.12234,o2017into}",\cite{o2017into},No
1280.11677,93,Staphylococcus aureus strain USA300-SUR11,HP,HP,"\cite{1280.11677,o2017into}",\cite{o2017into},No


In [19]:
labels = bacpacs_dataset.y

## Balanced dataset (one genome per species)

In [21]:
balanced_dataset_genomes = bacpacs_dataset.metadata[bacpacs_dataset.metadata['Balanced Test'] == 'Yes'].index

In [22]:
bacpacs_data_balanced = bacpacs_dataset.data[balanced_dataset_genomes]

In [23]:
len(bacpacs_data_balanced)

40

In [24]:
balanced_labels = labels[balanced_dataset_genomes]

# WSPC model

In [7]:
model_path = '..\..\WSPC Model\WSPC_model.pkl'

WSPC = util.load_model(model_path)

In [18]:
preds = WSPC.predict(bacpacs_dataset.data)

In [20]:
target_names = ['NHP', 'HP']
print(classification_report(labels, preds, target_names=target_names))

              precision    recall  f1-score   support

         NHP       0.93      0.88      0.90        16
          HP       0.97      0.99      0.98        78

    accuracy                           0.97        94
   macro avg       0.95      0.93      0.94        94
weighted avg       0.97      0.97      0.97        94



NHP recall = specificity   
HP recall = sensitivity

In [25]:
balanced_preds = WSPC.predict(bacpacs_data_balanced)

In [26]:
target_names = ['NHP', 'HP']
print(classification_report(balanced_labels, balanced_preds, target_names=target_names))

              precision    recall  f1-score   support

         NHP       1.00      0.87      0.93        15
          HP       0.93      1.00      0.96        25

    accuracy                           0.95        40
   macro avg       0.96      0.93      0.95        40
weighted avg       0.95      0.95      0.95        40

