In [18]:
import os
from pathlib import Path
import pandas as pd
from shutil import copyfile
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

import sys  
sys.path.insert(0, '../Code')

from data import *

In [21]:
pd.set_option('display.max_rows', None)

GROUP = 'Group'
GENOME_ID = 'Genome ID'
LABEL = 'Label'

In [23]:
path = Path('..\Data')
test_genome_verified = MetadataReader.read(path / 'test_genomes_verified.csv')
test_genome_verified[LABEL] = test_genome_verified[LABEL].apply(lambda label: GenomesData.label_to_int.get(label))

test_genome_verified.head()

Unnamed: 0_level_0,Genome Name,Label,HP/NHP,species,References,Group,Host Name,Isolation Source,Isolation Comments,Genome Quality,Collection Date,Date Inserted,Other Clinical,Host Health,Disease,Comments
Genome ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
163603.4,Actinomadura latina strain ATCC BAA-277,1,1/0,Actinomadura latina,\cite{trujillo1997polyphasic},HP,"Human, Homo sapiens",arm of patient,,Good,,2020-04-26 17:38:00.943000+00:00,,,,MicrobeNet genomes
648.157,Aeromonas caviae strain ScAc2001,1,9/0,Aeromonas caviae,\cite{tang2020co},HP,"Human, Homo sapiens",Watery diarrhea,,Good,2019-05,2020-01-31 23:41:12.327000+00:00,,Diarrhea,,Whole genome sequenced and analysis MDR isolat...
565.15,Atlantibacter hermannii strain 3608,1,1/0,Atlantibacter hermannii,\cite{ioannou2019escherichia},HP,"Human, Homo sapiens",Wound swab,,Good,2015,2020-02-07 13:50:54.571000+00:00,,Wound infection 8,,Extended Spectrum beta-lactamase producing Ent...
29459.655,Brucella melitensis strain HN20190002,1,13/0,Brucella melitensis,\cite{li2020molecular},HP,"Human, Homo sapiens",,,Good,2019-02-04,2020-01-31 21:07:14.657000+00:00,,,,"strain was obtained from patients in Hainan,China"
87883.284,Burkholderia multivorans strain C1576,1,107/0,Burkholderia multivorans,\cite{silva2016long},HP,"Human, Homo sapiens",sputum,isolated from a cystic fibrosis patient,Good,1992,2020-03-27 14:33:49.442000+00:00,,Cystic fibrosis,,Whole genome sequencing of a clinical isolate ...


In [24]:
test_genome_verified[GROUP].value_counts()

OHP     77
NHP     61
ONHP    41
HP      25
Name: Group, dtype: int64

In [30]:
def calculate_results_per_group(test_genomes, preds_arr):
    
    for group_name, genomes in test_genomes.groupby(GROUP):
        
        y_test_group = test_genomes.loc[genomes.index][LABEL]
        y_pred = preds_arr.loc[genomes.index]['predictions']
        correct = sklearn.metrics.accuracy_score(y_test_group, y_pred, normalize=False)
        accuracy = sklearn.metrics.accuracy_score(y_test_group, y_pred)
        print(f'{group_name}\taccuracy={accuracy:.2f} correctly predicted={correct}/{len(y_test_group)}')


# bacpacs results

In [32]:
bacpacs_pred_arr = pd.read_csv('bacpacs_prediction_on_newTest14.2.21.csv')  
bacpacs_pred_arr['genome_id'] = bacpacs_pred_arr['genome_id'].str.strip('.PATRIC')
bacpacs_pred_arr = bacpacs_pred_arr.set_index('genome_id') 

In [33]:
print('BacPaCS results:')

calculate_results_per_group(test_genome_verified, bacpacs_pred_arr)

BacPaCS results:
HP	accuracy=0.96 correctly predicted=24/25
NHP	accuracy=0.72 correctly predicted=44/61
OHP	accuracy=0.82 correctly predicted=63/77
ONHP	accuracy=0.73 correctly predicted=30/41


# pathogenfinder results

In [45]:
folder = 'results pathogenfinder/'
files = os.listdir(folder)
pathogenfinder_results = pd.DataFrame(index = test_genome_verified.index, columns = ['predictions'], dtype=int)

for name in files:
    if name.startswith('PathogenFinder_cdhit_raw'): continue
        
    genome_id = name.split('.txt', 1)[0].split('_')[2]
    if genome_id in  test_genome_verified.index:
            with open(folder + name , 'r') as input_file:
                for line in input_file: 
                    if line.startswith('The organisms is predicted as human pathogenic ::'):
                        predicted_pathogen = line.strip().split('\t')[1]
                        if predicted_pathogen == 'Yes': pathogenfinder_results.at[genome_id, 'predictions'] = 1
                        elif predicted_pathogen== 'No': pathogenfinder_results.at[genome_id, 'predictions'] = 0
                        else: print(f'Error in results of file: {name}')
                    # Validate that the file name match the input sequence
                    if line.startswith('#input_seq:'):
                        seq_id =  line.split('fig|')[1].split('.peg')[0]
                        if genome_id != seq_id: print(f'Error in file name: {name}')
                        break

In [46]:
print('Pathogenfinder results:')
calculate_results_per_group(test_genome_verified, pathogenfinder_results)

Pathogenfinder results:
HP	accuracy=0.96 correctly predicted=24/25
NHP	accuracy=0.59 correctly predicted=36/61
OHP	accuracy=0.86 correctly predicted=66/77
ONHP	accuracy=0.29 correctly predicted=12/41


# WSPC results

In [39]:
wspc_preds = pd.read_csv('wspc_test_preds.csv', dtype={GENOME_ID: str}).set_index(GENOME_ID)

In [47]:
calculate_results_per_group(test_genome_verified, wspc_preds)

HP	accuracy=1.00 correctly predicted=25/25
NHP	accuracy=0.93 correctly predicted=57/61
OHP	accuracy=0.92 correctly predicted=71/77
ONHP	accuracy=0.59 correctly predicted=24/41
