In [1]:
import os
import pandas as pd
from shutil import copyfile
import sklearn
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SelectKBest
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier

In [2]:
pd.set_option('display.max_rows', None)

GROUP = 'Group'
GENOME_ID = 'Genome ID'

In [3]:
test_genome_verified = pd.read_csv('test_genomes_verified.csv', dtype=str, index_col=0)
test_genome_verified.index = test_genome_verified[GENOME_ID]
test_genome_verified.head()

Unnamed: 0_level_0,Genome ID,Genome Name,Label,HP/NHP entire dataset,species,References,Group
Genome ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
163603.4,163603.4,Actinomadura latina strain ATCC BAA-277,HP,1/0,Actinomadura latina,\cite{trujillo1997polyphasic},HP
648.157,648.157,Aeromonas caviae strain ScAc2001,HP,9/0,Aeromonas caviae,\cite{tang2020co},HP
565.15,565.15,Atlantibacter hermannii strain 3608,HP,1/0,Atlantibacter hermannii,\cite{ioannou2019escherichia},HP
2026190.21,2026190.21,Bacillus mobilis strain 1428.155,HP,1/0,Bacillus mobilis,\cite{2026190.21},HP
29459.655,29459.655,Brucella melitensis strain HN20190002,HP,13/0,Brucella melitensis,\cite{li2020molecular},HP


In [4]:
test_genome_verified[GROUP].value_counts()

HP-OPP     76
NHP        61
NHP-OPP    41
HP         26
Name: Group, dtype: int64

In [20]:
def add_true_label_col(preds_arr):
    preds_arr = preds_arr.loc[test_genome_verified.index]    
    for i in preds_arr.index: preds_arr.at[i, 'Label'] = test_genome_verified.at[i, 'Label']
    preds_arr = preds_arr.replace({'Label': {'NHP': 0, 'HP': 1}})
    
    return preds_arr
    

In [21]:
def calculate_results_per_group(test_genomes, preds_arr):
    
    preds_arr = add_true_label_col(preds_arr)

    for group_name, genomes in test_genomes.groupby(GROUP):
        y_test_group = preds_arr.loc[genomes.index]['Label']
        y_pred = preds_arr.loc[genomes.index]['predictions']
        correct = sklearn.metrics.accuracy_score(y_test_group, y_pred, normalize=False)
        accuracy = sklearn.metrics.accuracy_score(y_test_group, y_pred)
        print(f'{group_name}\taccuracy={accuracy:.2f} correctly predicted={correct}/{len(y_test_group)}')

    

# bacpacs resuls

In [6]:
bacpacs_x = pd.read_csv('bacpacs_test7.2.21_X.csv')  
bacpacs_pred_arr = pd.read_csv('bacpacs_prediction_on_newTest7.2.21.csv')  
bacpacs_pred_arr['genome_id'] = bacpacs_x['genome_id']
ids = []
for i in bacpacs_pred_arr['genome_id']: ids.append(i.split('.PATRIC')[0])
bacpacs_pred_arr['genome_id'] = ids
bacpacs_pred_arr['true'] = ''
bacpacs_pred_arr = bacpacs_pred_arr.set_index('genome_id') 

In [22]:
print('BacPaCS results:')
calculate_results_per_group(test_genome_verified, bacpacs_pred_arr)

BacPaCS results:
HP	accuracy=0.92 correctly predicted=24/26
HP-OPP	accuracy=0.83 correctly predicted=63/76
NHP	accuracy=0.72 correctly predicted=44/61
NHP-OPP	accuracy=0.73 correctly predicted=30/41


# pathogenfinder resuls

In [80]:
folder = 'results pathogenfinder/'
files = os.listdir(folder)
pathogenfinder_results = pd.DataFrame(index = test_genome_verified.index, columns = ['predictions'], dtype=int)
for name in files:
    if name.startswith('PathogenFinder_cdhit_raw'): continue
    genome_id = name.split('.txt', 1)[0].split('_')[2]
    if genome_id in  test_genome_verified.index:
            with open(folder + name , 'r') as input_file:
                for line in input_file: 
                    if line.startswith('Probability of being human pathogen ::'):
                        prob = line.strip().split('\t')[1]
                        if float(prob) >= 0.5: pathogenfinder_results.at[genome_id, 'predictions'] = 1
                        else: pathogenfinder_results.at[genome_id, 'predictions'] = 0
                    # Validate that the file name match the input sequence
                    if line.startswith('#input_seq:'):
                        seq_id =  line.split('fig|')[1].split('.peg')[0]
                        if genome_id != seq_id: print('ERROR')
                        break

In [81]:
print('Pathogenfinder results:')
calculate_results_per_group(test_genome_verified, pathogenfinder_results)

Pathogenfinder results:
HP	accuracy=0.96 correctly predicted=25/26
HP-OPP	accuracy=0.84 correctly predicted=64/76
NHP	accuracy=0.52 correctly predicted=32/61
NHP-OPP	accuracy=0.17 correctly predicted=7/41
