In [1]:
import sys  
sys.path.insert(0, 'Code')

import numpy as np
import json
import pandas as pd
import random
from collections import Counter
from pathlib import Path

import util
import taxa
import warnings
warnings.filterwarnings('ignore')

from ete3 import NCBITaxa
ncbi = NCBITaxa()

In [2]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

input_path = Path('..\Data\PATRIC_labeled_genomes')

In [3]:
## Consts

UNCLASSIFIED = 'unclassified'
LABEL = 'Label'
GENOME_ID = 'Genome ID'
GENOME_NAME = 'Genome Name'
SPECIES = 'species'
HP = 'HP'
NHP = 'NHP'
RATIO = 'Ratio'
MAJORITY_LABEL = 'Majority Label'


In [4]:
def get_genome_to_species(selected_genomes):

    species_to_count = Counter()
    genome_to_species = {}
    
    for genome in selected_genomes:
        species = taxa.get_first_tax_desc(genome, SPECIES)
        
        species_to_count[species] += 1
        genome_to_species[genome] = species

    return species_to_count, genome_to_species


In [5]:
def choose_species_representative(genome_to_species, species, genomes):
    
    species_arr = [genome_id for genome_id, genome_species in genome_to_species.items() 
                   if species == genome_species and genome_id in genomes]
    
    sp_representative = None
    if len(species_arr) > 0:
        sp_representative = random.choice(species_arr)
    
    return sp_representative

In [6]:
def calc_hp_nhp_species(genome_to_species, genomes_df):
    
    species_to_hp_nhp = {}
    
    for genome_id in genomes_df.index:
        
        species = genome_to_species[genome_id]
        label = genomes_df.at[genome_id, LABEL]
        
        counter = species_to_hp_nhp.setdefault(species, Counter({HP : 0, NHP : 0}))
        counter[label] += 1
 
    return species_to_hp_nhp

In [7]:
def get_genomes_by_label(genomes_df, genome_to_species, curr_species, label):

    return [genome for genome, species in genome_to_species.items() 
            if genomes_df.loc[genome, LABEL] == label
            and species == curr_species]

def get_minority_ratio(curr_species_to_hp_nhp, majority_label, minority_label):
    
    return curr_species_to_hp_nhp[minority_label] / curr_species_to_hp_nhp[majority_label]

def update_rep_df(species_representatives_df, rep_genome, species, genomes_df):
    
    if rep_genome: 
        species_representatives_df.loc[rep_genome, SPECIES] = species
        species_representatives_df.loc[rep_genome, LABEL] = genomes_df.loc[rep_genome, LABEL]
        
def update_species_without_majority_df(species_without_majority, species, ratio, curr_species_to_hp_nhp):
    
    species_without_majority.loc[species, RATIO] = ratio
    species_without_majority.loc[species, HP] = curr_species_to_hp_nhp[HP]
    species_without_majority.loc[species, NHP] = curr_species_to_hp_nhp[NHP]

In [8]:
def choose_species_for_train_and_test(genomes_df, genome_to_species, species_to_count, train_df, test_df):
    
    random.seed(10)
    
    species_to_hp_nhp_train_test = calc_hp_nhp_species(genome_to_species, genomes_df)
    
    species_without_majority = pd.DataFrame(columns=[HP, NHP, RATIO])
    
    species_representatives_train = pd.DataFrame(columns=[SPECIES, LABEL])
    species_representatives_test = pd.DataFrame(columns=[SPECIES, LABEL])
    
    for species, count in species_to_count.items():
        
        genomes_to_drop = []
        
        if count > 1:
            curr_species_to_hp_nhp = species_to_hp_nhp_train_test[species]
            
            if all(count > 0 for count in curr_species_to_hp_nhp.values()):
            
                majority_label = max(curr_species_to_hp_nhp, key=curr_species_to_hp_nhp.get)
                minority_label = min(curr_species_to_hp_nhp, key=curr_species_to_hp_nhp.get)

                ratio = get_minority_ratio(curr_species_to_hp_nhp, majority_label, minority_label)
                if ratio < 0.1: #remove the minority labeled genomes
                    genomes_to_drop += get_genomes_by_label(genomes_df, genome_to_species, species, minority_label)
                else: #remove the entire species
                    update_species_without_majority_df(species_without_majority, species, ratio, curr_species_to_hp_nhp)
                    continue
                    
        train_genomes = train_df.loc[~train_df.index.isin(genomes_to_drop)].index

        rep_genome = choose_species_representative(genome_to_species, species, train_genomes)
        update_rep_df(species_representatives_train, rep_genome, species, genomes_df)

        rep_genome =  choose_species_representative(genome_to_species, species, test_df.index)
        update_rep_df(species_representatives_test, rep_genome, species, genomes_df)

    #add species without majority to test
    for species in species_without_majority.index:
        rep_genome = choose_species_representative(genome_to_species, species, test_df.index)
        update_rep_df(species_representatives_test, rep_genome, species, genomes_df)

    return species_representatives_train, species_representatives_test, species_without_majority

In [9]:
def get_unclassified_species_to_genomes(genomes_df):
    
    unclassified_species_to_genomes = {}

    for genome in genomes_df.index:

        tax_name = taxa.get_unclassified_species(genome)    
        if tax_name:
            unclassified_species_to_genomes.setdefault(tax_name, []).append(genome)
            
    return unclassified_species_to_genomes

In [10]:
def choose_genomes_unclassified_species(species_to_genomes):
    
    random.seed(10)
    
    species_representatives = pd.DataFrame(columns=[SPECIES])
    
    for species, genomes in species_to_genomes.items():
                    
        genome_id = random.choice(genomes)
        species_representatives.loc[genome_id, SPECIES] = species

    return species_representatives

In [11]:
def remove_duplicates(main_df, test_genomes_df, train_genomes_df):
    
    dup_ids = main_df[main_df.duplicated(subset=['Genome Name'])].index
    
    main_df.drop(dup_ids, axis='index', inplace=True)
    
    dups_in_train = sum(dup_ids.isin(train_genomes_df.index))
    dups_in_test = sum(dup_ids.isin(test_genomes_df.index))
    
    test_genomes_df.drop(dup_ids, axis='index', inplace=True, errors='ignore')
    train_genomes_df.drop(dup_ids, axis='index', inplace=True, errors='ignore')
    
    print(f'Removed {dups_in_train} duplicates from train, Removed {dups_in_test} duplicates from test')


## Load data and remove duplicates

In [12]:
train_path = input_path / 'labeled_train.csv'
train_genomes_df  = pd.read_csv(train_path, dtype=str).set_index(GENOME_ID)

test_path = input_path / 'labeled_test.csv'
test_genomes_df  = pd.read_csv(test_path, dtype=str).set_index(GENOME_ID)

main_df = pd.concat([train_genomes_df, test_genomes_df], ignore_index=False)

remove_duplicates(main_df, test_genomes_df, train_genomes_df)

assert len(train_genomes_df) + len(test_genomes_df) == len(main_df)

print(f'{len(train_genomes_df)} train genomes, {len(test_genomes_df)} test genomes, {len(main_df)} total genomes')

Removed 0 duplicates from train, Removed 16 duplicates from test
35860 train genomes, 3660 test genomes, 39520 total genomes


In [20]:
train_genomes_df.Label.value_counts()

HP     32908
NHP     2952
Name: Label, dtype: int64

In [21]:
test_genomes_df.Label.value_counts()

HP     2941
NHP     719
Name: Label, dtype: int64

In [22]:
train_genomes_df['Date Inserted'] = pd.to_datetime(train_genomes_df['Date Inserted'])
train_genomes_df['Date Inserted'][:10]

Genome ID
28450.1894   2019-10-02 19:02:02.913000+00:00
158836.408   2019-10-02 13:36:38.670000+00:00
881260.35    2019-10-02 13:36:35.344000+00:00
158836.407   2019-10-02 13:36:27.278000+00:00
158836.406   2019-10-02 13:35:41.856000+00:00
881260.34    2019-10-02 13:35:17.975000+00:00
1639.6892    2019-10-02 13:35:16.011000+00:00
1639.6890    2019-10-02 13:35:14.080000+00:00
1639.6891    2019-10-02 13:35:11.994000+00:00
1639.6889    2019-10-02 13:35:09.973000+00:00
Name: Date Inserted, dtype: datetime64[ns, UTC]

In [23]:
min(train_genomes_df['Date Inserted'])

Timestamp('2014-12-08 22:09:24.393000+0000', tz='UTC')

In [24]:
max(train_genomes_df['Date Inserted'])

Timestamp('2019-10-02 19:02:02.913000+0000', tz='UTC')

### Check that all train genomes were inserted before 2019-11-01

In [25]:
assert all(train_genomes_df['Date Inserted'] < pd.Timestamp('2019-11-01', tz='US/Pacific'))

In [26]:
test_genomes_df['Date Inserted'] = pd.to_datetime(test_genomes_df['Date Inserted'])
test_genomes_df['Date Inserted'][-10:]

Genome ID
1313.19274   2019-11-20 17:59:11.446000+00:00
1313.19086   2019-11-20 16:03:51.188000+00:00
1313.19043   2019-11-20 15:34:09.843000+00:00
1313.19022   2019-11-20 15:20:14.361000+00:00
1313.18914   2019-11-20 12:48:23.499000+00:00
1313.18874   2019-11-20 11:43:16.391000+00:00
1313.18749   2019-11-20 09:15:26.687000+00:00
1313.18642   2019-11-20 06:42:28.108000+00:00
1313.18633   2019-11-20 06:27:31.038000+00:00
1313.18492   2019-11-20 04:02:51.300000+00:00
Name: Date Inserted, dtype: datetime64[ns, UTC]

### Check that all test genomes were inserted after 2019-11-01

In [35]:
min(test_genomes_df['Date Inserted'])

Timestamp('2019-11-20 04:02:51.300000+0000', tz='UTC')

In [36]:
max(test_genomes_df['Date Inserted'])

Timestamp('2020-07-02 08:24:47.271000+0000', tz='UTC')

In [37]:
assert all(test_genomes_df['Date Inserted'] > pd.Timestamp('2019-11-01', tz='US/Pacific'))

In [38]:
selected_genomes = list(train_genomes_df.index) + list(test_genomes_df.index)

species_to_count, genome_to_species = get_genome_to_species(selected_genomes)

assert len(genome_to_species) == len(main_df)

# Select Representatives

In [39]:
species_representatives_train, species_representatives_test, species_without_majority = choose_species_for_train_and_test(main_df, genome_to_species, species_to_count, train_genomes_df, test_genomes_df)

In [40]:
species_representatives_train.head()

Unnamed: 0,species,Label
360118.7,Burkholderia pseudomallei,HP
158836.88,Enterobacter hormaechei,HP
881260.26,Enterobacter bugandensis,HP
1639.6889,Listeria monocytogenes,HP
1571816.3,Enterobacter roggenkampii,HP


In [41]:
len(species_representatives_train), len(species_representatives_test)

(1501, 277)

In [42]:
species_representatives_test.head()

Unnamed: 0,species,Label
28450.2018,Burkholderia pseudomallei,HP
158836.472,Enterobacter hormaechei,NHP
1639.7756,Listeria monocytogenes,HP
1812935.59,Enterobacter roggenkampii,HP
299767.1,Enterobacter ludwigii,HP


### species that were removed from train because they don't have a majority label

In [43]:
len(species_without_majority)

39

In [44]:
species_without_majority

Unnamed: 0,HP,NHP,Ratio
Bacteroides dorei,8,37,0.216216
Bacillus cereus,16,7,0.4375
Neisseria meningitidis,278,43,0.154676
Leptotrichia wadei,2,1,0.5
Bacteroides fragilis,61,45,0.737705
Enterococcus avium,4,5,0.8
Staphylococcus pseudintermedius,2,1,0.5
Lactobacillus rhamnosus,2,20,0.1
Clostridiales genomosp. BVAB1,6,1,0.166667
Bacteroides massiliensis,1,1,1.0


In [45]:
assert all(species_without_majority[RATIO] >= 0.1)
assert not any(species_without_majority.index.isin(species_representatives_train[SPECIES]))

species without majority in the original train, before selecting representatives

In [46]:
len(set(genome_to_species[genome] for genome in train_genomes_df.index).intersection(species_without_majority.index))

38

make sure they were removed from train

In [47]:
species_removed_from_train = set(genome_to_species[genome] 
                                 for genome in train_genomes_df.index).difference(species_representatives_train[SPECIES])

len(species_removed_from_train)

38

make sure no species were removed from test

In [48]:
species_removed_from_test = set(genome_to_species[genome] 
                                for genome in test_genomes_df.index).difference(species_representatives_test[SPECIES])

len(species_removed_from_test)

0

In [49]:
def create_ratio_df(rep_df, species_to_hp_nhp):
    
    ratio_df = rep_df.copy()
    ratio_df[HP] = ratio_df[SPECIES].apply(lambda species: species_to_hp_nhp[species].get(HP, 0))
    ratio_df[NHP] = ratio_df[SPECIES].apply(lambda species: species_to_hp_nhp[species].get(NHP, 0))

    ratio_df[RATIO] = ratio_df[SPECIES].apply(lambda species: 
                                              min(species_to_hp_nhp[species].values())/
                                              max(species_to_hp_nhp[species].values()))

    ratio_df[MAJORITY_LABEL] = ratio_df[[HP,NHP]].idxmax(axis=1)
    
    return ratio_df

In [50]:
species_to_hp_nhp = calc_hp_nhp_species(genome_to_species, main_df)
species_representatives_train_ratio = create_ratio_df(species_representatives_train, species_to_hp_nhp)

species_representatives_train_ratio.head(15)

Unnamed: 0,species,Label,HP,NHP,Ratio,Majority Label
360118.7,Burkholderia pseudomallei,HP,168,0,0.0,HP
158836.88,Enterobacter hormaechei,HP,305,6,0.019672,HP
881260.26,Enterobacter bugandensis,HP,12,0,0.0,HP
1639.6889,Listeria monocytogenes,HP,197,0,0.0,HP
1571816.3,Enterobacter roggenkampii,HP,12,0,0.0,HP
299767.86,Enterobacter ludwigii,HP,6,0,0.0,HP
61645.126,Enterobacter asburiae,HP,20,0,0.0,HP
562.21575,Escherichia coli,HP,4352,396,0.090993,HP
1338.6,Streptococcus intermedius,HP,13,0,0.0,HP
184250.3,Streptococcus constellatus,HP,3,0,0.0,HP


In [51]:
assert all(species_representatives_train_ratio[RATIO] < 0.1)
assert all(species_representatives_train_ratio.apply(lambda row: row[LABEL] == row[MAJORITY_LABEL], axis=1))

In [52]:
species_representatives_test_ratio = create_ratio_df(species_representatives_test, species_to_hp_nhp)
species_representatives_test_ratio.head(15)

Unnamed: 0,species,Label,HP,NHP,Ratio,Majority Label
28450.2018,Burkholderia pseudomallei,HP,168,0,0.0,HP
158836.472,Enterobacter hormaechei,NHP,305,6,0.019672,HP
1639.7756,Listeria monocytogenes,HP,197,0,0.0,HP
1812935.59,Enterobacter roggenkampii,HP,12,0,0.0,HP
299767.1,Enterobacter ludwigii,HP,6,0,0.0,HP
61645.317,Enterobacter asburiae,HP,20,0,0.0,HP
562.55247,Escherichia coli,HP,4352,396,0.090993,HP
573.29103,Klebsiella pneumoniae,HP,3846,8,0.00208,HP
287.11178,Pseudomonas aeruginosa,HP,1955,0,0.0,HP
44275.61,Leptospira interrogans,HP,6,0,0.0,HP


In [53]:
assert any(species_representatives_test_ratio[RATIO] >= 0.1)

In [54]:
species_representatives_test_ratio[species_representatives_test_ratio[RATIO] >= 0.1]

Unnamed: 0,species,Label,HP,NHP,Ratio,Majority Label
1396.2563,Bacillus cereus,NHP,16,7,0.4375,HP
487.3238,Neisseria meningitidis,HP,278,43,0.154676,HP
817.1653,Bacteroides fragilis,HP,61,45,0.737705,HP
33945.49,Enterococcus avium,NHP,4,5,0.8,NHP
47715.653,Lactobacillus rhamnosus,NHP,2,20,0.1,NHP
699240.9,Clostridiales genomosp. BVAB1,HP,6,1,0.166667,HP
1505.72,Paeniclostridium sordellii,NHP,4,2,0.5,HP
1351.348,Enterococcus faecalis,HP,98,13,0.132653,HP
1318.781,Streptococcus parasanguinis,NHP,3,25,0.12,NHP
1282.3805,Staphylococcus epidermidis,NHP,85,69,0.811765,HP


In [55]:
assert any(species_representatives_test_ratio.apply(lambda row: row[LABEL] != row[MAJORITY_LABEL], axis=1))

In [56]:
species_representatives_test_ratio[species_representatives_test_ratio.apply(lambda row: 
                                                                            row[LABEL] != row[MAJORITY_LABEL], axis=1)]

Unnamed: 0,species,Label,HP,NHP,Ratio,Majority Label
158836.472,Enterobacter hormaechei,NHP,305,6,0.019672,HP
1352.8997,Enterococcus faecium,NHP,666,7,0.010511,HP
1311.2752,Streptococcus agalactiae,NHP,236,5,0.021186,HP
1396.2563,Bacillus cereus,NHP,16,7,0.4375,HP
1505.72,Paeniclostridium sordellii,NHP,4,2,0.5,HP
1282.3805,Staphylococcus epidermidis,NHP,85,69,0.811765,HP
1260.144,Finegoldia magna,NHP,10,2,0.2,HP


In [57]:
print('test', len(species_representatives_test), 'train', len(species_representatives_train))

test 277 train 1501


### Handle unclassified species

In [58]:
test_genomes_with_unclassified_species = [genome for genome in species_representatives_test.index 
                                          if taxa.get_unclassified_species(genome)]

train_genomes_with_unclassified_species = [genome
                                           for genome in species_representatives_train.index 
                                           if taxa.get_unclassified_species(genome)]


species_representatives_test_filtered = species_representatives_test.drop(test_genomes_with_unclassified_species, 
                                                                          axis='index')
species_representatives_train_filtered = species_representatives_train.drop(train_genomes_with_unclassified_species, 
                                                                            axis='index')

print('test before', len(species_representatives_test), 'train before', len(species_representatives_train))
print('test after', len(species_representatives_test_filtered), 'train after', len(species_representatives_train_filtered))

test before 277 train before 1501
test after 170 train after 536


Count classified species with label ratio != 0

In [59]:
sum(species_representatives_train_ratio.loc[species_representatives_train_filtered.index, RATIO] != 0)

29

## Choose one representative from each unclassified species

In [60]:
def create_unclassified_rep_df(genomes_df, label_df):
    
    unclassified_species_to_genomes = get_unclassified_species_to_genomes(genomes_df)

    representatives_unclassified = choose_genomes_unclassified_species(unclassified_species_to_genomes)
    representatives_unclassified[LABEL] = label_df.loc[representatives_unclassified.index, LABEL]
    
    return representatives_unclassified

In [61]:
representatives_unclassified_test = create_unclassified_rep_df(test_genomes_df, label_df=main_df)
representatives_unclassified_train = create_unclassified_rep_df(train_genomes_df, label_df=main_df)

In [62]:
len(representatives_unclassified_test), len(representatives_unclassified_train)

(36, 105)

In [63]:
representatives_unclassified_test.head()

Unnamed: 0,species,Label
2026720.132,unclassified Saccharibacteria,NHP
1871037.25,unclassified Flavobacteriaceae,NHP
2044936.47,unclassified Bacteroidia,NHP
2044938.12,unclassified Bacteria,NHP
59823.699,unclassified Prevotella,NHP


In [64]:
representatives_unclassified_train.head()

Unnamed: 0,species,Label
2584561.3,unclassified Akkermansia,NHP
1906334.4,unclassified Corynebacterium,HP
1972757.3,unclassified Klebsiella,HP
713049.5,unclassified Bacteria,NHP
2508709.3,unclassified Lactobacillus,NHP


### Merge all represantatives from classified and unclassified species

In [65]:
final_train = pd.concat([species_representatives_train_filtered, representatives_unclassified_train])

final_test = pd.concat([species_representatives_test_filtered, representatives_unclassified_test])

In [66]:
assert len(final_test) == len(species_representatives_test_filtered) + len(representatives_unclassified_test)
assert len(final_train) == len(species_representatives_train_filtered) + len(representatives_unclassified_train)

In [67]:
print('final_test', len(final_test), 'final_train', len(final_train))

final_test 206 final_train 641


In [68]:
final_train[LABEL].value_counts()

HP     428
NHP    213
Name: Label, dtype: int64

In [69]:
final_test.head()

Unnamed: 0,species,Label
28450.2018,Burkholderia pseudomallei,HP
158836.472,Enterobacter hormaechei,NHP
1639.7756,Listeria monocytogenes,HP
1812935.59,Enterobacter roggenkampii,HP
299767.1,Enterobacter ludwigii,HP


In [70]:
final_train.head()

Unnamed: 0,species,Label
360118.7,Burkholderia pseudomallei,HP
158836.88,Enterobacter hormaechei,HP
881260.26,Enterobacter bugandensis,HP
1639.6889,Listeria monocytogenes,HP
1571816.3,Enterobacter roggenkampii,HP


# Write files

In [71]:
fields = ['Genome Name', 'Host Name', 'Isolation Source', 'Isolation Comments', 'Genome Quality', 'Collection Date', 'Date Inserted', 
          'Other Clinical', 'Host Health', 'Disease', 'Comments']

In [72]:
final_train.head()

Unnamed: 0,species,Label
360118.7,Burkholderia pseudomallei,HP
158836.88,Enterobacter hormaechei,HP
881260.26,Enterobacter bugandensis,HP
1639.6889,Listeria monocytogenes,HP
1571816.3,Enterobacter roggenkampii,HP


In [73]:
final_train_df = pd.concat([final_train, train_genomes_df.loc[final_train.index, fields]], axis=1)

final_train_df = final_train_df.sort_values(by=[LABEL])
final_train_df[LABEL].value_counts()

HP     428
NHP    213
Name: Label, dtype: int64

In [74]:
final_test_df = pd.concat([final_test, test_genomes_df.loc[final_test.index, fields]], axis=1)

final_test_df[LABEL].value_counts()

HP     106
NHP    100
Name: Label, dtype: int64

In [75]:
save = False

if save:
    final_train_df.to_csv('train_genomes.csv')
    final_test_df.to_csv('test_genomes.csv')