In [1]:
import pandas as pd
import re

In [2]:
# disable chained assignments
pd.options.mode.chained_assignment = None

In [3]:
path_labels = '/Users/luke.thompson/deep-metagenomics/MarRef_training/MarRef.training.label.tsv'
path_kraken = '/Users/luke.thompson/deep-metagenomics/kraken2-output/MarRef.training.vs.standard.paired.kraken'

In [4]:
df_labels = pd.read_csv(path_labels, sep='\t', index_col=0, dtype='str')
dict_labels = dict(zip(df_labels.Training_Label, df_labels.Species))

In [5]:
df_kraken = pd.read_csv(path_kraken, sep='\t', header=None)
df_kraken.columns =['classified', 'sequence_id', 'taxonomy_id', 'length_bp', 'kmer_mapping']

In [6]:
classified_vc = df_kraken.classified.value_counts()
classified_proportion = classified_vc['C']/classified_vc.sum()
print('classified: %s/%s (%s)' % (classified_vc['C'], 
                                  classified_vc.sum(),
                                  classified_proportion))

classified: 7174271/7432328 (0.9652791157763758)


In [7]:
df_kraken_classfied = df_kraken[df_kraken.classified == 'C']

In [8]:
df_kraken_classfied['predicted_strain'] = [re.sub(r'(.*) \(taxid.*\)', r'\1', x) for x in df_kraken_classfied.taxonomy_id]
df_kraken_classfied['predicted_species_clean'] = [' '.join(re.sub(r'Candidatus ', '', x).split(' ')[0:2]) for x in df_kraken_classfied.predicted_strain]
df_kraken_classfied['predicted_genus_clean'] = [x.split(' ')[0] for x in df_kraken_classfied.predicted_species_clean]

In [9]:
df_kraken_classfied['true_label'] = [seqid.split('|')[1] for seqid in df_kraken_classfied.sequence_id]

In [10]:
df_kraken_classfied['true_species'] = [dict_labels[x] for x in df_kraken_classfied.true_label]
df_kraken_classfied['true_species_clean'] = [re.sub(r'_\w+', '', x) for x in df_kraken_classfied.true_species]
df_kraken_classfied['true_genus_clean'] = [x.split(' ')[0] for x in df_kraken_classfied.true_species_clean]

In [11]:
df_kraken_classfied['match_genus'] = (df_kraken_classfied.predicted_genus_clean == df_kraken_classfied.true_genus_clean)
df_kraken_classfied['match_species'] = (df_kraken_classfied.predicted_species_clean == df_kraken_classfied.true_species_clean)

In [12]:
match_genus_vc = df_kraken_classfied.match_genus.value_counts()
match_genus_proportion = match_genus_vc[True]/match_genus_vc.sum()
print('match genus: %s/%s' % (match_genus_vc[True], 
                              match_genus_vc.sum()))
print('proportion of classified: %s' % match_genus_proportion)
print('proportion of all reads: %s' % (match_genus_proportion * classified_proportion))

match genus: 5588605/7174271
proportion of classified: 0.778978797985189
proportion of all reads: 0.7519319653276874


In [13]:
match_species_vc = df_kraken_classfied.match_species.value_counts()
match_species_proportion = match_species_vc[True]/match_species_vc.sum()
print('match species: %s/%s' % (match_species_vc[True], 
                              match_species_vc.sum()))
print('proportion of classified: %s' % match_species_proportion)
print('proportion of all reads: %s' % (match_species_proportion * classified_proportion))

match species: 3690136/7174271
proportion of classified: 0.5143569290872898
proportion of all reads: 0.49649800170283115


In [14]:
df_kraken_classfied[df_kraken_classfied.match_genus == False]

Unnamed: 0,classified,sequence_id,taxonomy_id,length_bp,kmer_mapping,predicted_strain,predicted_species_clean,predicted_genus_clean,true_label,true_species,true_species_clean,true_genus_clean,match_genus,match_species
662,C,label|286|MMP00000031-8676,Homo sapiens (taxid 9606),146|128,0:64 9606:4 0:27 9606:5 0:12 |:| 0:94,Homo sapiens,Homo sapiens,Homo,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,False,False
3673,C,label|286|MMP00000031-2654,Aliarcobacter cryaerophilus (taxid 28198),124|125,0:87 426368:1 0:2 |:| 0:39 28198:4 0:48,Aliarcobacter cryaerophilus,Aliarcobacter cryaerophilus,Aliarcobacter,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,False,False
5793,C,label|863|MMP00000346-8414,Flavobacteriaceae (taxid 49546),82|117,2:48 |:| 49546:5 976:3 68336:4 49546:2 976:5 4...,Flavobacteriaceae,Flavobacteriaceae,Flavobacteriaceae,863,Winogradskyella sp000828715,Winogradskyella sp000828715,Winogradskyella,False,False
6266,C,label|863|MMP00000346-7468,Flavobacteriaceae (taxid 49546),104|110,2:5 49546:4 2:5 49546:3 2:5 49546:4 2:18 49546...,Flavobacteriaceae,Flavobacteriaceae,Flavobacteriaceae,863,Winogradskyella sp000828715,Winogradskyella sp000828715,Winogradskyella,False,False
8823,C,label|863|MMP00000346-2354,Flavobacteriaceae (taxid 49546),113|139,49546:13 2:66 |:| 2:18 200644:4 2:5 200644:8 9...,Flavobacteriaceae,Flavobacteriaceae,Flavobacteriaceae,863,Winogradskyella sp000828715,Winogradskyella sp000828715,Winogradskyella,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7431078,C,label|543|MMP5137566-2500,Proteobacteria (taxid 1224),103|113,1224:12 0:26 1224:1 0:4 1224:26 |:| 1224:79,Proteobacteria,Proteobacteria,Proteobacteria,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,False,False
7431485,C,label|543|MMP5137566-1686,Proteobacteria (taxid 1224),110|133,1224:2 2:2 1224:6 2:4 1224:46 2:5 1224:11 |:| ...,Proteobacteria,Proteobacteria,Proteobacteria,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,False,False
7431499,C,label|543|MMP5137566-1658,Proteobacteria (taxid 1224),85|115,1224:51 |:| 1224:81,Proteobacteria,Proteobacteria,Proteobacteria,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,False,False
7432005,C,label|543|MMP5137566-646,Gammaproteobacteria (taxid 1236),146|118,1236:4 0:27 1224:2 1236:18 0:16 1236:3 0:12 12...,Gammaproteobacteria,Gammaproteobacteria,Gammaproteobacteria,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,False,False


In [15]:
df_kraken_classfied[df_kraken_classfied.match_species == False]

Unnamed: 0,classified,sequence_id,taxonomy_id,length_bp,kmer_mapping,predicted_strain,predicted_species_clean,predicted_genus_clean,true_label,true_species,true_species_clean,true_genus_clean,match_genus,match_species
629,C,label|286|MMP00000031-8742,Methanococcus (taxid 2184),83|91,2182:2 28890:3 131567:5 28890:3 2182:10 2184:2...,Methanococcus,Methanococcus,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,False
662,C,label|286|MMP00000031-8676,Homo sapiens (taxid 9606),146|128,0:64 9606:4 0:27 9606:5 0:12 |:| 0:94,Homo sapiens,Homo sapiens,Homo,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,False,False
1592,C,label|286|MMP00000031-6816,Methanococcus (taxid 2184),87|149,2184:14 2183:34 2182:5 |:| 2183:31 2184:5 2183...,Methanococcus,Methanococcus,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,False
3325,C,label|286|MMP00000031-3350,Methanococcus (taxid 2184),112|125,2184:25 2182:5 28890:2 2283794:10 131567:4 218...,Methanococcus,Methanococcus,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,False
3673,C,label|286|MMP00000031-2654,Aliarcobacter cryaerophilus (taxid 28198),124|125,0:87 426368:1 0:2 |:| 0:39 28198:4 0:48,Aliarcobacter cryaerophilus,Aliarcobacter cryaerophilus,Aliarcobacter,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7432041,C,label|543|MMP5137566-574,Pseudomonas (taxid 286),127|105,286:93 |:| 286:71,Pseudomonas,Pseudomonas,Pseudomonas,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,True,False
7432093,C,label|543|MMP5137566-470,Pseudomonas (taxid 286),127|103,1236:1 1224:12 1236:3 1224:7 2:9 1224:14 2:18 ...,Pseudomonas,Pseudomonas,Pseudomonas,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,True,False
7432158,C,label|543|MMP5137566-340,Pseudomonas (taxid 286),111|102,1236:1 286:76 |:| 286:32 0:10 286:5 0:18 286:3,Pseudomonas,Pseudomonas,Pseudomonas,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,True,False
7432230,C,label|543|MMP5137566-196,Pseudomonas (taxid 286),78|91,286:44 |:| 286:57,Pseudomonas,Pseudomonas,Pseudomonas,543,Pseudomonas_E marincola,Pseudomonas marincola,Pseudomonas,True,False


In [16]:
df_kraken_classfied.head()

Unnamed: 0,classified,sequence_id,taxonomy_id,length_bp,kmer_mapping,predicted_strain,predicted_species_clean,predicted_genus_clean,true_label,true_species,true_species_clean,true_genus_clean,match_genus,match_species
0,C,label|286|MMP00000031-10000,Methanococcus maripaludis C7 (taxid 426368),129|88,426368:17 2184:2 39152:5 426368:13 39152:1 426...,Methanococcus maripaludis C7,Methanococcus maripaludis,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,True
1,C,label|286|MMP00000031-9998,Methanococcus maripaludis C7 (taxid 426368),109|149,39152:7 426368:12 39152:12 0:7 426368:3 0:5 42...,Methanococcus maripaludis C7,Methanococcus maripaludis,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,True
2,C,label|286|MMP00000031-9996,Methanococcus maripaludis C7 (taxid 426368),106|100,0:54 402880:2 0:16 |:| 426368:55 39152:7 426368:4,Methanococcus maripaludis C7,Methanococcus maripaludis,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,True
3,C,label|286|MMP00000031-9994,Methanococcus maripaludis C7 (taxid 426368),110|101,426368:76 |:| 2184:1 39152:7 2184:1 2182:7 402...,Methanococcus maripaludis C7,Methanococcus maripaludis,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,True
4,C,label|286|MMP00000031-9992,Methanococcus maripaludis C7 (taxid 426368),110|124,39152:21 426368:35 39152:5 426368:15 |:| 42636...,Methanococcus maripaludis C7,Methanococcus maripaludis,Methanococcus,286,Methanococcus maripaludis_A,Methanococcus maripaludis,Methanococcus,True,True
