In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import uuid

fpath_fasta = 'current_Bacteria_unaligned.fa'
seq_data = open(fpath_fasta, 'r').read()
lines = seq_data.split('\n')

In [2]:
def get_seq_dict(desc, seq):
    try:
        keys = ['domain', 'phylum', 'class', 'subclass', 'order', 'suborder', 'family', 'genus']
        txn_info = desc.split('rootrank;')[1].split(';')
        seq_info = {}
        for i, key in enumerate(keys):
            try:
                idx = txn_info.index(key)
                seq_info[key] = txn_info[idx-1].replace('"', '')
            except ValueError:
                seq_info[key] = None
        seq_info['id'] = uuid.uuid4().hex[:10].upper()
        seq_info['seq_len'] = len(seq)
        seq_info['sequence'] = seq.upper()
    except Exception as e:
        print(i)
        print(txn_info)
        print(desc)
        raise IndexError(e)
    
    return seq_info

In [3]:
desc = ''
seq = ''

seq_dicts = []

for i, line in enumerate(tqdm(lines, total=len(lines))):
    if line.startswith('>'):
        if len(desc) > 0 and len(seq) > 0:
            seq_dicts.append(get_seq_dict(desc, seq))
        desc = line
        seq = ''
    else:
        seq += line

HBox(children=(IntProgress(value=0, max=46267413), HTML(value='')))

In [4]:
df_seqs = pd.DataFrame(seq_dicts)

In [5]:
seq_lens = df_seqs['seq_len'].values

lens, counts = np.unique(seq_lens, return_counts=True)

print(np.sum(counts[np.where(np.logical_and(lens >= 1270, lens <= 1370))]))

552082


In [6]:
# filter based on sequence length
df_len_filtered = df_seqs[df_seqs['seq_len'] >= 1270]
df_len_filtered = df_len_filtered[df_seqs['seq_len'] <= 1370]

print(df_len_filtered.size)

  This is separate from the ipykernel package so we can avoid doing imports until


6072902


In [7]:
top_phylum_names = list(df_len_filtered['phylum'].value_counts()[:3].index)
print(top_phylum_names)

df_phy_filtered = df_len_filtered[df_len_filtered['phylum'].isin(top_phylum_names)]
print(df_phy_filtered.shape)

['Actinobacteria', 'Proteobacteria', 'Firmicutes']
(443483, 11)


In [8]:
top_class_names = list(df_phy_filtered['class'].value_counts()[:5].index)
print(top_class_names)
df_class_filtered = df_phy_filtered[df_phy_filtered['class'].isin(top_class_names)]
print(df_class_filtered.shape)

['Actinobacteria', 'Clostridia', 'Gammaproteobacteria', 'Bacilli', 'Betaproteobacteria']
(393891, 11)


In [9]:
top_order_names = list(df_class_filtered['order'].value_counts()[:19].index)
print(top_order_names)

df_order_filtered = df_class_filtered[df_class_filtered['order'].isin(top_order_names)]
print(df_order_filtered.shape)

['Actinomycetales', 'Clostridiales', 'Lactobacillales', 'Burkholderiales', 'Pseudomonadales', 'Enterobacteriales', 'Bacillales', 'Pasteurellales', 'Neisseriales', 'Xanthomonadales', 'Coriobacteriales', 'Acidimicrobiales', 'Alteromonadales', 'Vibrionales', 'Rhodocyclales', 'Cardiobacteriales', 'Aeromonadales', 'Oceanospirillales', 'Chromatiales']
(386059, 11)


In [10]:
top_family_names = list(df_order_filtered['family'].value_counts()[:65].index)

df_family_filtered = df_order_filtered[df_order_filtered['family'].isin(top_family_names)]
print(df_family_filtered.shape)

(371247, 11)


In [11]:
top_genus_names = list(df_family_filtered['genus'].value_counts()[:393].index)

df_final = df_family_filtered[df_family_filtered['genus'].isin(top_genus_names)]
print(df_final.shape)

(339308, 11)


In [12]:
df_final['phylum'].value_counts()

Actinobacteria    147401
Firmicutes        102589
Proteobacteria     89318
Name: phylum, dtype: int64

In [13]:
df_final['class'].value_counts()

Actinobacteria         147401
Bacilli                 61977
Gammaproteobacteria     53550
Clostridia              40612
Betaproteobacteria      35768
Name: class, dtype: int64

In [14]:
df_final['order'].value_counts()

Actinomycetales      144873
Lactobacillales       52706
Clostridiales         40612
Burkholderiales       28799
Pseudomonadales       23083
Enterobacteriales     15064
Bacillales             9271
Pasteurellales         8469
Neisseriales           5978
Xanthomonadales        1880
Coriobacteriales       1773
Vibrionales            1306
Alteromonadales        1203
Aeromonadales          1084
Rhodocyclales           991
Acidimicrobiales        755
Oceanospirillales       612
Cardiobacteriales       595
Chromatiales            254
Name: order, dtype: int64

In [15]:
df_final['family'].value_counts()

Corynebacteriaceae                   62174
Propionibacteriaceae                 57240
Streptococcaceae                     41496
Moraxellaceae                        16263
Lachnospiraceae                      15097
Enterobacteriaceae                   15064
Comamonadaceae                       13460
Ruminococcaceae                      10140
Micrococcaceae                        8650
Pasteurellaceae                       8469
Burkholderiales_incertae_sedis        7087
Clostridiales_Incertae Sedis XI       7054
Pseudomonadaceae                      6820
Neisseriaceae                         5978
Burkholderiaceae                      5074
Bacillaceae 1                         4476
Carnobacteriaceae                     4047
Aerococcaceae                         3477
Staphylococcaceae                     3211
Peptostreptococcaceae                 3152
Microbacteriaceae                     2828
Lactobacillaceae                      2442
Clostridiaceae 1                      2441
Streptomyce

In [16]:
df_final['genus'].value_counts()

Corynebacterium                  61768
Propionibacterium                56755
Streptococcus                    40628
Aquabacterium                     7025
Diaphorobacter                    6891
Moraxella                         6719
Acinetobacter                     6696
Pseudomonas                       6603
Serratia                          5912
Faecalibacterium                  5217
Neisseria                         5182
Haemophilus                       4380
Anaerococcus                      4361
Acidovorax                        4227
Bacillus                          4186
Rothia                            4186
Lachnospiracea_incertae_sedis     3443
Dolosigranulum                    3435
Aerococcus                        3216
Burkholderia                      3190
Staphylococcus                    2834
Escherichia/Shigella              2784
Blautia                           2762
Klebsiella                        2612
Lactobacillus                     2408
Clostridium sensu stricto

In [17]:
df_final = df_final.reset_index(drop=True)
df_final = df_final.drop(['seq_len', 'subclass', 'suborder', 'domain'], axis=1)

In [18]:
df_final.sample(frac=1)[:10]

Unnamed: 0,class,family,genus,id,order,phylum,sequence
16692,Actinobacteria,Corynebacteriaceae,Corynebacterium,AECE89CE5F,Actinomycetales,Actinobacteria,GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAG...
52127,Actinobacteria,Corynebacteriaceae,Corynebacterium,8E8CE2E9DF,Actinomycetales,Actinobacteria,GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAG...
88614,Actinobacteria,Propionibacteriaceae,Propionibacterium,4B67B78BFA,Actinomycetales,Actinobacteria,GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAG...
272762,Bacilli,Streptococcaceae,Streptococcus,59B9E6B437,Lactobacillales,Firmicutes,GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTAGAACGCTGAA...
62619,Actinobacteria,Corynebacteriaceae,Corynebacterium,54CCBD9202,Actinomycetales,Actinobacteria,GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAG...
180905,Betaproteobacteria,Neisseriaceae,Neisseria,EEF726192E,Neisseriales,Proteobacteria,ATTGAACGCTGGCGGCATGCTTTACACATGCAAGTCGGACGGCAGC...
248511,Bacilli,Aerococcaceae,Aerococcus,2764B3AE8F,Lactobacillales,Firmicutes,GACGAACGCTGGCGGCATGCCTAATACATGCAAGTCGAGCGAACAG...
161215,Betaproteobacteria,Comamonadaceae,Diaphorobacter,CC582E3A10,Burkholderiales,Proteobacteria,ATTGAACGCTGGCGGCATGCCTTACACATGCAAGTCGAACGGTAAC...
53296,Actinobacteria,Corynebacteriaceae,Corynebacterium,F233F278B3,Actinomycetales,Actinobacteria,GACGAACGCTGGTGGCGTGCTTAACACATGCTAGTCGAACGGAAAG...
193594,Gammaproteobacteria,Enterobacteriaceae,Klebsiella,2ADD9278FB,Enterobacteriales,Proteobacteria,ATTGAACGCTGGCGGCAGGCCTAACACATGCAAGTCGAGCGGTAGC...


In [20]:
df_final.to_csv('taxa.csv')

In [19]:
print(len(df_final['id'].values))
print(len(np.unique(df_final['id'].values)))

339308
339308
