# Preprocessing and Exploratory Data Analysis

### Import libraries and read source FASTA file

In [2]:
import numpy as np
import pandas as pd
from tqdm import tqdm_notebook as tqdm
import uuid

fpath_fasta = '../data/current_Bacteria_unaligned.fa'
seq_data = open(fpath_fasta, 'r').read()
lines = seq_data.split('\n')

### Function to process sample rows

In [3]:
def get_seq_dict(desc, seq):
    try:
        keys = ['domain', 'phylum', 'class', 'subclass', 'order', 'suborder', 'family', 'genus']
        txn_info = desc.split('rootrank;')[1].split(';')
        seq_info = {}
        for i, key in enumerate(keys):
            try:
                idx = txn_info.index(key)
                seq_info[key] = txn_info[idx-1].replace('"', '')
            except ValueError:
                seq_info[key] = None
        seq_info['id'] = uuid.uuid4().hex[:10].upper()
        seq_info['seq_len'] = len(seq)
        seq_info['sequence'] = seq.upper()
    except Exception as e:
        print(i)
        print(txn_info)
        print(desc)
        raise IndexError(e)
    
    return seq_info

### Compile DNA sequence samples as an array of dicts

In [4]:
desc = ''
seq = ''

seq_dicts = []

for i, line in enumerate(tqdm(lines, total=len(lines))):
    if line.startswith('>'):
        if len(desc) > 0 and len(seq) > 0:
            seq_dicts.append(get_seq_dict(desc, seq))
        desc = line
        seq = ''
    else:
        seq += line

HBox(children=(IntProgress(value=0, max=46267413), HTML(value='')))




### Create dataframe from sequence dicts

In [6]:
df_seqs = pd.DataFrame(seq_dicts)

### Filter by sequence length

In [7]:
df_len_filtered = df_seqs[df_seqs['seq_len'] >= 1270]
df_len_filtered = df_len_filtered[df_seqs['seq_len'] <= 1370]

print(df_len_filtered.size)

  


6072902


### Filter by sample counts per phylum

In [8]:
top_phylum_names = list(df_len_filtered['phylum'].value_counts()[:3].index)
print(top_phylum_names)

df_phy_filtered = df_len_filtered[df_len_filtered['phylum'].isin(top_phylum_names)]
print(df_phy_filtered.shape)

['Actinobacteria', 'Proteobacteria', 'Firmicutes']
(443483, 11)


### Filter by sample counts per class

In [9]:
top_class_names = list(df_phy_filtered['class'].value_counts()[:5].index)
print(top_class_names)

df_class_filtered = df_phy_filtered[df_phy_filtered['class'].isin(top_class_names)]
print(df_class_filtered.shape)

['Actinobacteria', 'Clostridia', 'Gammaproteobacteria', 'Bacilli', 'Betaproteobacteria']
(393891, 11)


### Filter by sample counts per order

In [10]:
top_order_names = list(df_class_filtered['order'].value_counts()[:19].index)
print(top_order_names)

df_order_filtered = df_class_filtered[df_class_filtered['order'].isin(top_order_names)]
print(df_order_filtered.shape)

['Actinomycetales', 'Clostridiales', 'Lactobacillales', 'Burkholderiales', 'Pseudomonadales', 'Enterobacteriales', 'Bacillales', 'Pasteurellales', 'Neisseriales', 'Xanthomonadales', 'Coriobacteriales', 'Acidimicrobiales', 'Alteromonadales', 'Vibrionales', 'Rhodocyclales', 'Cardiobacteriales', 'Aeromonadales', 'Oceanospirillales', 'Chromatiales']
(386059, 11)


### Filter by sample counts per family

In [11]:
top_family_names = list(df_order_filtered['family'].value_counts()[:65].index)

df_family_filtered = df_order_filtered[df_order_filtered['family'].isin(top_family_names)]
print(df_family_filtered.shape)

(371247, 11)


### Filter by sample counts per genus

In [12]:
top_genus_names = list(df_family_filtered['genus'].value_counts()[:393].index)

df_final = df_family_filtered[df_family_filtered['genus'].isin(top_genus_names)]
print(df_final.shape)

(339308, 11)


### Sample count per phylum

In [13]:
df_final['phylum'].value_counts()

Actinobacteria    147411
Firmicutes        102599
Proteobacteria     89298
Name: phylum, dtype: int64

### Sample count per class

In [14]:
df_final['class'].value_counts()

Actinobacteria         147411
Bacilli                 61977
Gammaproteobacteria     53530
Clostridia              40622
Betaproteobacteria      35768
Name: class, dtype: int64

### Sample count per order

In [15]:
df_final['order'].value_counts()

Actinomycetales      144873
Lactobacillales       52706
Clostridiales         40622
Burkholderiales       28799
Pseudomonadales       23083
Enterobacteriales     15054
Bacillales             9271
Pasteurellales         8469
Neisseriales           5978
Xanthomonadales        1880
Coriobacteriales       1783
Vibrionales            1306
Alteromonadales        1203
Aeromonadales          1084
Rhodocyclales           991
Acidimicrobiales        755
Oceanospirillales       602
Cardiobacteriales       595
Chromatiales            254
Name: order, dtype: int64

### Sample count per family

In [17]:
df_final['family'].value_counts()[:15]

Corynebacteriaceae                 62174
Propionibacteriaceae               57240
Streptococcaceae                   41496
Moraxellaceae                      16263
Lachnospiraceae                    15107
Enterobacteriaceae                 15054
Comamonadaceae                     13460
Ruminococcaceae                    10140
Micrococcaceae                      8650
Pasteurellaceae                     8469
Burkholderiales_incertae_sedis      7087
Clostridiales_Incertae Sedis XI     7054
Pseudomonadaceae                    6820
Neisseriaceae                       5978
Burkholderiaceae                    5074
Name: family, dtype: int64

### Sample count per genus

In [19]:
df_final['genus'].value_counts()[:15]

Corynebacterium      61768
Propionibacterium    56755
Streptococcus        40628
Aquabacterium         7025
Diaphorobacter        6891
Moraxella             6719
Acinetobacter         6696
Pseudomonas           6603
Serratia              5912
Faecalibacterium      5217
Neisseria             5182
Haemophilus           4380
Anaerococcus          4361
Acidovorax            4227
Bacillus              4186
Name: genus, dtype: int64

### Drop extra columns and create final dataframe

In [20]:
df_final = df_final.reset_index(drop=True)
df_final = df_final.drop(['seq_len', 'subclass', 'suborder', 'domain'], axis=1)

### Shuffle and display the first 10 rows of the dataset

In [21]:
df_final.sample(frac=1)[:10]

Unnamed: 0,class,family,genus,id,order,phylum,sequence
280576,Bacilli,Streptococcaceae,Streptococcus,631F43DE7C,Lactobacillales,Firmicutes,GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTAGAACGCTGAA...
224526,Gammaproteobacteria,Moraxellaceae,Moraxella,94AB77B150,Pseudomonadales,Proteobacteria,ATTGAACGCTGGCGGCAGGCTTAACACATGCAAGTCGAACGAAGTT...
306015,Clostridia,Clostridiales_Incertae Sedis XI,Finegoldia,E001E4D1A0,Clostridiales,Firmicutes,ACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGGATTT...
257451,Bacilli,Streptococcaceae,Streptococcus,065EDB51C9,Lactobacillales,Firmicutes,GCAGTAGAACGCTGAAGGAGGAGCTTGCTCTTCTGGATGAGTTGCG...
319473,Clostridia,Lachnospiraceae,Lachnospiracea_incertae_sedis,BD258CF42F,Clostridiales,Firmicutes,GATGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGAAGCA...
167514,Betaproteobacteria,Oxalobacteraceae,Janthinobacterium,7E0F265176,Burkholderiales,Proteobacteria,GTTTGATCCTGGCTCAGATTGAACGCTGGCGGCATGCCTTACACAT...
31565,Actinobacteria,Corynebacteriaceae,Corynebacterium,80C984F525,Actinomycetales,Actinobacteria,GACGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGAAAG...
302231,Clostridia,Clostridiales_Incertae Sedis XI,Anaerococcus,47379ACF86,Clostridiales,Firmicutes,GATTAACGTTGGCGGCGTGCATAACACATGCAAGTCGAACGATGAA...
224052,Gammaproteobacteria,Moraxellaceae,Moraxella,2CCAA0F783,Pseudomonadales,Proteobacteria,ATTGAACGCTGGCGGCAGGCTTAACACATGCAAGTCGAACGATGAA...
296983,Bacilli,Streptococcaceae,Streptococcus,331173DFBD,Lactobacillales,Firmicutes,GACGAACGCTGGCGGCGTGCCTAATACATGCAAGTAGAACGCTGAA...


### Save the final processed dataframe as CSV

In [22]:
df_final.to_csv('taxa.csv')